[Caic-commits] r89 - in pkg: . R man

Wed Apr 22 13:17:16 CEST 2009

Author: davidorme
Date: 2009-04-22 13:17:16 +0200 (Wed, 22 Apr 2009)
New Revision: 89

Added:
   pkg/R/phylo.d.R
   pkg/man/phylo.d.Rd
Modified:
   pkg/DESCRIPTION
Log:
Added phylo.d (aka SOCC, sum of character change) code

Modified: pkg/DESCRIPTION
===================================================================

--- pkg/DESCRIPTION	2009-04-07 15:58:25 UTC (rev 88)
+++ pkg/DESCRIPTION	2009-04-22 11:17:16 UTC (rev 89)
@@ -1,11 +1,11 @@
 Package: CAIC
 Type: Package
 Title: Comparative Analyses using Independent Contrasts
-Version: 1.0.4-86
+Version: 1.0.4-89
 Date: 2008-10-01
-Author: David Orme, Rob Freckleton, Gavin Thomas, Thomas Petzoldt
+Author: David Orme, Rob Freckleton, Gavin Thomas, Thomas Petzoldt, Susanne Fritz
 Maintainer: David Orme <d.orme at imperial.ac.uk>
-Depends: R (>= 2.6.0), ape, MASS, mvtnorm
+Depends: R (>= 2.6.0), ape, MASS, mvtnorm, geiger
 Description: Functions for performing comparative analysis using independent contrasts and tree simulation.
 License: GPL version 2 or newer 
 

Added: pkg/R/phylo.d.R
===================================================================
--- pkg/R/phylo.d.R	                        (rev 0)
+++ pkg/R/phylo.d.R	2009-04-22 11:17:16 UTC (rev 89)
@@ -0,0 +1,156 @@
+phylo.d <- function(ds, phy, binvar, tipnames, permut=1000, verbose=TRUE) {
+	
+	# data checking
+	if (!is.data.frame(ds)) (stop("'", deparse(substitute(ds)), "' must be an object of class 'data.frame'."))
+	if (!inherits(phy, "phylo")) (stop("'", deparse(substitute(phy)), "' is not of class 'phylo'."))
+	binvar <- deparse(substitute(binvar))
+    bininds <- match(binvar, names(ds))
+    if (is.na(bininds)) (stop("'", binvar, "' is not a column name in ds."))
+    tipnames <- deparse(substitute(tipnames))
+    tipsinds <- match(tipnames, names(ds))
+    if (is.na(tipsinds)) (stop("'", tipnames, "' is not a column name in ds."))
+    if (!is.numeric(permut)) (stop("'", permut, "' is not numeric.")) 
+    phyName <- deparse(substitute(phy))
+    dsName <- deparse(substitute(ds))
+
+	d.data <- function (ds, phy, bininds, tipsinds, permut=1000, verbose=FALSE) {
+		
+	    # make up 1000 random data huffles
+		if (verbose) (print("Computing phylogenetically random permutations..."))
+		kseq <- seq(length(ds)+1, by=1, length.out=permut)
+		for (k in kseq) { 
+			ds[,k] <- sample(ds[,bininds])
+			names(ds)[k] <- paste("Var", k, sep="")
+		}
+		names(ds)[tipsinds] <- "tipnames"
+		explvarsran <- paste(paste("Var", kseq, sep=""), collapse="+")
+		
+		# make up 1000 simulated characters, brownian, lambda=0.999
+		if (verbose) (print("Computing phylogenetically clumped permutations..."))
+	    lamtree <- lambdaTree(phy, 1)
+	    simmat <- matrix(rep(0,permut^2), nrow=permut, ncol=permut) # covariance between traits is zero
+	    for (i in 1:permut) (simmat[i,i] <- 1) # trait variance is 1
+	    simchar <- sim.char(lamtree, simmat)
+	    simds <- as.data.frame(simchar[1:length(ds[,1]),1:permut,1])
+	    simds$tipnames <- rownames(simds)
+	    simds$resp <- sample(simds[,1])
+	    propone <- length(which(ds[,bininds]==1)) / length(ds[,bininds]) # proportion of ones in the response variable
+	    
+	    # convert the simulated traits into binary traits using the proportion of ones in the response variable as threshold
+	    for (k in 1:permut) {
+		    tipvals <- simds[,k]
+		    names(tipvals) <- rownames(simds)
+		    tipvals <- sort(tipvals)
+			ones <- names(tipvals)[(length(tipvals)-round(length(tipvals)*propone, 0)+1):length(tipvals)]
+			simds[which(is.element(rownames(simds), ones)),k] <- round(1,0)
+			simds[which(!is.element(rownames(simds), ones)),k] <- round(0,0)
+		}
+		explvarssim <- paste(paste("V", c(1:permut), sep=""), collapse="+")
+		
+		return(list(ds.ran=ds, formula.ran=paste(names(ds)[bininds], "~", explvarsran, sep=""), ds.sim=simds, formula.sim=paste("resp~", explvarssim, sep="")))
+		
+	}
+
+	
+	d.fit <- function(dds, phy, verbose=FALSE) {
+		
+		if (verbose) (print("Getting nodal value estimates..."))
+		caicran <- crunch(as.formula(dds[[2]]), dds[[1]], phy, names.col=tipnames, polytomy.brlen=0)
+		caicsim <- crunch(as.formula(dds[[4]]), dds[[3]], phy, names.col=tipnames, polytomy.brlen=0)
+		return(list(caicran=caicran, caicsim=caicsim, ds.ran=dds$ds.ran, ds.sim=dds$ds.sim))
+		
+	}
+	
+	sumofdiffs <- function (caicobj, ds, tipnames, resp=FALSE) {
+		
+		# data checking
+		if (!is.data.frame(ds)) (stop("'", deparse(substitute(ds)), "' must be an object of class 'data.frame'."))
+		if (!inherits(caicobj, "caic")) (stop("'", deparse(substitute(caicobj)), "' is not of class 'caic'."))
+	    tipnames <- deparse(substitute(tipnames))
+	    tipsinds <- match(tipnames, names(ds))
+	    if (is.na(tipsinds)) (stop("'", tipnames, "' is not a column name in ds."))
+	    
+		edges <- as.data.frame(caicobj$phy$edge)
+		edges$bl <- caicobj$phy$edge.length
+		responsev <- attr(caicobj$contrast.data$contr$response, "dimnames")[[2]]
+		variables <- attr(caicobj$contrast.data$contr$explanatory, "dimnames")[[2]]
+		
+		# response variable
+		if (resp) {
+			nvs <- caicobj$contrast.data$nodalVals$response[,1]
+			edges$startnv <- nvs[match(edges[,1], names(nvs))]
+			tips <- ds[match(caicobj$phy$tip.label, ds[,tipsinds]),which(names(ds)==responsev)]
+			names(tips) <- c(1:length(tips))
+			allnvs <- c(nvs, tips)
+			allnvs <- allnvs[order(as.numeric(names(allnvs)))]
+			edges$endnv <- allnvs[match(edges[,2], names(allnvs))]
+			edges$change <- abs(edges$startnv-edges$endnv)
+			rtvr <- sum(edges$change)
+			names(rtvr) <- responsev
+			obsnvs <- nvs
+		} else {
+			rtvr <- NA
+			obsnvs <- NA
+		}
+		
+		# explanatory variables
+		rtv <- numeric(0)
+		for (i in seq(along=variables)) {
+			nvs <- caicobj$contrast.data$nodalVals$explanatory[,which(attr(caicobj$contrast.data$nodalVals$explanatory, "dimnames")[[2]]==variables[i])]
+			edges$startnv <- nvs[match(edges[,1], names(nvs))]
+			tips <- ds[,which(names(ds)==variables[i])][match(caicobj$phy$tip.label, ds[,tipsinds])]
+			names(tips) <- c(1:length(tips))
+			allnvs <- c(nvs, tips)
+			allnvs <- allnvs[order(as.numeric(names(allnvs)))]
+			edges$endnv <- allnvs[match(edges[,2], names(allnvs))]
+			edges$change <- abs(edges$startnv-edges$endnv)
+			rtv <- append(rtv, sum(edges$change))
+			names(rtv)[i] <- variables[i]
+			if (i==1) (nvsds <- data.frame(t(nvs))) else (nvsds[i,] <- nvs)
+		}
+		
+		return(list(sumoc.resp=rtvr, sumoc.expl=rtv, obs.nvs=obsnvs, perm.nvs=nvsds))
+	}
+	
+	
+	d.compute <- function(dfit, verbose=FALSE) {
+		
+		if (verbose) (print("Computing output..."))
+		ransocc <- sumofdiffs(dfit$caicran, dfit$ds.ran, tipnames, resp=TRUE)
+		simsocc <- sumofdiffs(dfit$caicsim, dfit$ds.sim, tipnames)
+		soccratio <- (as.numeric(ransocc$sumoc.resp)-mean(simsocc$sumoc.expl)) / (mean(ransocc$sumoc.expl)-mean(simsocc$sumoc.expl))
+		soccpval1 <- length(which(ransocc$sumoc.expl<as.numeric(ransocc$sumoc.resp))) / length(ransocc$sumoc.expl)
+		soccpval0 <- length(which(simsocc$sumoc.expl>as.numeric(ransocc$sumoc.resp))) / length(simsocc$sumoc.expl)	
+		ret <- list(DEstimate=soccratio, Pval1=soccpval1, Pval0=soccpval0, Parameters=list(Observed=as.numeric(ransocc$sumoc.resp), MeanRandom=mean(ransocc$sumoc.expl), MeanBrownian=mean(simsocc$sumoc.expl)), Permutations=list(random=ransocc$sumoc.expl, brownian=simsocc$sumoc.expl), NodalVals=list(observed=ransocc$obs.nvs, random=ransocc$perm.nvs, brownian=simsocc$perm.nvs))
+		# if (verbose) (cat(" D estimate - ", ret$DEstimate, "\n P value for D<1 - ", ret$Pval1, "\n P value for D>0 - ", ret$Pval0, "\n"))
+		return(ret)
+	}
+	
+
+	dds <- d.data(ds=ds, phy=phy, bininds=bininds, tipsinds=tipsinds, permut=permut, verbose=verbose)
+	dfit <- d.fit(dds=dds, phy=phy, verbose=verbose)
+	dvals <- d.compute(dfit, verbose=verbose)
+	dvals$binvar <- binvar
+	dvals$phyName <- phyName
+	dvals$dsName <- dsName
+	dvals$nPermut <- permut
+	return(dvals)
+	
+}
+
+print.phylo.d <- function(x, ...){
+    summary(x)
+}
+
+summary.phylo.d <- function(object, ...){
+    cat('\nCalculation of D statistic for the phylogenetic structure of a binary variable\n')
+    cat('\n  Data : ', object$dsName)
+    cat('\n  Binary variable : ', object$binvar)
+    cat('\n  Phylogeny : ', object$phyName)
+    cat('\n  Number of permutations : ', object$nPermut)
+    
+    cat("\n\nEstimated D : ", object$DEstimate)
+    cat("\nProbability of E(D) resulting from no (random) phylogenetic structure : ", object$Pval1)
+    cat("\nProbability of E(D) resulting from Brownian phylogenetic structure    : ", object$Pval0)
+    cat("\n\n")
+}
\ No newline at end of file

Added: pkg/man/phylo.d.Rd
===================================================================
--- pkg/man/phylo.d.Rd	                        (rev 0)
+++ pkg/man/phylo.d.Rd	2009-04-22 11:17:16 UTC (rev 89)
@@ -0,0 +1,75 @@
+\name{phylo.d}
+\alias{phylo.d}
+\alias{print.phylo.d}
+\alias{summary.phylo.d}
+
+%- Also NEED an '\alias' for EACH other topic documented here.
+\title{Calculates the phylogenetic D statistic}
+\description{
+Calculates the D value, a measure of phylogenetic signal in a binary trait, and tests the estimated D value for significant departure from one (phylogenetically random) and zero (phylogenetically as clumped as expected under a Brownian evolution threshold model).
+}
+\usage{
+phylo.d(ds, phy, binvar, tipnames, permut = 1000, verbose = TRUE)
+\method{print}{phylo.d}(x, ...)
+\method{summary}{phylo.d}(object, ...)
+
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{ds}{An object of class 'data.frame'.}
+  \item{phy}{An object of class 'phylo'.}
+  \item{binvar}{The name of the column in \code{ds} holding the binary variable of interest.}
+  \item{tipnames}{The name of the column in \code{ds} holding the names to be used to match rows in the dataset (\code{ds}) to the tipnames in the phylogeny (\code{phy})}
+  \item{permut}{Number of permutations to be used in the randomisation test.}
+  \item{verbose}{If TRUE, the function reports progress and other output likely to be important to users}
+  \item{x}{An object of class 'phylo.d'}
+  \item{object}{An object of class 'phylo.d'}
+}
+
+\details{
+D is one if there is no phylogenetic signal in the trait, and zero if the phylogenetic signal is the same as that expected under a Brownian threshold model (see also reference for details).
+Calculation of D is based on the observed sum of sister-clade differences for the binary trait of interest, which is scaled using two expectations: a phylogenetically random distribution of zeros and ones at the tips of the phylogeny, and a phylogenetically clumped distribution of ones that is simulated under a Brownian model. The scaling and significance testing uses 1000 permutations each (default) for the random and Brownian expectations, which are based on the phylogeny and binary data of interest, so that D itself becomes independent of phylogeny size and proportion of ones in the binary trait (prevalence).
+For the random expectation, trait values are randomly shuffled along the tips of the phylogeny. For the Brownian expectation, a continuous trait is evolved under Brownian evolution in each permutation, and then the sum of sister-clade differences is computed for a binary trait ranked on the basis of that continuous trait (values higher than a threshold are ones, and the threshold is defined so that the simulated binary trait has the same prevalence as the observed one).
+}
+\value{
+Returns an object of class 'phylo.d', which is a list of the following:
+\item{DEstimate} {The estimated D value}
+\item{Pval1} {A p value, giving the result of testing whether D is significantly different from one}
+\item{Pval0} {A p value, giving the result of testing whether D is significantly different from zero}
+\item{Parameters} {A list of the Observed, MeanRandom and MeanBrownian sums of sister-clade differences}
+\item{Permutations} {A list with elements random and brownian, containing the sums of sister-clade differences from random permutations and simulations of Brownian evolution under a threshold model}
+\item{NodalVals} {A list with elements observed, random and brownian, containing the nodal values estimated for the observed trait (as a numeric with names that correspond to node numbers in the phylo object) and those estimated in the permutations (as dataframes with a column for each node, named e.g. X17 for node 17)}
+\item{binvar}{The binary variable used}
+\item{phyName}{The name of the phylogeny object used}
+\item{dsName}{The name of the dataframe used}
+\item{nPermut}{The number of permutations used}
+}
+}
+\references{ S. A. Fritz & A. Purvis (2009) In prep. }
+\author{Susanne Fritz (SFritz at bio.ku.dk)}
+\examples{
+data(shorebird)
+
+# add threatened status (threat data from IUCN Red List 2008)
+# and a phylogenetically clumped dummy example to the data frame
+shorebird.data$threat <- rep(c(0,1,0,1,0,1,0,1,0,1,0,1,0,1,0), times=c(22,2,2,2,1,1,4,1,8,1,3,1,1,2,20))
+shorebird.data$clustered <- rep(c(1,0,1,0,1,0,1,0,1,0,1,0), times=c(4,13,2,12,1,2,1,7,2,2,2,23))
+
+# plot the data
+par(mfrow=c(1,2))
+tipOrd <- match(shorebird.data$Species, shorebird.tree$tip.label)
+plot(shorebird.tree, show.tip.label=FALSE) 
+tiplabels(tip=tipOrd, pch=21, bg=ifelse(shorebird.data$threat==0, 'white','black'))
+plot(shorebird.tree, show.tip.label=FALSE) 
+tiplabels(tip=tipOrd, pch=21, bg=ifelse(shorebird.data$clustered==0, 'white','black'))
+
+# an example where D is not significantly different from a brownian threshold model of evolution (0)
+ex1 <- phylo.d(shorebird.data, shorebird.tree, threat, Species)
+# an example where D is not significantly different from random (1)
+ex2 <- phylo.d(shorebird.data, shorebird.tree, clustered, Species)
+}
+
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{ utilities }
+\keyword{ htest }% __ONLY ONE__ keyword per line