[Picante-commits] r214 - in pkg: R man

Thu Feb 11 08:01:01 CET 2010

Author: skembel
Date: 2010-02-11 08:01:01 +0100 (Thu, 11 Feb 2010)
New Revision: 214

Modified:
   pkg/R/data.checking.R
   pkg/R/phylobeta.R
   pkg/R/raoD.R
   pkg/man/comdist.Rd
   pkg/man/data.checking.Rd
   pkg/man/evolve.brownian.Rd
   pkg/man/phylocom.Rd
   pkg/man/raoD.Rd
Log:
Fix documentation typos. Update comdist/comdistnt functions for better performance.

Modified: pkg/R/data.checking.R
===================================================================

--- pkg/R/data.checking.R	2010-02-11 01:33:54 UTC (rev 213)
+++ pkg/R/data.checking.R	2010-02-11 07:01:01 UTC (rev 214)
@@ -91,10 +91,63 @@
             res$phy <- phy
         }
         
-        res$data <- data[res$phy$tip.label,]
+        if (dataclass == "data.frame") {
+            res$data <- as.data.frame(data[res$phy$tip.label,])
+        } else {
+            res$data <- data[res$phy$tip.label,]
+        }
         return(res)   
              
     } else {
         stop("Data must be a vector, data.frame, or matrix")
     }
+}
+
+
+match.comm.dist <- function(comm, dis) {
+
+    res <- list()
+
+    commtaxa <- colnames(comm)
+
+     if(is.null(commtaxa)) {
+        stop("Community data set lacks taxa (column) names, these are required to match distance matrix and community data")
+    }
+
+    disclass <- class(dis)
+    dis <- as.matrix(dis)
+
+    distaxa <- rownames(dis)
+
+    if(is.null(distaxa)) {
+        warning("Distance matrix lacks taxa names, these are required to match community and distance matrix. Data are returned unsorted. Assuming that distance matrix and community data taxa columns are in the same order!")
+        if (disclass == "dist") {
+            return(list(comm=comm,dist=as.dist(dis))) 
+        } else {
+            return(list(comm=comm,dist=dis))             
+        }
+    }
+
+    if(!all(distaxa %in% commtaxa)) {
+        print("Dropping taxa from the distance matrix because they are not present in the community data:")
+        print(setdiff(distaxa,commtaxa))
+        dis <- dis[intersect(distaxa,commtaxa), intersect(distaxa,commtaxa)]
+        distaxa <- rownames(dis)
+    }
+    
+    if(any(!(commtaxa %in% distaxa))) {
+        print("Dropping taxa from the community because they are not present in the distance matrix:")
+        print(setdiff(commtaxa,distaxa))
+        res$comm <- comm[,intersect(commtaxa,distaxa)]
+    } else {
+        res$comm <- comm
+    }
+    
+    if (disclass == "dist") {
+        res$dist <- as.dist(dis[colnames(comm),colnames(comm)])
+    } else {
+        res$dist <- dis[colnames(comm),colnames(comm)]
+    }
+    return(res)   
+
 }
\ No newline at end of file

Modified: pkg/R/phylobeta.R
===================================================================
--- pkg/R/phylobeta.R	2010-02-11 01:33:54 UTC (rev 213)
+++ pkg/R/phylobeta.R	2010-02-11 07:01:01 UTC (rev 214)
@@ -2,63 +2,80 @@
 
 #comdist
 #mean pairwise distance among taxa from two communities
-comdist <- function(comm,dis,abundance.weighted=FALSE,exclude.conspecifics=FALSE) {
-    N <- dim(comm)[1]
-    #relativize abundances for inter-sample comparisons
-    comm <- decostand(comm,method="total",MARGIN=1)
-    comdis <- matrix(nrow=N,ncol=N)
-    for (i in 1:(N-1)) {
-        for (j in (i+1):N) {
-            sppInSample1 <- names(comm[i, comm[i, ] > 0])
-            sppInSample2 <- names(comm[j, comm[j, ] > 0])
-            if ((length(sppInSample1) > 1) && (length(sppInSample2) > 1)) {
-                sample.dis <- dis[sppInSample1, sppInSample2]
-                if (exclude.conspecifics) {
-                    sample.dis[sample.dis == 0] <- NA
-                }                
-                if (abundance.weighted) {
-                    sample.weights <- comm[i,sppInSample1] %*% t(comm[j,sppInSample2])
-                    comdis[i,j] <- weighted.mean(sample.dis,sample.weights,na.rm=TRUE)
-                }
-                else {
-                    comdis[i,j] <- mean(sample.dis,na.rm=TRUE)                
-                }
-            }
-            else{
-                comdis[i,j] <- NA
-            }
-        }
+comdist <- function(comm, dis, abundance.weighted=FALSE) {
+
+    x <- as.matrix(comm)
+    dat <- match.comm.dist(comm, dis)
+    x <- dat$comm
+    dis <- as.matrix(dat$dist)
+    if (!abundance.weighted) {
+        x <- decostand(x, method="pa")
     }
-    rownames(comdis) <- colnames(comdis) <- rownames(comm)
-    as.dist(t(comdis))
+    N <- dim(x)[1]
+    S <- dim(x)[2]
+    x <- decostand(x, method="total", MARGIN=1)
+
+	comdist <- matrix(nrow=N,ncol=N)    
+	for (l in 1:(N-1)) {
+		for (k in 2:N) {
+			comdist[k,l] <-
+			sum ( dis * outer(as.vector(t(x[k,])),as.vector(t(x[l,]))) )
+		}
+	}
+
+    row.names(comdist) <- row.names(x)
+    colnames(comdist) <- row.names(x)
+    return(as.dist(comdist))
+
 }
 
-#comdistnn
+
+#comdistnt
 #mean distance to closest relative between taxa from two communities
-comdistnt <- function(comm,dis,abundance.weighted=FALSE,exclude.conspecifics=FALSE) {
+comdistnt <- function(comm, dis, abundance.weighted=FALSE, exclude.conspecifics=FALSE) {
+
     N <- dim(comm)[1]
+    dat <- match.comm.dist(comm, dis)
+    comm <- dat$comm
+    dis <- dat$dist
     comm <- decostand(comm,method="total",MARGIN=1)    
     comdisnt <- matrix(nrow=N,ncol=N)
     for (i in 1:(N-1)) {
         for (j in (i+1):N) {
-            sppInSample1 <- names(comm[i, comm[i, ] > 0])
-            sppInSample2 <- names(comm[j, comm[j, ] > 0])
+            sppInSample1 <- names(comm[i, comm[i, ] > 0, drop=FALSE])
+            sppInSample2 <- names(comm[j, comm[j, ] > 0, drop=FALSE])
             if ((length(sppInSample1) >= 1) && (length(sppInSample2) >= 1)) {
-                sample.dis <- dis[sppInSample1, sppInSample2]
+                sample.dis <- dis[sppInSample1, sppInSample2, drop=FALSE]
                 if (exclude.conspecifics) {
-                    sample.dis[sample.dis == 0] <- NA
+                    sample.dis[sample.dis==0] <- NA
                 }
+                #TODO fix min throws errors on empty set
                 sample1NT <- apply(sample.dis,1,min,na.rm=TRUE)
+                sample1NT[sample1NT == Inf] <- NA
                 sample2NT <- apply(sample.dis,2,min,na.rm=TRUE)
+                sample2NT[sample2NT == Inf] <- NA                
                 if (abundance.weighted) {
-                    sample1.weights <- comm[i,sppInSample1]
-                    sample2.weights <- comm[j,sppInSample2]
-                    sample.weights <- c(sample1.weights,sample2.weights)              
-                    comdisnt[i,j] <- weighted.mean(c(sample1NT,sample2NT),sample.weights,na.rm=TRUE)
+                    sample1.weights <- as.numeric(comm[i,sppInSample1])
+                    sample2.weights <- as.numeric(comm[j,sppInSample2])
+                    if (any(is.na(sample1NT))) {
+                        miss <- which(is.na(sample1NT))
+                        sample1NT <- sample1NT[-miss]
+                        sample1.weights <- sample1.weights[-miss]
+                        sample1.weights <- sample1.weights / sum(sample1.weights)
+                    }
+                    if (any(is.na(sample2NT))) {
+                        miss <- which(is.na(sample2NT))
+                        sample2NT <- sample2NT[-miss]
+                        sample2.weights <- sample2.weights[-miss]
+                        sample2.weights <- sample2.weights / sum(sample2.weights)
+                    }
+                    sampleNT <- c(sample1NT, sample2NT)
+                    sample.weights <- c(sample1.weights,sample2.weights)
+                    comdisnt[i,j] <- weighted.mean(sampleNT, sample.weights, na.rm=TRUE)
                 }
                 else
                 {
-                    comdisnt[i,j] <- mean(c(sample1NT,sample2NT))
+                    comdisnt[i,j] <- mean(c(sample1NT,sample2NT), na.rm=TRUE)                
                 }
             }
             else{

Modified: pkg/R/raoD.R
===================================================================
--- pkg/R/raoD.R	2010-02-11 01:33:54 UTC (rev 213)
+++ pkg/R/raoD.R	2010-02-11 07:01:01 UTC (rev 214)
@@ -29,17 +29,17 @@
 			sum ( tij * outer(as.vector(t(x[k,])),as.vector(t(x[k,]))) )
     res$Dkk <- D
 
-	H <- matrix(nrow=N,ncol=N)    
+	Dkl <- matrix(nrow=N,ncol=N)    
 	for (k in 1:N) {
 		for (l in 1:N) {
-			H[k,l] <-
+			Dkl[k,l] <-
 			sum ( tij * outer(as.vector(t(x[k,])),as.vector(t(x[l,]))) )
 		}
 	}
 
-    row.names(H) <- row.names(x)
-    colnames(H) <- row.names(x)
-	Dkl <- H
+    row.names(Dkl) <- row.names(x)
+    colnames(Dkl) <- row.names(x)
+	H <- Dkl
 	res$Dkl <- Dkl
 
 	for (k in 1:N) {

Modified: pkg/man/comdist.Rd
===================================================================
--- pkg/man/comdist.Rd	2010-02-11 01:33:54 UTC (rev 213)
+++ pkg/man/comdist.Rd	2010-02-11 07:01:01 UTC (rev 214)
@@ -5,7 +5,7 @@
   Calculates MPD (mean pairwise distance) separating taxa in two communities, a measure of phylogenetic beta diversity
 }
 \usage{
-comdist(comm, dis, abundance.weighted = FALSE, exclude.conspecifics = FALSE)
+comdist(comm, dis, abundance.weighted = FALSE)
 
 }
 
@@ -13,7 +13,6 @@
   \item{comm}{ Community data matrix }
   \item{dis}{ Interspecific distance matrix }
   \item{abundance.weighted}{ Should mean pairwise distances separating species in two communities be weighted by species abundances? (default = FALSE)}
-  \item{exclude.conspecifics}{ Should conspecific taxa in different communities be excluded from MPD calculations? (default = FALSE)}
 }  
 \value{
 	Distance object of MPD values separating each pair of communities.

Modified: pkg/man/data.checking.Rd
===================================================================
--- pkg/man/data.checking.Rd	2010-02-11 01:33:54 UTC (rev 213)
+++ pkg/man/data.checking.Rd	2010-02-11 07:01:01 UTC (rev 214)
@@ -1,6 +1,7 @@
 \name{match.phylo.data}
 \alias{match.phylo.data}
 \alias{match.phylo.comm}
+\alias{match.comm.dist}
 
 \title{ Match taxa in phylogeny and data }
 \description{
@@ -9,28 +10,31 @@
 \usage{
 match.phylo.comm(phy, comm)
 match.phylo.data(phy, data)
+match.comm.dist(comm, dis)
 }
 
 \arguments{
   \item{ phy }{ A phylogeny object of class phylo }
   \item{ comm }{ Community data matrix }
   \item{ data }{ A data object - a vector (with names matching phy) or a data.frame or matrix (with row names matching phy) }
+  \item{ dis }{ A distance matrix - a dist or matrix object }
 }
 
 \value{
   A list containing the following elements, pruned and sorted to match one another:
   \item{phy}{ A phylogeny object of class phylo }
   \item{comm}{ Community data matrix }
-  \item{data}{ A data object (vector, data.frame or matrix) }  
+  \item{data}{ A data object (vector, data.frame or matrix) }
+  \item{dist}{ A distance matrix - a dist or matrix object }
 }
 \details{
 A common pitfall in comparative analyses in R is that taxa labels are assumed to match between phylogenetic and other data sets. These functions prune a phylogeny and community or trait data set to match one another, reporting taxa that are missing from one data set or the other.
 
-Taxa names for phylogeny objects are taken from the phylogeny's tip labels. Taxa names for community data are taken from the column names. Taxa names for trait data are taken from the element names (vector) or row names (data.frame or matrix).
+Taxa names for phylogeny objects are taken from the phylogeny's tip labels. Taxa names for community data are taken from the column names. Taxa names for trait data are taken from the element names (vector) or row names (data.frame or matrix). Taxa names for distance data are taken from column/row names of the distance matrix/dist object.
 
 If community data lack taxa names, the function will issue a warning and no result will be returned, since the community-phylogenetic analyses in \code{picante} require named taxa in the community data set.
 
-If trait data lack names, a warning is issued and the data are assumed to be sorted in the same order as the phylogeny's tip labels.
+If trait data or distance matrix lack names, a warning is issued and the data are assumed to be sorted in the same order as the phylogeny's tip labels or community's column labels.
 
 These utility functions are used by several of the functions that assume taxa labels in phylogeny and data match, including \code{\link{Kcalc}}, \code{\link{phylosignal}}, and \code{\link{raoD}}.
 }

Modified: pkg/man/evolve.brownian.Rd
===================================================================
--- pkg/man/evolve.brownian.Rd	2010-02-11 01:33:54 UTC (rev 213)
+++ pkg/man/evolve.brownian.Rd	2010-02-11 07:01:01 UTC (rev 214)
@@ -14,7 +14,7 @@
   \item{var}{ variance }
 }
 \value{
-  Vector of trait values with names corresponding to phylo\$tip.label
+  Vector of trait values with names corresponding to phylo$tip.label
 }
 \author{ David Ackerly <dackerly at berkeley.edu> and Steven Kembel <skembel at uoregon.edu> }
 \keyword{datagen}

Modified: pkg/man/phylocom.Rd
===================================================================
--- pkg/man/phylocom.Rd	2010-02-11 01:33:54 UTC (rev 213)
+++ pkg/man/phylocom.Rd	2010-02-11 07:01:01 UTC (rev 214)
@@ -3,21 +3,21 @@
 \docType{data}
 \title{ Phylocom default data }
 \description{
-  Tree, community and trait data from the Phylocom 4.0 distribution
+  Phylogeny, community and trait data from the Phylocom 4.0 distribution
 }
 \usage{data(phylocom)}
 \format{
     A list with three elements:
     \itemize{
-      \item{phylocom\$phylo}{   Phylogenetic tree}
-      \item{phylocom\$sample}{  Community data}
-      \item{phylocom\$traits}{  Trait data}
+      \item{ phylo }{ Phylogenetic tree (an object of class phylo) }
+      \item{ sample }{ Community data (a data.frame with samples in rows and species in columns }
+      \item{ traits }{ Trait data (a data.frame with species in rows and traits in columns }
       }
 }
 
 \source{
-  Webb, C.O., Ackerly, D.D., and Kembel, S.W. 2008. Phylocom: software for the analysis of phylogenetic community structure and trait evolution. Version 4.0.1.
-   \url{http://www.phylodiversity.net/phylocom/}.
+Webb, C.O., Ackerly, D.D., and Kembel, S.W. 2008. Phylocom: software for the analysis of phylogenetic community structure and trait evolution. Version 4.0.1.
+\url{http://www.phylodiversity.net/phylocom/}.
 }
 
 \keyword{datasets}

Modified: pkg/man/raoD.Rd
===================================================================
--- pkg/man/raoD.Rd	2010-02-11 01:33:54 UTC (rev 213)
+++ pkg/man/raoD.Rd	2010-02-11 07:01:01 UTC (rev 214)
@@ -30,21 +30,13 @@
 
 If no phylogeny is supplied, Dkk is equivalent to Simpson's diversity (probability that two individuals drawn from a community are from different taxa), Dkl is a beta-diversity equivalent of Simpson's diversity (probability that individuals drawn from each of two communities belong to different taxa), and H is Dkl standardized to account for within-community diversity.
 
-If an ultrametric phylogeny is supplied, Dkk is equivalent to the mean pairwise phylogenetic distance (distance to MRCA) between two individuals from different species drawn from a community, Dkl is the mean pairwise phylogenetic distance between individuals drawn from each of two communities, and H is Dkl standardized to account for within-community diversity.
+If an ultrametric phylogeny is supplied, Dkk is equivalent to the mean pairwise phylogenetic distance (distance to MRCA) between two individuals drawn from a community, Dkl is the mean pairwise phylogenetic distance between individuals drawn from each of two communities, and H is Dkl standardized to account for within-community diversity.
 
-\emph{
-    D[kl] = sum(t[ij] * x[ki] * x[lj])
+\emph{D[kl] = sum(t[ij] * x[ki] * x[lj])}
     
-        where x[ki] is the relative abundance of taxon i in community k
-        
-        t[ij] is a matrix of weights for all pairs of taxa i,j
-        
-            Without a phylogeny, when i=j, t[ij]=0, otherwise t[ij]=1
+where \emph{x[ki]} is the relative abundance of taxon \emph{i} in community \emph{k} and \emph{t[ij]} is a matrix of weights for all pairs of taxa \emph{i,j}. Without a phylogeny, when \emph{i=j}, \emph{t[ij]=0}, otherwise \emph{t[ij]=1}. With a phylogeny, \emph{t[ij]} is the phylogenetic distance to MRCA for taxa \emph{i,j}.
             
-            With a phylogeny, t[ij] is the phylogenetic distance to MRCA for taxa i and j
-            
-    H[kl] = D[kl] - (D[kk] + D[ll])/2
-}
+\emph{H[kl] = D[kl] - (D[kk] + D[ll])/2}
 
 Alpha, beta and total measure the average diversity within, among, and across all communities based on Dkk and H values taking variation in number of individuals per community into account. A Fst-like measure is calculated by dividing beta by the total diversity across all samples.
 }