[Vegan-commits] r1902 - in branches/2.0: . R inst man

Tue Sep 27 15:22:43 CEST 2011

Author: jarioksa
Date: 2011-09-27 15:22:42 +0200 (Tue, 27 Sep 2011)
New Revision: 1902

Added:
   branches/2.0/R/clamtest.R
   branches/2.0/R/plot.clamtest.R
   branches/2.0/R/print.summary.clamtest.R
   branches/2.0/R/summary.clamtest.R
   branches/2.0/man/clamtest.Rd
Modified:
   branches/2.0/NAMESPACE
   branches/2.0/inst/ChangeLog
Log:
merge clamtest to branches/2.0

Modified: branches/2.0/NAMESPACE
===================================================================

--- branches/2.0/NAMESPACE	2011-09-27 12:50:02 UTC (rev 1901)
+++ branches/2.0/NAMESPACE	2011-09-27 13:22:42 UTC (rev 1902)
@@ -6,7 +6,7 @@
 export(CCorA, MOStest, RsquareAdj, SSarrhenius, SSgitay, SSgleason,
 SSlomolino, adipart, adonis, anosim, beals, betadisper, betadiver,
 bgdispersal, bioenv, bstick, cIndexKM, calibrate, capscale, cascadeKM,
-cca, commsimulator, contribdiv, decorana, decostand, designdist,
+cca, commsimulator, contribdiv, clamtest, decorana, decostand, designdist,
 coverscale, dispindmorisita, distconnected, diversity, downweight,
 drarefy, eigengrad, eigenvals, envfit, estaccumR, estimateR, eventstar,
 factorfit, fisherfit, fitspecaccum, goodness, hiersimu, humpfit,
@@ -210,6 +210,7 @@
 S3method(plot, cascadeKM)
 S3method(plot, cca)
 S3method(plot, contribdiv)
+S3method(plot, clamtest)
 S3method(plot, decorana)
 S3method(plot, envfit)
 S3method(plot, fisherfit)
@@ -301,6 +302,7 @@
 S3method(print, specaccum)
 S3method(print, summary.bioenv)
 S3method(print, summary.cca)
+S3method(print, summary.clamtest)
 S3method(print, summary.decorana)
 S3method(print, summary.humpfit)
 S3method(print, summary.isomap)
@@ -359,6 +361,7 @@
 S3method(summary, anosim)
 S3method(summary, bioenv)
 S3method(summary, cca)
+S3method(summary, clamtest)
 S3method(summary, decorana)
 S3method(summary, eigenvals)
 S3method(summary, humpfit)

Copied: branches/2.0/R/clamtest.R (from rev 1901, pkg/vegan/R/clamtest.R)
===================================================================
--- branches/2.0/R/clamtest.R	                        (rev 0)
+++ branches/2.0/R/clamtest.R	2011-09-27 13:22:42 UTC (rev 1902)
@@ -0,0 +1,112 @@
+## CLAM, reproduction of software described in Chazdon et al. 2011
+## Ecology, 92, 1332--1343
+clamtest <- 
+function(comm, groups, coverage.limit = 10,
+specialization = 2/3, npoints = 20, alpha = 0.05/20) 
+{
+    ## inital checks
+    comm <- as.matrix(comm)
+    if (NROW(comm) < 2)
+        stop("'comm' must have at least 2 rows")
+    if (nrow(comm) > 2 && missing(groups))
+        stop("'groups' is missing")
+    if (nrow(comm) == 2 && missing(groups))
+        groups <- if (is.null(rownames(comm)))
+            c("Group.1", "Group.2") else rownames(comm)
+    if (length(groups) != nrow(comm))
+        stop("length of 'groups' must equal 'nrow(comm)'")
+    if (length(unique(groups)) != 2)
+        stop("number of groups must be 2")
+    glabel <- as.character(unique(groups))
+    if (is.null(colnames(comm)))
+        colnames(comm) <- paste("Species", 1:nrow(comm), sep=".")
+    if (any(colSums(comm) <= 0))
+        stop("'comm' contains zero sum columns")
+    spp <- colnames(comm)
+    ## reproduced from Chazdon et al. 2011, Ecology 92, 1332--1343
+    S <- ncol(comm)
+    if (nrow(comm) == 2) {
+        Y <- comm[glabel[1],]
+        X <- comm[glabel[2],]
+    } else {
+        Y <- colSums(comm[which(groups==glabel[1]),])
+        X <- colSums(comm[which(groups==glabel[2]),])
+    }
+    names(X) <- names(Y) <- NULL
+    #all(ct$Total_SG == Y)
+    #all(ct$Total_OG == X)
+    m <- sum(Y)
+    n <- sum(X)
+    if (sum(Y) <= 0 || sum(X) <= 0)
+        stop("zero group totals not allowed")
+    ## check if comm contains integer, especially for singletons
+    if (any(X[X>0] < 1) || any(Y[Y>0] < 1))
+        warning("<1 non integer values detected: analysis might not be meaningful")
+    if (abs(sum(X,Y) - sum(as.integer(X), as.integer(Y))) > 10^-6)
+        warning("non integer values detected")
+    C1 <- 1 - sum(X==1)/n
+    C2 <- 1 - sum(Y==1)/m
+    ## this stands for other than 2/3 cases
+    uu <- specialization/(1-specialization)
+    ## critical level
+    Zp <- qnorm(alpha, lower.tail=FALSE)
+    #p_i=a
+    #pi_i=b
+    ## function to calculate test statistic from Appendix D 
+    ## (Ecological Archives E092-112-A4)
+    testfun <- function(p_i, pi_i, C1, C2, n, m) {
+        C1 <- ifelse(p_i < coverage.limit, C1, 1)
+        C2 <- ifelse(pi_i < coverage.limit, C2, 1)
+        Var <- C1^2*(p_i*(1-p_i)/n) + uu^2*C2^2*(pi_i*(1-pi_i)/m)
+        C1*p_i - C2*pi_i*uu - Zp*sqrt(Var)
+    }
+    ## root finding for iso-lines (instead of itarative search)
+    rootfun <- function(pi_i, C1, C2, n, m, upper) {
+        f <- function(p_i) testfun(p_i/n, pi_i/m, C1, C2, n, m)
+        if (length(unique(sign(c(f(1), f(upper))))) > 1)
+        ceiling(uniroot(f, lower=1, upper=upper)$root) else NA
+    }
+    ## sequences for finding Xmin and Ymin values
+    Xseq <- as.integer(trunc(seq(1, max(X), len=npoints)))
+    Yseq <- as.integer(trunc(seq(1, max(Y), len=npoints)))
+    ## finding Xmin and Ymin values for Xseq and Yseq
+    Xmins <- sapply(Yseq, function(z) rootfun(z, C1, C2, n, m, upper=max(X)))
+    Ymins <- sapply(Xseq, function(z) rootfun(z, C2, C1, m, n, upper=max(Y)))
+    minval <- list(data.frame(x=Xseq[!is.na(Ymins)], y=Ymins[!is.na(Ymins)]),
+        data.frame(x=Xmins[!is.na(Xmins)], y=Yseq[!is.na(Xmins)]))
+    ## shared but too rare
+    Ymin <- Ymins[1]
+    Xmin <- Xmins[1]
+    sr <- X < Xmin & Y < Ymin
+    tmp1 <- approx(c(Xmin, 1), c(1, Ymin), xout=1:Xmin)
+    tmp2 <- approx(c(1, Ymin), c(Xmin, 1), xout=1:Ymin)
+    for (i in 1:S) {
+        if (X[i] %in% tmp1$x)
+            sr[i] <- Y[i] < tmp1$y[which(X[i]==tmp1$x)]
+        if (Y[i] %in% tmp2$x)
+            sr[i] <- X[i] < tmp2$y[which(Y[i]==tmp2$x)]
+    }
+    ## classification
+    a <- ifelse(X==0, 1, X)/n # \hat{p_i}
+    b <- ifelse(Y==0, 1, Y)/m # \hat{\pi_i}
+    specX <- !sr & testfun(a, b, C1, C2, n, m) > 0
+    specY <- !sr & testfun(b, a, C2, C1, m, n) > 0
+    gen <- !sr & !specX & !specY
+    ## crosstable
+    tmp <- ifelse(cbind(gen, specY, specX, sr), 1, 0)
+    types <- c("Generalist", paste("Specialist", glabel[1], sep="_"),
+        paste("Specialist", glabel[2], sep="_"), "Too_rare")
+    classes <- as.factor((1:4)[rowSums(tmp*col(tmp))])
+    levels(classes) <- c("Generalist", paste("Specialist", glabel[1], sep="_"),
+        paste("Specialist", glabel[2], sep="_"), "Too_rare")
+    tab <- data.frame(Species=spp, y=Y, x=X, Classes=classes)
+    colnames(tab)[2:3] <- paste("Total", glabel, sep="_")
+    rownames(tab) <- NULL
+    class(tab) <- c("clamtest","data.frame")
+    attr(tab, "settings") <- list(labels = glabel,
+        coverage.limit = coverage.limit, specialization = specialization, 
+        npoints = npoints, alpha = alpha)
+    attr(tab, "minv") <- minval
+    attr(tab, "coverage") <- structure(c(C2, C1), .Names=glabel)
+    tab
+}

Copied: branches/2.0/R/plot.clamtest.R (from rev 1901, pkg/vegan/R/plot.clamtest.R)
===================================================================
--- branches/2.0/R/plot.clamtest.R	                        (rev 0)
+++ branches/2.0/R/plot.clamtest.R	2011-09-27 13:22:42 UTC (rev 1902)
@@ -0,0 +1,39 @@
+plot.clamtest <- function(x, xlab, ylab, main,
+    pch=21:24, col.points=1:4, col.lines=2:4, lty=1:3,
+    position="bottomright", ...) {
+    summ <- summary(x)
+    glabel <- summ$labels
+    if (missing(main))
+        main <- "Species Classification"
+    if (missing(xlab))
+        xlab <- paste(glabel[2], "(abundance + 1)")
+    if (missing(ylab))
+        ylab <- paste(glabel[1], "(abundance + 1)")
+    Y <- x[,2]
+    X <- x[,3]
+    minval <- summ$minv
+    ## plot the dots
+    rr <- range(X+1,Y+1)
+    plot(X+1, Y+1, log = "xy", xaxt = "n", yaxt = "n",
+        col=col.points[as.integer(x$Classes)],
+        pch=pch[as.integer(x$Classes)], 
+        xlab=xlab, ylab=ylab, main=main,
+        xlim=rr, ylim=rr, ...)
+    axis(1, c(1,10,100,1000,10000))
+    axis(2, c(1,10,100,1000,10000))
+    ## too rare threshold
+    Ymin <- minval[[1]][1,2]
+    Xmin <- minval[[2]][1,1]
+    lines(rep(Xmin, 2)+1, c(0, 1)+1, col=col.lines[1], lty=lty[1])
+    lines(c(0, 1)+1, rep(Ymin, 2)+1, col=col.lines[1], lty=lty[1])
+    tmp <- approx(c(Xmin, 1), c(1, Ymin))
+    lines(tmp$x+1, tmp$y+1, col=col.lines[1], lty=lty[1])
+    ## Y vs. gen threshold
+    lines(minval[[1]]+1, col=col.lines[2], lty=lty[2])
+    ## X vs. gen threshold
+    lines(minval[[2]]+1, col=col.lines[3], lty=lty[3])
+    if (!is.null(position))
+        legend(position, col=col.points, pch=pch, 
+            legend=rownames(summ$summary))
+    invisible(x)
+}

Copied: branches/2.0/R/print.summary.clamtest.R (from rev 1901, pkg/vegan/R/print.summary.clamtest.R)
===================================================================
--- branches/2.0/R/print.summary.clamtest.R	                        (rev 0)
+++ branches/2.0/R/print.summary.clamtest.R	2011-09-27 13:22:42 UTC (rev 1902)
@@ -0,0 +1,13 @@
+print.summary.clamtest <- function(x, digits=max(3, getOption("digits") - 3), ...) {
+    cat("Two Groups Species Classification Method (CLAM)\n\n")
+    cat("Specialization threshold =", x$specialization)
+    cat("\nAlpha level =", x$alpha)
+    cat("\n\nEstimated sample coverage:\n")
+    print(x$coverage, digits=digits)
+    cat("\nMinimum abundance for classification:\n")
+    print(structure(c(x$minv[[1]][1,2], x$minv[[2]][1,1]),
+        .Names=x$labels))
+    cat("\n")
+    printCoefmat(x$summary, digits=digits, ...)
+}
+

Copied: branches/2.0/R/summary.clamtest.R (from rev 1901, pkg/vegan/R/summary.clamtest.R)
===================================================================
--- branches/2.0/R/summary.clamtest.R	                        (rev 0)
+++ branches/2.0/R/summary.clamtest.R	2011-09-27 13:22:42 UTC (rev 1902)
@@ -0,0 +1,7 @@
+summary.clamtest <- function(object, ...) {
+    structure(c(attr(object, "settings"), 
+        list(summary=cbind(Species=table(object$Classes), 
+            Proportion=table(object$Classes)/nrow(object)),
+        minv=attr(object, "minv"),
+        coverage=attr(object, "coverage"))), class="summary.clamtest")
+}

Modified: branches/2.0/inst/ChangeLog
===================================================================
--- branches/2.0/inst/ChangeLog	2011-09-27 12:50:02 UTC (rev 1901)
+++ branches/2.0/inst/ChangeLog	2011-09-27 13:22:42 UTC (rev 1902)
@@ -4,6 +4,9 @@
 
 Version 2.0-1 (opened September 8, 2011)
 
+	* r1897, 1840, 1825, 1823: copy clamtest (at r1897),
+	summary.clamtest, print.summary.clamtest, plot.clamtest (all at
+	r1823) and clamtest.Rd (at r1897)
 	* merge r1872: raupcrick doc fixes.
 	* merge r1869: meandist re-ordering bug fix.
 	* merge r1846: tweak speed (a bit).

Copied: branches/2.0/man/clamtest.Rd (from rev 1901, pkg/vegan/man/clamtest.Rd)
===================================================================
--- branches/2.0/man/clamtest.Rd	                        (rev 0)
+++ branches/2.0/man/clamtest.Rd	2011-09-27 13:22:42 UTC (rev 1902)
@@ -0,0 +1,149 @@
+\name{clamtest}
+\alias{clamtest}
+\alias{summary.clamtest}
+\alias{plot.clamtest}
+
+\title{
+Multinomial Species Classification Method (CLAM)
+}
+
+\description{
+The CLAM statistical approach for classifying generalists and
+specialists in two distinct habitats is described in Chazdon et al. (2011).
+}
+\usage{
+clamtest(comm, groups, coverage.limit = 10, specialization = 2/3, 
+   npoints = 20, alpha = 0.05/20)
+\method{summary}{clamtest}(object, ...)
+\method{plot}{clamtest}(x, xlab, ylab, main,  pch = 21:24, col.points = 1:4, 
+   col.lines = 2:4, lty = 1:3, position = "bottomright", ...)
+}
+\arguments{
+  \item{comm}{
+Community matrix, consisting of counts.
+}
+  \item{groups}{
+A vector identifying the two habitats. Must have exactly
+two unique values or levels.
+}
+  \item{coverage.limit}{
+Integer, below this limit the sample coverage based correction
+is applied to rare species. Sample coverage is calculated separately 
+for the two habitats. Sample relative abundances are used for species 
+with higher than or equal to \code{coverage.limit} total counts per habitat.
+}
+  \item{specialization}{
+Numeric, specialization threshold value between 0 and 1.
+The value of \eqn{2/3} represents \sQuote{supermajority} rule,
+while a value of \eqn{1/2} represents a \sQuote{simple majority} rule
+to assign shared species as habitat specialists.
+}
+  \item{npoints}{
+Integer, number of points used to determine the boundary lines
+in the plots.
+}
+\item{alpha}{ Numeric, nominal significance level for individual
+  tests.  The default value reduces the conventional limit of
+  \eqn{0.05} to account for overdispersion and multiple testing for
+  several species simultaneously. However, the is no firm reason for
+  exactly this limit.  }
+  \item{x, object}{
+Fitted model object of class \code{"clamtest"}.
+}
+  \item{xlab, ylab}{
+Labels for the plot axes.
+}
+  \item{main}{
+Main title of the plot.
+}
+  \item{pch, col.points}{
+Symbols and colors used in plotting species groups.
+}
+  \item{lty, col.lines}{
+Line types and colors for boundary lines in plot to separate species groups.
+}
+  \item{position}{
+Position of figure legend, see \code{\link{legend}} for specification details.
+Legend not shown if \code{position = NULL}.
+}
+  \item{\dots}{
+Additional arguments passed to methods.
+}
+}
+
+\details{ The method uses a multinomial model based on estimated
+  species relative abundance in two habitats (A, B). It minimizes bias
+  due to differences in sampling intensities between two habitat types
+  as well as bias due to insufficient sampling within each
+  habitat. The method permits a robust statistical classification of
+  habitat specialists and generalists, without excluding rare species
+  \emph{a priori} (Chazdon et al. 2011).  Based on a user-defined
+  \code{specialization} threshold, the model classifies species into
+  one of four groups: (1) generalists; (2) habitat A specialists; (3)
+  habitat B specialists; and (4) too rare to classify with confidence.
+  } 
+
+\value{ A data frame (with class attribute \code{"clamtest"}),
+  with columns: 
+  \itemize{ 
+    \item{\code{Species}:}{ species name (column names from \code{comm}),} 
+    \item{\code{Total_*A*}:}{ total count in habitat A,} 
+    \item{\code{Total_*B*}:}{ total count in habitat B,} 
+    \item{\code{Classes}:}{ species classification, a factor with
+       levels \code{Generalist}, \code{Specialist_*A*},
+       \code{Specialist_*B*}, and \code{Too_rare}.}  
+}
+  \code{*A*} and \code{*B*} are placeholders for habitat names/labels found in the
+  data.
+
+The \code{summary} method returns descriptive statistics of the results.
+The \code{plot} method returns values invisibly and produces a bivariate
+scatterplot of species total abundances in the two habitats. Symbols and
+boundary lines are shown for species groups.
+}
+\references{
+Chazdon, R. L., Chao, A., Colwell, R. K., Lin, S.-Y., Norden, N., 
+Letcher, S. G., Clark, D. B., Finegan, B. and Arroyo J. P.(2011). 
+A novel statistical method for classifying habitat
+generalists and specialists. \emph{Ecology} \bold{92}, 1332--1343.
+}
+\author{
+Peter Solymos \email{solymos at ualberta.ca}
+}
+\note{
+The code was tested against standalone CLAM software provided
+on the website of Anne Chao (\url{http://chao.stat.nthu.edu.tw/softwarece.html});
+minor inconsistencies were found, especially for finding the
+threshold for 'too rare' species.
+These inconsistencies are probably due to numerical differences between the
+two implementation. The current \R implementation uses 
+root finding for iso-lines instead of iterative search.
+
+The original method (Chazdon et al. 2011) has two major problems:
+\enumerate{
+  
+  \item It assumes that the error distribution is multinomial. This is
+    a justified choice if individuals are freely distributed, and
+    there is no over-dispersion or clustering of individuals. In most
+    ecological data, the variance is much higher than multinomial
+    assumption, and therefore test statistic are too optimistic.
+
+  \item The original authors suggest that multiple testing adjustment
+    for multiple testing should be based on the number of points
+    (\code{npoints}) used to draw the critical lines on the plot,
+    whereas the adjustment should be based on the number tests (i.e,
+    tested species). The function uses the same numerical values as
+    the original paper, but there is no automatic connection between
+    \code{npoints} and \code{alpha} arguments, but you must work out
+    the adjustment yourself.
+}
+}
+\examples{
+data(mite)
+data(mite.env)
+x <- clamtest(mite, mite.env$Shrub=="None", alpha=0.005)
+summary(x)
+head(x)
+plot(x)
+}
+\keyword{ htest }