[Vegan-commits] r2104 - in pkg/vegan: R man src

Thu Feb 23 15:37:52 CET 2012

Author: jarioksa
Date: 2012-02-23 15:37:51 +0100 (Thu, 23 Feb 2012)
New Revision: 2104

Modified:
   pkg/vegan/R/vegdist.R
   pkg/vegan/man/vegdist.Rd
   pkg/vegan/src/vegdist.c
Log:
merge Cao index (CYd) in vegdist

Modified: pkg/vegan/R/vegdist.R
===================================================================

--- pkg/vegan/R/vegdist.R	2012-02-22 08:05:56 UTC (rev 2103)
+++ pkg/vegan/R/vegdist.R	2012-02-23 14:37:51 UTC (rev 2104)
@@ -7,7 +7,7 @@
         method <- "euclidean"
     METHODS <- c("manhattan", "euclidean", "canberra", "bray", 
                  "kulczynski", "gower", "morisita", "horn", "mountford", 
-                 "jaccard", "raup", "binomial", "chao", "altGower")
+                 "jaccard", "raup", "binomial", "chao", "altGower", "cao")
     method <- pmatch(method, METHODS)
     inm <- METHODS[method]
     if (is.na(method)) 
@@ -15,19 +15,23 @@
     if (method == -1) 
         stop("ambiguous distance method")
     if (method > 2 && any(rowSums(x, na.rm = TRUE) == 0)) 
-        warning("you have empty rows: their dissimilarities may be meaningless in method ", inm,"\n")
+        warning("you have empty rows: their dissimilarities may be meaningless in method ",
+                dQuote(inm))
     if (method > 2 && any(x < 0, na.rm = TRUE)) 
-        warning("results may be meaningless because data have negative entries in method ", inm,"\n")
+        warning("results may be meaningless because data have negative entries in method ",
+                dQuote(inm))
     if (method == 11 && any(colSums(x) == 0)) 
-        warning("data have empty species which influence the results im method ", inm, "\n")
+        warning("data have empty species which influence the results im method ",
+                dQuote(inm))
     if (method == 6) # gower, but no altGower
         x <- decostand(x, "range", 2, na.rm = TRUE, ...)
     if (binary) 
         x <- decostand(x, "pa")
     N <- nrow(x <- as.matrix(x))
-    if (method %in% c(7, 13) && !identical(all.equal(as.integer(x), 
+    if (method %in% c(7, 13, 15) && !identical(all.equal(as.integer(x), 
                                                      as.vector(x)), TRUE)) 
-        warning("results may be meaningless with non-integer data in method ", inm, "\n")
+        warning("results may be meaningless with non-integer data in method ",
+                dQuote(inm))
     d <- .C("veg_distance", x = as.double(x), nr = N, nc = ncol(x), 
             d = double(N * (N - 1)/2), diag = as.integer(FALSE), 
             method = as.integer(method), NAOK = na.rm, PACKAGE = "vegan")$d

Modified: pkg/vegan/man/vegdist.Rd
===================================================================
--- pkg/vegan/man/vegdist.Rd	2012-02-22 08:05:56 UTC (rev 2103)
+++ pkg/vegan/man/vegdist.Rd	2012-02-23 14:37:51 UTC (rev 2104)
@@ -13,7 +13,7 @@
   Gower, Bray--Curtis, Jaccard and
   Kulczynski indices are good in detecting underlying
   ecological gradients (Faith et al. 1987). Morisita, Horn--Morisita,
-  Binomial and Chao
+  Binomial, Cao and Chao
   indices should be able to handle different sample sizes (Wolda 1981,
   Krebs 1999, Anderson & Millar 2004),
   and Mountford (1962) and Raup-Crick indices for presence--absence data should
@@ -27,8 +27,8 @@
   \item{method}{Dissimilarity index, partial match to  \code{"manhattan"},
     \code{"euclidean"}, \code{"canberra"}, \code{"bray"}, \code{"kulczynski"},
      \code{"jaccard"}, \code{"gower"}, \code{"altGower"}, \code{"morisita"}, 
-     \code{"horn"}, \code{"mountford"}, \code{"raup"} , \code{"binomial"} or 
-     \code{"chao"}.}
+     \code{"horn"}, \code{"mountford"}, \code{"raup"} , \code{"binomial"}, 
+     \code{"chao"} of \code{"cao"}.}
   \item{binary}{Perform presence/absence standardization before analysis
     using \code{\link{decostand}}.}
   \item{diag}{Compute diagonals. }
@@ -116,6 +116,15 @@
     \cr
     \tab where \eqn{n_i = x_{ij} + x_{ik}}{n[i] = x[ij] + x[ik]}
     \cr \tab binary: \eqn{\log(2) \times (A+B-2J)}{log(2)*(A+B-2*J)}
+    \cr
+    \code{cao}
+    \tab \eqn{d_{jk} = \frac{1}{S} \sum_i \log
+    \left(\frac{n_i}{2}\right) - (x_{ij} \log(x_{ik}) + x_{ik}
+    \log(x_{ij}))/n_i}{d[jk] = (1/S) * sum(log(n[i]/2) -
+    (x[ij]*log(x[ik]) + x[ik]*log(x[ij]))/n[i])},
+  \cr
+  \tab where \eqn{S} is the number of species in compared sites and
+    \eqn{n_i = x_{ij}+x_{ik}}{n[i] = x[ij] + x[ik]}
   }
 
   Jaccard index is computed as \eqn{2B/(1+B)}, where \eqn{B} is
@@ -126,6 +135,18 @@
   handle variable sample sizes. The index does not have a fixed upper
   limit, but can vary among sites with no shared species. For further
   discussion, see Anderson & Millar (2004).
+
+  Cao index or CYd index (Cao et al. 1997) was suggested as a minimally
+  biased index for high beta diversity and variable sampling intensity.
+  Cao index does not have a fixed upper limit, but can vary among sites
+  with no shared species.  The index is intended for count (integer)
+  data, and it is undefined for zero abundances; these are replaced with
+  arbitrary value \eqn{0.1} following Cao et al. (1997).  Cao et
+  al. (1997) used \eqn{\log_{10}}{log10}, but the current function uses
+  natural logarithms so that the values are approximately \eqn{2.30}
+  times higher than with 10-based logarithms. Anderson & Thompson (2004)
+  give an alternative formulation of Cao index to highlight its
+  relationship with Binomial index (above).
   
   Mountford index is defined as \eqn{M = 1/\alpha} where \eqn{\alpha}
   is the parameter of Fisher's logseries assuming that the compared
@@ -223,10 +244,18 @@
   Zealand.  \emph{Journal of Experimental Marine Biology and Ecology}
   305, 191--221.
 
-  Anderson, M.J., Ellingsen, K.E. & McArdle, B.H. (2006) Multivariate
+  Anderson, M.J., Ellingsen, K.E. & McArdle, B.H. (2006). Multivariate
   dispersion as a measure of beta diversity. \emph{Ecology Letters} 
   9, 683--693.
 
+  Anderson, M.J & Thompson, A.A. (2004). Multivariate control charts for
+  ecological and environmental monitoring. \emph{Ecological
+    Applications} 14, 1921--1935.
+
+  Cao, Y., Williams, W.P. & Bark, A.W. (1997). Similarity measure bias
+  in river benthic Auswuchs community analysis. \emph{Water
+  Environment Research} 69, 95--106.
+
   Chao, A., Chazdon, R. L., Colwell, R. K. and Shen, T. (2005). A new
   statistical approach for assessing similarity of species composition
   with incidence and abundance data. \emph{Ecology Letters} 8, 148--159.

Modified: pkg/vegan/src/vegdist.c
===================================================================
--- pkg/vegan/src/vegdist.c	2012-02-22 08:05:56 UTC (rev 2103)
+++ pkg/vegan/src/vegdist.c	2012-02-23 14:37:51 UTC (rev 2104)
@@ -43,6 +43,7 @@
 #define MILLAR 12
 #define CHAO 13
 #define GOWERDZ 14
+#define CAO 15
 #define MATCHING 50
 #define NOSHARED 99
 
@@ -526,6 +527,46 @@
     return dist;
 }
 
+/* veg_cao implements Cao index (CYd) of Cao Y, Williams WP, Bark AW:
+ *   Water Envir Res 69, 95-106; 1997. Anderson MJ & Thompson AA: Ecol
+ *   Appl 14, 1921-1935; 2004 use different but equal formulation.
+ */
+
+double veg_cao(double *x, int nr, int nc, int i1, int i2)
+{
+     double dist, x1, x2, t1, t2, t3, tlog;
+     int count, j;
+  
+     count = 0;
+     dist = 0;
+     for (j=0; j<nc; j++, i1 += nr, i2 += nr) {
+	  if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	       /* skip the rest of the loop if both species are
+		  absent */
+	       if (x[i1] == 0 && x[i2] == 0) continue;
+	       /* Cao uses arbitrary value of 0.1 for zeros to avoid
+		  log(0). Obviously this indicates the use of counts
+		  (integer), but we accept non-integer data (with a
+		  warning in R) and put the truncation to the same 0.1
+		  to avoid discontinuities with non-integer data */
+	       x1 = (x[i1] < 0.1) ? 0.1 : x[i1];
+	       x2 = (x[i2] < 0.1) ? 0.1 : x[i2];
+	       t1 = x1 + x2;
+	       /* Cao et al. used log10, but we do not and so our
+		  results are log(10) = 2.302585 times higher */
+	       t2 = x1 * log(x2) + x2 * log(x1);
+	       dist += log(t1) - M_LN2 - t2/t1;
+	       count++;
+	  }
+     }
+     if (count==0) return NA_REAL;
+     if (dist < 0)
+	 dist = 0;
+     dist /= (double)count;
+     return dist;
+}
+
+
 /* veg_noshared is not a proper dissimilarity index, but a pretty
  * useless helper function. It returns 1 when there are no shared
  * species, and 0 if two sites have at least one shared species, and
@@ -626,6 +667,9 @@
     case GOWERDZ:
 	distfun = veg_gowerDZ;
 	break;
+    case CAO:
+        distfun = veg_cao;
+        break;
     case MATCHING:
 	distfun = veg_matching;
 	break;