[Vegan-commits] r2499 - in pkg/vegan: . R inst man

Sun May 5 17:58:34 CEST 2013

Author: jarioksa
Date: 2013-05-05 17:58:34 +0200 (Sun, 05 May 2013)
New Revision: 2499

Added:
   pkg/vegan/R/veganMahatrans.R
Modified:
   pkg/vegan/DESCRIPTION
   pkg/vegan/NAMESPACE
   pkg/vegan/R/bioenv.default.R
   pkg/vegan/R/bioenv.formula.R
   pkg/vegan/R/print.bioenv.R
   pkg/vegan/inst/ChangeLog
   pkg/vegan/man/bioenv.Rd
   pkg/vegan/man/vegan-internal.Rd
Log:
implement Mahalanobis, Manhattan and Gower metrics for bioenv

Modified: pkg/vegan/DESCRIPTION
===================================================================

--- pkg/vegan/DESCRIPTION	2013-04-29 08:00:00 UTC (rev 2498)
+++ pkg/vegan/DESCRIPTION	2013-05-05 15:58:34 UTC (rev 2499)
@@ -1,7 +1,7 @@
 Package: vegan
 Title: Community Ecology Package
-Version: 2.1-29
-Date: April 19, 2013
+Version: 2.1-30
+Date: May 5, 2013
 Author: Jari Oksanen, F. Guillaume Blanchet, Roeland Kindt, Pierre Legendre, 
    Peter R. Minchin, R. B. O'Hara, Gavin L. Simpson, Peter Solymos, 
    M. Henry H. Stevens, Helene Wagner  

Modified: pkg/vegan/NAMESPACE
===================================================================
--- pkg/vegan/NAMESPACE	2013-04-29 08:00:00 UTC (rev 2498)
+++ pkg/vegan/NAMESPACE	2013-05-05 15:58:34 UTC (rev 2499)
@@ -58,7 +58,7 @@
 ## export(ade2vegancca, orderingKM, ordiArgAbsorber, ordiArrowMul,
 ## ordiGetData, ordimedian, ordiNAexclude, ordiNApredict,
 ## ordiParseFormula, ordiTerminfo, pregraphKM, simpleRDA2, varpart2,
-## varpart3, varpart4, veganCovEllipse)
+## varpart3, varpart4, veganCovEllipse, veganMahatrans)
 
 ## Registration of S3 methods
 import(stats)

Modified: pkg/vegan/R/bioenv.default.R
===================================================================
--- pkg/vegan/R/bioenv.default.R	2013-04-29 08:00:00 UTC (rev 2498)
+++ pkg/vegan/R/bioenv.default.R	2013-05-05 15:58:34 UTC (rev 2499)
@@ -1,8 +1,14 @@
 `bioenv.default` <-
 function (comm, env, method = "spearman", index = "bray", upto = ncol(env), 
-              trace = FALSE, partial = NULL, parallel = getOption("mc.cores"),
+          trace = FALSE, partial = NULL,
+          metric = c("euclidean", "mahalanobis", "manhattan", "gower"),
+          parallel = getOption("mc.cores"),
           ...) 
 {
+    metric <- match.arg(metric)
+    method <- match.arg(method, eval(formals(cor)$method))
+    if (any(sapply(env, is.factor)) && metric != "gower")
+        stop("you have factors in 'env': only 'metric = \"gower\"' is allowed")
     if (is.null(partial)) {
         corfun <- function(dx, dy, dz, method) {
             cor(dx, dy, method=method)
@@ -26,13 +32,31 @@
     n <- ncol(env)
     ntake <- 2^n - 1
     ndone <- 0
+    upto <- min(upto, n)
     if (n > 8 || trace) {
         if (upto < n) 
             cat("Studying", nall <- sum(choose(n, 1:upto)), "of ")
         cat(ntake, "possible subsets (this may take time...)\n")
         flush.console()
     }
-    x <- scale(env)
+    ## Check metric and adapt data and distance function
+    if (metric == "euclidean") {
+        x <- scale(env, scale = TRUE)
+        distfun <- function(x) dist(x)
+    } else if (metric == "mahalanobis") {
+        x <- as.matrix(scale(env, scale = FALSE))
+        distfun <- function(x) dist(veganMahatrans(x))
+    } else if (metric == "gower") {
+        require(cluster) ||
+        stop("package 'cluster' needed for factor variables in 'env'")
+        x <- env
+        distfun <- function(x) daisy(x, metric = "gower")
+    } else if (metric == "manhattan") {
+        x <- decostand(env, "range")
+        distfun <- function(x) dist(x, "manhattan")
+    } else {
+        stop("unknown metric")
+    }
     best <- list()
     if (inherits(comm, "dist")) {
         comdis <- comm
@@ -77,18 +101,19 @@
         if (isParal && nrow(sets) >= CLUSLIM*nclus) {
             if (isMulticore) {
                 est <- unlist(mclapply(1:nrow(sets), function(j)
-                                       corfun(comdis, dist(x[,sets[j, ]]), partial,
-                                              method = method),
+                                       corfun(comdis,
+                                              distfun(x[,sets[j,],drop = FALSE]),
+                                              partial, method = method),
                                        mc.cores = parallel))
             } else {
                 est <- parSapply(parallel, 1:nrow(sets), function(j)
-                                  corfun(comdis, dist(x[,sets[j, ]]), partial,
-                                         method = method))
+                                  corfun(comdis, distfun(x[,sets[j,],drop = FALSE]),
+                                         partial, method = method))
             }
         } else {
             est <- sapply(1:nrow(sets), function(j) 
-                          corfun(comdis, dist(x[,sets[j, ]]), partial,
-                                 method = method))
+                          corfun(comdis, distfun(x[,sets[j,], drop=FALSE ]),
+                                 partial, method = method))
         }
         best[[i]] <- list(best = sets[which.max(est), ], est = max(est))
         if (trace) {
@@ -98,8 +123,9 @@
             flush.console()
         }
     }
-    out <- list(names = colnames(env), method = method, index = index, 
-                upto = upto, models = best, partial = partpart)
+    out <- list(names = colnames(env), method = method, index = index,
+                metric = metric, upto = upto, models = best,
+                partial = partpart)
     out$call <- match.call()
     out$call[[1]] <- as.name("bioenv")
     class(out) <- "bioenv"

Modified: pkg/vegan/R/bioenv.formula.R
===================================================================
--- pkg/vegan/R/bioenv.formula.R	2013-04-29 08:00:00 UTC (rev 2498)
+++ pkg/vegan/R/bioenv.formula.R	2013-05-05 15:58:34 UTC (rev 2499)
@@ -7,12 +7,7 @@
     comm <- formula[[2]]
     comm <- eval(comm, data, parent.frame())
     formula[[2]] <- NULL
-    mf <- model.frame(formula, data, na.action = NULL)
-    if (any(sapply(mf, function(x) is.factor(x) || !is.numeric(x)))) 
-        stop("bioenv applies only to numeric variables")
-    env <- attr(mf, "terms")
-    attr(env, "intercept") <- 0
-    env <- model.matrix(env, mf)
+    env <- model.frame(formula, data, na.action = NULL)
     out <- bioenv(comm, env, ...)
     out$formula <- fla
     out$call <- match.call()

Modified: pkg/vegan/R/print.bioenv.R
===================================================================
--- pkg/vegan/R/print.bioenv.R	2013-04-29 08:00:00 UTC (rev 2498)
+++ pkg/vegan/R/print.bioenv.R	2013-05-05 15:58:34 UTC (rev 2499)
@@ -1,11 +1,12 @@
-"print.bioenv" <-
+`print.bioenv` <-
     function (x, ...) 
 {
     cat("\nCall:\n")
     cat(deparse(x$call), "\n")
     cat("\nSubset of environmental variables with best correlation to community data.\n\n")
-    cat("Correlations:     ", x$method, "\n")
-    cat("Dissimilarities:  ", x$index, "\n\n")
+    cat("Correlations:   ", x$method, "\n")
+    cat("Dissimilarities:", x$index, "\n")
+    cat("Metric:         ", x$metric, "\n\n") 
     i <- which.max(lapply(x$models, function(tmp) tmp$est))
     cat("Best model has", i, "parameters (max.", x$upto, "allowed):\n")
     cat(paste(x$names[x$models[[i]]$best], collapse = " "))

Added: pkg/vegan/R/veganMahatrans.R
===================================================================
--- pkg/vegan/R/veganMahatrans.R	                        (rev 0)
+++ pkg/vegan/R/veganMahatrans.R	2013-05-05 15:58:34 UTC (rev 2499)
@@ -0,0 +1,20 @@
+### Internal function for Mahalanobis transformation of the matrix.
+### Mahalanobis transformation of matrix X is M = X S^(-1/2) where S
+### is the covariance matrix. The inverse square root of S is found
+### via eigen decomposition S = G L G^T, where G is the matrix of
+### eigenvectors, and L is the diagonal matrix of eigenvalues. Thus
+### S^(-1/2) = G L^(-1/2) G^T. This is an internal function so that
+### input must be correct: 'x' must be a centred matrix (not a
+### data.frame, not raw data).
+`veganMahatrans` <-
+    function (x, s2, tol = 1e-8) 
+{
+    n <- nrow(x)
+    if (missing(s2))
+        s2 <- cov(x) 
+    e <- eigen(s2)
+    k <- e$values > tol
+    sisqr <- e$vectors[,k, drop=FALSE] %*%
+        (sqrt(1/e$values[k]) * t(e$vectors[,k, drop = FALSE]))
+    x %*% sisqr 
+}

Modified: pkg/vegan/inst/ChangeLog
===================================================================
--- pkg/vegan/inst/ChangeLog	2013-04-29 08:00:00 UTC (rev 2498)
+++ pkg/vegan/inst/ChangeLog	2013-05-05 15:58:34 UTC (rev 2499)
@@ -2,8 +2,23 @@
 
 VEGAN DEVEL VERSIONS at http://r-forge.r-project.org/
 
-Version 2.1-29 (opened April 19, 2013)
+Version 2.1-30 (opened May 5, 2013)
 
+	* bioenv: can now use Mahalanobis, Manhattan and Gower distances
+	for environmental variables.  The Mahalanobis distances are based
+	on orthogonalized data, Manhattan distances give the direct sum of
+	differences of environmental variables, and Gower distances can
+	also handle factor variables. This involves adding internal
+	function veganMahatrans() for Mahalanobis transformation.  The
+	change was triggered by a recent email by Lydia Beaudrot (UC
+	Davis) to implement Mahalanobis distances, and in the same I
+	also implemented Robby Marotte's suggestion of using Gower
+	distances (vegan Forum item in R-Forge in July 2012). The output
+	is changed to show the 'metric' and the name of the 'method' is
+	fully expanded. No more fails if 'upto' is too large.
+
+Version 2.1-29 (closed April 19, 2013)
+
 	* ordisurf: significant changes were made to this function:
 
 	 - The default for `method` and `select` were changed to `"REML"`

Modified: pkg/vegan/man/bioenv.Rd
===================================================================
--- pkg/vegan/man/bioenv.Rd	2013-04-29 08:00:00 UTC (rev 2498)
+++ pkg/vegan/man/bioenv.Rd	2013-05-05 15:58:34 UTC (rev 2499)
@@ -14,6 +14,7 @@
 \usage{
 \method{bioenv}{default}(comm, env, method = "spearman", index = "bray",
        upto = ncol(env), trace = FALSE, partial = NULL, 
+       metric = c("euclidean", "mahalanobis", "manhattan", "gower"),
        parallel = getOption("mc.cores"), ...)
 \method{bioenv}{formula}(formula, data, ...)
 }
@@ -30,6 +31,8 @@
   \item{trace}{Trace the calculations }
   \item{partial}{Dissimilarities partialled out when inspecting
     variables in \code{env}.}
+  \item{metric}{Metric used for distances of environmental distances. See 
+    Details.}
   \item{parallel}{Number of parallel processes or a predefined socket
     cluster.  With \code{parallel = 1} uses ordinary, non-parallel
     processing. The parallel processing is done with \pkg{parallel}
@@ -37,17 +40,33 @@
   \item{...}{Other arguments passed to \code{\link{cor}}.}
 }
 \details{
+  
   The function calculates a community dissimilarity matrix using
   \code{\link{vegdist}}.  Then it selects all possible subsets of
   environmental variables, \code{\link{scale}}s the variables, and
   calculates Euclidean distances for this subset using
-  \code{\link{dist}}.  Then it finds the correlation between community
-  dissimilarities and environmental distances, and for each size of
-  subsets, saves the best result. 
-  There are \eqn{2^p-1} subsets of \eqn{p} variables, and an exhaustive
-  search may take a very, very, very long time (parameter \code{upto} offers a
-  partial relief). 
+  \code{\link{dist}}.  The function finds the correlation between
+  community dissimilarities and environmental distances, and for each
+  size of subsets, saves the best result.  There are \eqn{2^p-1}
+  subsets of \eqn{p} variables, and an exhaustive search may take a
+  very, very, very long time (parameter \code{upto} offers a partial
+  relief).
 
+  The argument \code{metric} defines distances in the given set of
+  environmental variables.  With \code{metric = "euclidean"}, the
+  variables are scaled to unit variance and Euclidean distances are
+  calculated. With \code{metric = "mahalanobis"}, the Mahalanobis
+  distances are calculated: in addition to scaling to unit variance,
+  the matrix of the current set of environmental variables is also
+  made orthogonal (uncorrelated). With \code{metric = "manhanttan"},
+  the variables are scaled to unit range and Manhattan distances are
+  calculated, so that the distances are sums of differences of
+  environmental variables.  With \code{metric = "gower"}, the Gower
+  distances are calculated using function
+  \code{\link[cluster]{daisy}}. This allows also using factor
+  variables, but with continuous variables the results are equal to
+  \code{metric = "manhattan"}.
+
   The function can be called with a model \code{\link{formula}} where
   the LHS is the data matrix and RHS lists the environmental variables.
   The formula interface is practical in selecting or transforming

Modified: pkg/vegan/man/vegan-internal.Rd
===================================================================
--- pkg/vegan/man/vegan-internal.Rd	2013-04-29 08:00:00 UTC (rev 2498)
+++ pkg/vegan/man/vegan-internal.Rd	2013-05-05 15:58:34 UTC (rev 2499)
@@ -11,6 +11,7 @@
 \alias{ordiArgAbsorber}
 \alias{veganCovEllipse}
 \alias{hierParseFormula}
+\alias{veganMahatrans}
 
 \title{Internal vegan functions}
 
@@ -32,6 +33,7 @@
 permuted.index(n, strata)
 pasteCall(call, prefix = "Call:")
 veganCovEllipse(cov, center = c(0, 0), scale = 1, npoints = 100)
+veganMahatrans(x, s2, tol = 1e-8)
 hierParseFormula(formula, data)
 }
 
@@ -82,6 +84,12 @@
   \code{veganCovEllipse} finds the coordinates for drawing a
   covariance ellipse.
 
+  \code{veganMahatrans} transforms data matrix so that its Euclidean
+  distances are Mahalanobis distances. The input data \code{x} must be
+  a matrix centred by columns, and \code{s2} its covariance matrix. If
+  \code{s2} is not given, covariance matrix is found from \code{x}
+  within the function.
+
   \code{hierParseFormula} returns a list of one matrix (left hand side)
   and a model frame with factors representing hierarchy levels 
   (right hand side) to be used in \code{\link{adipart}},