[Vegan-commits] r2577 - in pkg/vegan: R inst man

Sun Jul 28 07:04:43 CEST 2013

Author: jarioksa
Date: 2013-07-28 07:04:40 +0200 (Sun, 28 Jul 2013)
New Revision: 2577

Modified:
   pkg/vegan/R/oecosimu.R
   pkg/vegan/inst/ChangeLog
   pkg/vegan/man/oecosimu.Rd
Log:
oecosimu can analyse models in batches to avoid memory exhaustion

Modified: pkg/vegan/R/oecosimu.R
===================================================================

--- pkg/vegan/R/oecosimu.R	2013-07-22 10:54:03 UTC (rev 2576)
+++ pkg/vegan/R/oecosimu.R	2013-07-28 05:04:40 UTC (rev 2577)
@@ -2,12 +2,15 @@
     function(comm, nestfun, method, nsimul=99,
              burnin=0, thin=1, statistic = "statistic",
              alternative = c("two.sided", "less", "greater"),
+             batchsize = NA,
              parallel = getOption("mc.cores"), ...)
 {
     alternative <- match.arg(alternative)
     nestfun <- match.fun(nestfun)
     if (length(statistic) > 1)
         stop("only one 'statistic' is allowed")
+    if (!is.na(batchsize))
+        batchsize <- batchsize * 1024 * 1024
     applynestfun <-
         function(x, fun = nestfun, statistic = "statistic", ...) {
             tmp <- fun(x, ...)
@@ -36,6 +39,23 @@
         }
         method <- nm$commsim$method
     }
+    ## Check the number of batches needed to run the requested number
+    ## of simulations without exceeding arg 'batchsize', and find the
+    ## size of each batch.
+    if (!simmat_in && !is.na(batchsize)) {
+        commsize <- object.size(comm)
+        totsize <- commsize * nsimul
+        if (totsize > batchsize) { 
+            nbatch <- ceiling(unclass(totsize/batchsize))
+            batches <- diff(round(seq(0, nsimul, by = nsimul/nbatch)))
+        } else {
+            nbatch <- 1
+        }
+    } else {
+        nbatch <- 1
+    }
+    if (nbatch == 1)
+        batches <- nsimul
     
     ind <- nestfun(comm, ...)
     indstat <-
@@ -43,9 +63,10 @@
             ind[[statistic]]
         else
             ind
-    if (!simmat_in) {
+    ## burnin of sequential models
+    if (!simmat_in && nm$commsim$isSeq) {
         ## estimate thinning for "tswap" (trial swap)
-        if (method == "tswap") {
+        if (nm$commsim$method == "tswap") {
             checkbrd <-sum(designdist(comm, "(J-A)*(J-B)", 
                                       "binary"))
             M <- nm$ncol
@@ -54,9 +75,11 @@
             thin <- round(thin * checkbrd)
             burnin <- round(burnin * checkbrd)
         }
-        x <- simulate(nm, nsim = nsimul, burnin = burnin, thin = thin)
+        if (burnin > 0)
+            nm <- update(nm, burnin)
     }
-
+    ## start with empty simind
+    simind <- NULL
     ## Go to parallel processing if 'parallel > 1' or 'parallel' could
     ## be a pre-defined socket cluster or 'parallel = NULL' in which
     ## case it could be setDefaultCluster (or a user error)
@@ -68,12 +91,15 @@
     hasClus <- inherits(parallel, "cluster")
     if ((hasClus || parallel > 1)  && require(parallel)) {
         if(.Platform$OS.type == "unix" && !hasClus) {
-            tmp <- mclapply(1:nsimul,
-                            function(i)
-                            applynestfun(x[,,i], fun=nestfun,
-                                         statistic = statistic, ...),
-                            mc.cores = parallel)
-            simind <- do.call(cbind, tmp)
+            for (i in seq_len(nbatch)) {
+                x <- simulate(nm, nsim = batches[i], thin = thin)
+                tmp <- mclapply(seq_len(batches[i]),
+                                function(j)
+                                applynestfun(x[,,j], fun=nestfun,
+                                             statistic = statistic, ...),
+                                mc.cores = parallel)
+                simind <- cbind(simind, do.call(cbind, tmp))
+            }
         } else {
             ## if hasClus, do not set up and stop a temporary cluster
             if (!hasClus) {
@@ -81,22 +107,29 @@
                 ## make vegan functions available: others may be unavailable
                 clusterEvalQ(parallel, library(vegan))
             }
-            simind <- parApply(parallel, x, 3, function(z)
-                               applynestfun(z, fun = nestfun,
-                                            statistic = statistic, ...))
+            for(i in seq_len(nbatch)) {
+                x <- simulate(nm, nsim = batches[i], thin = thin)
+                simind <- cbind(simind,
+                                parApply(parallel, x, 3, function(z)
+                                         applynestfun(z, fun = nestfun,
+                                                      statistic = statistic, ...)))
+            }
             if (!hasClus)
                 stopCluster(parallel)
         }
     } else {
-        simind <- apply(x, 3, applynestfun, fun = nestfun,
-                        statistic = statistic, ...)
+        for(i in seq_len(nbatch)) {
+            x <- simulate(nm, nsim = batches[i], thin = thin)
+            simind <- cbind(simind, apply(x, 3, applynestfun, fun = nestfun,
+                                          statistic = statistic, ...))
+        }
     }
     
     simind <- matrix(simind, ncol = nsimul)
 
     if (attr(x, "isSeq")) {
         attr(simind, "thin") <- attr(x, "thin")
-        attr(simind, "burnin") <- attr(x, "start") - 1L
+        attr(simind, "burnin") <- burnin
     }
     
     sd <- apply(simind, 1, sd, na.rm = TRUE)

Modified: pkg/vegan/inst/ChangeLog
===================================================================
--- pkg/vegan/inst/ChangeLog	2013-07-22 10:54:03 UTC (rev 2576)
+++ pkg/vegan/inst/ChangeLog	2013-07-28 05:04:40 UTC (rev 2577)
@@ -13,6 +13,14 @@
 	Ecology Progress Series_, 320, 11–27. The basic development was
 	made in github.com and merged to R-Forge.
 
+	* oecosimu: Gained argument 'batchsize' to set the maximum size of
+	simulated nullmodels (in Mb). If a larger object would be
+	produced, the analysis is broken into several batches to avoid
+	exceeding the maximum size. This avoids exhausting memory which
+	can make whole R unresponsive and analysis very, very slow. In
+	general, the argument is needed with large data sets and/or large
+	number of simulations.
+
 	* orditkplot: bmp has been available in unix-alike OSes since
 	2008, or a moth after writing orditkplot. Thanks to Brian Ripley
 	for informing us.

Modified: pkg/vegan/man/oecosimu.Rd
===================================================================
--- pkg/vegan/man/oecosimu.Rd	2013-07-22 10:54:03 UTC (rev 2576)
+++ pkg/vegan/man/oecosimu.Rd	2013-07-28 05:04:40 UTC (rev 2577)
@@ -29,7 +29,7 @@
 \usage{
 oecosimu(comm, nestfun, method, nsimul = 99, burnin = 0, thin = 1,
    statistic = "statistic", alternative = c("two.sided", "less", "greater"), 
-   parallel = getOption("mc.cores"), ...)
+   batchsize = NA, parallel = getOption("mc.cores"), ...)
 \method{as.ts}{oecosimu}(x, ...)
 \method{as.mcmc}{oecosimu}(x)
 \method{density}{oecosimu}(x, ...)
@@ -75,6 +75,10 @@
     test is approximately two times higher than in the corresponding
     one-sided test (\code{"greater"} or \code{"less"} depending on the
     sign of the difference).}
+  \item{batchsize}{Size in Megabytes of largest simulation object. If
+    a larger structure would be produced, the analysis is broken
+    internally into batches. With default \code{NA} the analysis is
+    not broken into batches.  See Details.}
   \item{parallel}{Number of parallel processes or a predefined socket
     cluster.  With \code{parallel = 1} uses ordinary, non-parallel
     processing. The parallel processing is done with \pkg{parallel}
@@ -135,9 +139,23 @@
       \code{method} and \code{nsimul} are ignored.  
   }
   The last case allows analysing several statistics with the same
-  simulations. However, be aware that the \code{\link{object.size}} of
-  the simulated communities may be large.
+  simulations.
 
+  The function first generates simulations with given
+  \code{\link{nullmodel}} and then analyses these using the
+  \code{nestfun}.  With large data sets and/or large number of
+  simulations, the generated objects can be very large, and if the
+  memory is exhausted, the analysis can become very slow and the
+  system can become unresponsive. The simulation will be broken into
+  several smaller batches if the simulated \code{\link{nullmodel}}
+  objective will be above the set \code{batchsize} to avoid memory
+  problems (see \code{\link{object.size}} for estimating the size of
+  the current data set). The parallel processing still increases the
+  memory needs.  The parallel processing is only used for evaluating
+  \code{nestfun}.  The main load may be in simulation of the
+  \code{\link{nullmodel}}, and \code{parallel} argument does not help
+  there.
+
   Function \code{as.ts} transforms the simulated results of sequential
   methods into a time series or a \code{\link{ts}} object. This allows
   using analytic tools for time series in studying the sequences (see