[Vegan-commits] r2577 - in pkg/vegan: R inst man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sun Jul 28 07:04:43 CEST 2013
Author: jarioksa
Date: 2013-07-28 07:04:40 +0200 (Sun, 28 Jul 2013)
New Revision: 2577
Modified:
pkg/vegan/R/oecosimu.R
pkg/vegan/inst/ChangeLog
pkg/vegan/man/oecosimu.Rd
Log:
oecosimu can analyse models in batches to avoid memory exhaustion
Modified: pkg/vegan/R/oecosimu.R
===================================================================
--- pkg/vegan/R/oecosimu.R 2013-07-22 10:54:03 UTC (rev 2576)
+++ pkg/vegan/R/oecosimu.R 2013-07-28 05:04:40 UTC (rev 2577)
@@ -2,12 +2,15 @@
function(comm, nestfun, method, nsimul=99,
burnin=0, thin=1, statistic = "statistic",
alternative = c("two.sided", "less", "greater"),
+ batchsize = NA,
parallel = getOption("mc.cores"), ...)
{
alternative <- match.arg(alternative)
nestfun <- match.fun(nestfun)
if (length(statistic) > 1)
stop("only one 'statistic' is allowed")
+ if (!is.na(batchsize))
+ batchsize <- batchsize * 1024 * 1024
applynestfun <-
function(x, fun = nestfun, statistic = "statistic", ...) {
tmp <- fun(x, ...)
@@ -36,6 +39,23 @@
}
method <- nm$commsim$method
}
+ ## Check the number of batches needed to run the requested number
+ ## of simulations without exceeding arg 'batchsize', and find the
+ ## size of each batch.
+ if (!simmat_in && !is.na(batchsize)) {
+ commsize <- object.size(comm)
+ totsize <- commsize * nsimul
+ if (totsize > batchsize) {
+ nbatch <- ceiling(unclass(totsize/batchsize))
+ batches <- diff(round(seq(0, nsimul, by = nsimul/nbatch)))
+ } else {
+ nbatch <- 1
+ }
+ } else {
+ nbatch <- 1
+ }
+ if (nbatch == 1)
+ batches <- nsimul
ind <- nestfun(comm, ...)
indstat <-
@@ -43,9 +63,10 @@
ind[[statistic]]
else
ind
- if (!simmat_in) {
+ ## burnin of sequential models
+ if (!simmat_in && nm$commsim$isSeq) {
## estimate thinning for "tswap" (trial swap)
- if (method == "tswap") {
+ if (nm$commsim$method == "tswap") {
checkbrd <-sum(designdist(comm, "(J-A)*(J-B)",
"binary"))
M <- nm$ncol
@@ -54,9 +75,11 @@
thin <- round(thin * checkbrd)
burnin <- round(burnin * checkbrd)
}
- x <- simulate(nm, nsim = nsimul, burnin = burnin, thin = thin)
+ if (burnin > 0)
+ nm <- update(nm, burnin)
}
-
+ ## start with empty simind
+ simind <- NULL
## Go to parallel processing if 'parallel > 1' or 'parallel' could
## be a pre-defined socket cluster or 'parallel = NULL' in which
## case it could be setDefaultCluster (or a user error)
@@ -68,12 +91,15 @@
hasClus <- inherits(parallel, "cluster")
if ((hasClus || parallel > 1) && require(parallel)) {
if(.Platform$OS.type == "unix" && !hasClus) {
- tmp <- mclapply(1:nsimul,
- function(i)
- applynestfun(x[,,i], fun=nestfun,
- statistic = statistic, ...),
- mc.cores = parallel)
- simind <- do.call(cbind, tmp)
+ for (i in seq_len(nbatch)) {
+ x <- simulate(nm, nsim = batches[i], thin = thin)
+ tmp <- mclapply(seq_len(batches[i]),
+ function(j)
+ applynestfun(x[,,j], fun=nestfun,
+ statistic = statistic, ...),
+ mc.cores = parallel)
+ simind <- cbind(simind, do.call(cbind, tmp))
+ }
} else {
## if hasClus, do not set up and stop a temporary cluster
if (!hasClus) {
@@ -81,22 +107,29 @@
## make vegan functions available: others may be unavailable
clusterEvalQ(parallel, library(vegan))
}
- simind <- parApply(parallel, x, 3, function(z)
- applynestfun(z, fun = nestfun,
- statistic = statistic, ...))
+ for(i in seq_len(nbatch)) {
+ x <- simulate(nm, nsim = batches[i], thin = thin)
+ simind <- cbind(simind,
+ parApply(parallel, x, 3, function(z)
+ applynestfun(z, fun = nestfun,
+ statistic = statistic, ...)))
+ }
if (!hasClus)
stopCluster(parallel)
}
} else {
- simind <- apply(x, 3, applynestfun, fun = nestfun,
- statistic = statistic, ...)
+ for(i in seq_len(nbatch)) {
+ x <- simulate(nm, nsim = batches[i], thin = thin)
+ simind <- cbind(simind, apply(x, 3, applynestfun, fun = nestfun,
+ statistic = statistic, ...))
+ }
}
simind <- matrix(simind, ncol = nsimul)
if (attr(x, "isSeq")) {
attr(simind, "thin") <- attr(x, "thin")
- attr(simind, "burnin") <- attr(x, "start") - 1L
+ attr(simind, "burnin") <- burnin
}
sd <- apply(simind, 1, sd, na.rm = TRUE)
Modified: pkg/vegan/inst/ChangeLog
===================================================================
--- pkg/vegan/inst/ChangeLog 2013-07-22 10:54:03 UTC (rev 2576)
+++ pkg/vegan/inst/ChangeLog 2013-07-28 05:04:40 UTC (rev 2577)
@@ -13,6 +13,14 @@
Ecology Progress Series_, 320, 11–27. The basic development was
made in github.com and merged to R-Forge.
+ * oecosimu: Gained argument 'batchsize' to set the maximum size of
+ simulated nullmodels (in Mb). If a larger object would be
+ produced, the analysis is broken into several batches to avoid
+ exceeding the maximum size. This avoids exhausting memory which
+ can make whole R unresponsive and analysis very, very slow. In
+ general, the argument is needed with large data sets and/or large
+ number of simulations.
+
* orditkplot: bmp has been available in unix-alike OSes since
2008, or a moth after writing orditkplot. Thanks to Brian Ripley
for informing us.
Modified: pkg/vegan/man/oecosimu.Rd
===================================================================
--- pkg/vegan/man/oecosimu.Rd 2013-07-22 10:54:03 UTC (rev 2576)
+++ pkg/vegan/man/oecosimu.Rd 2013-07-28 05:04:40 UTC (rev 2577)
@@ -29,7 +29,7 @@
\usage{
oecosimu(comm, nestfun, method, nsimul = 99, burnin = 0, thin = 1,
statistic = "statistic", alternative = c("two.sided", "less", "greater"),
- parallel = getOption("mc.cores"), ...)
+ batchsize = NA, parallel = getOption("mc.cores"), ...)
\method{as.ts}{oecosimu}(x, ...)
\method{as.mcmc}{oecosimu}(x)
\method{density}{oecosimu}(x, ...)
@@ -75,6 +75,10 @@
test is approximately two times higher than in the corresponding
one-sided test (\code{"greater"} or \code{"less"} depending on the
sign of the difference).}
+ \item{batchsize}{Size in Megabytes of largest simulation object. If
+ a larger structure would be produced, the analysis is broken
+ internally into batches. With default \code{NA} the analysis is
+ not broken into batches. See Details.}
\item{parallel}{Number of parallel processes or a predefined socket
cluster. With \code{parallel = 1} uses ordinary, non-parallel
processing. The parallel processing is done with \pkg{parallel}
@@ -135,9 +139,23 @@
\code{method} and \code{nsimul} are ignored.
}
The last case allows analysing several statistics with the same
- simulations. However, be aware that the \code{\link{object.size}} of
- the simulated communities may be large.
+ simulations.
+ The function first generates simulations with given
+ \code{\link{nullmodel}} and then analyses these using the
+ \code{nestfun}. With large data sets and/or large number of
+ simulations, the generated objects can be very large, and if the
+ memory is exhausted, the analysis can become very slow and the
+ system can become unresponsive. The simulation will be broken into
+ several smaller batches if the simulated \code{\link{nullmodel}}
+ objective will be above the set \code{batchsize} to avoid memory
+ problems (see \code{\link{object.size}} for estimating the size of
+ the current data set). The parallel processing still increases the
+ memory needs. The parallel processing is only used for evaluating
+ \code{nestfun}. The main load may be in simulation of the
+ \code{\link{nullmodel}}, and \code{parallel} argument does not help
+ there.
+
Function \code{as.ts} transforms the simulated results of sequential
methods into a time series or a \code{\link{ts}} object. This allows
using analytic tools for time series in studying the sequences (see
More information about the Vegan-commits
mailing list