[Analogue-commits] r262 - in pkg: . R inst man

Sat Apr 14 00:46:53 CEST 2012

Author: gsimpson
Date: 2012-04-14 00:46:53 +0200 (Sat, 14 Apr 2012)
New Revision: 262

Added:
   pkg/R/splitSample.R
   pkg/man/splitSample.Rd
Modified:
   pkg/DESCRIPTION
   pkg/NAMESPACE
   pkg/inst/ChangeLog
Log:
adds splitSample function

Modified: pkg/DESCRIPTION
===================================================================

--- pkg/DESCRIPTION	2012-04-10 22:15:02 UTC (rev 261)
+++ pkg/DESCRIPTION	2012-04-13 22:46:53 UTC (rev 262)
@@ -1,7 +1,7 @@
 Package: analogue
 Type: Package
 Title: Analogue and weighted averaging methods for palaeoecology
-Version: 0.9-0
+Version: 0.9-1
 Date: $Date$
 Depends: R (>= 2.15.0), stats, graphics, vegan (>= 1.17-12), lattice, grid, 
          MASS, princurve

Modified: pkg/NAMESPACE
===================================================================
--- pkg/NAMESPACE	2012-04-10 22:15:02 UTC (rev 261)
+++ pkg/NAMESPACE	2012-04-13 22:46:53 UTC (rev 262)
@@ -45,6 +45,7 @@
        reconPlot,
        residLen,
        RMSEP,
+       splitSample,
        roc,
        smoothSpline,
        Stratiplot,

Added: pkg/R/splitSample.R
===================================================================
--- pkg/R/splitSample.R	                        (rev 0)
+++ pkg/R/splitSample.R	2012-04-13 22:46:53 UTC (rev 262)
@@ -0,0 +1,74 @@
+##--------------------------------------------------------------------##
+##                                                                    ##
+## Split sample a test set along gradient of interest                 ##
+##                                                                    ##
+## split gradient into chunkk sections and sample randomly nc         ##
+## observations from within each section to act as the test sample    ##
+##                                                                    ##
+## env    - gradient to sample along                                  ##
+## chunk  - number of chunks to split env into                        ##
+## n      - number of observations to sample in total                 ##
+## nchunk - number of observations per group                          ##
+##                                                                    ##
+##--------------------------------------------------------------------##
+splitSample <- function(env, chunk = 10, take, nchunk,
+                        fill = c("head","tail","random")) {
+    sampFun <- function(ind, x, nchunk) {
+        sample(x[[ind]], min(length(x[[ind]]), nchunk[[ind]]))
+    }
+    if(take < chunk) {
+        stop("Number of samples to 'take' < number of 'chunk's")
+    }
+    fill <- match.arg(fill)
+    env <- as.numeric(env)
+    N <- length(env)
+    cuts <- cut(env, chunk)
+    splt <- split(seq_along(env), cuts)
+    lens <- sapply(splt, length)
+    if(missing(nchunk)) {
+        ## base number of samples per chunk
+        each <- floor(take / chunk)
+        ## expand to a vector
+        nchunk <- rep(each, chunk)
+        ## fill in the remainder samples according to fill type
+        tooSmall <- lens < nchunk
+        nchunk[tooSmall] <- lens[tooSmall]
+        if(any(tooSmall)) {
+            i <- 1
+            ## vector of chunks that *aren't* too small expanded to length 100
+            vec <- if(isTRUE(all.equal(fill, "head"))) {
+                rep(sort(which(!tooSmall)), 100)
+            } else if(isTRUE(all.equal(fill, "tail"))) {
+                rep(sort(which(!tooSmall), decreasing = TRUE), 100)
+            } else {
+                sample(rep(which(!tooSmall), 100), 100)
+            }
+            ## fill in chunks
+            while(sum(nchunk) < take) {
+                want <- vec[i]
+                if(lens[want] <= nchunk[want])
+                    next
+                nchunk[want] <- nchunk[want] + 1
+                i <- i + 1
+                if(i > 100) ## if used all vec, start again
+                    i <- 1
+            }
+        }
+    } else {
+        if(!is.numeric(env))
+            stop("'env' must be a numeric vector.")
+        if(!isTRUE(all.equal(sum(nchunk), length(env))))
+            stop("'length(env)' and 'sum(nchunk)' do not match.")
+    }
+    ## indicator to loop over
+    ind <- seq_along(splt)
+    ## loop over gradient chunks
+    samp <- lapply(ind, FUN = sampFun, x = splt, nchunk = nchunk)
+    ## grab the number of samples in each chunk
+    lengths <- sapply(samp, length)
+    ## turn sample list intoa vector
+    samp <- unlist(samp, use.names = FALSE)
+    ## assign lengths as attribute
+    attr(samp, "lengths") <- lengths
+    samp
+}

Modified: pkg/inst/ChangeLog
===================================================================
--- pkg/inst/ChangeLog	2012-04-10 22:15:02 UTC (rev 261)
+++ pkg/inst/ChangeLog	2012-04-13 22:46:53 UTC (rev 262)
@@ -1,5 +1,12 @@
 analogue Change Log
 
+Version 0.9-1
+
+	* splitSample: new function to sample a test set from across
+	an environmental gradient by breaking gradient into a series
+	of chunks and sampling approximately equally from within each
+	chunk.
+
 Version 0.9-0
 
 	* caterpillarPlot: new function that draws a caterpillar plot

Added: pkg/man/splitSample.Rd
===================================================================
--- pkg/man/splitSample.Rd	                        (rev 0)
+++ pkg/man/splitSample.Rd	2012-04-13 22:46:53 UTC (rev 262)
@@ -0,0 +1,107 @@
+\name{splitSample}
+\alias{splitSample}
+
+\title{
+  Select samples from along an environmental gradient
+}
+\description{
+  Select samples from along an environmental gradient by splitting
+  the gradient into discrete chunks and sample within each chunk. This
+  allows a test set to be selected which covers the environmental
+  gradient of the training set, for example.
+}
+\usage{
+splitSample(env, chunk = 10, take, nchunk,
+            fill = c("head", "tail", "random"))
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+  \item{env}{numeric; vector of samples representing the gradient values.}
+  \item{chunk}{numeric; number of chunks to split the gradient into.}
+  \item{take}{numeric; how many samples to take from the gradient. Can
+    not be missing.}
+  \item{nchunk}{numeric; number of samples per chunk. Must be a vector
+    of length \code{chunk} and \code{sum(chunk)} must equal
+    \code{take}. Can be missing (the default), in which case some simple
+    heuristics are used to determine the number of samples chosen per
+    chunk. See Details.}
+  \item{fill}{character; the type of filling of chunks to perform. See
+    Details.}
+}
+\details{
+  The gradient is split into \code{chunk} sections and samples are
+  selected from each chunk to result in a sample of length
+  \code{take}. If \code{take} is divisible by \code{chunk} without
+  remainder then there will an equal number of samples selected from
+  each chunk. Where \code{chunk} is not a multiple of \code{take} and
+  \code{nchunk} is not specified then extra samples have to be allocated
+  to some of the chunks to reach the required number of samples
+  selected.
+
+  An additional complication is that some chunks of the gradient may
+  have fewer than \code{nchunk} samples and therefore more samples need
+  to be selected from the remaining chunks until \code{take} samples are
+  chosen.
+
+  If \code{nchunk} is supplied, it must be a vector stating exactly how
+  many samples to select from each chunk. If \code{chunk} is not
+  supplied, then the number of samples per chunk is determined as
+  follows:
+
+  \enumerate{
+    \item An intial allocation of \code{floor(take / chunk)} is assigned
+    to each chunk
+    \item If any chunks have fewer samples than this initial allocation,
+    these elements of \code{nchunk} are reset to the number of  samples
+    in those chunks
+    \item Sequentially an extra sample is allocated to each chunk with
+    sufficient available samples until \code{take} samples are
+    selected.
+  }
+
+  Argument \code{fill} controls the order in which the chunks are
+  filled. \code{fill = "head"} fills from the low to the high end of the
+  gradient, whilst \code{fill = "tail"} fills in the opposite
+  direction. Chunks are filled in random order if \code{fill =
+  "random"}. In all cases no chunk is filled by more than one extra
+  sample until all chunks that can supply one extra sample are
+  filled. In the case of \code{fill = "head"} or \code{fill = "tail"}
+  this entails moving along the gradient from one end to the other
+  allocating an extra sample to available chunks before starting along
+  the gradient again. For \code{fill = "random"}, a random order of
+  chunks to fill is determined, if an extra sample is allocated to each
+  chunk in the random order and \code{take} samples are still not
+  selected, filling begins again using the same random ordering. In
+  other words, the random order of chunks to fill is chosen only once.
+}
+\value{
+  A numeric vector of indices of selected samples. This vector has
+  attribute \code{lengths} which indicates how many samples were
+  actually chosen from each chunk.
+}
+%\references{
+%% ~put references to the literature/web site here ~
+%}
+\author{
+  Gavin L. Simpson
+}
+
+\examples{
+data(swappH)
+
+## take a test set of 20 samples along the pH gradient
+test1 <- splitSample(swappH, chunk = 10, take = 20)
+test1
+swappH[test1]
+
+## take a larger sample where some chunks don't have many samples
+## do random filling
+set.seed(3)
+test2 <- splitSample(swappH, chunk = 10, take = 70, fill = "random")
+test2
+swappH[test2]
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{manip}
+\keyword{utilities}