[Analogue-commits] r262 - in pkg: . R inst man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sat Apr 14 00:46:53 CEST 2012
Author: gsimpson
Date: 2012-04-14 00:46:53 +0200 (Sat, 14 Apr 2012)
New Revision: 262
Added:
pkg/R/splitSample.R
pkg/man/splitSample.Rd
Modified:
pkg/DESCRIPTION
pkg/NAMESPACE
pkg/inst/ChangeLog
Log:
adds splitSample function
Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION 2012-04-10 22:15:02 UTC (rev 261)
+++ pkg/DESCRIPTION 2012-04-13 22:46:53 UTC (rev 262)
@@ -1,7 +1,7 @@
Package: analogue
Type: Package
Title: Analogue and weighted averaging methods for palaeoecology
-Version: 0.9-0
+Version: 0.9-1
Date: $Date$
Depends: R (>= 2.15.0), stats, graphics, vegan (>= 1.17-12), lattice, grid,
MASS, princurve
Modified: pkg/NAMESPACE
===================================================================
--- pkg/NAMESPACE 2012-04-10 22:15:02 UTC (rev 261)
+++ pkg/NAMESPACE 2012-04-13 22:46:53 UTC (rev 262)
@@ -45,6 +45,7 @@
reconPlot,
residLen,
RMSEP,
+ splitSample,
roc,
smoothSpline,
Stratiplot,
Added: pkg/R/splitSample.R
===================================================================
--- pkg/R/splitSample.R (rev 0)
+++ pkg/R/splitSample.R 2012-04-13 22:46:53 UTC (rev 262)
@@ -0,0 +1,74 @@
+##--------------------------------------------------------------------##
+## ##
+## Split sample a test set along gradient of interest ##
+## ##
+## split gradient into chunkk sections and sample randomly nc ##
+## observations from within each section to act as the test sample ##
+## ##
+## env - gradient to sample along ##
+## chunk - number of chunks to split env into ##
+## n - number of observations to sample in total ##
+## nchunk - number of observations per group ##
+## ##
+##--------------------------------------------------------------------##
+splitSample <- function(env, chunk = 10, take, nchunk,
+ fill = c("head","tail","random")) {
+ sampFun <- function(ind, x, nchunk) {
+ sample(x[[ind]], min(length(x[[ind]]), nchunk[[ind]]))
+ }
+ if(take < chunk) {
+ stop("Number of samples to 'take' < number of 'chunk's")
+ }
+ fill <- match.arg(fill)
+ env <- as.numeric(env)
+ N <- length(env)
+ cuts <- cut(env, chunk)
+ splt <- split(seq_along(env), cuts)
+ lens <- sapply(splt, length)
+ if(missing(nchunk)) {
+ ## base number of samples per chunk
+ each <- floor(take / chunk)
+ ## expand to a vector
+ nchunk <- rep(each, chunk)
+ ## fill in the remainder samples according to fill type
+ tooSmall <- lens < nchunk
+ nchunk[tooSmall] <- lens[tooSmall]
+ if(any(tooSmall)) {
+ i <- 1
+ ## vector of chunks that *aren't* too small expanded to length 100
+ vec <- if(isTRUE(all.equal(fill, "head"))) {
+ rep(sort(which(!tooSmall)), 100)
+ } else if(isTRUE(all.equal(fill, "tail"))) {
+ rep(sort(which(!tooSmall), decreasing = TRUE), 100)
+ } else {
+ sample(rep(which(!tooSmall), 100), 100)
+ }
+ ## fill in chunks
+ while(sum(nchunk) < take) {
+ want <- vec[i]
+ if(lens[want] <= nchunk[want])
+ next
+ nchunk[want] <- nchunk[want] + 1
+ i <- i + 1
+ if(i > 100) ## if used all vec, start again
+ i <- 1
+ }
+ }
+ } else {
+ if(!is.numeric(env))
+ stop("'env' must be a numeric vector.")
+ if(!isTRUE(all.equal(sum(nchunk), length(env))))
+ stop("'length(env)' and 'sum(nchunk)' do not match.")
+ }
+ ## indicator to loop over
+ ind <- seq_along(splt)
+ ## loop over gradient chunks
+ samp <- lapply(ind, FUN = sampFun, x = splt, nchunk = nchunk)
+ ## grab the number of samples in each chunk
+ lengths <- sapply(samp, length)
+ ## turn sample list intoa vector
+ samp <- unlist(samp, use.names = FALSE)
+ ## assign lengths as attribute
+ attr(samp, "lengths") <- lengths
+ samp
+}
Modified: pkg/inst/ChangeLog
===================================================================
--- pkg/inst/ChangeLog 2012-04-10 22:15:02 UTC (rev 261)
+++ pkg/inst/ChangeLog 2012-04-13 22:46:53 UTC (rev 262)
@@ -1,5 +1,12 @@
analogue Change Log
+Version 0.9-1
+
+ * splitSample: new function to sample a test set from across
+ an environmental gradient by breaking gradient into a series
+ of chunks and sampling approximately equally from within each
+ chunk.
+
Version 0.9-0
* caterpillarPlot: new function that draws a caterpillar plot
Added: pkg/man/splitSample.Rd
===================================================================
--- pkg/man/splitSample.Rd (rev 0)
+++ pkg/man/splitSample.Rd 2012-04-13 22:46:53 UTC (rev 262)
@@ -0,0 +1,107 @@
+\name{splitSample}
+\alias{splitSample}
+
+\title{
+ Select samples from along an environmental gradient
+}
+\description{
+ Select samples from along an environmental gradient by splitting
+ the gradient into discrete chunks and sample within each chunk. This
+ allows a test set to be selected which covers the environmental
+ gradient of the training set, for example.
+}
+\usage{
+splitSample(env, chunk = 10, take, nchunk,
+ fill = c("head", "tail", "random"))
+}
+%- maybe also 'usage' for other objects documented here.
+\arguments{
+ \item{env}{numeric; vector of samples representing the gradient values.}
+ \item{chunk}{numeric; number of chunks to split the gradient into.}
+ \item{take}{numeric; how many samples to take from the gradient. Can
+ not be missing.}
+ \item{nchunk}{numeric; number of samples per chunk. Must be a vector
+ of length \code{chunk} and \code{sum(chunk)} must equal
+ \code{take}. Can be missing (the default), in which case some simple
+ heuristics are used to determine the number of samples chosen per
+ chunk. See Details.}
+ \item{fill}{character; the type of filling of chunks to perform. See
+ Details.}
+}
+\details{
+ The gradient is split into \code{chunk} sections and samples are
+ selected from each chunk to result in a sample of length
+ \code{take}. If \code{take} is divisible by \code{chunk} without
+ remainder then there will an equal number of samples selected from
+ each chunk. Where \code{chunk} is not a multiple of \code{take} and
+ \code{nchunk} is not specified then extra samples have to be allocated
+ to some of the chunks to reach the required number of samples
+ selected.
+
+ An additional complication is that some chunks of the gradient may
+ have fewer than \code{nchunk} samples and therefore more samples need
+ to be selected from the remaining chunks until \code{take} samples are
+ chosen.
+
+ If \code{nchunk} is supplied, it must be a vector stating exactly how
+ many samples to select from each chunk. If \code{chunk} is not
+ supplied, then the number of samples per chunk is determined as
+ follows:
+
+ \enumerate{
+ \item An intial allocation of \code{floor(take / chunk)} is assigned
+ to each chunk
+ \item If any chunks have fewer samples than this initial allocation,
+ these elements of \code{nchunk} are reset to the number of samples
+ in those chunks
+ \item Sequentially an extra sample is allocated to each chunk with
+ sufficient available samples until \code{take} samples are
+ selected.
+ }
+
+ Argument \code{fill} controls the order in which the chunks are
+ filled. \code{fill = "head"} fills from the low to the high end of the
+ gradient, whilst \code{fill = "tail"} fills in the opposite
+ direction. Chunks are filled in random order if \code{fill =
+ "random"}. In all cases no chunk is filled by more than one extra
+ sample until all chunks that can supply one extra sample are
+ filled. In the case of \code{fill = "head"} or \code{fill = "tail"}
+ this entails moving along the gradient from one end to the other
+ allocating an extra sample to available chunks before starting along
+ the gradient again. For \code{fill = "random"}, a random order of
+ chunks to fill is determined, if an extra sample is allocated to each
+ chunk in the random order and \code{take} samples are still not
+ selected, filling begins again using the same random ordering. In
+ other words, the random order of chunks to fill is chosen only once.
+}
+\value{
+ A numeric vector of indices of selected samples. This vector has
+ attribute \code{lengths} which indicates how many samples were
+ actually chosen from each chunk.
+}
+%\references{
+%% ~put references to the literature/web site here ~
+%}
+\author{
+ Gavin L. Simpson
+}
+
+\examples{
+data(swappH)
+
+## take a test set of 20 samples along the pH gradient
+test1 <- splitSample(swappH, chunk = 10, take = 20)
+test1
+swappH[test1]
+
+## take a larger sample where some chunks don't have many samples
+## do random filling
+set.seed(3)
+test2 <- splitSample(swappH, chunk = 10, take = 70, fill = "random")
+test2
+swappH[test2]
+}
+% Add one or more standard keywords, see file 'KEYWORDS' in the
+% R documentation directory.
+\keyword{manip}
+\keyword{utilities}
More information about the Analogue-commits
mailing list