[Analogue-commits] r223 - in pkg: . R inst man tests/Examples

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Mon May 30 23:26:07 CEST 2011


Author: gsimpson
Date: 2011-05-30 23:26:07 +0200 (Mon, 30 May 2011)
New Revision: 223

Added:
   pkg/R/pcr.R
   pkg/man/pcr.Rd
Modified:
   pkg/DESCRIPTION
   pkg/inst/ChangeLog
   pkg/tests/Examples/analogue-Ex.Rout.save
Log:
add pcr() for principal components regression, bump to 0.7-2

Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION	2011-05-27 09:44:50 UTC (rev 222)
+++ pkg/DESCRIPTION	2011-05-30 21:26:07 UTC (rev 223)
@@ -1,7 +1,7 @@
 Package: analogue
 Type: Package
 Title: Analogue and weighted averaging methods for palaeoecology
-Version: 0.7-1
+Version: 0.7-2
 Date: $Date$
 Depends: R (>= 2.10.0), stats, graphics, vegan, lattice, grid, MASS, princurve
 Author: Gavin L. Simpson, Jari Oksanen

Added: pkg/R/pcr.R
===================================================================
--- pkg/R/pcr.R	                        (rev 0)
+++ pkg/R/pcr.R	2011-05-30 21:26:07 UTC (rev 223)
@@ -0,0 +1,173 @@
+## Principal Components Regression Models
+
+## generic
+`pcr` <- function(x, ...) {
+    UseMethod("pcr")
+}
+
+## default
+`pcr.default` <- function(x, y, ncomp, tranFun, ...) {
+    ## convert to matrices for speed
+    x <- data.matrix(x)
+
+    ## dimensions
+    Nx <- NROW(x)
+    Ny <- length(y)
+    Mx <- NCOL(x) ## number of predictor vars
+
+    ## Apply a transformation if specified
+    if(!missing(tranFun)) {
+        FUN <- match.fun(tranFun)
+        x <- tranFun(x)
+        tranParms <- attr(x, "parms")
+    } else {
+        FUN <- NA
+    }
+
+    ## centre x and y
+    xMeans <- colMeans(x)
+    yMean <- mean(y)
+    x <- sweep(x, 2, xMeans, "-")
+    y <- y - yMean
+
+    ## How many components?
+    ncomp <- if(missing(ncomp)) {
+        min(Nx - 1, Mx)
+    } else {
+        if(ncomp < 1 || ncomp > (newcomp <- min(Nx - 1, Mx)))
+            warning("Invalid 'ncomp'. Resetting to max possible.")
+        newcomp
+    }
+    S <- seq_len(ncomp)
+
+    ## Storage objects:
+    ## B model coefficients
+    ## fitted.values
+    B <- matrix(0, nrow = Mx, ncol = ncomp)
+    fitted.values <- matrix(0, nrow = Nx, ncol = ncomp)
+
+    ## SVD
+    SVD <- La.svd(x)
+    D <- SVD$d[S]
+    TT <- SVD$u[, S, drop = FALSE] %*% diag(D, nrow = ncomp)
+    P <- t(SVD$vt[S, , drop = FALSE])
+    tQ <- crossprod(TT, y) / (varExpl <- D^2)
+
+    ## compute coefficients
+    for(b in S) {
+        bS <- seq_len(b)
+        B[, b] <- P[, bS, drop = FALSE] %*% tQ[bS, ]
+        fitted.values[, b] <- TT[, bS, drop = FALSE] %*% tQ[bS, ]
+    }
+
+    ## other model output
+    residuals <- y - fitted.values
+    fitted.values <- sweep(fitted.values, 2, yMean, "+")
+
+    ## model performance
+    Y <- y + yMean
+    performance <- data.frame(R2 = drop(cor(fitted.values, Y)),
+                              avgBias = colMeans(residuals),
+                              maxBias = apply(residuals, 2, maxBias, Y),
+                              RMSEP = sqrt(colMeans(residuals^2)))
+    rownames(performance) <- paste("PC", S, sep = "")
+
+    ## get and fix up the call
+    .call <- match.call()
+    .call[[1]] <- as.name("pcr")
+    ## fix-up the name of the transformation function used,
+    ## needed when formula method called...
+    .call[[which(names(.call) == "tranFun")]] <- as.name(deparse(substitute(tranFun)))
+
+    ## return object
+    Obj <- list(fitted.values = fitted.values,
+                coefficients = coefficients,
+                residuals = residuals,
+                scores = TT,
+                loadings = P,
+                Yloadings = t(tQ),
+                xMeans = xMeans,
+                yMean = yMean,
+                varExpl = varExpl,
+                totvar = sum(x * x),
+                call = .call,
+                tranFun = FUN,
+                tranParms = tranParms,
+                performance = performance,
+                ncomp = ncomp)
+    class(Obj) <- "pcr"
+    Obj
+}
+
+`pcr.formula` <- function(formula, data, subset, na.action,
+                          ..., model = FALSE) {
+    ## the function call
+    .call <- match.call()
+    ## need to reset due to method dispatch
+    .call[[1]] <- as.name("pcr")
+
+    ## keep only the arguments which should go into the model frame
+    mf <- match.call(expand.dots = FALSE)
+    m <- match(c("formula", "data", "subset", "na.action"), names(mf), 0)
+    mf <- mf[c(1, m)]
+    mf$drop.unused.levels <- TRUE
+    mf[[1]] <- as.name("model.frame")
+    mf <- eval.parent(mf)
+    attr(attr(mf, "terms"), "intercept") <- 0
+
+    ## terms objects
+    mt <- attr(mf, "terms")
+    ## model matrices
+    y <- model.response(mf, "numeric")
+    x <- model.matrix(mt, mf)
+
+    ## fit & the PCR
+    Obj <- pcr.default(x = x, y = y, ...)
+    Obj$na.action <- attr(mf, "na.action")
+    Obj$terms <- mt
+    if(model)
+        Obj$model <- mf
+    Obj
+}
+
+`Hellinger` <- function(x, apply = FALSE) {
+    tran(x, method = "hellinger")
+    ## ignore 'apply' as no meta-parameters
+}
+
+`ChiSquare` <- function(x, apply = FALSE) {
+    if(apply) {
+        ## apply pre-computed meta-parameters to transform
+        ## test samples to match training samples
+        parms <- attr(x, "parms")
+        x <- with(parms, sqrt(gsum) * x/outer(rsum, sqrt(csum)))
+    } else {
+        ## perform transformation and preserve the meta-parameters
+        if (any(x < 0, na.rm = TRUE)) {
+            k <- min(x, na.rm = TRUE)
+            warning("'x'contains negative entries: result may be nonsense\n")
+        } else {
+            k <- .Machine$double.eps
+        }
+        gsum <- sum(x, na.rm = TRUE)
+        rsum <- pmax(k, rowSums(x, na.rm = TRUE))
+        csum <- colSums(x, na.rm = TRUE)
+        x <- sqrt(gsum) * x/outer(rsum, sqrt(csum))
+        attr(x, "parms") <- list(gsum = gsum, rsum = rsum, csum = csum)
+    }
+    x
+}
+
+`print.pcr` <- function(x, digits = min(getOption("digits"), 4), ...) {
+    cat("\n")
+    writeLines(strwrap("Principal Component Regression Model", prefix = "\t"),
+               sep = "\n\n")
+    writeLines(strwrap("Call:"))
+    print(x$call)
+    cat("\n")
+    writeLines(strwrap(paste("No. of Components:", x$ncomp)), sep = "\n\n")
+    writeLines("RMSEP (Apparent):")
+    perf <- x$performance[, "RMSEP"]
+    names(perf) <- rownames(x$performance)
+    print(perf)
+}

Modified: pkg/inst/ChangeLog
===================================================================
--- pkg/inst/ChangeLog	2011-05-27 09:44:50 UTC (rev 222)
+++ pkg/inst/ChangeLog	2011-05-30 21:26:07 UTC (rev 223)
@@ -1,5 +1,12 @@
 analogue Change Log
 
+Version 0.7-2
+
+	* pcr: new function pcr() performs principal components
+	regression. Designed to allow transformations in the spirit of
+	Legendre & Gallagher (2001) that allow PCA to be usefully
+	applied to species data.
+
 Version 0.7-1
 
 	* crossval: new function to perform leave-one-out, k-fold,

Added: pkg/man/pcr.Rd
===================================================================
--- pkg/man/pcr.Rd	                        (rev 0)
+++ pkg/man/pcr.Rd	2011-05-30 21:26:07 UTC (rev 223)
@@ -0,0 +1,135 @@
+\name{pcr}
+\alias{pcr}
+\alias{pcr.default}
+\alias{pcr.formula}
+\alias{print.pcr}
+\alias{Hellinger}
+\alias{ChiSquare}
+\title{Prinicpal component regression transfer function models}
+\description{
+  Fits a palaeoecological transfer function model using principal
+  component regression, using an optional transformation of the matrix
+  of predictor variables when these are species abundance data.
+}
+\usage{
+
+\method{pcr}{default}(x, y, ncomp, tranFun, ...)
+
+\method{pcr}{formula}(formula, data, subset, na.action, ..., model = FALSE)
+
+Hellinger(x, apply = FALSE)
+
+ChiSquare(x, apply = FALSE)
+}
+\arguments{
+  \item{x}{Matrix or data frame of predictor variables. Usually species
+    composition or abundance data for transfer function models.}
+  \item{y}{Numeric vector; the response variable to be modelled.}
+  \item{ncomp}{numeric; number of principal components to build models
+    for. If not supplied the largest possible number of components is
+    determined.}
+  \item{tranFun}{function; a function or name of a function that
+    performs a transformation of the predictor variables \code{x}. The
+    function must be self-contained as no arguments are passed to the
+    function when it is applied. See Details for more information.}
+  \item{formula}{a model formula.}
+  \item{data}{an optional data frame, list or environment (or object
+    coercible by \code{\link{as.data.frame}} to a data frame) containing
+    the variables specified on the RHS of the model formula. If not found in
+    \code{data}, the  variables are taken from
+    \code{environment(formula)}, typically the environment from which
+    \code{pcr} is called.}
+  \item{subset}{an optional vector specifying a subset of observations
+    to be used in the fitting process.}
+  \item{na.action}{a function which indicates what should happen when
+    the data contain \code{NA}s.  The default is set by the
+    \code{na.action} setting of \code{options}, and is \code{na.fail} if
+    that is unset.  The 'factory-fresh' default is \code{na.omit}.
+    Another possible value is \code{NULL}, no action. Value
+    \code{na.exclude} can be useful.}
+  \item{model}{logical; If \code{TRUE} the model frame is returned?}
+  \item{apply}{logical; should an existing tranformation, using
+    pre-computed meta-parameters, be applied?}
+  \item{\dots}{Arguments passed to other methods.}
+}
+\details{
+  When applying cross-validation (CV) to transfer function models, any
+  transformation of the predictors must be applied separately during
+  each iteration of the CV procedure to the part of the data used in
+  fitting the model. In the same way, any samples to be predicted from
+  the model must use any meta-parameters derived from the training data
+  only. For examle, centring is appled to the training data only and the
+  variables means used to centre the training data are used to centre
+  the test samples. The variable means should not be computed on a
+  combination of the training and test samples.
+
+  When using PCR, we might wish to apply a transformation to the species
+  data predictor variables such that the PCA of those data preserves a
+  dissimilarity coefficient other than the Euclidean distance. This
+  transformation is applied to allow PCA to better describe patterns in
+  the species data (Legendre & Gallagher 2001).
+
+  How this is handled in \code{pcr} is to take a user-supplied function
+  that takes a single argument, the matrix of predictor variables. The
+  function should return a matrix of the same dimension as the input. If
+  any meta-parameters are required for subsequent use in prediction,
+  these should be returned as attribute \code{"parms"}, attached to the
+  matrix.
+
+  Two example transformation functions are provided implementing the
+  Hellinger and Chi Square transformations of Legendre & Gallagher
+  (2001). Users can base their transformation functions on
+  these. \code{ChiSquare()} illustrates how meta-parameters should be
+  returned as the attribute \code{"parms"}.
+}
+\value{
+  Returns an object of class \code{"pcr"}, a list with the
+  following components:
+
+  \item{fitted.values}{matrix; the PCR estimates of the response. The
+    columns contain fitted values using C components, where C is the Cth
+    column of the matrix.}
+  \item{coefficients}{matrix; regression coefficients for the
+    PCR. Columns as per \code{fitted} above.}
+  \item{residuals}{matrix; residuals, where the Cth column represents a
+    PCR model using C components.}
+  \item{scores}{}
+  \item{loadings}{}
+  \item{Yloadings}{}
+  \item{xMeans}{numeric; means of the predictor variables in the
+    training data.}
+  \item{yMean}{numeric; mean of the response variable in the training
+    data.}
+  \item{varExpl}{numeric; variance explained by the PCR model. These are
+    the squares of the singular values.}
+  \item{totvar}{numeric; total variance in the training data}
+  \item{call}{the matched call.}
+  \item{tranFun}{transformation function used. \code{NA} if none
+    supplied/used.}
+  \item{tranParms}{list; meta parameters used to computed the
+    transformed training data.}
+  \item{performance}{data frame; cross-validation performance statistics
+    for the model.}
+  \item{ncomp}{numeric; number of principal components computed}
+}
+%\references{TO DO}
+\author{Gavin L. Simpson}
+%\note{
+%}
+\seealso{\code{\link{wa}}}
+\examples{
+## Load the Imbrie & Kipp data and
+## summer sea-surface temperatures
+data(ImbrieKipp)
+data(SumSST)
+
+## normal interface and apply Hellinger transformation
+mod <- pcr(ImbrieKipp, SumSST, tranFun = Hellinger)
+mod
+
+## formula interface, but as above
+mod2 <- pcr(SumSST ~ ., data = ImbrieKipp, tranFun = Hellinger)
+mod2
+
+}
+\keyword{methods}

Modified: pkg/tests/Examples/analogue-Ex.Rout.save
===================================================================
--- pkg/tests/Examples/analogue-Ex.Rout.save	2011-05-27 09:44:50 UTC (rev 222)
+++ pkg/tests/Examples/analogue-Ex.Rout.save	2011-05-30 21:26:07 UTC (rev 223)
@@ -28,7 +28,7 @@
 Loading required package: grid
 Loading required package: MASS
 Loading required package: princurve
-This is analogue 0.7-0
+This is analogue 0.7-2
 > 
 > assign(".oldSearch", search(), pos = 'CheckExEnv')
 > cleanEx()
@@ -5536,6 +5536,69 @@
 > 
 > 
 > cleanEx()
+> nameEx("pcr")
+> ### * pcr
+> 
+> flush(stderr()); flush(stdout())
+> 
+> ### Name: pcr
+> ### Title: Prinicpal component regression transfer function models
+> ### Aliases: pcr pcr.default pcr.formula print.pcr Hellinger ChiSquare
+> ### Keywords: methods
+> 
+> ### ** Examples
+> 
+> ## Load the Imbrie & Kipp data and
+> ## summer sea-surface temperatures
+> data(ImbrieKipp)
+> data(SumSST)
+> 
+> ## normal interface and apply Hellinger transformation
+> mod <- pcr(ImbrieKipp, SumSST, tranFun = Hellinger)
+> mod
+
+	Principal Component Regression Model
+
+Call:
+pcr(x = ImbrieKipp, y = SumSST, tranFun = Hellinger)
+
+No. of Components: 27
+
+RMSEP (Apparent):
+     PC1      PC2      PC3      PC4      PC5      PC6      PC7      PC8 
+2.381215 1.707588 1.680896 1.679774 1.608903 1.535145 1.507995 1.496939 
+     PC9     PC10     PC11     PC12     PC13     PC14     PC15     PC16 
+1.496908 1.432142 1.426444 1.405155 1.391348 1.349172 1.349172 1.315284 
+    PC17     PC18     PC19     PC20     PC21     PC22     PC23     PC24 
+1.313187 1.311801 1.291201 1.206484 1.188438 1.187503 1.171215 1.170947 
+    PC25     PC26     PC27 
+1.170380 1.162497 1.162355 
+> 
+> ## formula interface, but as above
+> mod2 <- pcr(SumSST ~ ., data = ImbrieKipp, tranFun = Hellinger)
+> mod2
+
+	Principal Component Regression Model
+
+Call:
+pcr(x = x, y = y, tranFun = Hellinger)
+
+No. of Components: 27
+
+RMSEP (Apparent):
+     PC1      PC2      PC3      PC4      PC5      PC6      PC7      PC8 
+2.381215 1.707588 1.680896 1.679774 1.608903 1.535145 1.507995 1.496939 
+     PC9     PC10     PC11     PC12     PC13     PC14     PC15     PC16 
+1.496908 1.432142 1.426444 1.405155 1.391348 1.349172 1.349172 1.315284 
+    PC17     PC18     PC19     PC20     PC21     PC22     PC23     PC24 
+1.313187 1.311801 1.291201 1.206484 1.188438 1.187503 1.171215 1.170947 
+    PC25     PC26     PC27 
+1.170380 1.162497 1.162355 
+> 
+> 
+> 
+> 
+> cleanEx()
 > nameEx("performance")
 > ### * performance
 > 
@@ -7729,7 +7792,7 @@
 > ### * <FOOTER>
 > ###
 > cat("Time elapsed: ", proc.time() - get("ptime", pos = 'CheckExEnv'),"\n")
-Time elapsed:  19.584 0.559 20.318 0 0 
+Time elapsed:  19.381 0.603 20.167 0 0 
 > grDevices::dev.off()
 null device 
           1 



More information about the Analogue-commits mailing list