[Analogue-commits] r388 - in pkg: R man

Wed Dec 11 06:11:28 CET 2013

Author: gsimpson
Date: 2013-12-11 06:11:28 +0100 (Wed, 11 Dec 2013)
New Revision: 388

Modified:
   pkg/R/crossval.pcr.R
   pkg/R/predict.pcr.R
   pkg/man/predict.pcr.Rd
Log:
updated CV methods

Modified: pkg/R/crossval.pcr.R
===================================================================

--- pkg/R/crossval.pcr.R	2013-11-27 15:45:48 UTC (rev 387)
+++ pkg/R/crossval.pcr.R	2013-12-11 05:11:28 UTC (rev 388)
@@ -21,6 +21,8 @@
             if(ncomp < 1 || ncomp > (newcomp <- min(nr - 1, M))) {
                 warning("'ncomp' inappropriate for LOO CV. Resetting to max possible.")
                 newcomp
+            } else {
+                ncomp
             }
         }
         ## matrix of predictions
@@ -54,7 +56,6 @@
                 pred[i, j] <- Xi %*% FIT$B[, j, drop = FALSE] + B0
             }
         }
-        ##pred <- rowMeans(pred, na.rm = TRUE)
     }
     if(identical(method, "kfold")) {
         ## form ncomp, as k-fold we have ceiling(N / nfold) fewer sites
@@ -65,6 +66,8 @@
             if(ncomp < 1 || ncomp > maxComp) {
                 warning("'ncomp' inappropriate for k-fold CV. Resetting to max possible.")
                 maxComp
+            } else {
+                ncomp
             }
         }
         pred <- array(NA, dim = c(N, ncomp, folds))
@@ -108,17 +111,18 @@
                 }
             }
         }
-        pred <- rowMeans(pred, na.rm = TRUE, dims = 2)
     }
     if(identical(method, "bootstrap")) {
-        ## form ncomp, as k-fold we have ceiling(N / nfold) fewer sites
+        ## form ncomp, as if this was a standard training set setting.
         maxComp <- min(N - 1, M)
         ncomp <- if(missing(ncomp)) {
-            maxComp## uses nr which already has 1 removed
+            maxComp
         } else {
             if(ncomp < 1 || ncomp > maxComp) {
                 warning("'ncomp' inappropriate for bootstrap CV. Resetting to max possible.")
                 maxComp
+            } else {
+                ncomp
             }
         }
         pred <- array(NA, dim = c(N, ncomp, nboot))
@@ -136,28 +140,78 @@
             bSamp <- sample.int(N, N, replace = TRUE)
             sel <- which(!ind %in% bSamp) ## need indices!!!
             N.oob <- NROW(x[sel, , drop = FALSE])
-            N.mod <- N - N.oob ## not sure I need this
+            ##N.mod <- N - N.oob ## not sure I need this
             ## apply transformation to X[-sel, ]
-            TRAN <- obj$tranFun(x[-sel, , drop = FALSE])
+            TRAN <- obj$tranFun(x[bSamp, , drop = FALSE])
             X <- TRAN$data
             ## apply transformation to OOB samples, using parms from above
             Xi <- obj$tranFun(x[sel, , drop = FALSE], apply = TRUE,
                               parms = TRAN$parms)$data
             ## centre the training data
             Xbar <- colMeans(X)
-            ybar <- mean(y[-sel])
+            ybar <- mean(y[bSamp])
             X <- sweep(X, 2, Xbar, "-")
-            Y <- y[-sel] - ybar
+            Y <- y[bSamp] - ybar
             ## fit model to subset
-            FIT <- fitPCR(X = X, Y = Y, ncomp = ncomp, n = N.mod, m = M)
+            FIT <- fitPCR(X = X, Y = Y, ncomp = ncomp, n = N, m = M)
             ## predict for 1:ncomps components
             for(j in nc) {
                 B0 <- obj$yMean - obj$xMeans %*% FIT$B[, j, drop = FALSE]
-                pred[sel, j, i] <- Xi %*% FIT$B[, j, drop = FALSE] + rep(B0, N.oob)
+                pred[sel, j, i] <- Xi %*% FIT$B[, j, drop = FALSE] +
+                    rep(B0, N.oob)
             }
         }
-        pred <- rowMeans(pred, na.rm = TRUE, dims = 2)
     }
-    class(pred) <- "crossval"
-    pred
+
+    ## fitted values and derived stats
+    if(identical(method, "none")) {
+        fitted <- pred
+    } else if(method %in% c("","")) {
+        rowMeans(pred, na.rm = TRUE, dims = 2)
+    } else {
+        fitted <- rowMeans(pred)
+    }
+    residuals <- y - fitted                               ## residuals
+    maxBias <- apply(residuals, 2, maxBias, y, n = 10)    ## maxBias
+    avgBias <- colMeans(residuals)                        ## avgBias
+    r2 <- apply(fitted, cor, y)
+    ## s1 & s2 components for model and training set
+    ns <- rowSums(!is.na(pred), dims = 2)
+    s1.train <- t(sqrt(rowSums((pred - as.vector(fitted))^2,
+                               na.rm = TRUE, dims = 2) / as.vector(ns - 1)))
+    s1 <- sqrt(colMeans(s1.train^2))
+    s2 <- sqrt(colMeans(residuals^2, na.rm = TRUE))
+    s2.train <- sweep(pred, c(2,1), y, "-")
+    s2.train <- sqrt(t(rowMeans(s2.train^2, na.rm = TRUE, dims = 2)))
+
+    ## RMSEP
+    rmsep.train <- sqrt(s1.train^2 + s2.train^2)
+    rmsep <- sqrt(s1^2 + s2^2)
+    rmsep2 <- sqrt(mean(residuals^2))
+    performance <- data.frame(comp = ncomp,
+                              R2 = r2,
+                              avgBias = avgBias,
+                              maxBias = maxBias,
+                              RMSEP = rmsep,
+                              RMSEP2 = rmsep2,
+                              s1 = s1,
+                              s2 = s2)
+
+    ## more additions to the call
+    .call <- match.call()
+    .call[[1]] <- as.name("crossval")
+
+    ## return object
+    out <- list(fitted.values = fitted,
+                residuals = residuals,
+                rmsep = rmsep.train,
+                s1 = s1.train,
+                s2 = s2.train,
+                performance = performance,
+                call = .call,
+                CVparams = list(method = method, nboot = nboot,
+                nfold = nfold, folds = folds))
+
+    class(out) <- "crossval"
+    out
 }

Modified: pkg/R/predict.pcr.R
===================================================================
--- pkg/R/predict.pcr.R	2013-11-27 15:45:48 UTC (rev 387)
+++ pkg/R/predict.pcr.R	2013-12-11 05:11:28 UTC (rev 388)
@@ -1,7 +1,12 @@
 `predict.pcr` <- function(object, newdata, ncomp = seq_along(object$ncomp),
-                          CV = c("none", "LOO", "bootstrap", "nfold"),
-                          verbose = FALSE, nboot = 100, nfold = 5,
-                          ...) {
+                          CV = c("none", "LOO", "bootstrap", "kfold"),
+                          verbose = FALSE, nboot = 100, kfold = 10,
+                          folds = 5, ...) {
+    takeData <- function(x, new) {
+        want <- (spp.names <- colnames(x$data$x)) %in% colnames(new)
+        want <- spp.names[want]
+        new[, want, drop = FALSE]
+    }
     if(missing(newdata))
         return(fitted(object))
     ## store names of new samples
@@ -10,28 +15,165 @@
     if (missing(CV))
         CV <- "none"
     CV <- match.arg(CV)
-    Np <- NROW(newdata)
-    B <- coef(object)
+    Nnew <- NROW(newdata)
+    N <- nrow(object$data$x)
+    M <- ncol(object$data$x)
+
+    ind <- seq_len(N) ## indicator for samples
+
+    ## extract the training data & transformation fun
+    trainX <- object$data$x
+    trainY <- object$data$y
+    tranFun <- object$tranFun
+
     if(identical(CV, "none")) {
-        want <- (spp.names <- colnames(object$data$x)) %in% colnames(newdata)
-        want <- spp.names[want]
-        newdata <- newdata[, want, drop = FALSE]
+        B <- coef(object)
+        newdata <- takeData(object, newdata)
         ## apply transformation to newdata
         tf <- object$tranFun(newdata)
         newdata <- tf$data
         ## do predictions
         ## matrix of predictions
-        pred <- matrix(ncol = length(ncomp), nrow = Np)
-        for(j in seq_along(ncomp)) {
-            comp <- ncomp[j]
-            B0 <- object$yMean - object$xMeans %*% B[, comp]
-            pred[, j] <- newdata %*% B[, comp] + rep(B0, Np)
+        pred <- matrix(ncol = ncomp, nrow = Nnew)
+        for(j in seq_len(ncomp)) {
+            B0 <- object$yMean - object$xMeans %*% B[, j]
+            pred[, j] <- newdata %*% B[, j] + rep(B0, Nnew)
         }
-    } else {
-        stop("Other methods of crossvalidation not yet implemented")
+    } else if (identical(CV, "LOO")) {
+        nr <- N-1 ## for LOO
+        M <- ncol(object$data$x)
+        ## form ncomp, as LOO we have potentially 1 less component than usual
+        ncomp <- if(missing(ncomp)) {
+            min(nr - 1, M) ## uses nr which already has 1 removed
+        } else {
+            if(ncomp < 1 || ncomp > (newcomp <- min(nr - 1, M))) {
+                warning("'ncomp' inappropriate for LOO CV.
+Resetting to max possible.")
+                newcomp
+            } else {
+                ncomp
+            }
+        }
+
+        pred <- array(NA, dim = c(Nnew, ncomp, N))
+        comps <- seq_len(ncomp)
+
+        for (i in seq_len(N)) {
+            ## which samples are in the training set
+            loo <- ind[-i]
+
+            ## do the heavy lifting
+            pred[, , i] <-
+                pcrCVfit(trainX, trainY, tf = tranFun, newdata,
+                         parray = pred[, , i], take = loo,
+                         N = nr, Nnew = Nnew, M = M, ncomp = ncomp,
+                         comps = comps)
+        }
+        fitted <- rowMeans(pred, na.rm = TRUE, dims = 2)
+        ## other computations on `pred` needed for SEs etc
+        pred <- fitted
+    } else if (identical(CV, "bootstrap")) {
+        ## form ncomp, as if this was a standard training set setting.
+        maxComp <- min(N - 1, M)
+        ncomp <- if(missing(ncomp)) {
+            maxComp
+        } else {
+            if(ncomp < 1 || ncomp > maxComp) {
+                warning("'ncomp' inappropriate for bootstrap CV.
+Resetting to max possible.")
+                maxComp
+            } else {
+                ncomp
+            }
+        }
+
+        pred <- array(NA, dim = c(Nnew, ncomp, nboot))
+        comps <- seq_len(ncomp)
+
+        for (i in seq_len(nboot)) {
+            bSamp <- sample.int(N, N, replace = TRUE)
+
+            ## do the heavy lifting
+            pred[, , i] <-
+                pcrCVfit(trainX, trainY, tf = tranFun, newdata,
+                         parray = pred[, , i], take = bSamp,
+                         N = N, Nnew = Nnew, M = M, ncomp = ncomp,
+                         comps = comps)
+        }
+        fitted <- rowMeans(pred, na.rm = TRUE, dims = 2)
+        ## other computations on `pred` needed for SEs etc
+        pred <- fitted
+    } else if (identical(CV, "kfold")) {
+        ## form ncomp, as k-fold we have ceiling(N / nfold) fewer sites
+        maxComp <- min((N - ceiling(N / kfold)) - 1, M)
+        ncomp <- if(missing(ncomp)) {
+            maxComp## uses nr which already has 1 removed
+        } else {
+            if(ncomp < 1 || ncomp > maxComp) {
+                warning("'ncomp' inappropriate for k-fold CV.
+Resetting to max possible.")
+                maxComp
+            } else {
+                ncomp
+            }
+        }
+
+        comps <- seq_len(ncomp)
+
+        ## array for prediction
+        pred <- array(NA, dim = c(Nnew, ncomp, folds, kfold))
+        ind <- rep(seq_len(kfold), length = N) ## k-fold group indicator
+
+        ## this is the n in n k-fold CV, allowing n repeated k-folds
+        ##ii <- 0
+        for(i in seq_len(folds)) {
+            ## do a k-fold CV
+            pind <- ind[sample.int(N, N, replace = FALSE)]
+            ## the main k-fold CV loop
+            for(k in seq_len(kfold)) {
+                ##ii <- ii + 1
+                kSamp <- pind != k
+                Nk <- sum(kSamp) ## number of samples in training set
+                kSamp <- which(kSamp)
+
+                ## do the heavy lifting
+                pred[, , i, k] <-
+                    pcrCVfit(trainX, trainY, tf = tranFun,
+                             newdata, parray = pred[, , i, k],
+                             take = kSamp, N = Nk, Nnew = Nnew,
+                             M = M, ncomp = ncomp, comps = comps)
+            }
+        }
+        fitted <- rowMeans(pred, na.rm = TRUE, dims = 2)
+        ## other computations on `pred` needed for SEs etc
+        pred <- fitted
+    }else {
+        stop("Unknown crossvalidation method.")
     }
     rownames(pred) <- newSamp
-    colnames(pred) <- paste0("PC", ncomp)
+    colnames(pred) <- paste0("PC", seq_len(ncomp))
     pred
 }
 
+## take is a vector of indices for samples to include in the training
+## set during prediction
+`pcrCVfit` <- function(X, Y, tf, newdata, parray, take, N, Nnew, M,
+                       ncomp, comps) {
+    ## apply transformation to training data
+    TRAN <- tf(X[take, , drop = FALSE])
+    X <- TRAN$data
+    ## apply transformation to newdata, using parms from above
+    Xnew <- tf(newdata, apply = TRUE, parms = TRAN$parms)$data
+    ## centre the training data
+    Xbar <- colMeans(X)
+    ybar <- mean(Y[take])
+    X <- sweep(X, 2, Xbar, "-")
+    Y <- Y[take] - ybar
+    ## fit model to subset
+    FIT <- analogue:::fitPCR(X = X, Y = Y, ncomp = ncomp, n = N, m = M)
+    for(j in comps) {
+        B0 <- ybar - Xbar %*% FIT$B[, j]
+        parray[ , j] <- Xnew %*% FIT$B[, j] + rep(B0, Nnew)
+    }
+    parray
+}

Modified: pkg/man/predict.pcr.Rd
===================================================================
--- pkg/man/predict.pcr.Rd	2013-11-27 15:45:48 UTC (rev 387)
+++ pkg/man/predict.pcr.Rd	2013-12-11 05:11:28 UTC (rev 388)
@@ -5,14 +5,15 @@
 
 \description{
   Calculates predicted values from a fitted principal components
-  regression model. Leave-one-out, bootstrap of n-fold crossvalidated
-  predictions are also intended (but not yet implemented).
+  regression model. Leave-one-out, bootstrap or n k-fold crossvalidated
+  predictions are also implemented.
 }
 
 \usage{
 \method{predict}{pcr}(object, newdata, ncomp = seq_along(object$ncomp),
-        CV = c("none", "LOO", "bootstrap", "nfold"),
-        verbose = FALSE, nboot = 100, nfold = 5, \dots)
+        CV = c("none", "LOO", "bootstrap", "kfold"),
+        verbose = FALSE, nboot = 100, kfold = 10, folds = 5,
+        \dots)
 }
 
 \arguments{
@@ -21,16 +22,15 @@
   \item{newdata}{data frame of new observations for which predictions
     are sought.}
   \item{ncomp}{numeric; the PCR components for which predictions are
-    sought. Can be a vector in which case predictions for multiple
-    components are computed.}
+    sought. If \code{ncomp = c}, predictions for components \code{1:c}
+    are produced.}
   \item{CV}{character; the type of crossvalidation required. Currently,
     no crossvalidation methods are implemented.}
   \item{verbose}{logical; should progress on crossvalidation be printed
     to the console?}
-  \item{nboot}{numeric; the number of bootstrap samples to draw, or in
-    the case of \code{CV = "nfold"} the number of repeats of n-fold CV
-    to perform.}
-  \item{nfold}{numeric; the number of folds to split data into.}
+  \item{nboot}{numeric; the number of bootstrap samples to draw.}
+  \item{kfold}{numeric; the number of folds to split data into.}
+  \item{folds}{numeric; the number of repetitions of k-fold CV.}
   \item{\dots}{arguments passed to other methods.}
 }
 
@@ -63,7 +63,12 @@
 mod <- pcr(ImbrieKipp[-take, ], SumSST[-take], tranFun = Hellinger)
 
 ## predictions
-predict(mod, ImbrieKipp[take, ], ncomp = 1:4)
+predict(mod, ImbrieKipp[take, ], ncomp = 4)
 
+## predictions
+set.seed(123)
+predict(mod, ImbrieKipp[take, ], ncomp = 4, CV = "bootstrap",
+        nboot = 100)
+
 }
 \keyword{methods}
\ No newline at end of file