[Analogue-commits] r388 - in pkg: R man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Wed Dec 11 06:11:28 CET 2013
Author: gsimpson
Date: 2013-12-11 06:11:28 +0100 (Wed, 11 Dec 2013)
New Revision: 388
Modified:
pkg/R/crossval.pcr.R
pkg/R/predict.pcr.R
pkg/man/predict.pcr.Rd
Log:
updated CV methods
Modified: pkg/R/crossval.pcr.R
===================================================================
--- pkg/R/crossval.pcr.R 2013-11-27 15:45:48 UTC (rev 387)
+++ pkg/R/crossval.pcr.R 2013-12-11 05:11:28 UTC (rev 388)
@@ -21,6 +21,8 @@
if(ncomp < 1 || ncomp > (newcomp <- min(nr - 1, M))) {
warning("'ncomp' inappropriate for LOO CV. Resetting to max possible.")
newcomp
+ } else {
+ ncomp
}
}
## matrix of predictions
@@ -54,7 +56,6 @@
pred[i, j] <- Xi %*% FIT$B[, j, drop = FALSE] + B0
}
}
- ##pred <- rowMeans(pred, na.rm = TRUE)
}
if(identical(method, "kfold")) {
## form ncomp, as k-fold we have ceiling(N / nfold) fewer sites
@@ -65,6 +66,8 @@
if(ncomp < 1 || ncomp > maxComp) {
warning("'ncomp' inappropriate for k-fold CV. Resetting to max possible.")
maxComp
+ } else {
+ ncomp
}
}
pred <- array(NA, dim = c(N, ncomp, folds))
@@ -108,17 +111,18 @@
}
}
}
- pred <- rowMeans(pred, na.rm = TRUE, dims = 2)
}
if(identical(method, "bootstrap")) {
- ## form ncomp, as k-fold we have ceiling(N / nfold) fewer sites
+ ## form ncomp, as if this was a standard training set setting.
maxComp <- min(N - 1, M)
ncomp <- if(missing(ncomp)) {
- maxComp## uses nr which already has 1 removed
+ maxComp
} else {
if(ncomp < 1 || ncomp > maxComp) {
warning("'ncomp' inappropriate for bootstrap CV. Resetting to max possible.")
maxComp
+ } else {
+ ncomp
}
}
pred <- array(NA, dim = c(N, ncomp, nboot))
@@ -136,28 +140,78 @@
bSamp <- sample.int(N, N, replace = TRUE)
sel <- which(!ind %in% bSamp) ## need indices!!!
N.oob <- NROW(x[sel, , drop = FALSE])
- N.mod <- N - N.oob ## not sure I need this
+ ##N.mod <- N - N.oob ## not sure I need this
## apply transformation to X[-sel, ]
- TRAN <- obj$tranFun(x[-sel, , drop = FALSE])
+ TRAN <- obj$tranFun(x[bSamp, , drop = FALSE])
X <- TRAN$data
## apply transformation to OOB samples, using parms from above
Xi <- obj$tranFun(x[sel, , drop = FALSE], apply = TRUE,
parms = TRAN$parms)$data
## centre the training data
Xbar <- colMeans(X)
- ybar <- mean(y[-sel])
+ ybar <- mean(y[bSamp])
X <- sweep(X, 2, Xbar, "-")
- Y <- y[-sel] - ybar
+ Y <- y[bSamp] - ybar
## fit model to subset
- FIT <- fitPCR(X = X, Y = Y, ncomp = ncomp, n = N.mod, m = M)
+ FIT <- fitPCR(X = X, Y = Y, ncomp = ncomp, n = N, m = M)
## predict for 1:ncomps components
for(j in nc) {
B0 <- obj$yMean - obj$xMeans %*% FIT$B[, j, drop = FALSE]
- pred[sel, j, i] <- Xi %*% FIT$B[, j, drop = FALSE] + rep(B0, N.oob)
+ pred[sel, j, i] <- Xi %*% FIT$B[, j, drop = FALSE] +
+ rep(B0, N.oob)
}
}
- pred <- rowMeans(pred, na.rm = TRUE, dims = 2)
}
- class(pred) <- "crossval"
- pred
+
+ ## fitted values and derived stats
+ if(identical(method, "none")) {
+ fitted <- pred
+ } else if(method %in% c("","")) {
+ rowMeans(pred, na.rm = TRUE, dims = 2)
+ } else {
+ fitted <- rowMeans(pred)
+ }
+ residuals <- y - fitted ## residuals
+ maxBias <- apply(residuals, 2, maxBias, y, n = 10) ## maxBias
+ avgBias <- colMeans(residuals) ## avgBias
+ r2 <- apply(fitted, cor, y)
+ ## s1 & s2 components for model and training set
+ ns <- rowSums(!is.na(pred), dims = 2)
+ s1.train <- t(sqrt(rowSums((pred - as.vector(fitted))^2,
+ na.rm = TRUE, dims = 2) / as.vector(ns - 1)))
+ s1 <- sqrt(colMeans(s1.train^2))
+ s2 <- sqrt(colMeans(residuals^2, na.rm = TRUE))
+ s2.train <- sweep(pred, c(2,1), y, "-")
+ s2.train <- sqrt(t(rowMeans(s2.train^2, na.rm = TRUE, dims = 2)))
+
+ ## RMSEP
+ rmsep.train <- sqrt(s1.train^2 + s2.train^2)
+ rmsep <- sqrt(s1^2 + s2^2)
+ rmsep2 <- sqrt(mean(residuals^2))
+ performance <- data.frame(comp = ncomp,
+ R2 = r2,
+ avgBias = avgBias,
+ maxBias = maxBias,
+ RMSEP = rmsep,
+ RMSEP2 = rmsep2,
+ s1 = s1,
+ s2 = s2)
+
+ ## more additions to the call
+ .call <- match.call()
+ .call[[1]] <- as.name("crossval")
+
+ ## return object
+ out <- list(fitted.values = fitted,
+ residuals = residuals,
+ rmsep = rmsep.train,
+ s1 = s1.train,
+ s2 = s2.train,
+ performance = performance,
+ call = .call,
+ CVparams = list(method = method, nboot = nboot,
+ nfold = nfold, folds = folds))
+
+ class(out) <- "crossval"
+ out
}
Modified: pkg/R/predict.pcr.R
===================================================================
--- pkg/R/predict.pcr.R 2013-11-27 15:45:48 UTC (rev 387)
+++ pkg/R/predict.pcr.R 2013-12-11 05:11:28 UTC (rev 388)
@@ -1,7 +1,12 @@
`predict.pcr` <- function(object, newdata, ncomp = seq_along(object$ncomp),
- CV = c("none", "LOO", "bootstrap", "nfold"),
- verbose = FALSE, nboot = 100, nfold = 5,
- ...) {
+ CV = c("none", "LOO", "bootstrap", "kfold"),
+ verbose = FALSE, nboot = 100, kfold = 10,
+ folds = 5, ...) {
+ takeData <- function(x, new) {
+ want <- (spp.names <- colnames(x$data$x)) %in% colnames(new)
+ want <- spp.names[want]
+ new[, want, drop = FALSE]
+ }
if(missing(newdata))
return(fitted(object))
## store names of new samples
@@ -10,28 +15,165 @@
if (missing(CV))
CV <- "none"
CV <- match.arg(CV)
- Np <- NROW(newdata)
- B <- coef(object)
+ Nnew <- NROW(newdata)
+ N <- nrow(object$data$x)
+ M <- ncol(object$data$x)
+
+ ind <- seq_len(N) ## indicator for samples
+
+ ## extract the training data & transformation fun
+ trainX <- object$data$x
+ trainY <- object$data$y
+ tranFun <- object$tranFun
+
if(identical(CV, "none")) {
- want <- (spp.names <- colnames(object$data$x)) %in% colnames(newdata)
- want <- spp.names[want]
- newdata <- newdata[, want, drop = FALSE]
+ B <- coef(object)
+ newdata <- takeData(object, newdata)
## apply transformation to newdata
tf <- object$tranFun(newdata)
newdata <- tf$data
## do predictions
## matrix of predictions
- pred <- matrix(ncol = length(ncomp), nrow = Np)
- for(j in seq_along(ncomp)) {
- comp <- ncomp[j]
- B0 <- object$yMean - object$xMeans %*% B[, comp]
- pred[, j] <- newdata %*% B[, comp] + rep(B0, Np)
+ pred <- matrix(ncol = ncomp, nrow = Nnew)
+ for(j in seq_len(ncomp)) {
+ B0 <- object$yMean - object$xMeans %*% B[, j]
+ pred[, j] <- newdata %*% B[, j] + rep(B0, Nnew)
}
- } else {
- stop("Other methods of crossvalidation not yet implemented")
+ } else if (identical(CV, "LOO")) {
+ nr <- N-1 ## for LOO
+ M <- ncol(object$data$x)
+ ## form ncomp, as LOO we have potentially 1 less component than usual
+ ncomp <- if(missing(ncomp)) {
+ min(nr - 1, M) ## uses nr which already has 1 removed
+ } else {
+ if(ncomp < 1 || ncomp > (newcomp <- min(nr - 1, M))) {
+ warning("'ncomp' inappropriate for LOO CV.
+Resetting to max possible.")
+ newcomp
+ } else {
+ ncomp
+ }
+ }
+
+ pred <- array(NA, dim = c(Nnew, ncomp, N))
+ comps <- seq_len(ncomp)
+
+ for (i in seq_len(N)) {
+ ## which samples are in the training set
+ loo <- ind[-i]
+
+ ## do the heavy lifting
+ pred[, , i] <-
+ pcrCVfit(trainX, trainY, tf = tranFun, newdata,
+ parray = pred[, , i], take = loo,
+ N = nr, Nnew = Nnew, M = M, ncomp = ncomp,
+ comps = comps)
+ }
+ fitted <- rowMeans(pred, na.rm = TRUE, dims = 2)
+ ## other computations on `pred` needed for SEs etc
+ pred <- fitted
+ } else if (identical(CV, "bootstrap")) {
+ ## form ncomp, as if this was a standard training set setting.
+ maxComp <- min(N - 1, M)
+ ncomp <- if(missing(ncomp)) {
+ maxComp
+ } else {
+ if(ncomp < 1 || ncomp > maxComp) {
+ warning("'ncomp' inappropriate for bootstrap CV.
+Resetting to max possible.")
+ maxComp
+ } else {
+ ncomp
+ }
+ }
+
+ pred <- array(NA, dim = c(Nnew, ncomp, nboot))
+ comps <- seq_len(ncomp)
+
+ for (i in seq_len(nboot)) {
+ bSamp <- sample.int(N, N, replace = TRUE)
+
+ ## do the heavy lifting
+ pred[, , i] <-
+ pcrCVfit(trainX, trainY, tf = tranFun, newdata,
+ parray = pred[, , i], take = bSamp,
+ N = N, Nnew = Nnew, M = M, ncomp = ncomp,
+ comps = comps)
+ }
+ fitted <- rowMeans(pred, na.rm = TRUE, dims = 2)
+ ## other computations on `pred` needed for SEs etc
+ pred <- fitted
+ } else if (identical(CV, "kfold")) {
+ ## form ncomp, as k-fold we have ceiling(N / nfold) fewer sites
+ maxComp <- min((N - ceiling(N / kfold)) - 1, M)
+ ncomp <- if(missing(ncomp)) {
+ maxComp## uses nr which already has 1 removed
+ } else {
+ if(ncomp < 1 || ncomp > maxComp) {
+ warning("'ncomp' inappropriate for k-fold CV.
+Resetting to max possible.")
+ maxComp
+ } else {
+ ncomp
+ }
+ }
+
+ comps <- seq_len(ncomp)
+
+ ## array for prediction
+ pred <- array(NA, dim = c(Nnew, ncomp, folds, kfold))
+ ind <- rep(seq_len(kfold), length = N) ## k-fold group indicator
+
+ ## this is the n in n k-fold CV, allowing n repeated k-folds
+ ##ii <- 0
+ for(i in seq_len(folds)) {
+ ## do a k-fold CV
+ pind <- ind[sample.int(N, N, replace = FALSE)]
+ ## the main k-fold CV loop
+ for(k in seq_len(kfold)) {
+ ##ii <- ii + 1
+ kSamp <- pind != k
+ Nk <- sum(kSamp) ## number of samples in training set
+ kSamp <- which(kSamp)
+
+ ## do the heavy lifting
+ pred[, , i, k] <-
+ pcrCVfit(trainX, trainY, tf = tranFun,
+ newdata, parray = pred[, , i, k],
+ take = kSamp, N = Nk, Nnew = Nnew,
+ M = M, ncomp = ncomp, comps = comps)
+ }
+ }
+ fitted <- rowMeans(pred, na.rm = TRUE, dims = 2)
+ ## other computations on `pred` needed for SEs etc
+ pred <- fitted
+ }else {
+ stop("Unknown crossvalidation method.")
}
rownames(pred) <- newSamp
- colnames(pred) <- paste0("PC", ncomp)
+ colnames(pred) <- paste0("PC", seq_len(ncomp))
pred
}
+## take is a vector of indices for samples to include in the training
+## set during prediction
+`pcrCVfit` <- function(X, Y, tf, newdata, parray, take, N, Nnew, M,
+ ncomp, comps) {
+ ## apply transformation to training data
+ TRAN <- tf(X[take, , drop = FALSE])
+ X <- TRAN$data
+ ## apply transformation to newdata, using parms from above
+ Xnew <- tf(newdata, apply = TRUE, parms = TRAN$parms)$data
+ ## centre the training data
+ Xbar <- colMeans(X)
+ ybar <- mean(Y[take])
+ X <- sweep(X, 2, Xbar, "-")
+ Y <- Y[take] - ybar
+ ## fit model to subset
+ FIT <- analogue:::fitPCR(X = X, Y = Y, ncomp = ncomp, n = N, m = M)
+ for(j in comps) {
+ B0 <- ybar - Xbar %*% FIT$B[, j]
+ parray[ , j] <- Xnew %*% FIT$B[, j] + rep(B0, Nnew)
+ }
+ parray
+}
Modified: pkg/man/predict.pcr.Rd
===================================================================
--- pkg/man/predict.pcr.Rd 2013-11-27 15:45:48 UTC (rev 387)
+++ pkg/man/predict.pcr.Rd 2013-12-11 05:11:28 UTC (rev 388)
@@ -5,14 +5,15 @@
\description{
Calculates predicted values from a fitted principal components
- regression model. Leave-one-out, bootstrap of n-fold crossvalidated
- predictions are also intended (but not yet implemented).
+ regression model. Leave-one-out, bootstrap or n k-fold crossvalidated
+ predictions are also implemented.
}
\usage{
\method{predict}{pcr}(object, newdata, ncomp = seq_along(object$ncomp),
- CV = c("none", "LOO", "bootstrap", "nfold"),
- verbose = FALSE, nboot = 100, nfold = 5, \dots)
+ CV = c("none", "LOO", "bootstrap", "kfold"),
+ verbose = FALSE, nboot = 100, kfold = 10, folds = 5,
+ \dots)
}
\arguments{
@@ -21,16 +22,15 @@
\item{newdata}{data frame of new observations for which predictions
are sought.}
\item{ncomp}{numeric; the PCR components for which predictions are
- sought. Can be a vector in which case predictions for multiple
- components are computed.}
+ sought. If \code{ncomp = c}, predictions for components \code{1:c}
+ are produced.}
\item{CV}{character; the type of crossvalidation required. Currently,
no crossvalidation methods are implemented.}
\item{verbose}{logical; should progress on crossvalidation be printed
to the console?}
- \item{nboot}{numeric; the number of bootstrap samples to draw, or in
- the case of \code{CV = "nfold"} the number of repeats of n-fold CV
- to perform.}
- \item{nfold}{numeric; the number of folds to split data into.}
+ \item{nboot}{numeric; the number of bootstrap samples to draw.}
+ \item{kfold}{numeric; the number of folds to split data into.}
+ \item{folds}{numeric; the number of repetitions of k-fold CV.}
\item{\dots}{arguments passed to other methods.}
}
@@ -63,7 +63,12 @@
mod <- pcr(ImbrieKipp[-take, ], SumSST[-take], tranFun = Hellinger)
## predictions
-predict(mod, ImbrieKipp[take, ], ncomp = 1:4)
+predict(mod, ImbrieKipp[take, ], ncomp = 4)
+## predictions
+set.seed(123)
+predict(mod, ImbrieKipp[take, ], ncomp = 4, CV = "bootstrap",
+ nboot = 100)
+
}
\keyword{methods}
\ No newline at end of file
More information about the Analogue-commits
mailing list