[Coxflexboost-commits] r11 - in pkg: R man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sun Mar 15 19:47:14 CET 2009
Author: hofner
Date: 2009-03-15 19:47:14 +0100 (Sun, 15 Mar 2009)
New Revision: 11
Added:
pkg/R/crossvalidation.R
pkg/man/crossvalidation.Rd
Modified:
pkg/R/cfboost.R
pkg/R/methods.R
pkg/R/mstop.R
pkg/man/CoxFlexBoost-package.Rd
pkg/man/boost_control.Rd
pkg/man/mstop.Rd
pkg/man/risk.Rd
Log:
added function cv for cross validation
Modified: pkg/R/cfboost.R
===================================================================
--- pkg/R/cfboost.R 2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/R/cfboost.R 2009-03-15 18:47:14 UTC (rev 11)
@@ -11,8 +11,9 @@
cfboost.formula <- function(formula, data = list(), weights = NULL, na.action = na.omit, control = boost_control(), ...) {
## construct design matrix etc.
object <- boost_dpp(formula, data, weights, na.action)
- ## fit the ensemble
object$input <- object$menv at get("input")
+ object$data <- data
+ object$formula <- formula
if (!is.null(weights))
object$oob$input <- object$oob$menv at get("input")
RET <- cfboost_fit(object, control = control, data = data, weights = weights, ...)
@@ -65,7 +66,7 @@
fit <- fit_oob <- offset <- getoffset(y, which.offset)
if (trace)
- cat("Offset: ", offset, "\n")
+ cat("Offset: ", offset, "\n\n")
mstart <- 1
hSi <- 1 # number of iterations in the repeat loop
@@ -73,14 +74,14 @@
## compute df2lambda which depends on the offset and on y
if (trace)
- cat("compute df2lambda .")
+ cat("Compute df2lambda .")
for (i in 1:length(x)){
if (!is.null( attr(x[[i]], "df"))){
attr(x[[i]], "df2lambda")(y, offset)
if (trace) cat(".")
}
}
- if (trace) cat("\n")
+ if (trace) cat("\n\n")
##################################
#### start boosting iteration ####
@@ -88,7 +89,7 @@
repeat{
for (m in mstart:mstop) {
if (trace)
- cat("Step ", m, "; Progress .")
+ cat("Step ", m, "; Progress .", sep="")
## fit MLE component-wise
for (i in 1:length(x)) {
@@ -141,7 +142,7 @@
fit_oob <- fit_oob + nu * oob$x[[xselect]] %*% coefs[[xselect]]
}
}
- if (hardStop | risk != "oobag" | which.min(mrisk) < (mstop - mstop * 0.2/hSi) | hSi == 5) break
+ if (hardStop || risk != "oobag" || which.min(mrisk) < (mstop - mstop * 0.2/hSi) || hSi == 5) break
## else if minimum is at the end of the boosting algorithm,
## don't stop but proceed with boosting: Therefore, increase mstop
Added: pkg/R/crossvalidation.R
===================================================================
--- pkg/R/crossvalidation.R (rev 0)
+++ pkg/R/crossvalidation.R 2009-03-15 18:47:14 UTC (rev 11)
@@ -0,0 +1,69 @@
+cv <- function(object, ...)
+ UseMethod("cv")
+
+cv.cfboost <- function(object, folds, grid = c(1:mstop(object, opt=FALSE)), ...){
+
+ oobrisk <- matrix(0, nrow = ncol(folds), ncol = length(grid))
+
+ ctrl <- object$control
+
+ ctrl$risk <- "oobag"
+ # fehlt noch: ctrl$savedata <- FALSE
+ # fehlt noch: ctrl$saveensss <- FALSE
+
+ if (is.null(object$data))
+ stop(sQuote("object"), " does not contain data. Estimate model with option ", sQuote("savedata = TRUE"))
+
+ call <- deparse(object$call)
+ data <- object$data$data
+ formula <- object$data$formula
+
+ ## free memory
+ rm("object")
+
+ i <- 0
+
+ dummyfct <- function(weights, control, data, formula, grid){
+ i <<- i + 1
+ cat("\n>>> Fold ", i, "\n\n")
+ model <- cfboost(formula, data = data, control = control, weights = weights)
+ ret <- risk(model)[grid]
+ rm("model")
+ ret
+ }
+
+ oobrisk <- apply(folds, 2, dummyfct, control = ctrl, data = data, formula = formula, grid = grid)
+ oobrisk <- t(oobrisk)
+ oobrisk <- oobrisk/colSums(folds == 0)
+ colnames(oobrisk) <- grid
+ rownames(oobrisk) <- 1:nrow(oobrisk)
+ attr(oobrisk, "call") <- call
+ attr(oobrisk, "mstop") <- grid
+ attr(oobrisk, "risk") <- "empirical risk (neg. log likelihood)"
+ class(oobrisk) <- "cv"
+ oobrisk
+}
+
+
+print.cv <- function(x, ...) {
+ cat("\n\t Cross-validated risk \n\t Call:",
+ attr(x, "call"), "\n\n")
+ print(colMeans(x))
+ cat("\n\t Optimal number of boosting iterations:", mstop(x), "\n")
+ return(invisible(x))
+}
+
+plot.cv <- function(x, ylab = attr(x, "risk"), ylim = range(x),
+ main = attr(x, "call"), ...) {
+
+ cm <- colMeans(x)
+ plot(1:ncol(x), cm, ylab = ylab, ylim = ylim,
+ type = "n", lwd = 2,
+ xlab = "Number of boosting iterations",
+ main = main, ...)
+ out <- apply(x, 1, function(y) lines(1:ncol(x),y, col = "lightgrey"))
+ rm(out)
+ ms <- which.min(cm)
+ lines(c(ms, ms), c(min(c(0, ylim[1] * ifelse(ylim[1] < 0, 2, 0.5))), cm[ms]), lty = 2)
+ lines(1:ncol(x), cm, type = "l")
+}
Property changes on: pkg/R/crossvalidation.R
___________________________________________________________________
Name: svn:executable
+ *
Modified: pkg/R/methods.R
===================================================================
--- pkg/R/methods.R 2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/R/methods.R 2009-03-15 18:47:14 UTC (rev 11)
@@ -37,7 +37,7 @@
print.cfboost <- function(x, ...) {
cat("\n")
cat("\t CoxFlexBoost: \n")
- cat("\t Additive Survival Models with Time-Varying Effects\n")
+ cat("\t Structured Survival Models (with Time-Varying Effects)\n")
cat("\t Fitted via Likelihood-Based Boosting\n")
cat("\n")
if (!is.null(x$call))
Modified: pkg/R/mstop.R
===================================================================
--- pkg/R/mstop.R 2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/R/mstop.R 2009-03-15 18:47:14 UTC (rev 11)
@@ -16,3 +16,7 @@
return(mr$iteration)
}
}
+
+mstop.cv <- function(object, ...)
+ attr(object, "mstop")[which.min(colSums(object))]
+
Modified: pkg/man/CoxFlexBoost-package.Rd
===================================================================
--- pkg/man/CoxFlexBoost-package.Rd 2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/man/CoxFlexBoost-package.Rd 2009-03-15 18:47:14 UTC (rev 11)
@@ -27,10 +27,12 @@
}
\seealso{
- Important functions include: \code{\link{rSurvTime}} for simulations,
+ Important functions include: \code{\link{rSurvTime}} for simulations
+ of survival times,
\code{\link{cfboost}} for the boosting algorithm, \code{\link{bols}}
and \code{\link{bbs}} for the base-learners and
- \code{\link[CoxFlexBoost]{methods}} for methods to be used with results.
+ \code{\link[CoxFlexBoost]{methods}} for methods to be used with
+ results.
}
\references{
Modified: pkg/man/boost_control.Rd
===================================================================
--- pkg/man/boost_control.Rd 2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/man/boost_control.Rd 2009-03-15 18:47:14 UTC (rev 11)
@@ -16,10 +16,7 @@
\item{risk}{ character. Determines how the empirical risk should be
computed. It can take the values \code{risk = "inbag"}, i.e., the risk is
computed for the learning sample, \code{risk = "oobag"}, i.e., the risk is
- computed for the validation sample and \code{risk = "none"}. At the moment
- it is only reasonable to use the option \code{risk = "oobag"} as no other
- means of determining the (optimal) stopping iteration is implemented
- so far.}
+ computed for the validation sample and \code{risk = "none"}.}
\item{which.offset}{ character. Indictating the choice of offset. This
can be either the maximum likelihood estimator of the null model
(\code{which.offset = "mle"}) or zero (\code{which.offset = "zero"}).}
Added: pkg/man/crossvalidation.Rd
===================================================================
--- pkg/man/crossvalidation.Rd (rev 0)
+++ pkg/man/crossvalidation.Rd 2009-03-15 18:47:14 UTC (rev 11)
@@ -0,0 +1,107 @@
+\name{crossvalidation}
+\alias{cv}
+\alias{cv.cfboost}
+\alias{print.cv}
+\alias{plot.cv}
+
+\title{ Cross-Validation}
+\description{
+ Function for the estimation of \code{\link{mstop}} via
+ cross-validation and (generic) functions to print or plot the resuls.
+}
+\usage{
+
+\method{cv}{cfboost}(object, folds,
+ grid = c(1:mstop(object, opt=FALSE)), ...)
+\method{print}{cv}(x, ...)
+\method{plot}{cv}(x, ylab = attr(x, "risk"), ylim = range(x),
+ main = attr(x, "call"), ...)
+}
+\arguments{
+ \item{object}{ an object of class \code{\link{cfboost}}.}
+ \item{folds}{ a weight matrix with number of rows equal to the number
+ of observations. The number of columns corresponds to
+ the number of cross-validation runs.}
+ \item{grid}{ a vector of iterations the empirical risk
+ is to be evaluated for. Per default the empirical
+ risks for all iterations \code{1:mstop} are computed and returned.}
+ \item{x}{ an object of class \code{cv} }
+ \item{ylab}{ A title for the y axis. }
+ \item{ylim}{ the y limits of the plot. }
+ \item{main}{ the main title of the plot. }
+ \item{\dots}{ additional arguments to be passed to \code{\link{plot}}. }
+}
+\details{
+ The number of boosting iterations is a hyper-parameter of the
+ boosting algorithms. Cross-validated estimates of the empirical risk
+ for different values of \code{mstop} (as given by
+ \code{\link{grid}}) are computed, which are used to choose the
+ appropriate number of boosting iterations to be applied.
+
+ Different forms of cross-validation can be applied, for example,
+ 5-fold or 10-fold cross-validation. Bootstrapping is not implemented
+ so far. The \code{weights} are defined via the \code{folds} matrix.
+ A.t.m. they can only be used to specify a learning
+ sample which consists of observations with \code{weights == 1} and
+ and an out-of-bag sample with \code{weights == 0}. The latter
+ is used to determine the empirical risk (negative log likelihood).
+}
+\value{
+ \code{cv} returns an object of class \code{cv}, which consists of
+ a matrix of empirical risks and some further attributes.
+}
+\seealso{ \code{\link{cfboost}} for model fitting.
+ See \code{\link{risk}} for methods to extract the inbag and
+ out-of-bag risk and \code{\link{mstop}} for functions to
+ extract the (optimal) stopping iteration (based on cross-validation
+ or on the inbag and out-of-bag risk).}
+\examples{
+## fit a model with all observations first
+
+\dontrun{
+## (as this takes some minutes)
+set.seed(1234)
+## sample covariates first
+X <- matrix(NA, nrow=400, ncol=3)
+X[,1] <- runif(400, -1, 1)
+X[,2] <- runif(400, -1, 1)
+X[,3] <- runif(400, -1, 1)
+
+## time-dependent hazard rate
+lambda <- function(time, x){
+ exp(0 * time + 0.7 * x[1] + x[2]^2)
+}
+
+## specify censoring function
+cens_fct <- function(time, mean_cens){
+ censor_time <- rexp(n = length(time), rate = 1/mean_cens)
+ event <- (time <= censor_time)
+ t_obs <- apply(cbind(time, censor_time), 1, min)
+ return(cbind(t_obs, event))
+}
+daten <- rSurvTime(lambda, X, cens_fct, mean_cens = 5)
+
+ctrl <- boost_control( mstop = 100, risk="none")
+## fit (a simple) model
+model <- cfboost(Surv(time, event) ~ bbs(x.1) + bbs(x.2) + bbs(x.3),
+ control = ctrl, data = daten)
+}
+
+## 5 -fold cross-validation
+
+\dontrun{
+## (as this takes some minutes)
+n <- nrow(daten)
+k <- 5
+ntest <- floor(n / k)
+cv5f <- matrix(c(rep(c(rep(0, ntest), rep(1, n)), k - 1),
+ rep(0, n * k - (k - 1) * (n + ntest))), nrow = n)
+cvm <- cv(model, folds = cv5f)
+print(cvm)
+plot(cvm)
+mstop(cvm)
+}
+}
+
+\keyword{ misc }
+\keyword{ methods }
Modified: pkg/man/mstop.Rd
===================================================================
--- pkg/man/mstop.Rd 2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/man/mstop.Rd 2009-03-15 18:47:14 UTC (rev 11)
@@ -1,6 +1,7 @@
\name{mstop}
\alias{mstop}
\alias{mstop.cfboost}
+\alias{mstop.cv}
\title{ Function to Extract (Optimal) Stopping Iteration }
\description{
@@ -10,37 +11,47 @@
\usage{
mstop(object, ...)
\method{mstop}{cfboost}(object, opt = TRUE, ...)
+\method{mstop}{cv}(object, ...)
}
\arguments{
- \item{object}{ object of class \code{cfboost}. }
+ \item{object}{ object of class \code{cfboost} of of class \code{cv}. }
\item{opt}{ logic. If \code{opt = FALSE} the pre-defined stopping
iteration is returned, if \code{opt = TRUE} the optimal stopping
iteration (in the given range of iterations) is returned. }
\item{\dots}{ (not used a.t.m.) }
}
\details{
- The calculated risk is negative maximum log likelihood for each
- boosting step. The function \code{mstop} can be used to extract the
- number of boosting iterations that were performed (\code{opt = FALSE})
- or the optimal number of boosting iterations \code{opt = TRUE} with
- respect to the in-bag risk or out-of-bag risk. In the first case, the
- risk is computed on the learning sample, in the latter case it is
- computed on the validation sample. The samples can be specified by
- \code{weights} that are given to \code{\link{cfboost}}. The risk that
- is to be computed is specified with the call to \code{\link{cfboost}},
- to be more precise by the \code{\link{boost_control}} function. If an
- out-of-bag sample is specified, the risk that is to be computed can be
+ The calculated risk is the negative maximum log likelihood for each
+ boosting step.
+
+ Applied to a model object of class \code{\link{cfboost}} the function
+ \code{mstop} can be used to extract the number of boosting iterations
+ that were performed (\code{opt = FALSE}) or the optimal number of
+ boosting iterations \code{opt = TRUE} with respect to the in-bag risk
+ or out-of-bag risk. In the first case, the risk is computed on the
+ learning sample, in the latter case it is computed on the validation
+ sample.
+
+ The samples can be specified by \code{weights} that are given to
+ \code{\link{cfboost}}. The risk that is to be computed is specified
+ in the call of \code{\link{cfboost}}, to be more precise by the
+ \code{\link{boost_control}} function. If an out-of-bag sample is
+ specified, the risk that is to be computed can be
set to both alternatives, if not, only the inbag risk is appropriate.
+
+ The function \code{\link{mstop.cv}} is used to extract the
+ optimal boosting iteration from a cross-validated model (as
+ returned by \code{\link{cv}}.
}
\value{
The (optimal) number of boosting iterations is returned.
}
\seealso{ \code{\link{boost_control}} for the specification of
risk-type and \code{\link{cfboost}} for the specification of
- validation samples. }
+ validation samples. See \code{\link{cv}} for cross-validation. }
\examples{
-## see for example ?cfboost for usage of mstop()
+## see for example ?cfboost and ?cv for usage of mstop()
}
\keyword{ methods }
Modified: pkg/man/risk.Rd
===================================================================
--- pkg/man/risk.Rd 2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/man/risk.Rd 2009-03-15 18:47:14 UTC (rev 11)
@@ -57,7 +57,9 @@
\code{print} returns the given \code{object} invvisble.
}
-\seealso{ for other methods for \code{cfboost} objects see \code{\link[CoxFlexBoost]{methods}}. }
+\seealso{ for other methods for \code{cfboost} objects see
+ \code{\link[CoxFlexBoost]{methods}}.
+ See \code{\link{cv}} for cross-validated risk.}
\examples{
## see for example ?CoxFlexBoost-package
## for the usage of risk() and plot(risk()) etc.
More information about the Coxflexboost-commits
mailing list