[Coxflexboost-commits] r11 - in pkg: R man

Sun Mar 15 19:47:14 CET 2009

Author: hofner
Date: 2009-03-15 19:47:14 +0100 (Sun, 15 Mar 2009)
New Revision: 11

Added:
   pkg/R/crossvalidation.R
   pkg/man/crossvalidation.Rd
Modified:
   pkg/R/cfboost.R
   pkg/R/methods.R
   pkg/R/mstop.R
   pkg/man/CoxFlexBoost-package.Rd
   pkg/man/boost_control.Rd
   pkg/man/mstop.Rd
   pkg/man/risk.Rd
Log:
added function cv for cross validation

Modified: pkg/R/cfboost.R
===================================================================

--- pkg/R/cfboost.R	2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/R/cfboost.R	2009-03-15 18:47:14 UTC (rev 11)
@@ -11,8 +11,9 @@
 cfboost.formula <- function(formula, data = list(), weights = NULL, na.action = na.omit,  control = boost_control(), ...) {
     ## construct design matrix etc.
     object <- boost_dpp(formula, data, weights, na.action)
-    ## fit the ensemble
     object$input <- object$menv at get("input")
+    object$data <- data
+    object$formula <- formula
     if (!is.null(weights))
         object$oob$input <- object$oob$menv at get("input")
     RET <- cfboost_fit(object, control = control, data = data, weights = weights, ...)
@@ -65,7 +66,7 @@
 
     fit <- fit_oob <- offset <- getoffset(y, which.offset)
     if (trace)
-        cat("Offset: ", offset, "\n")
+        cat("Offset: ", offset, "\n\n")
 
     mstart <- 1
     hSi <- 1     # number of iterations in the repeat loop
@@ -73,14 +74,14 @@
 
     ## compute df2lambda which depends on the offset and on y
     if (trace)
-        cat("compute df2lambda .")
+        cat("Compute df2lambda .")
     for (i in 1:length(x)){
         if (!is.null( attr(x[[i]], "df"))){
             attr(x[[i]], "df2lambda")(y, offset)
             if (trace) cat(".")
         }
     }
-    if (trace) cat("\n")
+    if (trace) cat("\n\n")
 
     ##################################
     #### start boosting iteration ####
@@ -88,7 +89,7 @@
     repeat{
       for (m in mstart:mstop) {
         if (trace)
-          cat("Step ", m, "; Progress .")
+          cat("Step ", m, "; Progress .", sep="")
 
         ## fit MLE component-wise
         for (i in 1:length(x)) {
@@ -141,7 +142,7 @@
             fit_oob <- fit_oob + nu *  oob$x[[xselect]] %*% coefs[[xselect]]
         }
       }
-      if (hardStop | risk != "oobag" | which.min(mrisk) < (mstop - mstop * 0.2/hSi) | hSi == 5) break
+      if (hardStop || risk != "oobag" || which.min(mrisk) < (mstop - mstop * 0.2/hSi) || hSi == 5) break
 
       ## else if minimum is at the end of the boosting algorithm,
       ## don't stop but proceed with boosting: Therefore, increase mstop

Added: pkg/R/crossvalidation.R
===================================================================
--- pkg/R/crossvalidation.R	                        (rev 0)
+++ pkg/R/crossvalidation.R	2009-03-15 18:47:14 UTC (rev 11)
@@ -0,0 +1,69 @@
+cv <- function(object, ...)
+    UseMethod("cv")
+
+cv.cfboost <- function(object, folds, grid = c(1:mstop(object, opt=FALSE)), ...){
+
+    oobrisk <- matrix(0, nrow = ncol(folds), ncol = length(grid))
+
+    ctrl <- object$control
+
+    ctrl$risk <- "oobag"
+    # fehlt noch: ctrl$savedata <- FALSE
+    # fehlt noch: ctrl$saveensss <- FALSE
+
+    if (is.null(object$data))
+        stop(sQuote("object"), " does not contain data. Estimate model with option ", sQuote("savedata = TRUE"))
+
+    call <- deparse(object$call)
+    data <- object$data$data
+    formula <- object$data$formula
+
+    ## free memory
+    rm("object")
+
+    i <- 0
+
+    dummyfct <- function(weights, control, data, formula, grid){
+        i <<- i + 1
+        cat("\n>>> Fold ", i, "\n\n")
+        model <- cfboost(formula, data = data, control = control, weights = weights)
+        ret <- risk(model)[grid]
+        rm("model")
+        ret
+    }
+
+    oobrisk <- apply(folds, 2, dummyfct, control = ctrl, data = data, formula = formula, grid = grid)
+    oobrisk <- t(oobrisk)
+    oobrisk <- oobrisk/colSums(folds == 0)
+    colnames(oobrisk) <- grid
+    rownames(oobrisk) <- 1:nrow(oobrisk)
+    attr(oobrisk, "call") <- call
+    attr(oobrisk, "mstop") <- grid
+    attr(oobrisk, "risk") <- "empirical risk (neg. log likelihood)"
+    class(oobrisk) <- "cv"
+    oobrisk
+}
+
+
+print.cv <- function(x, ...) {
+    cat("\n\t Cross-validated risk \n\t Call:",
+              attr(x, "call"), "\n\n")
+    print(colMeans(x))
+    cat("\n\t Optimal number of boosting iterations:", mstop(x), "\n")
+    return(invisible(x))
+}
+
+plot.cv <- function(x, ylab = attr(x, "risk"), ylim = range(x),
+                        main = attr(x, "call"), ...) {
+
+    cm <- colMeans(x)
+    plot(1:ncol(x), cm, ylab = ylab, ylim = ylim,
+         type = "n", lwd = 2,
+         xlab = "Number of boosting iterations",
+         main = main, ...)
+    out <- apply(x, 1, function(y) lines(1:ncol(x),y, col = "lightgrey"))
+    rm(out)
+    ms <- which.min(cm)
+    lines(c(ms, ms), c(min(c(0, ylim[1] * ifelse(ylim[1] < 0, 2, 0.5))), cm[ms]), lty = 2)
+    lines(1:ncol(x), cm, type = "l")
+}


Property changes on: pkg/R/crossvalidation.R
___________________________________________________________________
Name: svn:executable
   + *

Modified: pkg/R/methods.R
===================================================================
--- pkg/R/methods.R	2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/R/methods.R	2009-03-15 18:47:14 UTC (rev 11)
@@ -37,7 +37,7 @@
 print.cfboost <- function(x, ...) {
     cat("\n")
     cat("\t CoxFlexBoost: \n")
-    cat("\t Additive Survival Models with Time-Varying Effects\n")
+    cat("\t Structured Survival Models (with Time-Varying Effects)\n")
     cat("\t Fitted via Likelihood-Based Boosting\n")
     cat("\n")
     if (!is.null(x$call))

Modified: pkg/R/mstop.R
===================================================================
--- pkg/R/mstop.R	2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/R/mstop.R	2009-03-15 18:47:14 UTC (rev 11)
@@ -16,3 +16,7 @@
         return(mr$iteration)
     }
 }
+
+mstop.cv <- function(object, ...)
+    attr(object, "mstop")[which.min(colSums(object))]
+

Modified: pkg/man/CoxFlexBoost-package.Rd
===================================================================
--- pkg/man/CoxFlexBoost-package.Rd	2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/man/CoxFlexBoost-package.Rd	2009-03-15 18:47:14 UTC (rev 11)
@@ -27,10 +27,12 @@
 }
 
 \seealso{
-  Important functions include: \code{\link{rSurvTime}} for simulations,
+  Important functions include: \code{\link{rSurvTime}} for simulations
+  of survival times,
   \code{\link{cfboost}} for the boosting algorithm, \code{\link{bols}}
   and \code{\link{bbs}} for the base-learners and
-  \code{\link[CoxFlexBoost]{methods}} for methods to be used with results.
+  \code{\link[CoxFlexBoost]{methods}} for methods to be used with
+  results.
 }
 
 \references{

Modified: pkg/man/boost_control.Rd
===================================================================
--- pkg/man/boost_control.Rd	2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/man/boost_control.Rd	2009-03-15 18:47:14 UTC (rev 11)
@@ -16,10 +16,7 @@
   \item{risk}{ character. Determines how the empirical risk should be
     computed. It can take the values \code{risk = "inbag"}, i.e., the risk is
     computed for the learning sample, \code{risk = "oobag"}, i.e., the risk is
-    computed for the validation sample and \code{risk = "none"}. At the moment
-    it is only reasonable to use the option \code{risk = "oobag"} as no other
-    means of determining the (optimal) stopping iteration is implemented
-    so far.}
+    computed for the validation sample and \code{risk = "none"}.}
   \item{which.offset}{ character. Indictating the choice of offset. This
     can be either the maximum likelihood estimator of the null model
     (\code{which.offset = "mle"}) or zero (\code{which.offset = "zero"}).}

Added: pkg/man/crossvalidation.Rd
===================================================================
--- pkg/man/crossvalidation.Rd	                        (rev 0)
+++ pkg/man/crossvalidation.Rd	2009-03-15 18:47:14 UTC (rev 11)
@@ -0,0 +1,107 @@
+\name{crossvalidation}
+\alias{cv}
+\alias{cv.cfboost}
+\alias{print.cv}
+\alias{plot.cv}
+
+\title{ Cross-Validation}
+\description{
+  Function for the estimation of \code{\link{mstop}} via
+  cross-validation and (generic) functions to print or plot the resuls.
+}
+\usage{
+
+\method{cv}{cfboost}(object, folds,
+   grid = c(1:mstop(object, opt=FALSE)), ...)
+\method{print}{cv}(x, ...)
+\method{plot}{cv}(x, ylab = attr(x, "risk"), ylim = range(x),
+     main = attr(x, "call"), ...)
+}
+\arguments{
+  \item{object}{ an object of class \code{\link{cfboost}}.}
+  \item{folds}{ a weight matrix with number of rows equal to the number
+                of observations. The number of columns corresponds to
+                the number of cross-validation runs.}
+  \item{grid}{ a vector of iterations the empirical risk
+               is to be evaluated for. Per default the empirical
+               risks for all iterations \code{1:mstop} are computed and returned.}
+  \item{x}{ an object of class \code{cv} }
+  \item{ylab}{ A title for the y axis. }
+  \item{ylim}{ the y limits of the plot. }
+  \item{main}{ the main title of the plot. }
+  \item{\dots}{ additional arguments to be passed to \code{\link{plot}}. }
+}
+\details{
+  The number of boosting iterations is a hyper-parameter of the
+  boosting algorithms. Cross-validated estimates of the empirical risk
+  for different values of \code{mstop} (as given by
+  \code{\link{grid}}) are computed, which are used to choose the
+  appropriate number of boosting iterations to be applied.
+
+  Different forms of cross-validation can be applied, for example,
+  5-fold or 10-fold cross-validation. Bootstrapping is not implemented
+  so far. The \code{weights} are defined via the \code{folds} matrix.
+  A.t.m. they can only be used to specify a learning
+  sample which consists of observations with \code{weights == 1} and
+  and an out-of-bag sample with \code{weights == 0}. The latter
+  is used to determine the empirical risk (negative log likelihood).
+}
+\value{
+  \code{cv} returns an object of class \code{cv}, which consists of
+  a matrix of empirical risks and some further attributes.
+}
+\seealso{ \code{\link{cfboost}} for model fitting.
+   See \code{\link{risk}} for methods to extract the inbag and
+   out-of-bag risk and \code{\link{mstop}} for functions to
+   extract the (optimal) stopping iteration (based on cross-validation
+   or on the inbag and out-of-bag risk).}
+\examples{
+## fit a model with all observations first
+
+\dontrun{
+## (as this takes some minutes)
+set.seed(1234)
+## sample covariates first
+X <- matrix(NA, nrow=400, ncol=3)
+X[,1] <- runif(400, -1, 1)
+X[,2] <- runif(400, -1, 1)
+X[,3] <- runif(400, -1, 1)
+
+## time-dependent hazard rate
+lambda <- function(time, x){
+   exp(0 * time + 0.7 * x[1] + x[2]^2)
+}
+
+## specify censoring function
+cens_fct <- function(time, mean_cens){
+  censor_time <- rexp(n = length(time), rate = 1/mean_cens)
+  event <- (time <= censor_time)
+  t_obs <- apply(cbind(time, censor_time), 1, min)
+  return(cbind(t_obs, event))
+}
+daten <- rSurvTime(lambda, X, cens_fct, mean_cens = 5)
+
+ctrl <- boost_control( mstop = 100, risk="none")
+## fit (a simple) model
+model <- cfboost(Surv(time, event) ~ bbs(x.1) + bbs(x.2) + bbs(x.3),
+                 control = ctrl, data = daten)
+}
+
+## 5 -fold cross-validation
+
+\dontrun{
+## (as this takes some minutes)
+n <- nrow(daten)
+k <- 5
+ntest <- floor(n / k)
+cv5f <- matrix(c(rep(c(rep(0, ntest), rep(1, n)), k - 1),
+                       rep(0, n * k - (k - 1) * (n + ntest))), nrow = n)
+cvm <- cv(model, folds = cv5f)
+print(cvm)
+plot(cvm)
+mstop(cvm)
+}
+}
+
+\keyword{ misc }
+\keyword{ methods }

Modified: pkg/man/mstop.Rd
===================================================================
--- pkg/man/mstop.Rd	2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/man/mstop.Rd	2009-03-15 18:47:14 UTC (rev 11)
@@ -1,6 +1,7 @@
 \name{mstop}
 \alias{mstop}
 \alias{mstop.cfboost}
+\alias{mstop.cv}
 
 \title{ Function to Extract (Optimal) Stopping Iteration }
 \description{
@@ -10,37 +11,47 @@
 \usage{
 mstop(object, ...)
 \method{mstop}{cfboost}(object, opt = TRUE, ...)
+\method{mstop}{cv}(object, ...)
 }
 
 \arguments{
-  \item{object}{ object of class \code{cfboost}. }
+  \item{object}{ object of class \code{cfboost} of of class \code{cv}. }
   \item{opt}{ logic. If \code{opt = FALSE} the pre-defined stopping
     iteration is returned, if \code{opt = TRUE} the optimal stopping
     iteration (in the given range of iterations) is returned. }
   \item{\dots}{ (not used a.t.m.) }
 }
 \details{
-  The calculated risk is negative maximum log likelihood for each
-  boosting step. The function \code{mstop} can be used to extract the
-  number of boosting iterations that were performed (\code{opt = FALSE})
-  or the optimal number of boosting iterations \code{opt = TRUE} with
-  respect to the in-bag risk or out-of-bag risk. In the first case, the
-  risk is computed on the learning sample, in the latter case it is
-  computed on the validation sample. The samples can be specified by
-  \code{weights} that are given to \code{\link{cfboost}}. The risk that
-  is to be computed is specified with the call to \code{\link{cfboost}},
-  to be more precise by the \code{\link{boost_control}} function. If an
-  out-of-bag sample is specified, the risk that is to be computed can be
+  The calculated risk is the negative maximum log likelihood for each
+  boosting step.
+
+  Applied to a model object of class \code{\link{cfboost}} the function
+  \code{mstop} can be used to extract the number of boosting iterations
+  that were performed (\code{opt = FALSE}) or the optimal number of
+  boosting iterations \code{opt = TRUE} with respect to the in-bag risk
+  or out-of-bag risk. In the first case, the risk is computed on the
+  learning sample, in the latter case it is computed on the validation
+  sample.
+
+  The samples can be specified by \code{weights} that are given to
+  \code{\link{cfboost}}. The risk that is to be computed is specified
+  in the call of \code{\link{cfboost}}, to be more precise by the
+  \code{\link{boost_control}} function. If an out-of-bag sample is
+  specified, the risk that is to be computed can be
   set to both alternatives, if not, only the inbag risk is appropriate.
+
+  The function \code{\link{mstop.cv}} is used to extract the
+  optimal boosting iteration from a cross-validated model (as
+  returned by \code{\link{cv}}.
 }
 \value{
   The (optimal) number of boosting iterations is returned.
 }
 \seealso{ \code{\link{boost_control}} for the specification of
   risk-type and \code{\link{cfboost}} for the specification of
-  validation samples. }
+  validation samples. See \code{\link{cv}} for cross-validation. }
 \examples{
-## see for example ?cfboost for usage of mstop()
+## see for example ?cfboost and ?cv for usage of mstop()
 }
 
 \keyword{ methods }

Modified: pkg/man/risk.Rd
===================================================================
--- pkg/man/risk.Rd	2009-03-15 18:46:20 UTC (rev 10)
+++ pkg/man/risk.Rd	2009-03-15 18:47:14 UTC (rev 11)
@@ -57,7 +57,9 @@
 
   \code{print} returns the given \code{object} invvisble.
 }
-\seealso{ for other methods for \code{cfboost} objects see \code{\link[CoxFlexBoost]{methods}}. }
+\seealso{ for other methods for \code{cfboost} objects see
+  \code{\link[CoxFlexBoost]{methods}}.
+  See \code{\link{cv}} for cross-validated risk.}
 \examples{
 ## see for example ?CoxFlexBoost-package
 ## for the usage of risk() and plot(risk()) etc.