[Analogue-commits] r142 - in pkg: . R inst man

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Mon Aug 10 01:31:24 CEST 2009


Author: gsimpson
Date: 2009-08-10 01:31:23 +0200 (Mon, 10 Aug 2009)
New Revision: 142

Modified:
   pkg/DESCRIPTION
   pkg/R/roc.R
   pkg/inst/ChangeLog
   pkg/man/roc.Rd
Log:
implements thinning in roc for large problems.

Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION	2009-08-09 10:36:32 UTC (rev 141)
+++ pkg/DESCRIPTION	2009-08-09 23:31:23 UTC (rev 142)
@@ -1,7 +1,7 @@
 Package: analogue
 Type: Package
 Title: Analogue and weighted averaging methods for palaeoecology
-Version: 0.6-15
+Version: 0.6-16
 Date: $Date$
 Depends: R (>= 2.5.0), stats, graphics, vegan, lattice, MASS
 Author: Gavin L. Simpson, Jari Oksanen

Modified: pkg/R/roc.R
===================================================================
--- pkg/R/roc.R	2009-08-09 10:36:32 UTC (rev 141)
+++ pkg/R/roc.R	2009-08-09 23:31:23 UTC (rev 142)
@@ -8,10 +8,25 @@
 
 `roc` <- function(object, groups, k = 1, ...) UseMethod("roc")
 
-`roc.default` <- function(object, groups, k = 1, ...) {
-    calcROC <- function(IN, OUT) {
+`roc.default` <- function(object, groups, k = 1,
+                          thin = FALSE, max.len = 10000, ...) {
+    calcROC <- function(IN, OUT, thin = FALSE, max.len = 10000) {
         n.IN <- length(IN)
         n.OUT <- length(OUT)
+        if(thin) {
+            ## thin
+            ratio <- n.IN/n.OUT
+            largest <- which.max(c(n.IN, n.OUT))
+            seq.vars <- c(max.len * ratio, max.len)
+            if(largest == 1)
+                seq.vars <- rev(seq.vars)
+            IN <- quantile(IN,
+                           probs = seq(0, 1, length.out = seq.vars[1]))
+            OUT <- quantile(OUT,
+                            probs = seq(0, 1, length.out = 1/seq.vars[2]))
+            n.IN <- length(IN)
+            n.OUT <- length(OUT)
+        }
         g <- rep(c(TRUE, FALSE), times = c(n.IN, n.OUT))
         tab <- table(c(IN, OUT), g)
         TPF <- cumsum(tab[, 2])/sum(tab[, 2])
@@ -56,16 +71,17 @@
                                function(x, k) {x[order(x)[k]]}, k = k))
         OUT <- as.numeric(apply(object[inds, !inds], 2,
                                 function(x, k) {x[order(x)[k]]}, k = k))
-        ROC <- calcROC(IN, OUT)
-        within[[l]] <- IN
-        without[[l]] <- OUT
+        ROC <- calcROC(IN, OUT, thin = thin, max.len = max.len)
+        within[[l]] <- ROC$analogue$yes
+        without[[l]] <- ROC$analogue$no
         statistics[l, ] <- with(ROC, data.frame(n.in, n.out, optimal, AUC,
                                                 se.fit, p.value))
         roc[[l]] <- ROC
     }
     IN <- do.call(c, within)
     OUT <- do.call(c, without)
-    roc[["Combined"]] <- ROC <- calcROC(IN, OUT)
+    roc[["Combined"]] <- ROC <- calcROC(IN, OUT, thin = thin,
+                                        max.len = max.len)
     statistics["Combined", ] <- with(ROC, data.frame(n.in, n.out, optimal,
                                                      AUC, se.fit, p.value))
     retval <- list(statistics = statistics, roc = roc)

Modified: pkg/inst/ChangeLog
===================================================================
--- pkg/inst/ChangeLog	2009-08-09 10:36:32 UTC (rev 141)
+++ pkg/inst/ChangeLog	2009-08-09 23:31:23 UTC (rev 142)
@@ -1,5 +1,14 @@
 analogue Change Log
 
+Version 0.6-16
+
+	* roc: For large problems the calculation of AUC and its standard
+	error could overflow the largest number R currently handles. roc()
+	now has two new arguments, 'thin' and 'max.len', which allow the
+	number of points on the ROC curve to be thinned to a smaller number,
+	which should allow the computations to be performed. The original
+	problem was reported by Diana Stralberg.
+
 Version 0.6-15
 
 	* tran: new 'formula' method allows simple selection or exclusion

Modified: pkg/man/roc.Rd
===================================================================
--- pkg/man/roc.Rd	2009-08-09 10:36:32 UTC (rev 141)
+++ pkg/man/roc.Rd	2009-08-09 23:31:23 UTC (rev 142)
@@ -19,6 +19,9 @@
 \usage{
 roc(object, groups, k = 1, ...)
 
+\method{roc}{default}(object, groups, k = 1, thin = FALSE,
+    max.len = 10000, ...)
+
 \method{roc}{mat}(object, groups, k = 1, ...)
 
 \method{roc}{analog}(object, groups, k = 1, ...)
@@ -31,6 +34,10 @@
     coerced to one if supplied vecvtor is not a factor.}
   \item{k}{numeric; the \code{k} closest analogues to use to calculate
     ROC curves.}
+  \item{thin}{logical; should the points on the ROC curve be thinned?
+    See Details, below.}
+  \item{max.len}{numeric; length of analolgue and non-analogue
+    vectors. Used as limit to thin points on ROC curve to.}
   \item{\dots}{arguments passed to/from other methods.}
 }
 \details{
@@ -46,6 +53,17 @@
   the basis of assemblage dissimilarity, then the dissimilarities
   between samples within a group will be small compared to the
   dissimilarities between group members and non group members.
+
+  \code{thin} is useful for large problems, where the number of analogue
+  and non-analogue distances can conceivably be large and thus overflow
+  the largest number R can work with. This option is also useful to
+  speed up computations for large problems. If \code{thin == TRUE}, then
+  the larger of the analogue or non-analogue distances is thinned to a
+  maximum length of \code{max.len}. The smaller set of distances is
+  scaled proportionally. In thinning, we approximate the distribution of
+  distances by taking \code{max.len} (or a fraction of \code{max.len}
+  for the smaller set of distances) equally-spaced probability
+  quantiles of the distribution as a new set of distances. 
 }
 \value{
   A list with two components; i, \code{statistics}, a summary of ROC



More information about the Analogue-commits mailing list