[Analogue-commits] r133 - in pkg: . R inst man src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Sun Jun 7 15:34:45 CEST 2009


Author: gsimpson
Date: 2009-06-07 15:34:37 +0200 (Sun, 07 Jun 2009)
New Revision: 133

Added:
   pkg/R/fixUpTol.R
   pkg/src/
   pkg/src/distx.c
   pkg/src/distxy.c
   pkg/src/wastats.c
Modified:
   pkg/DESCRIPTION
   pkg/R/internal.R
   pkg/R/predict.wa.R
   pkg/R/wa.R
   pkg/R/wa.formula.R
   pkg/R/zzz.R
   pkg/inst/ChangeLog
   pkg/man/analogue-internal.Rd
   pkg/man/wa.Rd
Log:
Incremental improvements in wa() and predict.wa() for tolerance down-weighting models. LOO CV now works with tol.dw = TRUE; code tidied a lot to re-use new internal utility functions. New C code for internal utility function w.tol().

Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION	2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/DESCRIPTION	2009-06-07 13:34:37 UTC (rev 133)
@@ -1,7 +1,7 @@
 Package: analogue
 Type: Package
 Title: Analogue and weighted averaging methods for palaeoecology
-Version: 0.6-9
+Version: 0.6-10
 Date: $Date$
 Depends: R (>= 2.5.0), stats, graphics, vegan, lattice, MASS
 Author: Gavin L. Simpson, Jari Oksanen

Added: pkg/R/fixUpTol.R
===================================================================
--- pkg/R/fixUpTol.R	                        (rev 0)
+++ pkg/R/fixUpTol.R	2009-06-07 13:34:37 UTC (rev 133)
@@ -0,0 +1,26 @@
+## fix-up tolerances for use in TF computations
+fixUpTol <- function(tol, na.tol, small.tol, min.tol, f, env) {
+    ## first missing tolerances
+    if(any(NA.TOL <- is.na(tol))) {
+        tol[NA.TOL] <-
+            switch(na.tol,
+                   min = min(tol, na.rm = TRUE),
+                   mean = mean(tol, na.rm = TRUE),
+                   max = max(tol, na.rm = TRUE))
+    }
+    ## second, replace tol < min.tol
+    if(!is.null(min.tol) && any(MIN.TOL <- tol < min.tol)) {
+        small.tol <- match.arg(small.tol)
+        ## min.tol must be in or on extremesof range(env)
+        if(min.tol < min(env) || min.tol > max(env))
+            stop("'min.tol' must be >= min(env) and <= max(env)")
+        if(small.tol == "fraction")
+            frac <- f * diff(range(env))
+        tol[MIN.TOL] <-
+            switch(small.tol,
+                   fraction = frac,
+                   absolute = min.tol,
+                   min = min(tol[tol >= min.tol], na.rm = TRUE))
+    }
+    return(tol)
+}

Modified: pkg/R/internal.R
===================================================================
--- pkg/R/internal.R	2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/R/internal.R	2009-06-07 13:34:37 UTC (rev 133)
@@ -227,7 +227,14 @@
     ## x   = species abundances
     ## env = vector of response var
     ## opt = weighted average optima
-    tol <- sqrt(ColSums(x * outer(env, opt, "-")^2) / ColSums(x))
+    ##tol <- sqrt(ColSums(x * outer(env, opt, "-")^2) / ColSums(x))
+    nr <- NROW(x)
+    nc <- NCOL(x)
+    tol <- .C("WTOL", x = as.double(env), w = as.double(x),
+              opt = as.double(opt),
+              nr = as.integer(nr), nc = as.integer(nc),
+              stat = double(nc), NAOK = FALSE,
+              PACKAGE="analogue")$stat
     if(useN2)
         tol <- tol / sqrt(1 - (1 / sppN2(x)))
     names(tol) <- colnames(x)
@@ -245,6 +252,3 @@
     1/H
 }
 
-## data(ImbrieKipp)
-## data(SumSST)
-## mod <- wa(SumSST ~ ., data = ImbrieKipp, tol.dw = TRUE)

Modified: pkg/R/predict.wa.R
===================================================================
--- pkg/R/predict.wa.R	2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/R/predict.wa.R	2009-06-07 13:34:37 UTC (rev 133)
@@ -1,7 +1,6 @@
 `predict.wa` <- function(object, newdata,
                          CV = c("none","LOO","bootstrap", "nfold"),
-                         verbose = FALSE,
-                         n.boot = 100, nfold = 5,
+                         verbose = FALSE, n.boot = 100, nfold = 5,
                          ...) {
     if(missing(newdata))
         return(fitted(object))
@@ -24,14 +23,13 @@
                            none = no.deshrink)
     X <- object$orig.x
     ENV <- object$orig.env
+    ## tolerance options from model
+    O <- object$options.tol
     ## Doing CV?
     if(identical(CV, "none")) {
         want <- names(object$wa.optima) %in%
         colnames(newdata)
         want <- names(object$wa.optima)[want]
-        #pred <- colSums(t(newdata[,want]) *
-        #                object$wa.optima[want]) /
-        #                    rowSums(newdata[,want])
         if(object$tol.dw) {
             pred <- WATpred(newdata[,want], object$wa.optima[want],
                             object$model.tol[want])
@@ -45,37 +43,42 @@
             loo.pred <- matrix(0, ncol = n.train,
                                nrow = n.fossil)
             mod.pred <- length(n.train)
+            useN2 <- object$options.tol$useN2
+            want <- names(object$wa.optima) %in% colnames(newdata)
+            want <- names(object$wa.optima)[want]
             for(i in seq_len(n.train)) {
                 if(verbose && ((i %% 10) == 0)) {
                     cat(paste("Leave one out sample", i, "\n"))
                     flush.console()
                 }
                 wa.optima <- w.avg(X[-i,], ENV[-i])
-                ## do the model bits
-                ones <- rep(1, length = length(wa.optima))
-                miss <- is.na(wa.optima)
-                ones[miss] <- 0
-                wa.optima[miss] <- 0
-                rowsum <- X[-i,] %*% ones
-                wa.env <- (X[-i,]%*% wa.optima) / rowsum
+                tol <- w.tol(X[-i, ], ENV[-i], wa.optima,
+                             useN2 = useN2)
+                ## fix up problematic tolerances
+                tol <- fixUpTol(tol, O$na.tol, O$small.tol,
+                                O$min.tol, O$f, ENV[-i])
+                ## CV for the training set
+                if(object$tol.dw) {
+                    wa.env <- WATpred(X[-i,], wa.optima, tol)
+                    mod.pred[i] <- WATpred(X[i,,drop=FALSE],
+                                           wa.optima, tol)
+                } else {
+                    wa.env <- WApred(X[-i,], wa.optima)
+                    mod.pred[i] <- WApred(X[i,,drop=FALSE],
+                                          wa.optima)
+                }
                 deshrink.mod <- deshrink.fun(ENV[-i], wa.env)
                 wa.env <- deshrink.mod$env
-                coefs <- coef(deshrink.mod) #$coef
+                coefs <- coef(deshrink.mod)
                 ## LOO model predictions
-                rowsum <- X[i,,drop=FALSE] %*% ones
-                mod.pred[i] <- (X[i,,drop=FALSE] %*%
-                                wa.optima) /
-                                    rowsum
                 mod.pred[i] <- deshrink.pred(mod.pred[i], coefs)
                 ## newdata predictions
-                want <- names(wa.optima) %in% colnames(newdata)
-                want <- names(wa.optima)[want]
-                ones <- rep(1, length = length(want))
-                miss <- miss[want]
-                ones[miss] <- 0
-                rowsum <- newdata[,want] %*% ones
-                pred <- (newdata[,want] %*% wa.optima[want]) /
-                    rowsum
+                pred <- if(object$tol.dw) {
+                    WATpred(newdata[,want], wa.optima[want],
+                            tol[want])
+                } else {
+                    WApred(newdata[,want], wa.optima[want])
+                }
                 loo.pred[,i] <- deshrink.pred(pred, coefs)
             }
             ## average the LOO predictions
@@ -226,13 +229,28 @@
 }
 
 WApred <- function(X, optima) {
-    ((X %*% optima) / rowSums(X))[,1, drop = TRUE]
+    ones <- rep.int(1, length(optima))
+    miss <- is.na(optima)
+    ones[miss] <- 0
+    optima[miss] <- 0
+    rsum <- X %*% ones
+    ((X %*% optima) / rsum)
 }
 
 WATpred <- function(X, optima, tol) {
+    ones <- rep.int(1, length(optima))
+    miss <- is.na(optima)
+    ones[miss] <- 0
+    optima[miss] <- 0
+    tol[miss] <- 1
     tol2 <- tol^2
-    tmp <- sweep(X, 2, optima, "*", check.margin = FALSE)
-    tmp <- rowSums(sweep(tmp, 2, tol2, "/",
-                         check.margin = FALSE))
-    tmp / rowSums(sweep(X, 2, tol2, "/", check.margin = FALSE))
+    #tmp <- sweep(X, 2, optima, "*", check.margin = FALSE)
+    ##tmp <- RowSums(t(t(X) * optima / tol2))
+    #tmp <- RowSums(sweep(tmp, 2, tol2, "/",
+    #                     check.margin = FALSE))
+    #tmp / RowSums(sweep(X, 2, tol2, "/",
+    #                    check.margin = FALSE)[,!miss, drop = FALSE])
+    #tmp / (sweep(X, 2, tol2, "/",check.margin = FALSE) %*% ones)
+    ##tmp / (t(t(X) / tol2) %*% ones)
+    RowSums(t(t(X) * optima / tol2)) / (t(t(X) / tol2) %*% ones)
 }

Modified: pkg/R/wa.R
===================================================================
--- pkg/R/wa.R	2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/R/wa.R	2009-06-07 13:34:37 UTC (rev 133)
@@ -27,34 +27,18 @@
     if(missing(na.tol))
         na.tol <- "min"
     na.tol <- match.arg(na.tol)
-    ## first missing tolerances
-    if(any(NA.TOL <- is.na(tol))) {
-        tol[NA.TOL] <- switch(na.tol,
-                              min = min(tol, na.rm = TRUE),
-                              mean = mean(tol, na.rm = TRUE),
-                              max = max(tol, na.rm = TRUE))
+    if(missing(small.tol))
+        small.tol <- "min"
+    small.tol <- match.arg(small.tol)
+    if(small.tol == "fraction") {
+        if(!(f > 0 && f < 1))
+            stop("'f' must be 0 < f < 1")
+        frac <- f * diff(range(env))
+        if(frac < min.tol)
+            warning("Requested fraction of gradient is < minimum tolerance.")
     }
-    ## second, replace tol < min.tol
-    if(!is.null(min.tol) && any(MIN.TOL <- tol < min.tol)) {
-        if(missing(small.tol))
-            small.tol <- "min"
-        small.tol <- match.arg(small.tol)
-        ## min.tol must be in or on extremesof range(env)
-        if(min.tol < min(env) || min.tol > max(env))
-            stop("'min.tol' must be >= min(env) and <= max(env)")
-        ## warn if "fraction" and f * diff(range(env))) < min.tol
-        if(small.tol == "fraction") {
-            if(!(f > 0 && f < 1))
-                stop("'f' must be 0 < f < 1")
-            frac <- f * diff(range(env))
-            if(frac < min.tol)
-                warning("Requested fraction of gradient is < minimum tolerance.")
-        }
-        tol[MIN.TOL] <- switch(small.tol,
-                               fraction = frac,
-                               absolute = min.tol,
-                               min = min(tol[tol >= min.tol], na.rm = TRUE))
-    }
+    tol <- fixUpTol(tol, na.tol = na.tol, small.tol = small.tol,
+                    min.tol = min.tol, f = f, env = env)
     ## calculate WA estimate of env for each site
     if(tol.dw) {
         wa.env <- WATpred(x, wa.optima, tol)

Modified: pkg/R/wa.formula.R
===================================================================
--- pkg/R/wa.formula.R	2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/R/wa.formula.R	2009-06-07 13:34:37 UTC (rev 133)
@@ -1,6 +1,10 @@
 `wa.formula` <- function(formula, data, subset, na.action,
                          deshrink = c("inverse", "classical", "expanded", "none"),
-                         tol.dw = FALSE, ..., model = FALSE) {
+                         tol.dw = FALSE, useN2 = TRUE,
+                         na.tol = c("min","mean","max"),
+                         small.tol = c("min","fraction","absolute"),
+                         min.tol = NULL, f = 0.1, ...,
+                         model = FALSE) {
     ## set default deshrinking to inverse if no supplied
     if(missing(deshrink))
         deshrink <- "inverse"

Modified: pkg/R/zzz.R
===================================================================
--- pkg/R/zzz.R	2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/R/zzz.R	2009-06-07 13:34:37 UTC (rev 133)
@@ -1,5 +1,5 @@
 .First.lib <- function(lib, pkg)  {
-    ##library.dynam("analogue", pkg, lib)
+    library.dynam("analogue", pkg, lib)
     packageStartupMessage("This is analogue ",
                           utils::packageDescription("analogue",
                                                     field="Version"),

Modified: pkg/inst/ChangeLog
===================================================================
--- pkg/inst/ChangeLog	2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/inst/ChangeLog	2009-06-07 13:34:37 UTC (rev 133)
@@ -1,7 +1,19 @@
 analogue Change Log
 
-Version 0.6-9 (Opened Wed 27 May 2009)
+Version 0.6-10
 
+	* predict.wa: Predictions with tolerance DW now works for CV = "LOO"
+
+	* wa.formula: Argument list updated to match wa.default.
+
+	* fixUpTol: New internal utility function the encapsulates code to
+	modify working tolerances within WA model fitting.
+
+	* w.tol: Internal function w.tol now uses a faster C version of the
+	code.
+	
+Version 0.6-9 (Closed Sun 7 June 2009)
+
 	* predict.wa: Predictions without CV can now be made for WA models
 	fitted using tolerance DW.
 

Modified: pkg/man/analogue-internal.Rd
===================================================================
--- pkg/man/analogue-internal.Rd	2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/man/analogue-internal.Rd	2009-06-07 13:34:37 UTC (rev 133)
@@ -17,6 +17,7 @@
 \alias{w.tol}
 \alias{WApred}
 \alias{WATpred}
+\alias{fixUpTol}
 \title{Internal analogue Functions}
 \description{
   Internal analogue functions

Modified: pkg/man/wa.Rd
===================================================================
--- pkg/man/wa.Rd	2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/man/wa.Rd	2009-06-07 13:34:37 UTC (rev 133)
@@ -23,7 +23,9 @@
 
 \method{wa}{formula}(formula, data, subset, na.action,
    deshrink = c("inverse", "classical", "expanded", "none"),
-   tol.dw = FALSE, ..., model = FALSE)
+   tol.dw = FALSE, useN2 = TRUE, na.tol = c("min","mean","max"),
+   small.tol = c("min","fraction","absolute"), min.tol = NULL,
+   f = 0.1,..., model = FALSE)
 
 \method{fitted}{wa}(object, \dots)
 

Added: pkg/src/distx.c
===================================================================
--- pkg/src/distx.c	                        (rev 0)
+++ pkg/src/distx.c	2009-06-07 13:34:37 UTC (rev 133)
@@ -0,0 +1,547 @@
+/* 
+ * Functions to compute dissimilarity for a single matrix
+ *
+ * Based on code from vegdist by Jari Oksanen:
+ *
+ * (C) 2001-2009, Jari Oksanen
+ * (C) 2009 Gavin L. Simpson
+ *
+ * Licene: GPL 2
+ */
+
+/* Standard R headers */
+
+#include <R.h>
+#include <Rmath.h>
+#include <math.h>
+
+/* Indices */
+/* Note we don't actually call all of these via xy_distance
+ * Some are called via direct methods, but we include the 
+ * indices here to allow the pattern matching to work 
+ * correctly
+ */
+#define EUCLIDEAN 1
+#define SQEUCLIDEAN 2
+#define CHORD 3
+#define SQCHORD 4
+#define BRAY 5
+#define CHISQUARE 6
+#define SQCHISQUARE 7
+#define INFORMATION 8
+#define CHIDISTANCE 9
+#define MANHATTAN 10
+#define KENDALL 11
+#define GOWER 12
+#define ALTGOWER 13
+#define MIXED 14
+
+/* Distance functions */
+
+/* Euclidean distance */
+double xx_euclidean(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dev = x[i1] - x[i2];
+	    dist += dev*dev;
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count == 0) return NA_REAL;
+    return sqrt(dist);
+}
+
+/* Squared Euclidean distance */
+double xx_sq_euclidean(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dev = x[i1] - x[i2];
+	    dist += dev*dev;
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count == 0) return NA_REAL;
+    return dist;
+}
+
+/* Chord distance */
+double xx_chord(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dev = sqrt(x[i1]) - sqrt(x[i2]);
+	    dist += dev*dev;
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count == 0) return NA_REAL;
+    return sqrt(dist);
+}
+
+/* Squared Chord distance */
+double xx_sq_chord(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dev = sqrt(x[i1]) - sqrt(x[i2]);
+	    dist += dev*dev;
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count == 0) return NA_REAL;
+    return dist;
+}
+
+/*  Bray-Curtis */
+double xx_bray(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist, total;
+    int count, j;
+    
+    total = 0.0;
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dist += fabs(x[i1] - x[i2]);
+	    total += x[i1] + x[i2];
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count==0) return NA_REAL;
+    dist /= total;
+    return dist;
+}
+
+/*  chi square */
+double xx_chi_square(double *x, int nr,  int nc, int i1, int i2)
+{
+    double dist, dev, nomin, denomin;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    if(x[i1] != 0 || x[i2] != 0) {
+		dev = x[i1] - x[i2];
+		nomin = dev*dev;
+		denomin = x[i1] + x[i2];
+		dist += nomin / denomin;
+		count++;
+	    }
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count==0) return NA_REAL;
+    return sqrt(dist);
+}
+
+/*  square chi square */
+double xx_sq_chi_square(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist, dev, nomin, denomin;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    if(x[i1] != 0 || x[i2] != 0) {
+		dev = x[i1] - x[i2];
+		nomin = dev*dev;
+		denomin = x[i1] + x[i2];
+		dist += nomin / denomin;
+		count++;
+	    }
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count==0) return NA_REAL;
+    return dist;
+}
+
+/* information statistic */
+double xx_information(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist, XY, A, B, Adist, Bdist;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    Adist = 0.0;
+    Bdist = 0.0;
+    A = 0.0;
+    B = 0.0;
+    for(j=0; j<nc; j++) {
+	if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    XY = x[i1] + x[i2];
+	    A += x[i1] * (log((2 * x[i1]) / XY)/log(2));
+	    B += x[i2] * (log((2 * x[i2]) / XY)/log(2));
+	    if(R_FINITE(A)) {
+		Adist += A;
+	    }
+	    if(R_FINITE(B)) {
+		Bdist += B;
+	    }
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if(count==0) return NA_REAL;
+    dist = A + B;
+    return dist;
+}
+
+/*  chi square distance*/
+/* currently not correct */
+double xx_chi_distance(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist, dev, nomin;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dev = x[i1] - x[i2];
+	    nomin += dev*dev;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count==0) return NA_REAL;
+    return dist;
+}
+
+/* Manhattan metric */
+double xx_manhattan(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dist += fabs( x[i1] - x[i2] );
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count == 0) return NA_REAL;
+    return dist;
+}
+
+/* Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+   in the R wrapper */
+double xx_gower(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dist += fabs( x[i1] - x[i2] );
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count == 0) return NA_REAL;
+    return dist;
+}
+
+/* Alternative Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+   in the R wrapper */
+double xx_alt_gower(double *x, int nr, int nc, int i1, int i2)
+{
+    double dist;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dist += fabs( x[i1] - x[i2] );
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count == 0) return NA_REAL;
+    return sqrt(2 * dist);
+}
+
+/* Driver */
+
+static double (*distfun)(double*, int, int, int, int);
+
+void xx_distance(double *x, int *nr, int *nc, double *d, 
+		 int *diag, int *method)
+{
+    int dc, i, j, ij;
+    switch(*method) {
+    case EUCLIDEAN:
+	distfun = xx_euclidean;
+	break;
+    case SQEUCLIDEAN:
+	distfun = xx_sq_euclidean;
+	break;
+    case CHORD:
+	distfun = xx_chord;
+	break;
+    case SQCHORD:
+	distfun = xx_sq_chord;
+	break;
+    case BRAY:
+	distfun = xx_bray;
+	break;
+    case CHISQUARE:
+	distfun = xx_chi_square;
+	break;
+    case SQCHISQUARE:
+	distfun = xx_sq_chi_square;
+	break;
+    case INFORMATION:
+	distfun = xx_information;
+	break;
+    case MANHATTAN:
+	distfun = xx_manhattan;
+	break;
+    case GOWER:
+	distfun = xx_gower;
+	break;
+    case ALTGOWER:
+	distfun = xx_alt_gower;
+	break;
+    default:
+	error("Unknown distance in the internal C function");
+    }
+    
+    dc = (*diag) ? 0 : 1;
+    ij = 0;
+    for (j=0; j <= *nr; j++)
+	for (i=j+dc; i < *nr; i++) {
+	    d[ij++] = distfun(x, *nr, *nc, i, j);
+	}
+}
+
+/*
+ * Kendall's coefficient
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * maxi: the max abundance for each species
+ *
+ */
+double xx_KENDALL(double *x, int nr, int nc, int i1, int i2,
+		  double *maxi)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dev = (x[i1] >= x[i2]) ? x[i2] : x[i1];
+	    dist += maxi[j] - dev;
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count == 0) return NA_REAL;
+    return dist;
+}
+
+void xx_kendall(double *x, int *nr, int *nc, double *d, 
+		double *maxi)
+{
+    int i, j, ij;
+    double dist;
+
+    dist = 0.0;
+    
+    ij = 0;
+    for(j=0; j <= *nr; j++) {
+	for(i=0; i < *nr; i++) {
+	    d[ij++] = xx_KENDALL(x, *nr, *nc, i, j, maxi); 
+	}
+    }
+}
+
+/*
+ * Chi square distance
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * csum: species sums
+ *
+ */
+double xx_CHISQ_DIST(double *x, int nr, int nc, int i1, int i2,
+		     double *csum, double ccsum)
+{
+    double dist, dev, denom, nomin;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    dev = x[i1] - x[i2];
+	    nomin = dev*dev;
+	    denom = csum[j] / ccsum;
+	    dist += nomin / denom;
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count == 0) return NA_REAL;
+    return sqrt(dist);
+}
+
+void xx_chisq_dist(double *x, int *nr, int *nc, double *d, 
+		   double *csum)
+{
+    int i, j, k, ij;
+    double ccsum;
+
+    ccsum = 0.0;
+    
+    ij = 0;
+
+    for(k=0; k < *nc; k++) {
+	ccsum += csum[k];
+    }
+
+    for(j=0; j < *nr; j++) {
+	for(i=0; i < *nr; i++) {
+	    d[ij++] = xx_CHISQ_DIST(x, *nr, *nc, i, j, 
+				    csum, ccsum);
+	}
+    }
+}
+
+/*
+ * Gower's coefficient for mixed data
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * vtype  : variable type
+ *          1 == Symmetric Binary
+ *          2 == Asymmetric Binary
+ *          3 == Nominal (class/factor)
+ *          4 == Ordinal (ordered factor)
+ *          5 == Quantitative
+ * weights: variable weights
+ * R      : variable range (max - min)
+ *
+ */
+double xx_MIXED(double *x, int nr, int nc, int i1, int i2, 
+		int *vtype, double *weights, double *R, 
+		double wsum)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+	    if(vtype[j] == 1) {
+		dev = (x[i1] == x[i2]) ? 1 : 0;
+		dist += dev * weights[j];
+	    }
+	    if(vtype[j] == 2) {
+		dev = (x[i1] == x[i2]) ?   : 0;
+		dist += dev * weights[j];
+	    }
+	    if(vtype[j] == 3) {
+		dev = (x[i1] == x[i2]) ? 1 : 0;
+		dist += dev * weights[j];
+	    }
+	    if(vtype[j] == 4) {
+		/* ordinal data current not handled 
+		 * so don't call this yet
+		 */
+	    }
+	    if(vtype[j] == 5) {
+		dev = 1 - (fabs(x[i1] - x[i2]) / R[j]);
+		dist += dev * weights[j];
+	    }
+	    count++;
+	}
+	i1 += nr;
+	i2 += nr;
+    }
+    if (count == 0) return NA_REAL;
+    return 1 - (dist / wsum);
+}
+
+void xx_mixed(double *x, int *nr, int *nc, double *d, 
+	      int *vtype, double *weights, double *R)
+{
+    int i, j, k, ij;
+    double wsum;
+
+    wsum = 0.0;
+    
+    ij = 0;
+
+    for(k=0; k <*nc; k++) {
+	wsum += weights[k];
+    }
+
+    for(j=0; j < *nr; j++) {
+	for(i=0; i < *nr; i++) {
+	    d[ij++] = xx_MIXED(x, *nr, *nc, i, j, vtype,
+			       weights, R, wsum);
+	}
+    }
+}

Added: pkg/src/distxy.c
===================================================================
--- pkg/src/distxy.c	                        (rev 0)
+++ pkg/src/distxy.c	2009-06-07 13:34:37 UTC (rev 133)
@@ -0,0 +1,540 @@
+/* 
+ * Functions to compute dissimilarity between two matrices
+ * X and Y.
+ *
+ * Based on code from vegdist by Jari Oksanen:
+ *
+ * (C) 2001-2009, Jari Oksanen
+ * (C) 2009 Gavin L. Simpson
+ *
+ * Licene: GPL 2
+ */
+
+/* Standard R headers */
+
+#include <R.h>
+#include <Rmath.h>
+#include <math.h>
+
+/* Indices */
+/* Note we don't actually call all of these via xy_distance
+ * Some are called via direct methods, but we include the 
+ * indices here to allow the pattern matching to work 
+ * correctly
+ */
+#define EUCLIDEAN 1
+#define SQEUCLIDEAN 2
+#define CHORD 3
+#define SQCHORD 4
+#define BRAY 5
+#define CHISQUARE 6
+#define SQCHISQUARE 7
+#define INFORMATION 8
+#define CHIDISTANCE 9
+#define MANHATTAN 10
+#define KENDALL 11
+#define GOWER 12
+#define ALTGOWER 13
+#define MIXED 14
+
+/* Distance functions */
+
+/* Euclidean distance */
+double xy_euclidean(double *x, double *y, int nr1, int nr2, 
+		    int nc, int i1, int i2)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    dev = x[i1] - y[i2];
+	    dist += dev*dev;
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count == 0) return NA_REAL;
+    return sqrt(dist);
+}
+
+/* Squared Euclidean distance */
+double xy_sq_euclidean(double *x, double *y, int nr1, int nr2, 
+		       int nc, int i1, int i2)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    dev = x[i1] - y[i2];
+	    dist += dev*dev;
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count == 0) return NA_REAL;
+    return dist;
+}
+
+/* Chord distance */
+double xy_chord(double *x, double *y, int nr1, int nr2, 
+		int nc, int i1, int i2)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    dev = sqrt(x[i1]) - sqrt(y[i2]);
+	    dist += dev*dev;
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count == 0) return NA_REAL;
+    return sqrt(dist);
+}
+
+/* Squared Chord distance */
+double xy_sq_chord(double *x, double *y, int nr1, int nr2, 
+		   int nc, int i1, int i2)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    dev = sqrt(x[i1]) - sqrt(y[i2]);
+	    dist += dev*dev;
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count == 0) return NA_REAL;
+    return dist;
+}
+
+/*  Bray-Curtis */
+double xy_bray(double *x, double *y, int nr1, int nr2, 
+       	       int nc, int i1, int i2)
+{
+    double dist, total;
+    int count, j;
+    
+    total = 0.0;
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    dist += fabs(x[i1] - y[i2]);
+	    total += x[i1] + y[i2];
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count==0) return NA_REAL;
+    dist /= total;
+    return dist;
+}
+
+/*  chi square */
+double xy_chi_square(double *x, double *y, int nr1, int nr2, 
+		     int nc, int i1, int i2)
+{
+    double dist, dev, nomin, denomin;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    if(x[i1] != 0 || y[i2] != 0) {
+		dev = x[i1] - y[i2];
+		nomin = dev*dev;
+		denomin = x[i1] + y[i2];
+		dist += nomin / denomin;
+		count++;
+	    }
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count==0) return NA_REAL;
+    return sqrt(dist);
+}
+
+/*  square chi square */
+double xy_sq_chi_square(double *x, double *y, int nr1, int nr2, 
+			int nc, int i1, int i2)
+{
+    double dist, dev, nomin, denomin;
+    int count, j;
+	
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    if(x[i1] != 0 || y[i2] != 0) {
+		dev = x[i1] - y[i2];
+		nomin = dev*dev;
+		denomin = x[i1] + y[i2];
+		dist += nomin / denomin;
+		count++;
+	    }
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count==0) return NA_REAL;
+    return dist;
+}
+
+/* information statistic */
+double xy_information(double *x, double *y, int nr1, int nr2, 
+		      int nc, int i1, int i2)
+{
+    double dist, XY, A, B, Adist, Bdist;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    Adist = 0.0;
+    Bdist = 0.0;
+    A = 0.0;
+    B = 0.0;
+    for(j=0; j<nc; j++) {
+	if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    XY = x[i1] + y[i2];
+	    A += x[i1] * (log((2 * x[i1]) / XY)/log(2));
+	    B += y[i2] * (log((2 * y[i2]) / XY)/log(2));
+	    if(R_FINITE(A)) {
+		Adist += A;
+	    }
+	    if(R_FINITE(B)) {
+		Bdist += B;
+	    }
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if(count==0) return NA_REAL;
+    dist = A + B;
+    return dist;
+}
+
+/* Manhattan metric */
+double xy_manhattan(double *x, double *y, int nr1, int nr2, 
+		    int nc, int i1, int i2)
+{
+    double dist;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    dist += fabs( x[i1] - y[i2] );
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count == 0) return NA_REAL;
+    return dist;
+}
+
+/* Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+   in the R wrapper */
+double xy_gower(double *x, double *y, int nr1, int nr2, 
+		int nc, int i1, int i2)
+{
+    double dist;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    dist += fabs( x[i1] - y[i2] );
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count == 0) return NA_REAL;
+    return dist;
+}
+
+/* Alternative Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+   in the R wrapper */
+double xy_alt_gower(double *x, double *y, int nr1, int nr2, 
+		    int nc, int i1, int i2)
+{
+    double dist;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    dist += fabs( x[i1] - y[i2] );
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count == 0) return NA_REAL;
+    return sqrt(2 * dist);
+}
+
+/* Driver */
+
+static double (*distfun)(double*, double*, int, int, int, 
+			 int, int);
+
+void xy_distance(double *x, double *y, int *nr1, int *nr2,
+		 int *nc, double *d, int *method)
+{
+    int i, j, ij;
+    switch(*method) {
+    case EUCLIDEAN:
+	distfun = xy_euclidean;
+	break;
+    case SQEUCLIDEAN:
+	distfun = xy_sq_euclidean;
+	break;
+    case CHORD:
+	distfun = xy_chord;
+	break;
+    case SQCHORD:
+	distfun = xy_sq_chord;
+	break;
+    case BRAY:
+	distfun = xy_bray;
+	break;
+    case CHISQUARE:
+	distfun = xy_chi_square;
+	break;
+    case SQCHISQUARE:
+	distfun = xy_sq_chi_square;
+	break;
+    case INFORMATION:
+	distfun = xy_information;
+	break;
+    case MANHATTAN:
+	distfun = xy_manhattan;
+	break;
+    case GOWER:
+	distfun = xy_gower;
+	break;
+    case ALTGOWER:
+	distfun = xy_alt_gower;
+	break;
+    default:
+	error("Unknown distance in the internal C function");
+    }
+    
+    ij = 0;
+    for (j=0; j < *nr1; j++)
+	for (i=0; i < *nr2; i++) {
+	    d[ij++] = distfun(x, y, *nr1, *nr2, *nc, j, i);
+	}
+}
+
+/*
+ * Kendall's coefficient
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * maxi: the max abundance for each species
+ *
+ */
+double xy_KENDALL(double *x, double *y, int nr1, int nr2, 
+		  int nc, int i1, int i2, double *maxi)
+{
+    double dist, dev;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    dev = (x[i1] >= y[i2]) ? y[i2] : x[i1];
+	    dist += maxi[j] - dev;
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count == 0) return NA_REAL;
+    return dist;
+}
+
+void xy_kendall(double *x, double *y, int *nr1, int *nr2,
+		int *nc, double *d, double *maxi)
+{
+    int i, j, ij;
+    
+    ij = 0;
+    for(j=0; j < *nr1; j++) {
+	for(i=0; i < *nr2; i++) {
+	    d[ij++] = xy_KENDALL(x, y, *nr1, *nr2, *nc, j, i,
+				 maxi);
+	}
+    }
+}
+
+/*
+ * Chi square distance
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * Needs to be pre processed in R
+ *
+ * csum: species sums
+ *
+ */
+double xy_CHISQ_DIST(double *x, double *y, int nr1, int nr2, 
+		     int nc, int i1, int i2, double *csum,
+		     double ccsum)
+{
+    double dist, dev, denom, nomin;
+    int count, j;
+    
+    count = 0;
+    dist = 0.0;
+    for (j=0; j<nc; j++) {
+	if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+	    dev = x[i1] - y[i2];
+	    nomin = dev*dev;
+	    denom = csum[j] / ccsum;
+	    dist += nomin / denom;
+	    count++;
+	}
+	i1 += nr1;
+	i2 += nr2;
+    }
+    if (count == 0) return NA_REAL;
+    return sqrt(dist);
+}
+
+void xy_chisq_dist(double *x, double *y, int *nr1, int *nr2,
+		   int *nc, double *d, double *csum)
+{
+    int i, j, k, ij;
+    double ccsum;
+
+    ccsum = 0.0;
+    
+    ij = 0;
+
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/analogue -r 133


More information about the Analogue-commits mailing list