[Analogue-commits] r133 - in pkg: . R inst man src
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sun Jun 7 15:34:45 CEST 2009
Author: gsimpson
Date: 2009-06-07 15:34:37 +0200 (Sun, 07 Jun 2009)
New Revision: 133
Added:
pkg/R/fixUpTol.R
pkg/src/
pkg/src/distx.c
pkg/src/distxy.c
pkg/src/wastats.c
Modified:
pkg/DESCRIPTION
pkg/R/internal.R
pkg/R/predict.wa.R
pkg/R/wa.R
pkg/R/wa.formula.R
pkg/R/zzz.R
pkg/inst/ChangeLog
pkg/man/analogue-internal.Rd
pkg/man/wa.Rd
Log:
Incremental improvements in wa() and predict.wa() for tolerance down-weighting models. LOO CV now works with tol.dw = TRUE; code tidied a lot to re-use new internal utility functions. New C code for internal utility function w.tol().
Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION 2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/DESCRIPTION 2009-06-07 13:34:37 UTC (rev 133)
@@ -1,7 +1,7 @@
Package: analogue
Type: Package
Title: Analogue and weighted averaging methods for palaeoecology
-Version: 0.6-9
+Version: 0.6-10
Date: $Date$
Depends: R (>= 2.5.0), stats, graphics, vegan, lattice, MASS
Author: Gavin L. Simpson, Jari Oksanen
Added: pkg/R/fixUpTol.R
===================================================================
--- pkg/R/fixUpTol.R (rev 0)
+++ pkg/R/fixUpTol.R 2009-06-07 13:34:37 UTC (rev 133)
@@ -0,0 +1,26 @@
+## fix-up tolerances for use in TF computations
+fixUpTol <- function(tol, na.tol, small.tol, min.tol, f, env) {
+ ## first missing tolerances
+ if(any(NA.TOL <- is.na(tol))) {
+ tol[NA.TOL] <-
+ switch(na.tol,
+ min = min(tol, na.rm = TRUE),
+ mean = mean(tol, na.rm = TRUE),
+ max = max(tol, na.rm = TRUE))
+ }
+ ## second, replace tol < min.tol
+ if(!is.null(min.tol) && any(MIN.TOL <- tol < min.tol)) {
+ small.tol <- match.arg(small.tol)
+ ## min.tol must be in or on extremesof range(env)
+ if(min.tol < min(env) || min.tol > max(env))
+ stop("'min.tol' must be >= min(env) and <= max(env)")
+ if(small.tol == "fraction")
+ frac <- f * diff(range(env))
+ tol[MIN.TOL] <-
+ switch(small.tol,
+ fraction = frac,
+ absolute = min.tol,
+ min = min(tol[tol >= min.tol], na.rm = TRUE))
+ }
+ return(tol)
+}
Modified: pkg/R/internal.R
===================================================================
--- pkg/R/internal.R 2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/R/internal.R 2009-06-07 13:34:37 UTC (rev 133)
@@ -227,7 +227,14 @@
## x = species abundances
## env = vector of response var
## opt = weighted average optima
- tol <- sqrt(ColSums(x * outer(env, opt, "-")^2) / ColSums(x))
+ ##tol <- sqrt(ColSums(x * outer(env, opt, "-")^2) / ColSums(x))
+ nr <- NROW(x)
+ nc <- NCOL(x)
+ tol <- .C("WTOL", x = as.double(env), w = as.double(x),
+ opt = as.double(opt),
+ nr = as.integer(nr), nc = as.integer(nc),
+ stat = double(nc), NAOK = FALSE,
+ PACKAGE="analogue")$stat
if(useN2)
tol <- tol / sqrt(1 - (1 / sppN2(x)))
names(tol) <- colnames(x)
@@ -245,6 +252,3 @@
1/H
}
-## data(ImbrieKipp)
-## data(SumSST)
-## mod <- wa(SumSST ~ ., data = ImbrieKipp, tol.dw = TRUE)
Modified: pkg/R/predict.wa.R
===================================================================
--- pkg/R/predict.wa.R 2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/R/predict.wa.R 2009-06-07 13:34:37 UTC (rev 133)
@@ -1,7 +1,6 @@
`predict.wa` <- function(object, newdata,
CV = c("none","LOO","bootstrap", "nfold"),
- verbose = FALSE,
- n.boot = 100, nfold = 5,
+ verbose = FALSE, n.boot = 100, nfold = 5,
...) {
if(missing(newdata))
return(fitted(object))
@@ -24,14 +23,13 @@
none = no.deshrink)
X <- object$orig.x
ENV <- object$orig.env
+ ## tolerance options from model
+ O <- object$options.tol
## Doing CV?
if(identical(CV, "none")) {
want <- names(object$wa.optima) %in%
colnames(newdata)
want <- names(object$wa.optima)[want]
- #pred <- colSums(t(newdata[,want]) *
- # object$wa.optima[want]) /
- # rowSums(newdata[,want])
if(object$tol.dw) {
pred <- WATpred(newdata[,want], object$wa.optima[want],
object$model.tol[want])
@@ -45,37 +43,42 @@
loo.pred <- matrix(0, ncol = n.train,
nrow = n.fossil)
mod.pred <- length(n.train)
+ useN2 <- object$options.tol$useN2
+ want <- names(object$wa.optima) %in% colnames(newdata)
+ want <- names(object$wa.optima)[want]
for(i in seq_len(n.train)) {
if(verbose && ((i %% 10) == 0)) {
cat(paste("Leave one out sample", i, "\n"))
flush.console()
}
wa.optima <- w.avg(X[-i,], ENV[-i])
- ## do the model bits
- ones <- rep(1, length = length(wa.optima))
- miss <- is.na(wa.optima)
- ones[miss] <- 0
- wa.optima[miss] <- 0
- rowsum <- X[-i,] %*% ones
- wa.env <- (X[-i,]%*% wa.optima) / rowsum
+ tol <- w.tol(X[-i, ], ENV[-i], wa.optima,
+ useN2 = useN2)
+ ## fix up problematic tolerances
+ tol <- fixUpTol(tol, O$na.tol, O$small.tol,
+ O$min.tol, O$f, ENV[-i])
+ ## CV for the training set
+ if(object$tol.dw) {
+ wa.env <- WATpred(X[-i,], wa.optima, tol)
+ mod.pred[i] <- WATpred(X[i,,drop=FALSE],
+ wa.optima, tol)
+ } else {
+ wa.env <- WApred(X[-i,], wa.optima)
+ mod.pred[i] <- WApred(X[i,,drop=FALSE],
+ wa.optima)
+ }
deshrink.mod <- deshrink.fun(ENV[-i], wa.env)
wa.env <- deshrink.mod$env
- coefs <- coef(deshrink.mod) #$coef
+ coefs <- coef(deshrink.mod)
## LOO model predictions
- rowsum <- X[i,,drop=FALSE] %*% ones
- mod.pred[i] <- (X[i,,drop=FALSE] %*%
- wa.optima) /
- rowsum
mod.pred[i] <- deshrink.pred(mod.pred[i], coefs)
## newdata predictions
- want <- names(wa.optima) %in% colnames(newdata)
- want <- names(wa.optima)[want]
- ones <- rep(1, length = length(want))
- miss <- miss[want]
- ones[miss] <- 0
- rowsum <- newdata[,want] %*% ones
- pred <- (newdata[,want] %*% wa.optima[want]) /
- rowsum
+ pred <- if(object$tol.dw) {
+ WATpred(newdata[,want], wa.optima[want],
+ tol[want])
+ } else {
+ WApred(newdata[,want], wa.optima[want])
+ }
loo.pred[,i] <- deshrink.pred(pred, coefs)
}
## average the LOO predictions
@@ -226,13 +229,28 @@
}
WApred <- function(X, optima) {
- ((X %*% optima) / rowSums(X))[,1, drop = TRUE]
+ ones <- rep.int(1, length(optima))
+ miss <- is.na(optima)
+ ones[miss] <- 0
+ optima[miss] <- 0
+ rsum <- X %*% ones
+ ((X %*% optima) / rsum)
}
WATpred <- function(X, optima, tol) {
+ ones <- rep.int(1, length(optima))
+ miss <- is.na(optima)
+ ones[miss] <- 0
+ optima[miss] <- 0
+ tol[miss] <- 1
tol2 <- tol^2
- tmp <- sweep(X, 2, optima, "*", check.margin = FALSE)
- tmp <- rowSums(sweep(tmp, 2, tol2, "/",
- check.margin = FALSE))
- tmp / rowSums(sweep(X, 2, tol2, "/", check.margin = FALSE))
+ #tmp <- sweep(X, 2, optima, "*", check.margin = FALSE)
+ ##tmp <- RowSums(t(t(X) * optima / tol2))
+ #tmp <- RowSums(sweep(tmp, 2, tol2, "/",
+ # check.margin = FALSE))
+ #tmp / RowSums(sweep(X, 2, tol2, "/",
+ # check.margin = FALSE)[,!miss, drop = FALSE])
+ #tmp / (sweep(X, 2, tol2, "/",check.margin = FALSE) %*% ones)
+ ##tmp / (t(t(X) / tol2) %*% ones)
+ RowSums(t(t(X) * optima / tol2)) / (t(t(X) / tol2) %*% ones)
}
Modified: pkg/R/wa.R
===================================================================
--- pkg/R/wa.R 2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/R/wa.R 2009-06-07 13:34:37 UTC (rev 133)
@@ -27,34 +27,18 @@
if(missing(na.tol))
na.tol <- "min"
na.tol <- match.arg(na.tol)
- ## first missing tolerances
- if(any(NA.TOL <- is.na(tol))) {
- tol[NA.TOL] <- switch(na.tol,
- min = min(tol, na.rm = TRUE),
- mean = mean(tol, na.rm = TRUE),
- max = max(tol, na.rm = TRUE))
+ if(missing(small.tol))
+ small.tol <- "min"
+ small.tol <- match.arg(small.tol)
+ if(small.tol == "fraction") {
+ if(!(f > 0 && f < 1))
+ stop("'f' must be 0 < f < 1")
+ frac <- f * diff(range(env))
+ if(frac < min.tol)
+ warning("Requested fraction of gradient is < minimum tolerance.")
}
- ## second, replace tol < min.tol
- if(!is.null(min.tol) && any(MIN.TOL <- tol < min.tol)) {
- if(missing(small.tol))
- small.tol <- "min"
- small.tol <- match.arg(small.tol)
- ## min.tol must be in or on extremesof range(env)
- if(min.tol < min(env) || min.tol > max(env))
- stop("'min.tol' must be >= min(env) and <= max(env)")
- ## warn if "fraction" and f * diff(range(env))) < min.tol
- if(small.tol == "fraction") {
- if(!(f > 0 && f < 1))
- stop("'f' must be 0 < f < 1")
- frac <- f * diff(range(env))
- if(frac < min.tol)
- warning("Requested fraction of gradient is < minimum tolerance.")
- }
- tol[MIN.TOL] <- switch(small.tol,
- fraction = frac,
- absolute = min.tol,
- min = min(tol[tol >= min.tol], na.rm = TRUE))
- }
+ tol <- fixUpTol(tol, na.tol = na.tol, small.tol = small.tol,
+ min.tol = min.tol, f = f, env = env)
## calculate WA estimate of env for each site
if(tol.dw) {
wa.env <- WATpred(x, wa.optima, tol)
Modified: pkg/R/wa.formula.R
===================================================================
--- pkg/R/wa.formula.R 2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/R/wa.formula.R 2009-06-07 13:34:37 UTC (rev 133)
@@ -1,6 +1,10 @@
`wa.formula` <- function(formula, data, subset, na.action,
deshrink = c("inverse", "classical", "expanded", "none"),
- tol.dw = FALSE, ..., model = FALSE) {
+ tol.dw = FALSE, useN2 = TRUE,
+ na.tol = c("min","mean","max"),
+ small.tol = c("min","fraction","absolute"),
+ min.tol = NULL, f = 0.1, ...,
+ model = FALSE) {
## set default deshrinking to inverse if no supplied
if(missing(deshrink))
deshrink <- "inverse"
Modified: pkg/R/zzz.R
===================================================================
--- pkg/R/zzz.R 2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/R/zzz.R 2009-06-07 13:34:37 UTC (rev 133)
@@ -1,5 +1,5 @@
.First.lib <- function(lib, pkg) {
- ##library.dynam("analogue", pkg, lib)
+ library.dynam("analogue", pkg, lib)
packageStartupMessage("This is analogue ",
utils::packageDescription("analogue",
field="Version"),
Modified: pkg/inst/ChangeLog
===================================================================
--- pkg/inst/ChangeLog 2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/inst/ChangeLog 2009-06-07 13:34:37 UTC (rev 133)
@@ -1,7 +1,19 @@
analogue Change Log
-Version 0.6-9 (Opened Wed 27 May 2009)
+Version 0.6-10
+ * predict.wa: Predictions with tolerance DW now works for CV = "LOO"
+
+ * wa.formula: Argument list updated to match wa.default.
+
+ * fixUpTol: New internal utility function the encapsulates code to
+ modify working tolerances within WA model fitting.
+
+ * w.tol: Internal function w.tol now uses a faster C version of the
+ code.
+
+Version 0.6-9 (Closed Sun 7 June 2009)
+
* predict.wa: Predictions without CV can now be made for WA models
fitted using tolerance DW.
Modified: pkg/man/analogue-internal.Rd
===================================================================
--- pkg/man/analogue-internal.Rd 2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/man/analogue-internal.Rd 2009-06-07 13:34:37 UTC (rev 133)
@@ -17,6 +17,7 @@
\alias{w.tol}
\alias{WApred}
\alias{WATpred}
+\alias{fixUpTol}
\title{Internal analogue Functions}
\description{
Internal analogue functions
Modified: pkg/man/wa.Rd
===================================================================
--- pkg/man/wa.Rd 2009-06-05 18:38:07 UTC (rev 132)
+++ pkg/man/wa.Rd 2009-06-07 13:34:37 UTC (rev 133)
@@ -23,7 +23,9 @@
\method{wa}{formula}(formula, data, subset, na.action,
deshrink = c("inverse", "classical", "expanded", "none"),
- tol.dw = FALSE, ..., model = FALSE)
+ tol.dw = FALSE, useN2 = TRUE, na.tol = c("min","mean","max"),
+ small.tol = c("min","fraction","absolute"), min.tol = NULL,
+ f = 0.1,..., model = FALSE)
\method{fitted}{wa}(object, \dots)
Added: pkg/src/distx.c
===================================================================
--- pkg/src/distx.c (rev 0)
+++ pkg/src/distx.c 2009-06-07 13:34:37 UTC (rev 133)
@@ -0,0 +1,547 @@
+/*
+ * Functions to compute dissimilarity for a single matrix
+ *
+ * Based on code from vegdist by Jari Oksanen:
+ *
+ * (C) 2001-2009, Jari Oksanen
+ * (C) 2009 Gavin L. Simpson
+ *
+ * Licene: GPL 2
+ */
+
+/* Standard R headers */
+
+#include <R.h>
+#include <Rmath.h>
+#include <math.h>
+
+/* Indices */
+/* Note we don't actually call all of these via xy_distance
+ * Some are called via direct methods, but we include the
+ * indices here to allow the pattern matching to work
+ * correctly
+ */
+#define EUCLIDEAN 1
+#define SQEUCLIDEAN 2
+#define CHORD 3
+#define SQCHORD 4
+#define BRAY 5
+#define CHISQUARE 6
+#define SQCHISQUARE 7
+#define INFORMATION 8
+#define CHIDISTANCE 9
+#define MANHATTAN 10
+#define KENDALL 11
+#define GOWER 12
+#define ALTGOWER 13
+#define MIXED 14
+
+/* Distance functions */
+
+/* Euclidean distance */
+double xx_euclidean(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = x[i1] - x[i2];
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* Squared Euclidean distance */
+double xx_sq_euclidean(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = x[i1] - x[i2];
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Chord distance */
+double xx_chord(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = sqrt(x[i1]) - sqrt(x[i2]);
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* Squared Chord distance */
+double xx_sq_chord(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = sqrt(x[i1]) - sqrt(x[i2]);
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Bray-Curtis */
+double xx_bray(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, total;
+ int count, j;
+
+ total = 0.0;
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dist += fabs(x[i1] - x[i2]);
+ total += x[i1] + x[i2];
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count==0) return NA_REAL;
+ dist /= total;
+ return dist;
+}
+
+/* chi square */
+double xx_chi_square(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev, nomin, denomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ if(x[i1] != 0 || x[i2] != 0) {
+ dev = x[i1] - x[i2];
+ nomin = dev*dev;
+ denomin = x[i1] + x[i2];
+ dist += nomin / denomin;
+ count++;
+ }
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count==0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* square chi square */
+double xx_sq_chi_square(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev, nomin, denomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ if(x[i1] != 0 || x[i2] != 0) {
+ dev = x[i1] - x[i2];
+ nomin = dev*dev;
+ denomin = x[i1] + x[i2];
+ dist += nomin / denomin;
+ count++;
+ }
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count==0) return NA_REAL;
+ return dist;
+}
+
+/* information statistic */
+double xx_information(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, XY, A, B, Adist, Bdist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ Adist = 0.0;
+ Bdist = 0.0;
+ A = 0.0;
+ B = 0.0;
+ for(j=0; j<nc; j++) {
+ if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ XY = x[i1] + x[i2];
+ A += x[i1] * (log((2 * x[i1]) / XY)/log(2));
+ B += x[i2] * (log((2 * x[i2]) / XY)/log(2));
+ if(R_FINITE(A)) {
+ Adist += A;
+ }
+ if(R_FINITE(B)) {
+ Bdist += B;
+ }
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if(count==0) return NA_REAL;
+ dist = A + B;
+ return dist;
+}
+
+/* chi square distance*/
+/* currently not correct */
+double xx_chi_distance(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev, nomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = x[i1] - x[i2];
+ nomin += dev*dev;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count==0) return NA_REAL;
+ return dist;
+}
+
+/* Manhattan metric */
+double xx_manhattan(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dist += fabs( x[i1] - x[i2] );
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+ in the R wrapper */
+double xx_gower(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dist += fabs( x[i1] - x[i2] );
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Alternative Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+ in the R wrapper */
+double xx_alt_gower(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dist += fabs( x[i1] - x[i2] );
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(2 * dist);
+}
+
+/* Driver */
+
+static double (*distfun)(double*, int, int, int, int);
+
+void xx_distance(double *x, int *nr, int *nc, double *d,
+ int *diag, int *method)
+{
+ int dc, i, j, ij;
+ switch(*method) {
+ case EUCLIDEAN:
+ distfun = xx_euclidean;
+ break;
+ case SQEUCLIDEAN:
+ distfun = xx_sq_euclidean;
+ break;
+ case CHORD:
+ distfun = xx_chord;
+ break;
+ case SQCHORD:
+ distfun = xx_sq_chord;
+ break;
+ case BRAY:
+ distfun = xx_bray;
+ break;
+ case CHISQUARE:
+ distfun = xx_chi_square;
+ break;
+ case SQCHISQUARE:
+ distfun = xx_sq_chi_square;
+ break;
+ case INFORMATION:
+ distfun = xx_information;
+ break;
+ case MANHATTAN:
+ distfun = xx_manhattan;
+ break;
+ case GOWER:
+ distfun = xx_gower;
+ break;
+ case ALTGOWER:
+ distfun = xx_alt_gower;
+ break;
+ default:
+ error("Unknown distance in the internal C function");
+ }
+
+ dc = (*diag) ? 0 : 1;
+ ij = 0;
+ for (j=0; j <= *nr; j++)
+ for (i=j+dc; i < *nr; i++) {
+ d[ij++] = distfun(x, *nr, *nc, i, j);
+ }
+}
+
+/*
+ * Kendall's coefficient
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * maxi: the max abundance for each species
+ *
+ */
+double xx_KENDALL(double *x, int nr, int nc, int i1, int i2,
+ double *maxi)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = (x[i1] >= x[i2]) ? x[i2] : x[i1];
+ dist += maxi[j] - dev;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+void xx_kendall(double *x, int *nr, int *nc, double *d,
+ double *maxi)
+{
+ int i, j, ij;
+ double dist;
+
+ dist = 0.0;
+
+ ij = 0;
+ for(j=0; j <= *nr; j++) {
+ for(i=0; i < *nr; i++) {
+ d[ij++] = xx_KENDALL(x, *nr, *nc, i, j, maxi);
+ }
+ }
+}
+
+/*
+ * Chi square distance
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * csum: species sums
+ *
+ */
+double xx_CHISQ_DIST(double *x, int nr, int nc, int i1, int i2,
+ double *csum, double ccsum)
+{
+ double dist, dev, denom, nomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = x[i1] - x[i2];
+ nomin = dev*dev;
+ denom = csum[j] / ccsum;
+ dist += nomin / denom;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+void xx_chisq_dist(double *x, int *nr, int *nc, double *d,
+ double *csum)
+{
+ int i, j, k, ij;
+ double ccsum;
+
+ ccsum = 0.0;
+
+ ij = 0;
+
+ for(k=0; k < *nc; k++) {
+ ccsum += csum[k];
+ }
+
+ for(j=0; j < *nr; j++) {
+ for(i=0; i < *nr; i++) {
+ d[ij++] = xx_CHISQ_DIST(x, *nr, *nc, i, j,
+ csum, ccsum);
+ }
+ }
+}
+
+/*
+ * Gower's coefficient for mixed data
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * vtype : variable type
+ * 1 == Symmetric Binary
+ * 2 == Asymmetric Binary
+ * 3 == Nominal (class/factor)
+ * 4 == Ordinal (ordered factor)
+ * 5 == Quantitative
+ * weights: variable weights
+ * R : variable range (max - min)
+ *
+ */
+double xx_MIXED(double *x, int nr, int nc, int i1, int i2,
+ int *vtype, double *weights, double *R,
+ double wsum)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ if(vtype[j] == 1) {
+ dev = (x[i1] == x[i2]) ? 1 : 0;
+ dist += dev * weights[j];
+ }
+ if(vtype[j] == 2) {
+ dev = (x[i1] == x[i2]) ? : 0;
+ dist += dev * weights[j];
+ }
+ if(vtype[j] == 3) {
+ dev = (x[i1] == x[i2]) ? 1 : 0;
+ dist += dev * weights[j];
+ }
+ if(vtype[j] == 4) {
+ /* ordinal data current not handled
+ * so don't call this yet
+ */
+ }
+ if(vtype[j] == 5) {
+ dev = 1 - (fabs(x[i1] - x[i2]) / R[j]);
+ dist += dev * weights[j];
+ }
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return 1 - (dist / wsum);
+}
+
+void xx_mixed(double *x, int *nr, int *nc, double *d,
+ int *vtype, double *weights, double *R)
+{
+ int i, j, k, ij;
+ double wsum;
+
+ wsum = 0.0;
+
+ ij = 0;
+
+ for(k=0; k <*nc; k++) {
+ wsum += weights[k];
+ }
+
+ for(j=0; j < *nr; j++) {
+ for(i=0; i < *nr; i++) {
+ d[ij++] = xx_MIXED(x, *nr, *nc, i, j, vtype,
+ weights, R, wsum);
+ }
+ }
+}
Added: pkg/src/distxy.c
===================================================================
--- pkg/src/distxy.c (rev 0)
+++ pkg/src/distxy.c 2009-06-07 13:34:37 UTC (rev 133)
@@ -0,0 +1,540 @@
+/*
+ * Functions to compute dissimilarity between two matrices
+ * X and Y.
+ *
+ * Based on code from vegdist by Jari Oksanen:
+ *
+ * (C) 2001-2009, Jari Oksanen
+ * (C) 2009 Gavin L. Simpson
+ *
+ * Licene: GPL 2
+ */
+
+/* Standard R headers */
+
+#include <R.h>
+#include <Rmath.h>
+#include <math.h>
+
+/* Indices */
+/* Note we don't actually call all of these via xy_distance
+ * Some are called via direct methods, but we include the
+ * indices here to allow the pattern matching to work
+ * correctly
+ */
+#define EUCLIDEAN 1
+#define SQEUCLIDEAN 2
+#define CHORD 3
+#define SQCHORD 4
+#define BRAY 5
+#define CHISQUARE 6
+#define SQCHISQUARE 7
+#define INFORMATION 8
+#define CHIDISTANCE 9
+#define MANHATTAN 10
+#define KENDALL 11
+#define GOWER 12
+#define ALTGOWER 13
+#define MIXED 14
+
+/* Distance functions */
+
+/* Euclidean distance */
+double xy_euclidean(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dev = x[i1] - y[i2];
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* Squared Euclidean distance */
+double xy_sq_euclidean(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dev = x[i1] - y[i2];
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Chord distance */
+double xy_chord(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dev = sqrt(x[i1]) - sqrt(y[i2]);
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* Squared Chord distance */
+double xy_sq_chord(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dev = sqrt(x[i1]) - sqrt(y[i2]);
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Bray-Curtis */
+double xy_bray(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, total;
+ int count, j;
+
+ total = 0.0;
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dist += fabs(x[i1] - y[i2]);
+ total += x[i1] + y[i2];
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count==0) return NA_REAL;
+ dist /= total;
+ return dist;
+}
+
+/* chi square */
+double xy_chi_square(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev, nomin, denomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ if(x[i1] != 0 || y[i2] != 0) {
+ dev = x[i1] - y[i2];
+ nomin = dev*dev;
+ denomin = x[i1] + y[i2];
+ dist += nomin / denomin;
+ count++;
+ }
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count==0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* square chi square */
+double xy_sq_chi_square(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev, nomin, denomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ if(x[i1] != 0 || y[i2] != 0) {
+ dev = x[i1] - y[i2];
+ nomin = dev*dev;
+ denomin = x[i1] + y[i2];
+ dist += nomin / denomin;
+ count++;
+ }
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count==0) return NA_REAL;
+ return dist;
+}
+
+/* information statistic */
+double xy_information(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, XY, A, B, Adist, Bdist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ Adist = 0.0;
+ Bdist = 0.0;
+ A = 0.0;
+ B = 0.0;
+ for(j=0; j<nc; j++) {
+ if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ XY = x[i1] + y[i2];
+ A += x[i1] * (log((2 * x[i1]) / XY)/log(2));
+ B += y[i2] * (log((2 * y[i2]) / XY)/log(2));
+ if(R_FINITE(A)) {
+ Adist += A;
+ }
+ if(R_FINITE(B)) {
+ Bdist += B;
+ }
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if(count==0) return NA_REAL;
+ dist = A + B;
+ return dist;
+}
+
+/* Manhattan metric */
+double xy_manhattan(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dist += fabs( x[i1] - y[i2] );
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+ in the R wrapper */
+double xy_gower(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dist += fabs( x[i1] - y[i2] );
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Alternative Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+ in the R wrapper */
+double xy_alt_gower(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dist += fabs( x[i1] - y[i2] );
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(2 * dist);
+}
+
+/* Driver */
+
+static double (*distfun)(double*, double*, int, int, int,
+ int, int);
+
+void xy_distance(double *x, double *y, int *nr1, int *nr2,
+ int *nc, double *d, int *method)
+{
+ int i, j, ij;
+ switch(*method) {
+ case EUCLIDEAN:
+ distfun = xy_euclidean;
+ break;
+ case SQEUCLIDEAN:
+ distfun = xy_sq_euclidean;
+ break;
+ case CHORD:
+ distfun = xy_chord;
+ break;
+ case SQCHORD:
+ distfun = xy_sq_chord;
+ break;
+ case BRAY:
+ distfun = xy_bray;
+ break;
+ case CHISQUARE:
+ distfun = xy_chi_square;
+ break;
+ case SQCHISQUARE:
+ distfun = xy_sq_chi_square;
+ break;
+ case INFORMATION:
+ distfun = xy_information;
+ break;
+ case MANHATTAN:
+ distfun = xy_manhattan;
+ break;
+ case GOWER:
+ distfun = xy_gower;
+ break;
+ case ALTGOWER:
+ distfun = xy_alt_gower;
+ break;
+ default:
+ error("Unknown distance in the internal C function");
+ }
+
+ ij = 0;
+ for (j=0; j < *nr1; j++)
+ for (i=0; i < *nr2; i++) {
+ d[ij++] = distfun(x, y, *nr1, *nr2, *nc, j, i);
+ }
+}
+
+/*
+ * Kendall's coefficient
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * maxi: the max abundance for each species
+ *
+ */
+double xy_KENDALL(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2, double *maxi)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dev = (x[i1] >= y[i2]) ? y[i2] : x[i1];
+ dist += maxi[j] - dev;
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+void xy_kendall(double *x, double *y, int *nr1, int *nr2,
+ int *nc, double *d, double *maxi)
+{
+ int i, j, ij;
+
+ ij = 0;
+ for(j=0; j < *nr1; j++) {
+ for(i=0; i < *nr2; i++) {
+ d[ij++] = xy_KENDALL(x, y, *nr1, *nr2, *nc, j, i,
+ maxi);
+ }
+ }
+}
+
+/*
+ * Chi square distance
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * Needs to be pre processed in R
+ *
+ * csum: species sums
+ *
+ */
+double xy_CHISQ_DIST(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2, double *csum,
+ double ccsum)
+{
+ double dist, dev, denom, nomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dev = x[i1] - y[i2];
+ nomin = dev*dev;
+ denom = csum[j] / ccsum;
+ dist += nomin / denom;
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+void xy_chisq_dist(double *x, double *y, int *nr1, int *nr2,
+ int *nc, double *d, double *csum)
+{
+ int i, j, k, ij;
+ double ccsum;
+
+ ccsum = 0.0;
+
+ ij = 0;
+
[TRUNCATED]
To get the complete diff run:
svnlook diff /svnroot/analogue -r 133
More information about the Analogue-commits
mailing list