[Analogue-commits] r315 - in pkg: R inst src
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Thu Mar 21 23:11:50 CET 2013
Author: gsimpson
Date: 2013-03-21 23:11:50 +0100 (Thu, 21 Mar 2013)
New Revision: 315
Added:
pkg/R/distance_c.R
pkg/src/c_distx.c
pkg/src/c_distxy.c
Modified:
pkg/inst/ChangeLog
Log:
first commit of the new C code for dissimilarities and the R interface distanceX()
Added: pkg/R/distance_c.R
===================================================================
--- pkg/R/distance_c.R (rev 0)
+++ pkg/R/distance_c.R 2013-03-21 22:11:50 UTC (rev 315)
@@ -0,0 +1,108 @@
+distanceX <- function(x, y, method = "euclidean", weights = NULL, R = NULL,
+ as.dist = FALSE, ...) {
+ ## Euclid?an could be spelled variously
+ if(!is.na(pmatch(method, "euclidian")))
+ method <- "euclidean"
+ METHODS <- c("euclidean", "SQeuclidean", "chord", "SQchord",
+ "bray", "chi.square", "SQchi.square",
+ "information","chi.distance", "manhattan",
+ "kendall", "gower", "alt.gower", "mixed")
+ DCOEF <- pmatch(method, METHODS)
+ if(missing(y)) { ## only a single matrix
+ ## variables
+ nr <- nrow(x)
+ nc <- ncol(x)
+ ## object names (row names)
+ x.names <- rownames(x)
+ ## some preprocessing steps required for some coefs
+ ## so dealt with separately
+ if(method %in% c("chi.distance", "gower", "alt.gower",
+ "mixed", "kendall")) {
+ if(method == "chi.distance") {
+ x <- data.matrix(x)
+ csum <- colSums(x)
+ x <- x / rowSums(x)
+ d <- .Call("Cchisqdistxx", x, csum, PACKAGE = "analogue")
+ }
+ if(method == "kendall") {
+ x <- data.matrix(x)
+ maxi <- apply(x, 2, max)
+ d <- .Call("Ckendallxx", x, maxi, PACKAGE = "analogue")
+ }
+ if(method %in% c("gower", "alt.gower")) {
+ if(is.null(R)) {
+ x <- data.matrix(x)
+ maxi <- apply(x, 2, max, na.rm = TRUE)
+ mini <- apply(x, 2, min, na.rm = TRUE)
+ R <- maxi - mini
+ } else {
+ if(length(R) != nc)
+ stop("'R' must be of length 'ncol(x)'")
+ }
+ ## pre-process here for gower and alt.gower
+ ## but note we call the main driver Cdistxx
+ x <- sweep(x, 2, R, "/")
+ d <- .Call("Cdistxx", x, DCOEF, PACKAGE = "analogue")
+ }
+ } else {
+ ## must be one of the DC's handled by xy_distance
+ x <- data.matrix(x)
+ d <- .Call("Cdistxx", x, DCOEF, PACKAGE = "analogue")
+ }
+ attr(d, "Size") <- nr
+ attr(d, "Labels") <- x.names
+ attr(d, "Diag") <- FALSE
+ attr(d, "Upper") <- FALSE
+ attr(d, "method") <- method
+ attr(d, "call") <- match.call()
+ class(d) <- "dist"
+ if(!as.dist) {
+ d <- as.matrix(d)
+ attr(d, "method") <- method
+ class(d) <- c("distance","matrix")
+ }
+ } else { ## two matrices
+ ## check x and y have same columns
+ if(!isTRUE(all.equal(names(x), names(y))))
+ stop("'x' and 'y' appear to have different variables.")
+ if(!isTRUE(all.equal((n.vars <- ncol(x)), ncol(y))))
+ stop("'x' and 'y' have different numbers of columns.")
+ ## variables
+ nrx <- nrow(x)
+ nry <- nrow(y)
+ ## object names (row names)
+ x.names <- rownames(x)
+ y.names <- rownames(y)
+ ## some preprocessing steps required for some coefs
+ ## so dealt with separately
+ if(method %in% c("chi.distance", "gower", "alt.gower",
+ "mixed", "kendall")) {
+ if(method == "chi.distance") {
+ x <- data.matrix(x)
+ y <- data.matrix(y)
+ csum <- colSums(rbind(x, y))
+ y <- y / rowSums(y)
+ x <- x / rowSums(x)
+ d <- .C("xy_chisq_dist", x = as.double(x), y = as.double(y),
+ nr1 = as.integer(nrx), nr2 = as.integer(nry),
+ nc = as.integer(n.vars), d = as.double(d),
+ csum = as.double(csum), NAOK = as.integer(FALSE),
+ PACKAGE = "analogue")$d
+ ##d <- .Call("Cchisqdistxy", x, y, )
+ }
+ } else {
+ ## must be one of the DC's handled by xy_distance
+ x <- data.matrix(x)
+ y <- data.matrix(y)
+ d <- .Call("Cdistxy", x, y, DCOEF, PACKAGE = "analogue")
+ }
+
+ ## convert d to a matrix
+ d <- matrix(d, ncol = nry, byrow = TRUE)
+ colnames(d) <- y.names
+ rownames(d) <- x.names
+ attr(d, "method") <- method
+ class(d) <- c("distance","matrix")
+ }
+ d
+}
Modified: pkg/inst/ChangeLog
===================================================================
--- pkg/inst/ChangeLog 2013-03-21 22:07:02 UTC (rev 314)
+++ pkg/inst/ChangeLog 2013-03-21 22:11:50 UTC (rev 315)
@@ -6,6 +6,15 @@
site scores for the base ordination. The latter is the default
to maintain backwards compatability.
+ * distanceX: experimental replacement for distance() which uses
+ fast C code for computing dissimilarities via a .Call interface
+ based on base::dist().
+
+ Currently only the single matrix code has an R interface and
+ `method = "mixed"` is not hooked up at the moment. It is intended
+ that the next version of analogue (0.12-x) will have this version
+ of the code replace the old mainly R-based distance().
+
Version 0.11-1
* scores.prcurve: new function to extract "axis" scores for
Added: pkg/src/c_distx.c
===================================================================
--- pkg/src/c_distx.c (rev 0)
+++ pkg/src/c_distx.c 2013-03-21 22:11:50 UTC (rev 315)
@@ -0,0 +1,650 @@
+/*
+ * Functions to compute dissimilarity between two matrices
+ * X and Y.
+ *
+ * Based on code from vegdist by Jari Oksanen:
+ *
+ * (C) 2001-2009, Jari Oksanen
+ * (C) 2009-2012 Gavin L. Simpson
+ *
+ * Licene: GPL 2
+ */
+
+/* Standard R headers */
+
+#include <R.h>
+#include <Rmath.h>
+#include <math.h>
+
+/* Indices */
+/* Note we don't actually call all of these via xx_distance
+ * Some are called via direct methods, but we include the
+ * indices here to allow the pattern matching to work
+ * correctly
+ */
+#define EUCLIDEAN 1
+#define SQEUCLIDEAN 2
+#define CHORD 3
+#define SQCHORD 4
+#define BRAY 5
+#define CHISQUARE 6
+#define SQCHISQUARE 7
+#define INFORMATION 8
+#define CHIDISTANCE 9
+#define MANHATTAN 10
+#define KENDALL 11
+#define GOWER 12
+#define ALTGOWER 13
+#define MIXED 14
+
+/* Distance functions */
+
+/* Euclidean distance */
+double c_xx_euclidean(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = x[i1] - x[i2];
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* Squared Euclidean distance */
+double c_xx_sq_euclidean(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = x[i1] - x[i2];
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Chord distance */
+double c_xx_chord(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = sqrt(x[i1]) - sqrt(x[i2]);
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* Squared Chord distance */
+double c_xx_sq_chord(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = sqrt(x[i1]) - sqrt(x[i2]);
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Bray-Curtis */
+double c_xx_bray(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, total;
+ int count, j;
+
+ total = 0.0;
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dist += fabs(x[i1] - x[i2]);
+ total += x[i1] + x[i2];
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count==0) return NA_REAL;
+ dist /= total;
+ return dist;
+}
+
+/* chi square */
+double c_xx_chi_square(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev, nomin, denomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ if(x[i1] != 0 || x[i2] != 0) {
+ dev = x[i1] - x[i2];
+ nomin = dev*dev;
+ denomin = x[i1] + x[i2];
+ dist += nomin / denomin;
+ count++;
+ }
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count==0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* square chi square */
+double c_xx_sq_chi_square(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev, nomin, denomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ if(x[i1] != 0 || x[i2] != 0) {
+ dev = x[i1] - x[i2];
+ nomin = dev*dev;
+ denomin = x[i1] + x[i2];
+ dist += nomin / denomin;
+ count++;
+ }
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count==0) return NA_REAL;
+ return dist;
+}
+
+/* information statistic */
+double c_xx_information(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, XY, A, B, Adist, Bdist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ Adist = 0.0;
+ Bdist = 0.0;
+ A = 0.0;
+ B = 0.0;
+ for(j=0; j<nc; j++) {
+ if(R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ XY = x[i1] + x[i2];
+ A += x[i1] * (log((2 * x[i1]) / XY)/log(2));
+ B += x[i2] * (log((2 * x[i2]) / XY)/log(2));
+ if(R_FINITE(A)) {
+ Adist += A;
+ }
+ if(R_FINITE(B)) {
+ Bdist += B;
+ }
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if(count==0) return NA_REAL;
+ dist = A + B;
+ return dist;
+}
+
+/* chi square distance*/
+/* currently not correct */
+double c_xx_chi_distance(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist, dev, nomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ nomin = 0.0;
+
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = x[i1] - x[i2];
+ nomin += dev*dev;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count==0) return NA_REAL;
+ return dist;
+}
+
+/* Manhattan metric */
+double c_xx_manhattan(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dist += fabs( x[i1] - x[i2] );
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+ in the R wrapper */
+double c_xx_gower(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dist += fabs( x[i1] - x[i2] );
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Alternative Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+ in the R wrapper */
+double c_xx_alt_gower(double *x, int nr, int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dist += fabs( x[i1] - x[i2] );
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(2 * dist);
+}
+
+/* Driver */
+
+static double (*distfun)(double*, int, int, int, int);
+
+void c_xx_distance(double *x, int *nr, int *nc, double *d,
+ int *diag, int *method)
+{
+ int dc, i, j, ij;
+ switch(*method) {
+ case EUCLIDEAN:
+ distfun = c_xx_euclidean;
+ break;
+ case SQEUCLIDEAN:
+ distfun = c_xx_sq_euclidean;
+ break;
+ case CHORD:
+ distfun = c_xx_chord;
+ break;
+ case SQCHORD:
+ distfun = c_xx_sq_chord;
+ break;
+ case BRAY:
+ distfun = c_xx_bray;
+ break;
+ case CHISQUARE:
+ distfun = c_xx_chi_square;
+ break;
+ case SQCHISQUARE:
+ distfun = c_xx_sq_chi_square;
+ break;
+ case INFORMATION:
+ distfun = c_xx_information;
+ break;
+ case MANHATTAN:
+ distfun = c_xx_manhattan;
+ break;
+ case GOWER:
+ distfun = c_xx_gower;
+ break;
+ case ALTGOWER:
+ distfun = c_xx_alt_gower;
+ break;
+ default:
+ error("Unknown distance in the internal C function");
+ }
+
+ dc = (*diag) ? 0 : 1;
+ ij = 0;
+ for (j=0; j <= *nr; j++)
+ for (i=j+dc; i < *nr; i++) {
+ d[ij++] = distfun(x, *nr, *nc, i, j);
+ }
+}
+
+/* .Call interface */
+
+/* Based on code from distance.c in R Sources*/
+
+/*
+ * R : A Computer Language for Statistical Data Analysis
+ * Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
+ * Copyright (C) 1998-2012 The R Core Team
+ * Copyright (C) 2002, 2004 The R Foundation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, a copy is available at
+ * http://www.r-project.org/Licenses/
+ */
+
+#include <Rinternals.h>
+
+SEXP Cdistxx(SEXP x, SEXP smethod)
+{
+ SEXP ans;
+ int nr = nrows(x), nc = ncols(x), method = asInteger(smethod);
+ int N, diag = 0;
+ N = (double)nr * (nr-1)/2; /* avoid overflow for N ~ 50,000 */
+ PROTECT(ans = allocVector(REALSXP, N));
+ c_xx_distance(REAL(x), &nr, &nc, REAL(ans), &diag, &method);
+ UNPROTECT(1);
+ return ans;
+}
+
+/* Kendall's coefficient -------------------------------------------
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * maxi: the max abundance for each species
+ *
+*/
+
+/* inner-most loop */
+double c_xx_KENDALL(double *x, int nr, int nc, int i1, int i2,
+ double *maxi)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = (x[i1] >= x[i2]) ? x[i2] : x[i1];
+ dist += maxi[j] - dev;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* driver, called by .call wrapper Ckendallxx */
+void c_xx_kendall(double *x, int *nr, int *nc, double *d,
+ int *diag, double *maxi)
+{
+int dc, i, j, ij;
+
+dc = (*diag) ? 0 : 1;
+ij = 0;
+ for(j=0; j <= *nr; j++) {
+ for(i=j+dc; i < *nr; i++) {
+ d[ij++] = c_xx_KENDALL(x, *nr, *nc, i, j, maxi);
+ }
+ }
+}
+
+/* .call wrapper */
+SEXP Ckendallxx(SEXP x, SEXP smaxi)
+{
+ SEXP ans;
+ int nr = nrows(x), nc = ncols(x);
+ int N, diag = 0;
+ double maxi = asReal(smaxi);
+ N = (double)nr * (nr-1)/2; /* avoid overflow for N ~ 50,000 */
+ PROTECT(ans = allocVector(REALSXP, N));
+ c_xx_kendall(REAL(x), &nr, &nc, REAL(ans), &diag, &maxi);
+ UNPROTECT(1);
+ return ans;
+}
+
+/* Chi square distance ------------------------------------------------
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * csum: species sums
+ *
+ */
+
+double c_xx_CHISQ_DIST(double *x, int nr, int nc, int i1, int i2,
+ double *csum, double ccsum)
+{
+ double dist, dev, denom, nomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ dev = x[i1] - x[i2];
+ nomin = dev*dev;
+ denom = csum[j] / ccsum;
+ dist += nomin / denom;
+ count++;
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+void c_xx_chisq_dist(double *x, int *nr, int *nc, double *d,
+ int *diag, double *csum)
+{
+ int dc, i, j, k, ij;
+ double ccsum;
+
+ ccsum = 0.0;
+
+ ij = 0;
+ dc = (*diag) ? 0 : 1;
+ for(k=0; k < *nc; k++) {
+ ccsum += csum[k];
+ }
+
+ for(j=0; j < *nr; j++) {
+ for(i=j+dc; i < *nr; i++) {
+ d[ij++] = c_xx_CHISQ_DIST(x, *nr, *nc, i, j,
+ csum, ccsum);
+ }
+ }
+}
+
+/* .call wrapper */
+SEXP Cchisqdistxx(SEXP x, SEXP scsum)
+{
+ SEXP ans;
+ int nr = nrows(x), nc = ncols(x);
+ int N, diag = 0;
+ double csum = asReal(scsum);
+ N = (double)nr * (nr-1)/2; /* avoid overflow for N ~ 50,000 */
+ PROTECT(ans = allocVector(REALSXP, N));
+ c_xx_chisq_dist(REAL(x), &nr, &nc, REAL(ans), &diag, &csum);
+ UNPROTECT(1);
+ return ans;
+}
+
+
+/* Gower's coefficient for mixed data ---------------------------------
+ *
+ * Should be called separately from the underlying R code,
+ * not via xy_distance.
+ *
+ * vtype : variable type
+ * 1 == Symmetric Binary
+ * 2 == Asymmetric Binary
+ * 3 == Nominal (class/factor)
+ * 4 == Ordinal (ordered factor)
+ * 5 == Quantitative
+ * weights: variable weights
+ * R : variable range (max - min)
+ *
+ */
+double c_xx_MIXED(double *x, int nr, int nc, int i1, int i2,
+ int *vtype, double *weights, double *R,
+ double wsum)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ wsum = 0.0;
+
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(x[i2])) {
+ if(vtype[j] == 1) {
+ dev = (x[i1] == x[i2]) ? 1 : 0;
+ dist += dev * weights[j];
+ }
+ if(vtype[j] == 2) { // Asymmetric binary
+ /*dev = (x[i1] == x[i2]) ? 1 : 0;
+ dist += dev * weights[j]; */
+ if((x[i1] != 0) || (x[i2] != 0)) {
+ // both x1 and x2 not zero for this variables
+ dev = (x[i1] == x[i2]) ? 1 : 0;
+ dist += dev * weights[j];
+ } else {
+ /* set jth current weight to zero and do not
+ increment dist as ignoring double zero
+ We actually subtract the weight as it gets added
+ later on.
+ */
+ wsum -= weights[j];
+ }
+ }
+ if(vtype[j] == 3) { // Nominal
+ dev = (x[i1] == x[i2]) ? 1 : 0;
+ dist += dev * weights[j];
+ }
+ if(vtype[j] == 4) { // Ordinal
+ /* ordinal data currently handled as Nominal */
+ dev = (x[i1] == x[i2]) ? 1 : 0;
+ dist += dev * weights[j];
+ break;
+ }
+ if(vtype[j] == 5) {
+ dev = 1 - (fabs(x[i1] - x[i2]) / R[j]);
+ dist += dev * weights[j];
+ }
+ count++;
+ // only summing weights for non-NA comparisons
+ wsum += weights[j];
+ }
+ i1 += nr;
+ i2 += nr;
+ }
+ if (count == 0) return NA_REAL;
+ return 1 - (dist / wsum);
+}
+
+void c_xx_mixed(double *x, int *nr, int *nc, double *d,
+ int *diag, int *vtype, double *weights, double *R)
+{
+ int dc, i, j, k, ij;
+ double wsum;
+
+ wsum = 0.0;
+
+ ij = 0;
+
+ dc = (*diag) ? 0 : 1;
+
+ for(k=0; k <*nc; k++) {
+ wsum += weights[k];
+ }
+
+ for(j=0; j < *nr; j++) {
+ for(i=j+dc; i < *nr; i++) {
+ d[ij++] = c_xx_MIXED(x, *nr, *nc, i, j, vtype,
+ weights, R, wsum);
+ }
+ }
+}
+
+/* .call wrapper */
+SEXP Cmixedxx(SEXP x, SEXP svtype, SEXP sweights, SEXP sR)
+{
+ SEXP ans;
+ int nr = nrows(x), nc = ncols(x), vtype = asInteger(svtype);
+ int N, diag = 0;
+ double weights = asReal(sweights), R = asReal(sR);
+ N = (double)nr * (nr-1)/2; /* avoid overflow for N ~ 50,000 */
+ PROTECT(ans = allocVector(REALSXP, N));
+ c_xx_mixed(REAL(x), &nr, &nc, REAL(ans), &diag, &vtype, &weights, &R);
+ UNPROTECT(1);
+ return ans;
+}
Added: pkg/src/c_distxy.c
===================================================================
--- pkg/src/c_distxy.c (rev 0)
+++ pkg/src/c_distxy.c 2013-03-21 22:11:50 UTC (rev 315)
@@ -0,0 +1,397 @@
+/*
+ * Functions to compute dissimilarity between two matrices
+ * X and Y.
+ *
+ * Based on code from vegdist by Jari Oksanen:
+ *
+ * (C) 2001-2009, Jari Oksanen
+ * (C) 2009-2012 Gavin L. Simpson
+ *
+ * Licene: GPL 2
+ */
+
+/* Standard R headers */
+
+#include <R.h>
+#include <Rmath.h>
+#include <math.h>
+
+/* Indices */
+/* Note we don't actually call all of these via xy_distance
+ * Some are called via direct methods, but we include the
+ * indices here to allow the pattern matching to work
+ * correctly
+ */
+#define EUCLIDEAN 1
+#define SQEUCLIDEAN 2
+#define CHORD 3
+#define SQCHORD 4
+#define BRAY 5
+#define CHISQUARE 6
+#define SQCHISQUARE 7
+#define INFORMATION 8
+#define CHIDISTANCE 9
+#define MANHATTAN 10
+#define KENDALL 11
+#define GOWER 12
+#define ALTGOWER 13
+#define MIXED 14
+
+/* Distance functions */
+
+/* Euclidean distance */
+double c_xy_euclidean(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dev = x[i1] - y[i2];
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* Squared Euclidean distance */
+double c_xy_sq_euclidean(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dev = x[i1] - y[i2];
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Chord distance */
+double c_xy_chord(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dev = sqrt(x[i1]) - sqrt(y[i2]);
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* Squared Chord distance */
+double c_xy_sq_chord(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dev = sqrt(x[i1]) - sqrt(y[i2]);
+ dist += dev*dev;
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Bray-Curtis */
+double c_xy_bray(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, total;
+ int count, j;
+
+ total = 0.0;
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dist += fabs(x[i1] - y[i2]);
+ total += x[i1] + y[i2];
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count==0) return NA_REAL;
+ dist /= total;
+ return dist;
+}
+
+/* chi square */
+double c_xy_chi_square(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev, nomin, denomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ if(x[i1] != 0 || y[i2] != 0) {
+ dev = x[i1] - y[i2];
+ nomin = dev*dev;
+ denomin = x[i1] + y[i2];
+ dist += nomin / denomin;
+ count++;
+ }
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count==0) return NA_REAL;
+ return sqrt(dist);
+}
+
+/* square chi square */
+double c_xy_sq_chi_square(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, dev, nomin, denomin;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ if(x[i1] != 0 || y[i2] != 0) {
+ dev = x[i1] - y[i2];
+ nomin = dev*dev;
+ denomin = x[i1] + y[i2];
+ dist += nomin / denomin;
+ count++;
+ }
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count==0) return NA_REAL;
+ return dist;
+}
+
+/* information statistic */
+double c_xy_information(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist, XY, A, B, Adist, Bdist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ Adist = 0.0;
+ Bdist = 0.0;
+ A = 0.0;
+ B = 0.0;
+ for(j=0; j<nc; j++) {
+ if(R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ XY = x[i1] + y[i2];
+ A += x[i1] * (log((2 * x[i1]) / XY)/log(2));
+ B += y[i2] * (log((2 * y[i2]) / XY)/log(2));
+ if(R_FINITE(A)) {
+ Adist += A;
+ }
+ if(R_FINITE(B)) {
+ Bdist += B;
+ }
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if(count==0) return NA_REAL;
+ dist = A + B;
+ return dist;
+}
+
+/* Manhattan metric */
+double c_xy_manhattan(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dist += fabs( x[i1] - y[i2] );
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+ in the R wrapper */
+double c_xy_gower(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dist += fabs( x[i1] - y[i2] );
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return dist;
+}
+
+/* Alternative Gower's distance */
+/* Needs to be preprocessed by dividing by Maxi - Mini
+ in the R wrapper */
+double c_xy_alt_gower(double *x, double *y, int nr1, int nr2,
+ int nc, int i1, int i2)
+{
+ double dist;
+ int count, j;
+
+ count = 0;
+ dist = 0.0;
+ for (j=0; j<nc; j++) {
+ if (R_FINITE(x[i1]) && R_FINITE(y[i2])) {
+ dist += fabs( x[i1] - y[i2] );
+ count++;
+ }
+ i1 += nr1;
+ i2 += nr2;
+ }
+ if (count == 0) return NA_REAL;
+ return sqrt(2 * dist);
+}
+
+/* Driver */
+
+void c_xy_distance(double *x, double *y, int *nrx, int *nry,
+ int *nc, double *d, int *method)
+{
+ int i, j;
+ size_t ij; /* can exceed 2^31 - 1 */
+ double (*distfun)(double*, double*, int, int, int, int, int) = NULL;
+ switch(*method) {
+ case EUCLIDEAN:
+ distfun = c_xy_euclidean;
+ break;
+ case SQEUCLIDEAN:
+ distfun = c_xy_sq_euclidean;
+ break;
+ case CHORD:
+ distfun = c_xy_chord;
+ break;
+ case SQCHORD:
+ distfun = c_xy_sq_chord;
+ break;
+ case BRAY:
+ distfun = c_xy_bray;
+ break;
+ case CHISQUARE:
+ distfun = c_xy_chi_square;
+ break;
+ case SQCHISQUARE:
+ distfun = c_xy_sq_chi_square;
+ break;
+ case INFORMATION:
+ distfun = c_xy_information;
+ break;
+ case MANHATTAN:
+ distfun = c_xy_manhattan;
+ break;
+ case GOWER:
+ distfun = c_xy_gower;
+ break;
+ case ALTGOWER:
+ distfun = c_xy_alt_gower;
+ break;
+ default:
+ error("Unknown distance in the internal C function");
+ }
+
+ ij = 0;
+ for (j=0; j < *nrx; j++)
+ for (i=0; i < *nry; i++) {
+ d[ij++] = distfun(x, y, *nrx, *nry, *nc, j, i);
+ }
+}
+
+/* .Call interface */
+
+/* Based on code from distance.c in R Sources*/
+
+/*
+ * R : A Computer Language for Statistical Data Analysis
+ * Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
+ * Copyright (C) 1998-2012 The R Core Team
+ * Copyright (C) 2002, 2004 The R Foundation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, a copy is available at
+ * http://www.r-project.org/Licenses/
+ */
+
+#include <Rinternals.h>
+
+SEXP Cdistxy(SEXP x, SEXP y, SEXP smethod)
+{
+ SEXP ans;
+ int nrx = nrows(x), nry = nrows(y), nc = ncols(x), method = asInteger(smethod);
+ int N;
+ N = (double)nrx * (double)nry; /* avoid overflow for N ~ 50,000 */
+ PROTECT(ans = allocVector(REALSXP, N));
+ c_xy_distance(REAL(x), REAL(y), &nrx, &nry, &nc, REAL(ans), &method);
+ UNPROTECT(1);
+ return ans;
+}
More information about the Analogue-commits
mailing list