[Analogue-commits] r294 - pkg/R

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Thu Jan 3 13:07:57 CET 2013


Author: gsimpson
Date: 2013-01-03 13:07:56 +0100 (Thu, 03 Jan 2013)
New Revision: 294

Modified:
   pkg/R/distance2.R
Log:
adds a new version of distance that calls C code

Modified: pkg/R/distance2.R
===================================================================
--- pkg/R/distance2.R	2013-01-03 12:06:00 UTC (rev 293)
+++ pkg/R/distance2.R	2013-01-03 12:07:56 UTC (rev 294)
@@ -13,6 +13,9 @@
                                 type = list(),
                                 ordinal = c("gower","rank","metric"), ...) {
     pColl <- function(n) paste(n, collapse = ", ")
+    ## Euclid?an could be spelled variously
+    if(!is.na(pmatch(method, "euclidian")))
+	method <- "euclidean"
     if (missing(method))
         method <- "euclidean"
     METHODS <- c("euclidean", "SQeuclidean", "chord", "SQchord",
@@ -240,3 +243,175 @@
 ##         outfoo[j, i] <- sum(resfoo)
 ##     }
 ## }
+
+`distance3` <- function(x, ...)
+    UseMethod("distance3")
+
+`distance3.default` <-
+    function(x, y, method = "euclidean",
+             weights = NULL, R = NULL,
+             ...)
+{
+    METHODS <- c("euclidean", "SQeuclidean", "chord", "SQchord",
+                 "bray", "chi.square", "SQchi.square", "information",
+                 "chi.distance", "manhattan", "kendall", "gower", "alt.gower",
+                 "mixed")
+    pColl <- function(n) paste(n, collapse = ", ")
+    ## Euclid?an could be spelled variously
+    if(!is.na(pmatch(method, "euclidian")))
+	method <- "euclidean"
+    METHODS <- c("euclidean", "SQeuclidean", "chord", "SQchord",
+                 "bray", "chi.square", "SQchi.square",
+                 "information","chi.distance", "manhattan",
+                 "kendall", "gower", "alt.gower", "mixed")
+    DCOEF <- pmatch(method, METHODS)
+    if(missing(y)) { ## only a single matrix
+        ## TODO
+    } else { ## two matrices
+        ## check x and y have same columns
+        if(!isTRUE(all.equal(names(x), names(y))))
+            stop("'x' and 'y' appear to have different variables.")
+        if(!isTRUE(all.equal((n.vars <- ncol(x)), ncol(y))))
+            stop("'x' and 'y' have different numbers of columns.")
+        ## variables
+        nrx <- nrow(x)
+        nry <- nrow(y)
+        d <- numeric(length = nrx * nry)
+        ## object names (row names)
+        x.names <- rownames(x)
+        y.names <- rownames(y)
+        ## some preprocessing steps required for some coefs
+        ## so dealt with separately
+        if(method %in% c("chi.distance", "gower", "alt.gower",
+                         "mixed", "kendall")) {
+            if(method == "chi.distance") {
+                x <- data.matrix(x)
+                y <- data.matrix(y)
+                csum <- colSums(rbind(x, y))
+                y <- y / rowSums(y)
+                x <- x / rowSums(x)
+                d <- .C("xy_chisq_dist", x = as.double(x), y = as.double(y),
+                        nr1 = as.integer(nrx), nr2 = as.integer(nry),
+                        nc = as.integer(n.vars), d = as.double(d),
+                        csum = as.double(csum), NAOK = as.integer(FALSE),
+                        PACKAGE = "analogue")$d
+            }
+            if(method %in% c("gower", "alt.gower", "mixed")) {
+                if(method == "mixed") {
+                    if(is.null(weights))
+                        weights <- rep(1, n.vars)
+                    else {
+                        if(length(weights) != n.vars)
+                            stop("'weights' must be of length 'ncol(x)'")
+                    }
+                    ## process vtypes
+                    if(is.data.frame(x)) {
+                        type2x <- sapply(x, data.class, USE.NAMES = FALSE)
+                        ##x <- data.matrix(x)
+                    } else {
+                        type2x <- rep("numeric", n.vars)
+                        names(type2x) <- colnames(x)
+                    }
+                    if(is.data.frame(y)) {
+                        type2y <- sapply(y, data.class, USE.NAMES = FALSE)
+                        ##y <- data.matrix(y)
+                    } else {
+                        type2y <- rep("numeric", n.vars)
+                        names(type2y) <- colnames(y)
+                    }
+                    ## x and y should have same column types
+                    if(!isTRUE(all.equal(type2x, type2y)))
+                        stop("Variable types in 'x' and 'y' differ.
+Did you forget  to 'join' 'x' and 'y' before calling 'distance'?")
+
+                    ## Record the variable types
+                    type2x[tI <- type2x %in% c("numeric", "integer")] <- "Q"
+                    ## save which are ordinal for rank conversion below - TODO
+                    type2x[(ordinal <- type2x == "ordered")] <- "O"
+                    type2x[type2x == "factor"] <- "N"
+                    type2x[type2x == "logical"] <- "A"
+                    typeCodes <- c("A", "S", "N", "O", "Q", "I", "T")
+                    type3 <- match(type2x, typeCodes)
+                    if (any(ina <- is.na(type3)))
+                        stop("invalid type ", type2x[ina], " for column numbers ",
+                             pColl(which(ina)))
+
+                    ## Convert to matrices from now on
+                    ## also takes care of ordinal == metric as all factors
+                    ## are converted to internal numeric codes
+                    x <- data.matrix(x)
+                    y <- data.matrix(y)
+
+                    ## Compute range Rj
+                    XY <- rbind(x, y)
+                    if(is.null(R)) {
+                        maxi <- apply(XY, 2, max, na.rm = TRUE)
+                        mini <- apply(XY, 2, min, na.rm = TRUE)
+                        R <- maxi - mini
+                    } else {
+                        if(length(R) != n.vars)
+                            stop("'R' must be of length 'ncol(x)'")
+                    }
+
+                    ## call the C code
+                    d <- .C("xy_mixed", x = as.double(x), y = as.double(y),
+                            nr1 = as.integer(nrx), nr2 = as.integer(nry),
+                            nc = as.integer(n.vars), d = as.double(d),
+                            vtype = as.integer(type3),
+                            weights = as.double(weights), R = as.double(R),
+                            NAOK = as.integer(TRUE),
+                            PACKAGE = "analogue")$d
+                } else {
+                    if(is.null(R)) {
+                        XY <- rbind(x, y)
+                        maxi <- apply(XY, 2, max, na.rm = TRUE)
+                        mini <- apply(XY, 2, min, na.rm = TRUE)
+                        R <- maxi - mini
+                    } else {
+                        if(length(R) != n.vars)
+                            stop("'R' must be of length 'ncol(x)'")
+                    }
+                    x <- data.matrix(x)
+                    y <- data.matrix(y)
+
+                    ## pre-process for gower and alt gower
+                    ## but these handled by xy_distance below
+                    x <- sweep(x, 2, R, "/")
+                    y <- sweep(y, 2, R, "/")
+                    d <- .C("xy_distance", x = as.double(x), y = as.double(y),
+                            nr1 = as.integer(nrx), nr2 = as.integer(nry),
+                            nc = as.integer(n.vars), d = as.double(d),
+                            method = as.integer(DCOEF), NAOK = as.integer(FALSE),
+                            PACKAGE = "analogue")$d
+                }
+            }
+            if(method == "kendall") {
+                x <- data.matrix(x)
+                y <- data.matrix(y)
+                XY <- rbind(x, y)
+                maxi <- apply(XY, 2, max)
+                d <- .C("xy_kendall", x = as.double(x), y = as.double(y),
+                        nr1 = as.integer(nrx), nr2 = as.integer(nry),
+                        nc = as.integer(n.vars), d = as.double(d),
+                        maxi = as.double(maxi), NAOK = as.integer(FALSE),
+                        PACKAGE = "analogue")$d
+            }
+        } else {
+            ## must be one of the DC's handled by xy_distance
+            x <- data.matrix(x)
+            y <- data.matrix(y)
+            d <- .C("xy_distance", x = as.double(x), y = as.double(y),
+                    nr1 = as.integer(nrx), nr2 = as.integer(nry),
+                    nc = as.integer(n.vars), d = as.double(d),
+                    method = as.integer(DCOEF), NAOK = as.integer(FALSE),
+                    PACKAGE = "analogue")$d
+        }
+        ## convert d to a matrix
+        d <- matrix(d, ncol = nry, byrow = TRUE)
+        colnames(d) <- y.names
+        rownames(d) <- x.names
+        attr(d, "method") <- method
+        class(d) <- c("distance","matrix")
+    }
+    return(d)
+}



More information about the Analogue-commits mailing list