[Analogue-commits] r240 - pkg/R

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Sun Sep 11 20:10:14 CEST 2011


Author: gsimpson
Date: 2011-09-11 20:10:14 +0200 (Sun, 11 Sep 2011)
New Revision: 240

Added:
   pkg/R/distance2.R
Log:
start on a new function to replace distance that uses compiled code, most functionality works, with main exception being a compiled code version of gower's coefficient

Added: pkg/R/distance2.R
===================================================================
--- pkg/R/distance2.R	                        (rev 0)
+++ pkg/R/distance2.R	2011-09-11 18:10:14 UTC (rev 240)
@@ -0,0 +1,242 @@
+`distance2` <- function(x, ...)
+    UseMethod("distance2")
+
+`distance2.default` <- function(x, y,
+                                method = c("euclidean",
+                                "SQeuclidean", "chord",
+                                "SQchord", "bray", "chi.square",
+                                "SQchi.square", "information",
+                                "chi.distance", "manhattan",
+                                "kendall", "gower", "alt.gower",
+                                "mixed"),
+                                weights = NULL, R = NULL,
+                                type = list(),
+                                ordinal = c("gower","rank","metric"), ...) {
+    pColl <- function(n) paste(n, collapse = ", ")
+    if (missing(method))
+        method <- "euclidean"
+    METHODS <- c("euclidean", "SQeuclidean", "chord", "SQchord",
+                 "bray", "chi.square", "SQchi.square",
+                 "information","chi.distance", "manhattan",
+                 "kendall", "gower", "alt.gower", "mixed")
+    method <- match.arg(method)
+    DCOEF <- pmatch(method, METHODS)
+    ordinal <- match.arg(ordinal)
+    ORDTYPES <- c("gower","rank","metric")
+    if(missing(y)) { ## only a single matrix
+        ## TODO
+    } else { ## two matrices
+        ## check x and y have same columns
+        if(!isTRUE(all.equal(names(x), names(y))))
+            stop("'x' and 'y' appear to have different variables.")
+        if(!isTRUE(all.equal((n.vars <- ncol(x)), ncol(y))))
+            stop("'x' and 'y' have different numbers of columns.")
+        ## variables
+        nrx <- nrow(x)
+        nry <- nrow(y)
+        d <- numeric(length = nrx * nry)
+        ## object names (row names)
+        x.names <- rownames(x)
+        y.names <- rownames(y)
+        ## some preprocessing steps required for some coefs
+        ## so dealt with separately
+        if(method %in% c("chi.distance", "gower", "alt.gower",
+                         "mixed", "kendall")) {
+            if(method == "chi.distance") {
+                x <- data.matrix(x)
+                y <- data.matrix(y)
+                csum <- colSums(rbind(x, y))
+                y <- y / rowSums(y)
+                x <- x / rowSums(x)
+                d <- .C("xy_chisq_dist", x = as.double(x), y = as.double(y),
+                        nr1 = as.integer(nrx), nr2 = as.integer(nry),
+                        nc = as.integer(n.vars), d = as.double(d),
+                        csum = as.double(csum), NAOK = as.integer(FALSE),
+                        PACKAGE = "analogue")$d
+            }
+            if(method %in% c("gower", "alt.gower", "mixed")) {
+                if(method == "mixed") {
+                    if(is.null(weights))
+                        weights <- rep(1, n.vars)
+                    else {
+                        if(length(weights) != n.vars)
+                            stop("'weights' must be of length 'ncol(x)'")
+                    }
+                    ## process vtypes
+                    if(length(type)) {
+                        ## if 'type's supplied, validate
+                    }
+                    ## TODO
+                    if(is.data.frame(x)) {
+                        type2x <- sapply(x, data.class, USE.NAMES = FALSE)
+                        ##x <- data.matrix(x)
+                    } else {
+                        type2x <- rep("numeric", n.vars)
+                        names(type2x) <- colnames(x)
+                    }
+                    if(is.data.frame(y)) {
+                        type2y <- sapply(y, data.class, USE.NAMES = FALSE)
+                        ##y <- data.matrix(y)
+                    } else {
+                        type2y <- rep("numeric", n.vars)
+                        names(type2y) <- colnames(y)
+                    }
+                    ## x and y should have same column types
+                    if(!isTRUE(all.equal(type2x, type2y)))
+                        stop("Variable types in 'x' and 'y' differ.
+Did you forget  to 'join' 'x' and 'y' before calling 'distance'?")
+                    type2x[tI <- type2x %in% c("numeric", "integer")] <- "Q"
+                    ## save which are ordinal for rank conversion below
+                    type2x[(ordinal <- type2x == "ordered")] <- "O"
+                    type2x[type2x == "factor"] <- "N"
+                    type2x[type2x == "logical"] <- "A"
+                    typeCodes <- c("A", "S", "N", "O", "Q", "I", "T")
+                    type3 <- match(type2x, typeCodes)
+                    if (any(ina <- is.na(type3)))
+                        stop("invalid type ", type2x[ina], " for column numbers ",
+                             pColl(which(ina)))
+
+                    ## Convert to matrices from now on
+                    ## also takes care of ordinal == metric as all factors
+                    ## are converted to internal numeric codes
+                    x <- data.matrix(x)
+                    y <- data.matrix(y)
+
+                    ## Convert ordinal variables to ranks or numerics
+                    ## implemented as per Podani 1999. Only do ranks here as
+                    ## conversion to matrices above handled the standard case
+                    x[, ordinal] <- apply(x[, ordinal], 2, rank, na.last = "keep")
+                    y[, ordinal] <- apply(y[, ordinal], 2, rank, na.last = "keep")
+
+                    ## Compute range Rj
+                    XY <- rbind(x, y)
+                    if(is.null(R)) {
+                        maxi <- apply(XY, 2, max, na.rm = TRUE)
+                        mini <- apply(XY, 2, min, na.rm = TRUE)
+                        R <- maxi - mini
+                    } else {
+                        if(length(R) != n.vars)
+                            stop("'R' must be of length 'ncol(x)'")
+                    }
+
+                    ## For Ordinal we need TiMin and TiMax
+                    ## compute over all variables so they have same length as
+                    ## everything else
+                    doT <- function(X, which) {
+                        val <- if(which == "min") {
+                            min(X, na.rm = TRUE)
+                        } else {
+                            max(X, na.rm = TRUE)
+                        }
+                        nas <- is.na(X)
+                        length(which(X[!nas] == val))
+                    }
+                    tmin <- apply(XY, 2, doT, which = "min")
+                    tmax <- apply(XY, 2, doT, which = "max")
+
+                    ## How do we want to handle ordinals - convert to interger code
+                    ## for use in C
+                    podani <- match(ordinal, ORDTYPES)
+
+                    ## call the C code
+                    d <- .C("xy_mixed", x = as.double(x), y = as.double(y),
+                            nr1 = as.integer(nrx), nr2 = as.integer(nry),
+                            nc = as.integer(n.vars), d = as.double(d),
+                            vtype = as.integer(type3),
+                            weights = as.double(weights), R = as.double(R),
+                            tmin = as.integer(tmin), tmax = as.integer(tmax),
+                            podani = as.integer(podani),
+                            NAOK = as.integer(TRUE),
+                            PACKAGE = "analogue")$d
+                } else {
+                    if(is.null(R)) {
+                        XY <- rbind(x, y)
+                        maxi <- apply(XY, 2, max, na.rm = TRUE)
+                        mini <- apply(XY, 2, min, na.rm = TRUE)
+                        R <- maxi - mini
+                    } else {
+                        if(length(R) != n.vars)
+                            stop("'R' must be of length 'ncol(x)'")
+                    }
+                    x <- data.matrix(x)
+                    y <- data.matrix(y)
+
+                    ## pre-process for gower and alt gower
+                    ## but these handled by xy_distance below
+                    x <- sweep(x, 2, R, "/")
+                    y <- sweep(y, 2, R, "/")
+                    d <- .C("xy_distance", x = as.double(x), y = as.double(y),
+                            nr1 = as.integer(nrx), nr2 = as.integer(nry),
+                            nc = as.integer(n.vars), d = as.double(d),
+                            method = as.integer(DCOEF), NAOK = as.integer(FALSE),
+                            PACKAGE = "analogue")$d
+                }
+            }
+            if(method == "kendall") {
+                x <- data.matrix(x)
+                y <- data.matrix(y)
+                XY <- rbind(x, y)
+                maxi <- apply(XY, 2, max)
+                d <- .C("xy_kendall", x = as.double(x), y = as.double(y),
+                        nr1 = as.integer(nrx), nr2 = as.integer(nry),
+                        nc = as.integer(n.vars), d = as.double(d),
+                        maxi = as.double(maxi), NAOK = as.integer(FALSE),
+                        PACKAGE = "analogue")$d
+            }
+        } else {
+            ## must be one of the DC's handled by xy_distance
+            x <- data.matrix(x)
+            y <- data.matrix(y)
+            d <- .C("xy_distance", x = as.double(x), y = as.double(y),
+                    nr1 = as.integer(nrx), nr2 = as.integer(nry),
+                    nc = as.integer(n.vars), d = as.double(d),
+                    method = as.integer(DCOEF), NAOK = as.integer(FALSE),
+                    PACKAGE = "analogue")$d
+        }
+        ## convert d to a matrix
+        d <- matrix(d, ncol = n.vars, byrow = TRUE)
+        colnames(d) <- y.names
+        rownames(d) <- x.names
+        attr(d, "method") <- method
+        class(d) <- c("distance","matrix")
+    }
+    return(d)
+}
+
+## set.seed(1)
+## bar <- matrix(sample(3, 9, replace = TRUE), ncol = 3)
+## foo <- matrix(sample(3, 9, replace = TRUE), ncol = 3)
+## foobar <- rbind(bar, foo)
+## out <- matrix(ncol = ncol(bar), nrow = nrow(foobar))
+## res <- numeric(length = nrow(foobar))
+## for(i in seq_len(nrow(foobar))) {
+##     for(j in seq_len(ncol(foobar))) {
+##         for(k in seq_along(foobar[,j])) {
+##             res[k] <- foobar[k,j] == foobar[i,j]
+##         }
+##         out[i, j] <- sum(res)
+##     }
+## }
+
+## set.seed(1)
+## bar <- matrix(sample(3, 9, replace = TRUE), ncol = 3)
+## foo <- matrix(sample(3, 9, replace = TRUE), ncol = 3)
+## outbar <- matrix(0, ncol = ncol(bar), nrow = nrow(bar))
+## outfoo <- matrix(0, ncol = ncol(foo), nrow = nrow(foo))
+## resbar <- numeric(length = nrow(bar))# + nrow(foo))
+## resfoo <- numeric(length = nrow(bar))# + nrow(foo))
+
+## for(i in seq_len(ncol(bar))) {
+##     for(j in seq_len(nrow(bar))) {
+##         for(k in seq_len(nrow(bar))) {
+##             resbar[k] <- bar[k, i] == bar[j, i]
+##         }
+##         outbar[j, i] <- sum(resbar)
+##     }
+##     for(j in seq_len(nrow(foo))) {
+##         for(k in seq_len(nrow(foo))) {
+##             resfoo[k] <- foo[k, i] == bar[j, i]
+##         }
+##         outfoo[j, i] <- sum(resfoo)
+##     }
+## }



More information about the Analogue-commits mailing list