[Analogue-commits] r146 - in pkg: . R inst man

Sun Aug 16 13:52:48 CEST 2009

Author: gsimpson
Date: 2009-08-16 13:52:47 +0200 (Sun, 16 Aug 2009)
New Revision: 146

Modified:
   pkg/DESCRIPTION
   pkg/R/join.R
   pkg/inst/ChangeLog
   pkg/man/join.Rd
Log:
adds left join to 'join' and new argument to supply the value using in replacing missing values.

Modified: pkg/DESCRIPTION
===================================================================

--- pkg/DESCRIPTION	2009-08-15 21:25:23 UTC (rev 145)
+++ pkg/DESCRIPTION	2009-08-16 11:52:47 UTC (rev 146)
@@ -1,7 +1,7 @@
 Package: analogue
 Type: Package
 Title: Analogue and weighted averaging methods for palaeoecology
-Version: 0.6-17
+Version: 0.6-18
 Date: $Date$
 Depends: R (>= 2.5.0), stats, graphics, vegan, lattice, MASS
 Author: Gavin L. Simpson, Jari Oksanen

Modified: pkg/R/join.R
===================================================================
--- pkg/R/join.R	2009-08-15 21:25:23 UTC (rev 145)
+++ pkg/R/join.R	2009-08-16 11:52:47 UTC (rev 146)
@@ -4,8 +4,8 @@
 ##                                                                       ##
 ## Created       : 17-Apr-2006                                           ##
 ## Author        : Gavin Simpson                                         ##
-## Version       : 0.2-0                                                 ##
-## Last modified : 23-Jul-2007                                           ##
+## Version       : 0.5-0                                                 ##
+## Last modified : 16-Aug-2009                                           ##
 ##                                                                       ##
 ## ARGUMENTS:                                                            ##
 ## ...                  - the data frames to be merged                   ##
@@ -35,37 +35,62 @@
 ##                             class data.frame, not that it is that     ##
 ##                             class. This allows join to work with the  ##
 ##                             results of join(..., split = FALSE        ##
+## 16-Aug-2009 - GLS - 0.5-0 * now has left outer as well as outer join  ##
+##                           * replacement value can now be specified    ##
 ##                                                                       ##
 ###########################################################################
-join <- function(..., verbose = FALSE, na.replace = TRUE, split = TRUE)
-  {
+join <- function(..., verbose = FALSE, na.replace = TRUE, split = TRUE,
+                  value = 0, type = c("outer","left"))
+{
+    outerJoin <- function(X) {
+        ## From code provided by Sundar Dorai-Raj in R-Help posting:
+        ## http://article.gmane.org/gmane.comp.lang.r.general/63042/
+        cn <- unique(unlist(lapply(X, colnames)))
+        for(i in seq(along = X)) {
+            if(any(m <- !cn %in% colnames(X[[i]]))) {
+                na <- matrix(NA, nrow(X[[i]]), sum(m))
+                dimnames(na) <- list(rownames(X[[i]]), cn[m])
+                X[[i]] <- cbind(X[[i]], na)
+            }
+        }
+        joined <- do.call(rbind, X)
+        colnames(joined) <- cn
+        return(joined)
+    }
+    leftJoin <- function(X) {
+        cn <- unique(unlist(lapply(X, colnames)[[1]]))
+        ## if more than 2 df in X, merge all bar first
+        if(length(X) > 2)
+            dfs <- outerJoin(X[-1])
+        else
+            dfs <- X[[2]]
+        ## matched column names
+        mcn <- match(colnames(dfs), cn)
+        mcn <- mcn[!is.na(mcn)]
+        joined <- matrix(NA, ncol = dims[1,2], nrow = sum(dims[,1]))
+        joined[1:dims[1,1], ] <- data.matrix(X[[1]])
+        joined[(dims[1,1]+1):NROW(joined), mcn] <- data.matrix(dfs[, mcn])
+        colnames(joined) <- cn
+        return(joined)
+    }
     x <- list(...)
-    ##if(any(sapply(x, class) != "data.frame"))
     if(any(!sapply(x, inherits, "data.frame")))
-      stop("\nall objects to be merged must be data frames.")
+        stop("\nall objects to be merged must be data frames.")
     dims <- do.call(rbind, lapply(x, dim))
     n.joined <- nrow(dims)
-    ## From code provided by Sundar Dorai-Raj in R-Help posting:
-    ## http://article.gmane.org/gmane.comp.lang.r.general/63042/match=merging
-    cn <- unique(unlist(lapply(x, colnames)))
-    for(i in seq(along = x)) {
-      if(any(m <- !cn %in% colnames(x[[i]]))) {
-        na <- matrix(NA, nrow(x[[i]]), sum(m))
-        dimnames(na) <- list(rownames(x[[i]]), cn[m])
-        x[[i]] <- cbind(x[[i]], na)
-      }
+    if(missing(type))
+        type <- "outer"
+    type <- match.arg(type)
+    if(type == "outer") {
+        joined <- outerJoin(x)
+    } else if(type == "left") {
+        joined <- leftJoin(x)
     }
-    joined <- do.call(rbind, x)
-    ## End Sundar code
     if(na.replace) {
-      dim.names <- dimnames(joined)
-      ##joined <- sapply(joined, function(x) {x[is.na(x)] <- 0; x})
-      joined <- data.frame(lapply(joined,
-                                  function(x) {x[is.na(x)] <- 0; x}))
-      dimnames(joined) <- dim.names
+        joined[is.na(joined)] <- value
     }
-    if(verbose)
-      {
+    rn <- lapply(x, rownames)
+    if(verbose) {
         stats <- rbind(dims, dim(joined))
         rownames(stats) <- c(paste("Data set ", c(1:n.joined), ":", sep = ""),
                              "Merged:")
@@ -74,20 +99,21 @@
         printCoefmat(stats, digits = max(3, getOption("digits") - 3),
                      na.print = "")
         cat("\n")
-      }
+    }
     if(split) {
-      retval <- vector(mode = "list", length = n.joined)
-      ends<- cumsum(dims[,1])
-      start <- c(1, ends[-n.joined] + 1)
-      for(i in 1:n.joined) {
-        retval[[i]] <- as.data.frame(joined[start[i]:ends[i], ])
-        names(retval) <- as.character(match.call())[c(2, 2 + (n.joined-1))]
-      }
-      class(retval) <- "join"
-      return(retval)
+        retval <- vector(mode = "list", length = n.joined)
+        ends<- cumsum(dims[,1])
+        start <- c(1, ends[-n.joined] + 1)
+        for(i in 1:n.joined) {
+            retval[[i]] <- as.data.frame(joined[start[i]:ends[i], ])
+            rownames(retval[[i]]) <- rn[[i]]
+        }
+        names(retval) <- as.character(match.call())[2:(n.joined+1)]
+        class(retval) <- "join"
     } else {
-      retval <- as.data.frame(joined, row.names = rownames(joined))
-      class(retval) <- c("join", class(retval))
-      return(retval)
+        retval <- as.data.frame(joined, row.names = rownames(joined))
+        class(retval) <- c("join", class(retval))
     }
-  }
+    attr(retval, "type") <- type
+    retval
+}

Modified: pkg/inst/ChangeLog
===================================================================
--- pkg/inst/ChangeLog	2009-08-15 21:25:23 UTC (rev 145)
+++ pkg/inst/ChangeLog	2009-08-16 11:52:47 UTC (rev 146)
@@ -1,5 +1,19 @@
 analogue Change Log
 
+Version 0.6-18
+
+	* join: new arguments 'type' and 'value'.
+
+	'type' controls which join is performed. Options are (currently)
+	"outer" (default) and "left". The left join is used to prepare two
+	or more data sets for ordinating the first and subsequently
+	passively projecting the other data sets into this ordination.
+	The outer join is used to prepare data for transfer functions such
+	as MAT and WA.
+
+	'value' allows the user to supply a numeric value to be used to
+	replace 'NA's.
+
 Version 0.6-17
 
 	* predict.wa: deshrinking method was not being honoured when

Modified: pkg/man/join.Rd
===================================================================
--- pkg/man/join.Rd	2009-08-15 21:25:23 UTC (rev 145)
+++ pkg/man/join.Rd	2009-08-16 11:52:47 UTC (rev 146)
@@ -11,7 +11,8 @@
   data sets with respect to training set samples. 
 }
 \usage{
-join(\dots, verbose = FALSE, na.replace = TRUE, split = TRUE)
+join(\dots, verbose = FALSE, na.replace = TRUE, split = TRUE, value = 0,
+     type = c("outer", "left"))
 
 \method{head}{join}(x, \dots)
 
@@ -37,9 +38,33 @@
   \item{split}{logical; should the merged data sets samples be split
     back into individual data frames, but now with common columns
     (i.e. species)?}
+  \item{value}{numeric; value to replace \code{NA} with if
+    \code{na.replace} is \code{TRUE}.}
+  \item{type}{logical; type of join to perform. \code{"outer"} returns
+    the \emph{union} of the variables in data frames to be merged, such
+    that the resulting objects have columns for all variables found
+    across all the data frames to be merged. \code{"left"} returns the
+    left outer (or the left) join, such that the merged data frames
+    contain the set of variables found in the first supplied data
+    frame. See Details.}
   \item{x}{an object of class \code{"join"}, usually the result of a
     call to \code{\link{join}}.}
 }
+\details{
+  When merging multiple data frames the set of variables in the merged
+  data can be determined via a number of routes. \code{join} provides
+  for two (currently) join types; the \emph{outer} join and the
+  \emph{left outer} (or simply the \emph{left}) join. Which type of join
+  is performed is determined by the argument \code{type}.
+
+  The \emph{outer} join returns the union of the set of variables found
+  in the data frames to be merged. This means that the resulting data
+  frame(s) contain columns for all the variable observed across all the
+  data frames supplied for merging.
+
+  With the \emph{left outer} join the resulting data frame(s) contain
+  only the set of variables found in the first data frame provided.
+}
 \value{
   If \code{split = TRUE}, an object of class \code{"join"}, a list of
   data frames, with as many components as the number of data frames
@@ -84,6 +109,13 @@
 
 ## show just the last few lines of each data set
 tail(dat, n = 4)
+
+## merge training and test set using left join
+head(join(ImbrieKipp, V12.122, verbose = TRUE, type = "left"))
+
+## merge training and test set using outer join and replace
+## NA with -99.9
+head(join(ImbrieKipp, V12.122, verbose = TRUE, value = -99.9))
 }
 \keyword{multivariate}% at least one, from doc/KEYWORDS
 \keyword{manip}% __ONLY ONE__ keyword per line