[Phylobase-commits] r669 - in branches/slot-mods: R data inst/unitTests man

Mon Sep 28 07:34:05 CEST 2009

Author: regetz
Date: 2009-09-28 07:34:04 +0200 (Mon, 28 Sep 2009)
New Revision: 669

Modified:
   branches/slot-mods/R/class-phylo4d.R
   branches/slot-mods/R/formatData.R
   branches/slot-mods/data/geospiza.rda
   branches/slot-mods/inst/unitTests/runit.class-phylo4d.R
   branches/slot-mods/man/formatData.Rd
Log:
Revised formatData to return data rows for *all* nodes (unless otherwise
specified), and simplified .phylo4Data to match. Row names now stored
internally as numeric, not character. Updated geospiza/tests to match.


Modified: branches/slot-mods/R/class-phylo4d.R
===================================================================

--- branches/slot-mods/R/class-phylo4d.R	2009-09-28 04:06:24 UTC (rev 668)
+++ branches/slot-mods/R/class-phylo4d.R	2009-09-28 05:34:04 UTC (rev 669)
@@ -48,31 +48,31 @@
              "tip.data and node.data column names")
     }
 
-    ## identify common columns to merge if merging, or rename otherwise
-    colnamesToMerge <- intersect(names(tip.data), names(node.data))
-    if (merge.data==FALSE) {
-        names(tip.data)[names(tip.data) %in% colnamesToMerge] <-
-            paste(colnamesToMerge, "tip", sep=".")
-        names(node.data)[names(node.data) %in% colnamesToMerge] <-
-            paste(colnamesToMerge, "node", sep=".")
-        colnamesToMerge <- NULL
+    ## combine common columns and move into all.data if merging,
+    ## otherwise rename them
+    colsToMerge <- intersect(names(tip.data), names(node.data))
+    if (merge.data && length(colsToMerge)>0) {
+        ##TODO could really just index rows directly on 1:nTip and
+        ## (nTip+1):(nTip+nNode) in the next two statements for speed,
+        ## but this is more robust to changes in node numbering rules
+        tip.rows <- tip.data[match(nodeId(x, "tip"),
+            row.names(tip.data)), colsToMerge, drop=FALSE]
+        node.rows <- node.data[match(nodeId(x, "internal"),
+            row.names(tip.data)), colsToMerge, drop=FALSE]
+        merge.data <- rbind(tip.rows, node.rows)
+        all.data <- data.frame(all.data, merge.data)
+    } else {
+        names(tip.data)[names(tip.data) %in% colsToMerge] <-
+            paste(colsToMerge, "tip", sep=".")
+        names(node.data)[names(node.data) %in% colsToMerge] <-
+            paste(colsToMerge, "node", sep=".")
     }
-    ## now separate tip.only, node.only, and common columns
+    ## now separate tips-only and nodes-only data
     tip.only.data <- tip.data[setdiff(names(tip.data), names(node.data))]
     node.only.data <- node.data[setdiff(names(node.data), names(tip.data))]
-    common.data <- rbind(tip.data[colnamesToMerge], node.data[colnamesToMerge])
 
-    ## merge data common to tips and nodes
-    all.common.data <- merge(all.data, common.data, by=0, all=TRUE,
-        sort=FALSE)
-    ## merge data that apply only to tips or nodes
-    all.separate.data <- merge(tip.only.data, node.only.data, by=0,
-        all=TRUE, sort=FALSE)
-    ## merge everything together and clean up
-    complete.data <- merge(all.common.data, all.separate.data,
-        by="Row.names", all=TRUE, sort=FALSE)
-    row.names(complete.data) <- complete.data[["Row.names"]]
-    complete.data <- subset(complete.data, select=-Row.names)
+    ## combine all data
+    complete.data <- data.frame(all.data, tip.only.data, node.only.data)
 
     ## drop any rows that only contain NAs
     if (ncol(complete.data)==0) {

Modified: branches/slot-mods/R/formatData.R
===================================================================
--- branches/slot-mods/R/formatData.R	2009-09-28 04:06:24 UTC (rev 668)
+++ branches/slot-mods/R/formatData.R	2009-09-28 05:34:04 UTC (rev 669)
@@ -2,138 +2,123 @@
                        match.data=TRUE, rownamesAsLabels=FALSE,
                        label.type=c("rownames", "column"),
                        label.column=1, missing.data=c("fail", "warn", "OK"),
-                       extra.data=c("warn", "OK", "fail")
+                       extra.data=c("warn", "OK", "fail"), keep.all=TRUE
                        ) {
 
-    ## coerce vector data to data.frame
-    if (is.vector(dt)) {
-        dt <- as.data.frame(dt)
+    ## determine whether to return rows for all nodes, or just 'type'
+    type <- match.arg(type)
+    if (keep.all) {
+        ids.out <- nodeId(phy, "all")
+    } else {
+        ids.out <- nodeId(phy, type)
     }
+
     ## if null, return empty data frame with node numbers as row names
     if (is.null(dt)) {
-        return(data.frame(row.names=nodeId(phy, type)))
+        return(data.frame(row.names=ids.out))
     }
+    ## if vector, coerce to data.frame
+    if (is.vector(dt)) {
+        dt <- as.data.frame(dt)
+    }
     ## before proceeding, make sure that data provided are a data frame
     if (!is.data.frame(dt)) {
         nmSomeData <- substitute(dt)
         stop(paste(nmSomeData, "must be a vector or a data frame"))
     }
-    ## if null or lacking rows or columns, return a placeholder data
-    ## frame with node numbers as row names
+    ## if lacking rows or columns, return a placeholder data frame with
+    ## node numbers as row names
     if (any(dim(dt)==0)) {
-        return(data.frame(row.names=nodeId(phy, type)))
+        return(data.frame(row.names=ids.out))
     }
         
-    type <- match.arg(type)
     label.type <- match.arg(label.type)
     stopifnot(label.column %in% 1:ncol(dt))
     missing.data <- match.arg(missing.data)
     extra.data <- match.arg(extra.data)
 
-    nr <- switch(type,
-                 tip = nTips(phy),
-                 internal = nNodes(phy),
-                 all = nTips(phy)+nNodes(phy))
-
-    tmpDt <- array(, dim=c(nr, ncol(dt)),
-                   dimnames=list(nodeId(phy, type), colnames(dt)))
-    tmpDt <- data.frame(tmpDt)
-
     if(match.data) {
-        ## extract node-matching vector
+        ## extract values to be matched to nodes
         ndNames <- switch(label.type,
                           rownames = rownames(dt),
                           column = dt[,label.column])
         ## either force matching on labels, or match on node
         ## numbers for any number-like elements and labels otherwise
         if (rownamesAsLabels) {
-            ndDt <- getNode(phy, as.character(ndNames), missing="OK")
+            ids.in <- getNode(phy, as.character(ndNames), missing="OK")
         } else {
-            ndDt <- as.numeric(rep(NA, length(ndNames)))
+            ids.in <- as.numeric(rep(NA, length(ndNames)))
             treatAsNumber <- nchar(gsub("[0-9]", "", ndNames))==0
-            ndDt[treatAsNumber] <- getNode(phy,
+            ids.in[treatAsNumber] <- getNode(phy,
                 as.integer(ndNames[treatAsNumber]), missing="OK")
-            ndDt[!treatAsNumber] <- getNode(phy,
+            ids.in[!treatAsNumber] <- getNode(phy,
                 as.character(ndNames[!treatAsNumber]), missing="OK")
         }
+
         ## Make sure that data are matched to appropriate nodes
-        if(type != "all") {
-            switch(type,
-                   tip = {
-                     ## BMB: don't bother trying to match NAs
-                       if(any(na.omit(ndDt) %in% nodeId(phy, "internal")))
-                           stop("Your tip data are being matched to internal ",
-                                "nodes. Make sure that your data identifiers ",
-                                "are correct.")
-                   },
-                   internal = {
-                       if(any(na.omit(ndDt) %in% nodeId(phy, "tip")))
-                           stop("Your node data are being matched to tip ",
-                                "nodes. Make sure that your data identifiers ",
-                                "are correct.")
-                   })
+        if (type=="tip" && any(na.omit(ids.in) %in% nodeId(phy,
+            "internal"))) {
+            stop("Your tip data are being matched to internal ",
+                "nodes. Make sure that your data identifiers ",
+                "are correct.")
         }
+        if (type=="internal" && any(na.omit(ids.in) %in% nodeId(phy,
+            "tip"))) {
+            stop("Your node data are being matched to tip ",
+                "nodes. Make sure that your data identifiers ",
+                "are correct.")
+        }
 
         ## Check differences
-        extra <- ndNames[is.na(ndDt)]
-        mssng <- nodeId(phy, type)[! nodeId(phy, type) %in% ndDt]
-
+        mssng <- setdiff(nodeId(phy, type), ids.in)
         if(length(mssng) > 0 && missing.data != "OK") {
+            ## provide label if it exists and node number otherwise
+            mssng <- getNode(phy, mssng)
+            mssng <- ifelse(is.na(names(mssng)), mssng, names(mssng))
             msg <- "The following nodes are not found in the dataset: "
-
-            ## provides label if it exists and node number otherwise
-            mssng <- sapply(mssng, function(m) {
-                m <- getNode(phy, m)
-                if (is.na(names(m)) || is.null(names(m)))
-                    m
-                else
-                    names(m)
-            })
-
             msg <- paste(msg, paste(mssng, collapse=", "))
             switch(missing.data,
                    warn = warning(msg),
                    fail = stop(msg))
         }
-
+        extra <- ndNames[is.na(ids.in)]
         if(length(extra) > 0 && extra.data != "OK") {
             msg <- "The following names are not found in the tree: "
             msg <- paste(msg, paste(extra, collapse=", "))
             switch(extra.data,
                    warn = warning(msg),
                    fail = stop(msg))
-
         }
         ## Format data to have correct dimensions
-        dt <- dt[!is.na(ndDt) ,, drop=FALSE]
-        rownames(dt) <- ndDt[!is.na(ndDt)]
-        tmpDt[,] <- dt[match(rownames(tmpDt), rownames(dt)) ,, drop=FALSE]
-        if(label.type == "column") tmpDt <- tmpDt[, -label.column, drop=FALSE]
-    }
-    else {
-        ## Remove rownames in data provided
-        rownames(dt) <- NULL
+        dt <- dt[!is.na(ids.in), , drop=FALSE]
+        rownames(dt) <- ids.in[!is.na(ids.in)]
+        dt.out <- dt[match(ids.out, rownames(dt)), , drop=FALSE]
+        rownames(dt.out) <- ids.out
+        if(label.type == "column") dt.out <- dt.out[, -label.column, drop=FALSE]
 
-        ## Tips before internal nodes for all.data
-        if (type == "all")
-            rownames(tmpDt) <- 1:nr
+    } else {
 
-        ## Check differences between dataset and tree
-        diffNr <- nrow(dt) - nr
-        if(diffNr > 0 && extra.data != "OK") {
+        ## Check if too many or not enough rows in input data
+        expected.nrow <- length(nodeId(phy, type))
+        diffNr <- nrow(dt) - expected.nrow
+        if(nrow(dt) > expected.nrow && extra.data != "OK") {
             msg <- paste("There are", diffNr, "extra rows.")
             switch(extra.data,
                    warn = warning(msg),
                    fail = stop(msg))
         }
-        if(diffNr < 0 && missing.data != "OK") {
+        if(nrow(dt) < expected.nrow && missing.data != "OK") {
             msg <- paste("There are", abs(diffNr), "missing rows.")
             switch(missing.data,
                    warn = warning(msg),
                    fail = stop(msg))
         }
-        tmpDt[,] <- dt[1:min(nrow(dt), nr) ,, drop = FALSE]
+        ## truncate rows of input data frame if necessary
+        dt <- dt[1:min(nrow(dt), expected.nrow) ,, drop = FALSE]
+        rownames(dt) <- nodeId(phy, type)[seq_len(nrow(dt))]
+        dt.out <- dt[match(ids.out, rownames(dt)) ,, drop=FALSE]
+        rownames(dt.out) <- ids.out
     }
 
-    tmpDt
+    dt.out
 }

Modified: branches/slot-mods/data/geospiza.rda
===================================================================
(Binary files differ)

Modified: branches/slot-mods/inst/unitTests/runit.class-phylo4d.R
===================================================================
--- branches/slot-mods/inst/unitTests/runit.class-phylo4d.R	2009-09-28 04:06:24 UTC (rev 668)
+++ branches/slot-mods/inst/unitTests/runit.class-phylo4d.R	2009-09-28 05:34:04 UTC (rev 669)
@@ -59,21 +59,21 @@
     ## brute force: no matching; with tip data
     phyd <- phylo4d(phy.alt, tip.data=tipDt, match.data=FALSE)
     checkIdentical(phyd at data, data.frame(tipDt,
-        row.names=as.character(nid.tip)))
+        row.names=nid.tip))
     checkIdentical(tdata(phyd, "tip"), data.frame(tipDt,
         row.names=lab.tip))
 
     ## brute force: no matching; with node data
     phyd <- phylo4d(phy.alt, node.data=nodDt, match.data=FALSE)
     checkIdentical(phyd at data, data.frame(nodDt,
-        row.names=as.character(nid.int)))
+        row.names=nid.int))
     checkIdentical(tdata(phyd, "internal"), data.frame(nodDt,
         row.names=lab.int))
 
     ## brute force: no matching; with all.data
     phyd <- phylo4d(phy.alt, all.data=allDt, match.data=FALSE)
     checkIdentical(phyd at data, data.frame(allDt,
-        row.names=as.character(nid.all)))
+        row.names=nid.all))
     checkIdentical(tdata(phyd, "all"), data.frame(allDt,
         row.names=lab.all))
 
@@ -83,7 +83,7 @@
         match.data=FALSE)
     checkIdentical(phyd at data, data.frame(rbind(data.frame(tipDt["d"],
         e=NA_real_), data.frame(d=NA_real_, nodDt["e"])),
-        row.names=as.character(nid.all)))
+        row.names=nid.all))
     checkIdentical(tdata(phyd, "tip"), data.frame(tipDt["d"], e=NA_real_,
         row.names=lab.tip))
     checkIdentical(tdata(phyd, "internal"), data.frame(d=NA_real_, nodDt["e"],
@@ -94,7 +94,7 @@
     phyd <- phylo4d(phy.alt, tip.data=tipDt["c"], node.data=nodDt["c"],
         match.data=FALSE)
     checkIdentical(phyd at data, data.frame(rbind(tipDt["c"], nodDt["c"]),
-        row.names=as.character(nid.all)))
+        row.names=nid.all))
     checkIdentical(tdata(phyd, "tip"), data.frame(c=factor(tipDt$c,
         levels=letters[nid.all]), row.names=lab.tip))
     checkIdentical(tdata(phyd, "internal"), data.frame(c=factor(nodDt$c,
@@ -108,14 +108,14 @@
     ## match with node numbers, tip data
     phyd <- phylo4d(phy.alt, tip.data=tipDt)
     checkIdentical(phyd at data, data.frame(tipDt[order(nid.tip.r),],
-        row.names=as.character(nid.tip)))
+        row.names=nid.tip))
     checkIdentical(tdata(phyd, "tip"), data.frame(tipDt[order(nid.tip.r),],
         row.names=lab.tip))
 
     ## match with node numbers, node data
     phyd <- phylo4d(phy.alt, node.data=nodDt)
     checkIdentical(phyd at data, data.frame(nodDt[order(nid.int.r),],
-        row.names=as.character(nid.int)))
+        row.names=nid.int))
     checkIdentical(tdata(phyd, "internal"), data.frame(nodDt[order(nid.int.r),],
         row.names=lab.int))
 
@@ -124,7 +124,7 @@
     checkIdentical(phyd at data, data.frame(rbind(data.frame(
         d=tipDt[order(nid.tip.r), "d"], e=NA_real_),
         data.frame(d=NA_real_, e=nodDt[order(nid.int.r), "e"])),
-        row.names=as.character(nid.all)))
+        row.names=nid.all))
     checkIdentical(tdata(phyd, "tip"), data.frame(d=tipDt[order(nid.tip.r), "d"],
         e=NA_real_, row.names=lab.tip))
     checkIdentical(tdata(phyd, "internal"), data.frame(d=NA_real_,
@@ -134,14 +134,14 @@
     phyd <- phylo4d(phy.alt, tip.data=tipDt, all.data=allDt)
     merged <- data.frame(merge(allDt[order(nid.all.r),],
         tipDt[order(nid.tip.r),], all=TRUE, by=0)[-1])
-    checkIdentical(phyd at data, data.frame(merged, row.names=as.character(nid.all)))
+    checkIdentical(phyd at data, data.frame(merged, row.names=nid.all))
     checkIdentical(tdata(phyd, "all"), data.frame(merged, row.names=lab.all))
 
     ## match with node numbers, node & all data
     phyd <- phylo4d(phy.alt, node.data=nodDt, all.data=allDt)
     merged <- data.frame(merge(allDt[order(nid.all.r),],
         nodDt[order(nid.int.r),], all=TRUE, by=0)[-1])
-    checkIdentical(phyd at data, data.frame(merged, row.names=as.character(nid.all)))
+    checkIdentical(phyd at data, data.frame(merged, row.names=nid.all))
     checkIdentical(tdata(phyd, "all"), data.frame(merged, row.names=lab.all))
 
     ## match with node numbers, tip, node & all data
@@ -154,7 +154,7 @@
     # ...now merge these together
     merged <- data.frame(merge(m1, m2, by=0)[-1])
     checkIdentical(phyd at data, data.frame(merged,
-        row.names=as.character(nid.all)))
+        row.names=nid.all))
     checkIdentical(tdata(phyd, "tip"), data.frame(merged[nid.tip,],
         row.names=lab.tip, check.names=FALSE))
     checkIdentical(tdata(phyd, "internal"), data.frame(merged[nid.int,],
@@ -168,7 +168,7 @@
         suffix=c(".tip", ".node"))[-1])
     merged <- data.frame(merge(allDt, m3, by=0)[-1])
     checkIdentical(phyd at data, data.frame(merged,
-        row.names=as.character(nid.all)))
+        row.names=nid.all))
     checkIdentical(tdata(phyd, "tip"), data.frame(merged[nid.tip,],
         row.names=lab.tip, check.names=FALSE))
     checkIdentical(tdata(phyd, "internal"), data.frame(merged[nid.int,],
@@ -182,7 +182,7 @@
     ## match with names, tip data
     phyd <- phylo4d(phy.alt, tip.data=tipDt)
     checkIdentical(phyd at data, data.frame(tipDt[order(nid.tip.r),],
-        row.names=as.character(nid.tip)))
+        row.names=nid.tip))
     checkIdentical(tdata(phyd, "tip"), data.frame(tipDt[order(nid.tip.r),],
         row.names=lab.tip))
 
@@ -200,7 +200,7 @@
     checkIdentical(tdata(phyd, "internal"), data.frame(allDt[match(nid.int,
         nid.all.r),], row.names=lab.int))
     checkIdentical(phyd at data, data.frame(allDt[match(nid.all, nid.all.r),],
-        row.names=as.character(nid.all)))
+        row.names=nid.all))
 
 }
 

Modified: branches/slot-mods/man/formatData.Rd
===================================================================
--- branches/slot-mods/man/formatData.Rd	2009-09-28 04:06:24 UTC (rev 668)
+++ branches/slot-mods/man/formatData.Rd	2009-09-28 05:34:04 UTC (rev 669)
@@ -13,7 +13,7 @@
               match.data=TRUE, rownamesAsLabels=FALSE,
               label.type=c("rownames", "column"),
               label.column=1, missing.data=c("fail", "warn", "OK"),
-              extra.data=c("warn", "OK", "fail"))                       
+              extra.data=c("warn", "OK", "fail"), keep.all=TRUE)
 }
 
 \arguments{
@@ -34,7 +34,9 @@
   \item{rownamesAsLabels}{(logical), in the case of number-like labels
     should the row names of the data provided be considered as labels
     (TRUE) or node numbers (FALSE and default)}
-  
+  \item{keep.all}{(logical), should the data have rows for all nodes
+    (with NA values for internal rows when type='tip', and vice versa) 
+    (TRUE and default) or only rows corresponding to type argument}
 }
 
 \value{
@@ -58,7 +60,7 @@
   
   \code{formatData} (1) converts labels provided in the data into node
   numbers, (2) makes sure that the data are appropriately matched
-  against  tip or nodes, (3) checks for differences between data and
+  against tip or nodes, (3) checks for differences between data and
   tree, (4) creates a data frame with the correct dimensions given a
   tree.