[Phylobase-commits] r669 - in branches/slot-mods: R data inst/unitTests man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Mon Sep 28 07:34:05 CEST 2009
Author: regetz
Date: 2009-09-28 07:34:04 +0200 (Mon, 28 Sep 2009)
New Revision: 669
Modified:
branches/slot-mods/R/class-phylo4d.R
branches/slot-mods/R/formatData.R
branches/slot-mods/data/geospiza.rda
branches/slot-mods/inst/unitTests/runit.class-phylo4d.R
branches/slot-mods/man/formatData.Rd
Log:
Revised formatData to return data rows for *all* nodes (unless otherwise
specified), and simplified .phylo4Data to match. Row names now stored
internally as numeric, not character. Updated geospiza/tests to match.
Modified: branches/slot-mods/R/class-phylo4d.R
===================================================================
--- branches/slot-mods/R/class-phylo4d.R 2009-09-28 04:06:24 UTC (rev 668)
+++ branches/slot-mods/R/class-phylo4d.R 2009-09-28 05:34:04 UTC (rev 669)
@@ -48,31 +48,31 @@
"tip.data and node.data column names")
}
- ## identify common columns to merge if merging, or rename otherwise
- colnamesToMerge <- intersect(names(tip.data), names(node.data))
- if (merge.data==FALSE) {
- names(tip.data)[names(tip.data) %in% colnamesToMerge] <-
- paste(colnamesToMerge, "tip", sep=".")
- names(node.data)[names(node.data) %in% colnamesToMerge] <-
- paste(colnamesToMerge, "node", sep=".")
- colnamesToMerge <- NULL
+ ## combine common columns and move into all.data if merging,
+ ## otherwise rename them
+ colsToMerge <- intersect(names(tip.data), names(node.data))
+ if (merge.data && length(colsToMerge)>0) {
+ ##TODO could really just index rows directly on 1:nTip and
+ ## (nTip+1):(nTip+nNode) in the next two statements for speed,
+ ## but this is more robust to changes in node numbering rules
+ tip.rows <- tip.data[match(nodeId(x, "tip"),
+ row.names(tip.data)), colsToMerge, drop=FALSE]
+ node.rows <- node.data[match(nodeId(x, "internal"),
+ row.names(tip.data)), colsToMerge, drop=FALSE]
+ merge.data <- rbind(tip.rows, node.rows)
+ all.data <- data.frame(all.data, merge.data)
+ } else {
+ names(tip.data)[names(tip.data) %in% colsToMerge] <-
+ paste(colsToMerge, "tip", sep=".")
+ names(node.data)[names(node.data) %in% colsToMerge] <-
+ paste(colsToMerge, "node", sep=".")
}
- ## now separate tip.only, node.only, and common columns
+ ## now separate tips-only and nodes-only data
tip.only.data <- tip.data[setdiff(names(tip.data), names(node.data))]
node.only.data <- node.data[setdiff(names(node.data), names(tip.data))]
- common.data <- rbind(tip.data[colnamesToMerge], node.data[colnamesToMerge])
- ## merge data common to tips and nodes
- all.common.data <- merge(all.data, common.data, by=0, all=TRUE,
- sort=FALSE)
- ## merge data that apply only to tips or nodes
- all.separate.data <- merge(tip.only.data, node.only.data, by=0,
- all=TRUE, sort=FALSE)
- ## merge everything together and clean up
- complete.data <- merge(all.common.data, all.separate.data,
- by="Row.names", all=TRUE, sort=FALSE)
- row.names(complete.data) <- complete.data[["Row.names"]]
- complete.data <- subset(complete.data, select=-Row.names)
+ ## combine all data
+ complete.data <- data.frame(all.data, tip.only.data, node.only.data)
## drop any rows that only contain NAs
if (ncol(complete.data)==0) {
Modified: branches/slot-mods/R/formatData.R
===================================================================
--- branches/slot-mods/R/formatData.R 2009-09-28 04:06:24 UTC (rev 668)
+++ branches/slot-mods/R/formatData.R 2009-09-28 05:34:04 UTC (rev 669)
@@ -2,138 +2,123 @@
match.data=TRUE, rownamesAsLabels=FALSE,
label.type=c("rownames", "column"),
label.column=1, missing.data=c("fail", "warn", "OK"),
- extra.data=c("warn", "OK", "fail")
+ extra.data=c("warn", "OK", "fail"), keep.all=TRUE
) {
- ## coerce vector data to data.frame
- if (is.vector(dt)) {
- dt <- as.data.frame(dt)
+ ## determine whether to return rows for all nodes, or just 'type'
+ type <- match.arg(type)
+ if (keep.all) {
+ ids.out <- nodeId(phy, "all")
+ } else {
+ ids.out <- nodeId(phy, type)
}
+
## if null, return empty data frame with node numbers as row names
if (is.null(dt)) {
- return(data.frame(row.names=nodeId(phy, type)))
+ return(data.frame(row.names=ids.out))
}
+ ## if vector, coerce to data.frame
+ if (is.vector(dt)) {
+ dt <- as.data.frame(dt)
+ }
## before proceeding, make sure that data provided are a data frame
if (!is.data.frame(dt)) {
nmSomeData <- substitute(dt)
stop(paste(nmSomeData, "must be a vector or a data frame"))
}
- ## if null or lacking rows or columns, return a placeholder data
- ## frame with node numbers as row names
+ ## if lacking rows or columns, return a placeholder data frame with
+ ## node numbers as row names
if (any(dim(dt)==0)) {
- return(data.frame(row.names=nodeId(phy, type)))
+ return(data.frame(row.names=ids.out))
}
- type <- match.arg(type)
label.type <- match.arg(label.type)
stopifnot(label.column %in% 1:ncol(dt))
missing.data <- match.arg(missing.data)
extra.data <- match.arg(extra.data)
- nr <- switch(type,
- tip = nTips(phy),
- internal = nNodes(phy),
- all = nTips(phy)+nNodes(phy))
-
- tmpDt <- array(, dim=c(nr, ncol(dt)),
- dimnames=list(nodeId(phy, type), colnames(dt)))
- tmpDt <- data.frame(tmpDt)
-
if(match.data) {
- ## extract node-matching vector
+ ## extract values to be matched to nodes
ndNames <- switch(label.type,
rownames = rownames(dt),
column = dt[,label.column])
## either force matching on labels, or match on node
## numbers for any number-like elements and labels otherwise
if (rownamesAsLabels) {
- ndDt <- getNode(phy, as.character(ndNames), missing="OK")
+ ids.in <- getNode(phy, as.character(ndNames), missing="OK")
} else {
- ndDt <- as.numeric(rep(NA, length(ndNames)))
+ ids.in <- as.numeric(rep(NA, length(ndNames)))
treatAsNumber <- nchar(gsub("[0-9]", "", ndNames))==0
- ndDt[treatAsNumber] <- getNode(phy,
+ ids.in[treatAsNumber] <- getNode(phy,
as.integer(ndNames[treatAsNumber]), missing="OK")
- ndDt[!treatAsNumber] <- getNode(phy,
+ ids.in[!treatAsNumber] <- getNode(phy,
as.character(ndNames[!treatAsNumber]), missing="OK")
}
+
## Make sure that data are matched to appropriate nodes
- if(type != "all") {
- switch(type,
- tip = {
- ## BMB: don't bother trying to match NAs
- if(any(na.omit(ndDt) %in% nodeId(phy, "internal")))
- stop("Your tip data are being matched to internal ",
- "nodes. Make sure that your data identifiers ",
- "are correct.")
- },
- internal = {
- if(any(na.omit(ndDt) %in% nodeId(phy, "tip")))
- stop("Your node data are being matched to tip ",
- "nodes. Make sure that your data identifiers ",
- "are correct.")
- })
+ if (type=="tip" && any(na.omit(ids.in) %in% nodeId(phy,
+ "internal"))) {
+ stop("Your tip data are being matched to internal ",
+ "nodes. Make sure that your data identifiers ",
+ "are correct.")
}
+ if (type=="internal" && any(na.omit(ids.in) %in% nodeId(phy,
+ "tip"))) {
+ stop("Your node data are being matched to tip ",
+ "nodes. Make sure that your data identifiers ",
+ "are correct.")
+ }
## Check differences
- extra <- ndNames[is.na(ndDt)]
- mssng <- nodeId(phy, type)[! nodeId(phy, type) %in% ndDt]
-
+ mssng <- setdiff(nodeId(phy, type), ids.in)
if(length(mssng) > 0 && missing.data != "OK") {
+ ## provide label if it exists and node number otherwise
+ mssng <- getNode(phy, mssng)
+ mssng <- ifelse(is.na(names(mssng)), mssng, names(mssng))
msg <- "The following nodes are not found in the dataset: "
-
- ## provides label if it exists and node number otherwise
- mssng <- sapply(mssng, function(m) {
- m <- getNode(phy, m)
- if (is.na(names(m)) || is.null(names(m)))
- m
- else
- names(m)
- })
-
msg <- paste(msg, paste(mssng, collapse=", "))
switch(missing.data,
warn = warning(msg),
fail = stop(msg))
}
-
+ extra <- ndNames[is.na(ids.in)]
if(length(extra) > 0 && extra.data != "OK") {
msg <- "The following names are not found in the tree: "
msg <- paste(msg, paste(extra, collapse=", "))
switch(extra.data,
warn = warning(msg),
fail = stop(msg))
-
}
## Format data to have correct dimensions
- dt <- dt[!is.na(ndDt) ,, drop=FALSE]
- rownames(dt) <- ndDt[!is.na(ndDt)]
- tmpDt[,] <- dt[match(rownames(tmpDt), rownames(dt)) ,, drop=FALSE]
- if(label.type == "column") tmpDt <- tmpDt[, -label.column, drop=FALSE]
- }
- else {
- ## Remove rownames in data provided
- rownames(dt) <- NULL
+ dt <- dt[!is.na(ids.in), , drop=FALSE]
+ rownames(dt) <- ids.in[!is.na(ids.in)]
+ dt.out <- dt[match(ids.out, rownames(dt)), , drop=FALSE]
+ rownames(dt.out) <- ids.out
+ if(label.type == "column") dt.out <- dt.out[, -label.column, drop=FALSE]
- ## Tips before internal nodes for all.data
- if (type == "all")
- rownames(tmpDt) <- 1:nr
+ } else {
- ## Check differences between dataset and tree
- diffNr <- nrow(dt) - nr
- if(diffNr > 0 && extra.data != "OK") {
+ ## Check if too many or not enough rows in input data
+ expected.nrow <- length(nodeId(phy, type))
+ diffNr <- nrow(dt) - expected.nrow
+ if(nrow(dt) > expected.nrow && extra.data != "OK") {
msg <- paste("There are", diffNr, "extra rows.")
switch(extra.data,
warn = warning(msg),
fail = stop(msg))
}
- if(diffNr < 0 && missing.data != "OK") {
+ if(nrow(dt) < expected.nrow && missing.data != "OK") {
msg <- paste("There are", abs(diffNr), "missing rows.")
switch(missing.data,
warn = warning(msg),
fail = stop(msg))
}
- tmpDt[,] <- dt[1:min(nrow(dt), nr) ,, drop = FALSE]
+ ## truncate rows of input data frame if necessary
+ dt <- dt[1:min(nrow(dt), expected.nrow) ,, drop = FALSE]
+ rownames(dt) <- nodeId(phy, type)[seq_len(nrow(dt))]
+ dt.out <- dt[match(ids.out, rownames(dt)) ,, drop=FALSE]
+ rownames(dt.out) <- ids.out
}
- tmpDt
+ dt.out
}
Modified: branches/slot-mods/data/geospiza.rda
===================================================================
(Binary files differ)
Modified: branches/slot-mods/inst/unitTests/runit.class-phylo4d.R
===================================================================
--- branches/slot-mods/inst/unitTests/runit.class-phylo4d.R 2009-09-28 04:06:24 UTC (rev 668)
+++ branches/slot-mods/inst/unitTests/runit.class-phylo4d.R 2009-09-28 05:34:04 UTC (rev 669)
@@ -59,21 +59,21 @@
## brute force: no matching; with tip data
phyd <- phylo4d(phy.alt, tip.data=tipDt, match.data=FALSE)
checkIdentical(phyd at data, data.frame(tipDt,
- row.names=as.character(nid.tip)))
+ row.names=nid.tip))
checkIdentical(tdata(phyd, "tip"), data.frame(tipDt,
row.names=lab.tip))
## brute force: no matching; with node data
phyd <- phylo4d(phy.alt, node.data=nodDt, match.data=FALSE)
checkIdentical(phyd at data, data.frame(nodDt,
- row.names=as.character(nid.int)))
+ row.names=nid.int))
checkIdentical(tdata(phyd, "internal"), data.frame(nodDt,
row.names=lab.int))
## brute force: no matching; with all.data
phyd <- phylo4d(phy.alt, all.data=allDt, match.data=FALSE)
checkIdentical(phyd at data, data.frame(allDt,
- row.names=as.character(nid.all)))
+ row.names=nid.all))
checkIdentical(tdata(phyd, "all"), data.frame(allDt,
row.names=lab.all))
@@ -83,7 +83,7 @@
match.data=FALSE)
checkIdentical(phyd at data, data.frame(rbind(data.frame(tipDt["d"],
e=NA_real_), data.frame(d=NA_real_, nodDt["e"])),
- row.names=as.character(nid.all)))
+ row.names=nid.all))
checkIdentical(tdata(phyd, "tip"), data.frame(tipDt["d"], e=NA_real_,
row.names=lab.tip))
checkIdentical(tdata(phyd, "internal"), data.frame(d=NA_real_, nodDt["e"],
@@ -94,7 +94,7 @@
phyd <- phylo4d(phy.alt, tip.data=tipDt["c"], node.data=nodDt["c"],
match.data=FALSE)
checkIdentical(phyd at data, data.frame(rbind(tipDt["c"], nodDt["c"]),
- row.names=as.character(nid.all)))
+ row.names=nid.all))
checkIdentical(tdata(phyd, "tip"), data.frame(c=factor(tipDt$c,
levels=letters[nid.all]), row.names=lab.tip))
checkIdentical(tdata(phyd, "internal"), data.frame(c=factor(nodDt$c,
@@ -108,14 +108,14 @@
## match with node numbers, tip data
phyd <- phylo4d(phy.alt, tip.data=tipDt)
checkIdentical(phyd at data, data.frame(tipDt[order(nid.tip.r),],
- row.names=as.character(nid.tip)))
+ row.names=nid.tip))
checkIdentical(tdata(phyd, "tip"), data.frame(tipDt[order(nid.tip.r),],
row.names=lab.tip))
## match with node numbers, node data
phyd <- phylo4d(phy.alt, node.data=nodDt)
checkIdentical(phyd at data, data.frame(nodDt[order(nid.int.r),],
- row.names=as.character(nid.int)))
+ row.names=nid.int))
checkIdentical(tdata(phyd, "internal"), data.frame(nodDt[order(nid.int.r),],
row.names=lab.int))
@@ -124,7 +124,7 @@
checkIdentical(phyd at data, data.frame(rbind(data.frame(
d=tipDt[order(nid.tip.r), "d"], e=NA_real_),
data.frame(d=NA_real_, e=nodDt[order(nid.int.r), "e"])),
- row.names=as.character(nid.all)))
+ row.names=nid.all))
checkIdentical(tdata(phyd, "tip"), data.frame(d=tipDt[order(nid.tip.r), "d"],
e=NA_real_, row.names=lab.tip))
checkIdentical(tdata(phyd, "internal"), data.frame(d=NA_real_,
@@ -134,14 +134,14 @@
phyd <- phylo4d(phy.alt, tip.data=tipDt, all.data=allDt)
merged <- data.frame(merge(allDt[order(nid.all.r),],
tipDt[order(nid.tip.r),], all=TRUE, by=0)[-1])
- checkIdentical(phyd at data, data.frame(merged, row.names=as.character(nid.all)))
+ checkIdentical(phyd at data, data.frame(merged, row.names=nid.all))
checkIdentical(tdata(phyd, "all"), data.frame(merged, row.names=lab.all))
## match with node numbers, node & all data
phyd <- phylo4d(phy.alt, node.data=nodDt, all.data=allDt)
merged <- data.frame(merge(allDt[order(nid.all.r),],
nodDt[order(nid.int.r),], all=TRUE, by=0)[-1])
- checkIdentical(phyd at data, data.frame(merged, row.names=as.character(nid.all)))
+ checkIdentical(phyd at data, data.frame(merged, row.names=nid.all))
checkIdentical(tdata(phyd, "all"), data.frame(merged, row.names=lab.all))
## match with node numbers, tip, node & all data
@@ -154,7 +154,7 @@
# ...now merge these together
merged <- data.frame(merge(m1, m2, by=0)[-1])
checkIdentical(phyd at data, data.frame(merged,
- row.names=as.character(nid.all)))
+ row.names=nid.all))
checkIdentical(tdata(phyd, "tip"), data.frame(merged[nid.tip,],
row.names=lab.tip, check.names=FALSE))
checkIdentical(tdata(phyd, "internal"), data.frame(merged[nid.int,],
@@ -168,7 +168,7 @@
suffix=c(".tip", ".node"))[-1])
merged <- data.frame(merge(allDt, m3, by=0)[-1])
checkIdentical(phyd at data, data.frame(merged,
- row.names=as.character(nid.all)))
+ row.names=nid.all))
checkIdentical(tdata(phyd, "tip"), data.frame(merged[nid.tip,],
row.names=lab.tip, check.names=FALSE))
checkIdentical(tdata(phyd, "internal"), data.frame(merged[nid.int,],
@@ -182,7 +182,7 @@
## match with names, tip data
phyd <- phylo4d(phy.alt, tip.data=tipDt)
checkIdentical(phyd at data, data.frame(tipDt[order(nid.tip.r),],
- row.names=as.character(nid.tip)))
+ row.names=nid.tip))
checkIdentical(tdata(phyd, "tip"), data.frame(tipDt[order(nid.tip.r),],
row.names=lab.tip))
@@ -200,7 +200,7 @@
checkIdentical(tdata(phyd, "internal"), data.frame(allDt[match(nid.int,
nid.all.r),], row.names=lab.int))
checkIdentical(phyd at data, data.frame(allDt[match(nid.all, nid.all.r),],
- row.names=as.character(nid.all)))
+ row.names=nid.all))
}
Modified: branches/slot-mods/man/formatData.Rd
===================================================================
--- branches/slot-mods/man/formatData.Rd 2009-09-28 04:06:24 UTC (rev 668)
+++ branches/slot-mods/man/formatData.Rd 2009-09-28 05:34:04 UTC (rev 669)
@@ -13,7 +13,7 @@
match.data=TRUE, rownamesAsLabels=FALSE,
label.type=c("rownames", "column"),
label.column=1, missing.data=c("fail", "warn", "OK"),
- extra.data=c("warn", "OK", "fail"))
+ extra.data=c("warn", "OK", "fail"), keep.all=TRUE)
}
\arguments{
@@ -34,7 +34,9 @@
\item{rownamesAsLabels}{(logical), in the case of number-like labels
should the row names of the data provided be considered as labels
(TRUE) or node numbers (FALSE and default)}
-
+ \item{keep.all}{(logical), should the data have rows for all nodes
+ (with NA values for internal rows when type='tip', and vice versa)
+ (TRUE and default) or only rows corresponding to type argument}
}
\value{
@@ -58,7 +60,7 @@
\code{formatData} (1) converts labels provided in the data into node
numbers, (2) makes sure that the data are appropriately matched
- against tip or nodes, (3) checks for differences between data and
+ against tip or nodes, (3) checks for differences between data and
tree, (4) creates a data frame with the correct dimensions given a
tree.
More information about the Phylobase-commits
mailing list