[Phylobase-commits] r694 - in pkg: R man src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Thu Oct 15 00:20:00 CEST 2009


Author: regetz
Date: 2009-10-15 00:19:59 +0200 (Thu, 15 Oct 2009)
New Revision: 694

Added:
   pkg/src/ancestors.c
Modified:
   pkg/R/treewalk.R
   pkg/man/treewalk.Rd
   pkg/src/descendants.c
Log:
vectorized ancestors and descendants, both using C functions for
performance; augmented documentation


Modified: pkg/R/treewalk.R
===================================================================
--- pkg/R/treewalk.R	2009-10-14 20:56:12 UTC (rev 693)
+++ pkg/R/treewalk.R	2009-10-14 22:19:59 UTC (rev 694)
@@ -70,47 +70,55 @@
 }
 
 ## get descendants [recursively]
-descendants <- function (phy, node, type=c("tips","children","all"))
-{
-    ## FIXME: allow vector of nodes? (or just let people lapply?)
+descendants <- function (phy, node, type=c("tips","children","all")) {
     type <- match.arg(type)
-    if (type=="children") {
-        return(children(phy, node))
-    }
-    node <- getNode(phy, node)
-    if (is.na(node)) stop("node ", node, " not found in tree")
-    n <- nTips(phy)
-    if (node <= n) {
-        return(node)
-    }
 
-    ## edge matrix must be in preorder for the C function!
-    if (phy at order=="preorder") {
-        edge <- phy at edge
+    ## look up nodes, warning about and excluding invalid nodes
+    oNode <- node
+    node <- getNode(phy, node, missing="warn")
+    isValid <- !is.na(node)
+    node <- as.integer(node[isValid])
+
+    if (type == "children") {
+        res <- lapply(node, function(x) children(phy, x))
     } else {
-        edge <- reorder(phy, order="preorder")@edge
-    }
-    ## deal with NA root node
-    edge[is.na(edge)] <- 0
-    ## extract edge columns
-    ancestor <- as.integer(edge[, 1])
-    descendant <- as.integer(edge[, 2])
-    ## create indicator vector and seed it based on children
-    isChild <- rep(0L, length=nrow(edge))
-    isChild[ancestor==node] <- 1L
-    numEdges <- as.integer(length(isChild))
+        ## edge matrix must be in preorder for the C function!
+        if (phy at order=="preorder") {
+            edge <- phy at edge
+        } else {
+            edge <- reorder(phy, order="preorder")@edge
+        }
+        ## extract edge columns
+        ancestor <- as.integer(edge[, 1])
+        descendant <- as.integer(edge[, 2])
 
-    ## returned value of isChild will indicate *all* descendants 
-    isDescendant <- .C("descendants", isChild, ancestor, descendant,
-        numEdges)[[1]]
+        ## return indicator matrix of ALL descendants (including self)
+        isDes <- .Call("descendants", node, ancestor, descendant)
+        storage.mode(isDes) <- "logical"
 
-    l <- descendant[as.logical(isDescendant)]
-    if (type=="tips") {
-        l <- l[l<=n]
+        ## for internal nodes only, drop self (not sure why this rule?)
+        int.node <- intersect(node, nodeId(phy, "internal"))
+        isDes[cbind(match(int.node, descendant),
+            match(int.node, node))] <- FALSE
+        ## if only tips desired, drop internal nodes
+        if (type=="tips") {
+            isDes[descendant %in% nodeId(phy, "internal"),] <- FALSE
+        }
+        res <- lapply(seq_along(node), function(n) getNode(phy,
+            descendant[isDes[,n]]))
     }
+    names(res) <- as.character(oNode[isValid])
 
+    ## if just a single node, return as a single vector
+    if (length(res)==1) res <- res[[1]]
+    res
+
     ## Original pure R implementation of the above
     ## (note that it does not require preorder ordering)
+    ##n <- nTips(phy)
+    ##if (node <= n) {
+    ##    return(node)
+    ##}
     ##l <- numeric()
     ##d <- children(phy, node)
     ##for (j in d) {
@@ -120,8 +128,6 @@
     ##               descendants(phy,j,type="all"))
     ##    else l <- c(l, descendants(phy,j,type=type))
     ##}
-
-    return(getNode(phy, l))
 }
 
 siblings <- function(phy, node, include.self=FALSE) {
@@ -131,26 +137,57 @@
 }
 
 ## get ancestors (all nodes)
-ancestors <- function (phy, node, type=c("all","parent","ALL"))
-{
+ancestors <- function (phy, node, type=c("all","parent","ALL")) {
+
     type <- match.arg(type)
-    if (type=="parent") return(ancestor(phy,node))
-    oNode <- node <- getNode(phy,node)
-    if (is.na(node)) stop("node ",node," not found in tree")
-    res <- numeric(0)
-    n <- nTips(phy)
 
-    ## correct behavior when node==root
-    if(node == rootNode(phy)) return(NULL)
+    ## look up nodes, warning about and excluding invalid nodes
+    oNode <- node
+    node <- getNode(phy, node, missing="warn")
+    isValid <- !is.na(node)
+    node <- as.integer(node[isValid])
 
-    repeat {
-        anc <- ancestor(phy,node)
-        res <- c(res,anc)
-        node <- anc
-        if (anc==n+1) break
+    if (type == "parent") {
+        res <- lapply(node, function(x) ancestor(phy, x))
+    } else {
+        ## edge matrix must be in postorder for the C function!
+        if (phy at order=="postorder") {
+            edge <- phy at edge
+        } else {
+            edge <- reorder(phy, order="postorder")@edge
+        }
+        ## extract edge columns
+        ancestor <- as.integer(edge[, 1])
+        descendant <- as.integer(edge[, 2])
+
+        ## return indicator matrix of ALL ancestors (including self)
+        isAnc <- .Call("ancestors", node, ancestor, descendant)
+        storage.mode(isAnc) <- "logical"
+
+        ## drop self if needed
+        if (type=="all") {
+            isAnc[cbind(match(node, descendant), seq_along(node))] <- FALSE
+        }
+        res <- lapply(seq_along(node), function(n) getNode(phy,
+            descendant[isAnc[,n]]))
     }
-    if(type == "ALL") res <- c(oNode, res)
-    return(getNode(phy,res))
+    names(res) <- as.character(oNode[isValid])
+
+    ## if just a single node, return as a single vector
+    if (length(res)==1) res <- res[[1]]
+    res
+
+    ## Original pure R implementation of the above
+    ## (note that it does not require preorder ordering)
+    ##if (node == rootNode(phy))
+    ##    return(NULL)
+    ##repeat {
+    ##    anc <- ancestor(phy, node)
+    ##    res <- c(res, anc)
+    ##    node <- anc
+    ##    if (anc == n + 1)
+    ##        break
+    ##}
 }
 
 MRCA <- function(phy, ...) {

Modified: pkg/man/treewalk.Rd
===================================================================
--- pkg/man/treewalk.Rd	2009-10-14 20:56:12 UTC (rev 693)
+++ pkg/man/treewalk.Rd	2009-10-14 22:19:59 UTC (rev 694)
@@ -19,7 +19,7 @@
 siblings(phy,node,include.self=FALSE)
 children(phy, node)
 descendants(phy, node, type=c("tips","children","all"))
-MRCA(phy,\dots)
+MRCA(phy, \dots)
 shortestPath(phy, node1, node2)
 \S4method{sumEdgeLength}{phylo4}(x, node)
 }
@@ -30,7 +30,10 @@
   \item{x}{a \linkS4class{phylo4} object (or one inheriting from
     \linkS4class{phylo4}, e.g. a \linkS4class{phylo4d} object)
   }
-  \item{node}{a node ID number (or name)}
+  \item{node}{either an integer corresponding to a node ID number, or a
+    character corresponding to a node label; for \code{ancestors} and
+    \code{descendants}, this may be a vector of multiple node numbers or
+    names}
   \item{type}{(\code{ancestors}) specify whether to return just direct
     ancestor ("parent"), all ancestor nodes ("all"), or all ancestor
     nodes including self ("ALL"); (\code{descendants})
@@ -43,14 +46,35 @@
   \item{node1}{a node number (or name)}
   \item{node2}{a node number (or name)}
 }
+\details{
+  \code{ancestors} and \code{descendants} can take \code{node} vectors
+  of arbitrary length, returning a list of output vectors if the number
+  of valid input nodes is greater than one. List element names are taken
+  directly from the input node vector.
+
+  If any supplied nodes are not found in the tree, the behavior
+  currently varies across functions. Invalid nodes are automatically
+  omitted by \code{ancestors} and \code{descendants}, with a warning.
+  \code{ancestor} will return \code{NA} for any invalid nodes, with
+  a warning. Both \code{children} and \code{siblings} will return an empty
+  vector, again with a warning. In contrast, \code{MRCA} and
+  \code{shortestPath} will throw an immediate error if any input nodes
+  are invalid.
+}
 \value{
   \item{\code{ancestors} and \code{descendants}}{
-    return named vectors of the ancestors and descendants of a node; \code{ancestor} is a synonym 
-    for \code{ancestors(\dots, type="parent")} (i.e. direct ancestor only), while \code{children} 
-    is a synonym for \code{descendants(\dots, type="children")} (i.e. direct descendants only)}
+    return a named vector (or a list of such vectors in the case of
+    multiple input nodes) of the ancestors and descendants of a node}
+  \item{\code{ancestor} and \code{children}}{
+    \code{ancestor} is analogous to \code{ancestors(\dots,
+    type="parent")} (i.e. direct ancestor only), but returns a single
+    concatenated vector in the case of multiple input nodes;
+    \code{children} is analogous to \code{descendants(\dots,
+    type="children")} (i.e. direct descendants only), but is not
+    currently intended to be used with multiple input nodes }
   \item{\code{siblings}}{
     returns sibling nodes (children of the same parent)}
-  \item{\code{mrca}}{
+  \item{\code{MRCA}}{
     returns the most recent common ancestor of two or more nodes}
   \item{\code{shortestPath}}{
     returns the nodes of the shortest path from one node to another (excluding \code{node1} and 
@@ -65,18 +89,18 @@
 \examples{
   data(geospiza)
   nodeLabels(geospiza) <- LETTERS[1:nNodes(geospiza)]
-  plot(as(geospiza,"phylo4"), show.node.label=TRUE)
-  ancestor(geospiza,"E")
-  children(geospiza,"C")
-  descendants(geospiza,"D",type="tips")
-  descendants(geospiza,"D",type="all")
-  ancestors(geospiza,"D")
-  MRCA(geospiza,"conirostris","difficilis","fuliginosa")
-  MRCA(geospiza,"olivacea","conirostris")
+  plot(as(geospiza, "phylo4"), show.node.label=TRUE)
+  ancestor(geospiza, "E")
+  children(geospiza, "C")
+  descendants(geospiza, "D", type="tips")
+  descendants(geospiza, "D", type="all")
+  ancestors(geospiza, "D")
+  MRCA(geospiza, "conirostris", "difficilis", "fuliginosa")
+  MRCA(geospiza, "olivacea", "conirostris")
 
   ## shortest path between 2 nodes
-  shortestPath(geospiza, "fortis","fuliginosa")
-  shortestPath(geospiza, "F","L")
+  shortestPath(geospiza, "fortis", "fuliginosa")
+  shortestPath(geospiza, "F", "L")
 
   ## FIXME
   ## if(require(ape)){ edgelabels() }

Added: pkg/src/ancestors.c
===================================================================
--- pkg/src/ancestors.c	                        (rev 0)
+++ pkg/src/ancestors.c	2009-10-14 22:19:59 UTC (rev 694)
@@ -0,0 +1,53 @@
+/*
+  ancestors.c:
+    Identify all ancestors of each node in the input vector. Function
+  inputs are derived from a phylo4 edge matrix, which *must* be in
+  postorder order. The isAncestor output is an indicator matrix of
+  which nodes (rows, corresponding to the decendant vector) are
+  ancestors of each input node (columns, corresponding to the nodes
+  vector). It will contain 1 for each ancestor of the node, *including
+  itself*, and 0 for all other nodes.
+
+  Jim Regetz (NCEAS)
+*/
+
+#include <R.h>
+#include <Rinternals.h>
+
+SEXP ancestors(SEXP nod, SEXP anc, SEXP des) {
+
+    int numEdges = length(anc);
+    int numNodes = length(nod);
+
+    int* nodes = INTEGER(nod);
+    int* ancestor = INTEGER(anc);
+    int* descendant = INTEGER(des);
+
+    int parent = 0;
+    SEXP isAncestor;
+
+    PROTECT(isAncestor = allocMatrix(INTSXP, numEdges, numNodes));
+    for (int n=0; n<numNodes; n++) {
+        for (int i=0; i<numEdges; i++) {
+            if (nodes[n]==descendant[i]) {
+                INTEGER(isAncestor)[i + n*numEdges] = 1;
+            } else {
+                INTEGER(isAncestor)[i + n*numEdges] = 0;
+            }
+        }
+    }
+    for (int n=0; n<numNodes; n++) {
+        for (int i=0; i<numEdges; i++) {
+            if (INTEGER(isAncestor)[i + n*numEdges]==1) {
+                parent = ancestor[i];
+                for (int j=i+1; j<numEdges; j++) {
+                    if (descendant[j]==parent) {
+                        INTEGER(isAncestor)[j + n*numEdges]=1; 
+                    }
+                }
+            }
+        }
+    }
+    UNPROTECT(1);
+    return isAncestor;
+}


Property changes on: pkg/src/ancestors.c
___________________________________________________________________
Name: svn:eol-style
   + native

Modified: pkg/src/descendants.c
===================================================================
--- pkg/src/descendants.c	2009-10-14 20:56:12 UTC (rev 693)
+++ pkg/src/descendants.c	2009-10-14 22:19:59 UTC (rev 694)
@@ -1,24 +1,53 @@
 /*
   descendants.c:
-    Identify all descendants of a given node. Function inputs are
-  derived from a phylo4 edge matrix, which *must* be in preorder order.
-  The isDescendant input vector should contain 1 for the immediate
-  children of the node, and 0 otherwise. The function returns this
-  vector updated to include all further descendants.
+    Identify all descendants of each node in the input vector. Function
+  inputs are derived from a phylo4 edge matrix, which *must* be in
+  preorder order. The isDescendant output is an indicator matrix of
+  which nodes (rows, corresponding to the decendant vector) are
+  descendants of each input node (columns, corresponding to the nodes
+  vector). It will contain 1 for each descendant of the node, *including
+  itself*, and 0 for all other nodes. 
+
+  Jim Regetz (NCEAS)
 */
 
 #include <R.h>
+#include <Rinternals.h>
 
-void descendants(int *isDescendant, int *ancestor, int *descendant, 
-    int *numEdges) {
+SEXP descendants(SEXP nod, SEXP anc, SEXP des) {
 
-    int child=0;
-    for (int i=0; i<*numEdges; i++) {
-        if (isDescendant[i]==1) {
-            child = descendant[i];
-            for (int j=i+1; j<*numEdges; j++) {
-                if (ancestor[j]==child) isDescendant[j]=1; 
+    int numEdges = length(anc);
+    int numNodes = length(nod);
+
+    int* nodes = INTEGER(nod);
+    int* ancestor = INTEGER(anc);
+    int* descendant = INTEGER(des);
+
+    int child = 0;
+    SEXP isDescendant;
+
+    PROTECT(isDescendant = allocMatrix(INTSXP, numEdges, numNodes));
+    for (int n=0; n<numNodes; n++) {
+        for (int i=0; i<numEdges; i++) {
+            if (nodes[n]==descendant[i]) {
+                INTEGER(isDescendant)[i + n*numEdges] = 1;
+            } else {
+                INTEGER(isDescendant)[i + n*numEdges] = 0;
             }
         }
     }
+    for (int n=0; n<numNodes; n++) {
+        for (int i=0; i<numEdges; i++) {
+            if (INTEGER(isDescendant)[i + n*numEdges]==1) {
+                child = descendant[i];
+                for (int j=i+1; j<numEdges; j++) {
+                    if (ancestor[j]==child) {
+                        INTEGER(isDescendant)[j + n*numEdges] = 1; 
+                    }
+                }
+            }
+        }
+    }
+    UNPROTECT(1);
+    return isDescendant;
 }



More information about the Phylobase-commits mailing list