[Mattice-commits] r67 - in pkg: R man misc

Tue Dec 2 21:18:21 CET 2008

Author: andrew_hipp
Date: 2008-12-02 21:18:21 +0100 (Tue, 02 Dec 2008)
New Revision: 67

Added:
   pkg/misc/batchHansenVignette.Rnw
Modified:
   pkg/R/summarizingAnalyses.R
   pkg/man/runBatchHansen.Rd
Log:
confirmed that node support is calculated correctly even with phylogenetic uncertainty

Modified: pkg/R/summarizingAnalyses.R
===================================================================

--- pkg/R/summarizingAnalyses.R	2008-12-02 18:27:35 UTC (rev 66)
+++ pkg/R/summarizingAnalyses.R	2008-12-02 20:18:21 UTC (rev 67)
@@ -25,9 +25,9 @@
   for(tree in 1:ntrees) {
     aicList <- icObject[[tree]]$AICwi
     aiccList <- icObject[[tree]]$AICcwi
-    bicList <- icObject[[tree]]$BICwi
-    modelsMatrix[[tree]] <- cbind(aicList, aiccList, bicList) # value: modelsMatrix
-    temp <- modelsMatrix[[tree]]; temp[!is.na(temp)] <- 1
+    bicList <- icObject[[tree]]$BICwi 
+    modelsMatrix[[tree]] <- cbind(aicList, aiccList, bicList) # grab IC weights for each model in the tree and stick them all into modelsMatrix
+    temp <- modelsMatrix[[tree]]; temp[!is.na(temp)] <- 1 # 
     nmodelsMatrix <- nmodelsMatrix + replace.matrix(temp, NA, 0) # nmodelsMatrix gets a 1 for each model present, a 0 for each model not present
     outMatrix <- outMatrix + replace.matrix(modelsMatrix[[tree]], NA, 0) # replace NAs with 0 so that sum works correctly at the end
     sigmaSqVector[tree] <- weighted.mean(hansenBatch$hansens[[tree]][, 'sigma.squared'], bicList, na.rm = T)
@@ -58,7 +58,7 @@
                                                # equal to zero in any trees that lack that model. Weights sum to 1.0, and they
                                                # factor in the posterior probability (or bootstrap proportion) for each model. 
                                                # Which to use? they are both informative, but this one has the desirable property of
-                                               # acting being a proper probability distribution, albeit one that confounds clade support
+                                               # being a proper probability distribution, albeit one that confounds clade support
                                                # with model support.
   
   # sum over nodes
@@ -92,7 +92,7 @@
   modelAvgAlpha <- mean(alphaVector, na.rm = T)
   modelAvgSigmaSq <- mean(sigmaSqVector, na.rm = T)
   warning('the K-matrix as currently implemented is calculated over the all-nodes (normalized) weights.')
-  outdata <- list(modelsMatrix = modelsMatrix, weightsMatrix = list(unnormalized = weightsMatrix.unnormalized, allNodes = weightsMatrix.allNodes), nodeWeightsMatrix = list(unnormalized = nodeWeightsMatrix.unnormalized, allNodes = nodeWeightsMatrix.allNodes), kMatrix = kMatrix, modelAvgAlpha = modelAvgAlpha, modelAvgSigmaSq = modelAvgSigmaSq, thetaMatrix = thetaMatrix)
+  outdata <- list(modelsMatrix = modelsMatrix, nmodelsMatrix = nmodelsMatrix, weightsMatrix = list(unnormalized = weightsMatrix.unnormalized, allNodes = weightsMatrix.allNodes), nodeWeightsMatrix = list(unnormalized = nodeWeightsMatrix.unnormalized, allNodes = nodeWeightsMatrix.allNodes), kMatrix = kMatrix, modelAvgAlpha = modelAvgAlpha, modelAvgSigmaSq = modelAvgSigmaSq, thetaMatrix = thetaMatrix)
   class(outdata) <- 'hansenSummary'
   return(outdata)
 }
@@ -105,16 +105,15 @@
 
 print.hansenSummary <- function(hansenSummary) {
 ## This just formats a hansenSummary object so that it is readable on the screen; you can still store the summary object and extract elements as needed
-  message(paste("Summarizing hansenBatch analyses over", length(hansenSummary$modelsMatrix), "trees and", dim(hansenSummary$modelsMatrix[[1]])[1], "models"))
-  message("ESTIMATED SUPPORT FOR CHANGES OCCURRING AT DESIGNATED NODES\nThese estimates are only valid if (1) the maximum number of regimes permitted approximates the actual maximum;\n(2) nodes at which changes actually occurred were included among the nodes being tested; and\n(3) any matrix you may have utilized to conduct analysis was balanced, such that all nodes are present in the same number of models.\n\n")
-  message("Averaged only over models containing that node:")
-  print(hansenSummary$nodeWeightsMatrix$unnormalized)
-  message("\nAveraged over all nodes, thus multiplying the clade distribution by the mean support for the model:")
+  message(paste("\nSummarizing hansenBatch analyses over", length(hansenSummary$modelsMatrix), "trees and", dim(hansenSummary$modelsMatrix[[1]])[1], "models"))
+  message("-----------------------------------------------------------")
+  message("ESTIMATED SUPPORT FOR CHANGES OCCURRING AT DESIGNATED NODES")
+  message("Averaged over all trees, thus multiplying the clade distribution by the mean support for the model:")
   print(hansenSummary$nodeWeightsMatrix$allNodes)
   message("\nESTIMATED SUPPORT FOR NUMBER OF PARAMETERS IN THE MODEL")
   message("The properties of this support value have not been studied and are likely to be biased strongly toward the median value of\nK, as K is largest at the median values (they are distributed according to Stirling numbers of the first kind).")
   print(hansenSummary$kMatrix)
-  message("\nMODEL AVERAGED PARAMETERS")
+  message("\nMODEL-AVERAGED PARAMETERS")
   message(paste("alpha =", hansenSummary$modelAvgAlpha))
   message(paste("sigma^2 =", hansenSummary$modelAvgSigmaSq))
   if(any(dim(hansenSummary$thetaMatrix) > 12)) message(paste("theta matrix is too long to display; access through the summary object"))

Modified: pkg/man/runBatchHansen.Rd
===================================================================
--- pkg/man/runBatchHansen.Rd	2008-12-02 18:27:35 UTC (rev 66)
+++ pkg/man/runBatchHansen.Rd	2008-12-02 20:18:21 UTC (rev 67)
@@ -51,6 +51,12 @@
 \details{
     This function is the primary function for estimating the probability of a change in with changes in regime at the nodes
     specified, with changes corresponding to permutations of the nodes fed to the function. See vignette for additional details.
+    
+    Note: summary estimates of the probability of change on a branch are only valid if 
+    (1) the maximum number of regimes permitted approximates the actual maximum;
+    (2) nodes at which changes actually occurred were included among the nodes being tested; and
+    (3) any matrix you may have utilized to conduct analysis was balanced, 
+        such that all nodes are present in the same number of models.
 }
 \value{
   XXX

Added: pkg/misc/batchHansenVignette.Rnw
===================================================================
--- pkg/misc/batchHansenVignette.Rnw	                        (rev 0)
+++ pkg/misc/batchHansenVignette.Rnw	2008-12-02 20:18:21 UTC (rev 67)
@@ -0,0 +1,103 @@
+\documentclass[a4paper]{article}
+\usepackage{fancyvrb}
+\usepackage{color}
+
+\author{Andrew Hipp}
+\title{How Should I Interpret Node Probabilities When there is Phylogenetic Uncertainty?}
+
+\begin{document}
+
+\maketitle
+
+\section{Introduction}
+
+\code{mattice} summarizes the support for change occurring at a node in two ways. The first is to summarize
+
+
+<<>>=
+x <- rnorm(1000)
+x[1:10]
+@ 
+
+Both input and output are printed. Suppose we want only the output:
+
+<<echo=false>>=
+y <- rnorm(1000)
+y[1:10]
+ls()
+@ 
+
+The options are specified within the `< < > > =' starting the R chunck. A
+related option is `quiet=true' which suppresses the input, so some R
+commands can be executed silently:
+
+<<echo=false,quiet=true>>=
+z <- rnorm(1000)
+@ 
+
+Let's check that it has run with a normal R chunck:
+
+<<>>=
+ls()
+@ 
+
+\section{Including Figures}
+
+To have a plot included, we must add the option `fig=true' in the R
+chunck header:
+
+<<fig=true>>=
+hist(x)
+@ 
+
+This will create two files in the working directory: a PDF version of
+the plot, and an EPS one. The names of these files are created
+automatically by Sweave. In case you don't know what is your R working directory:
+
+<<>>=
+getwd()
+@ 
+
+If you want to make many plots, it's useful to work in a distinct (sub)directory.
+
+You can't really have several plot commands within the same R chunck,
+but you can use `layout' to make several plots on the same figures:
+
+<<fig=true>>=
+layout(matrix(c(1, 2, 1, 3), 2))
+plot(x, y)
+hist(x)
+hist(y)
+@ 
+
+Any R command can be put in the R chuncks. Let's read some data with ape:
+
+<<>>=
+library(ape)
+x <- read.dna("../Projets/ape/dev/ape/data/woodmouse.txt")
+x
+@ 
+
+The nice thing with Sweave is that you know exactly which data file
+has been used since the code has been executed at the same time than
+the document is printed!
+
+Just for fun, let's plot the NJ tree done with a K80 distance:
+
+<<fig=true>>=
+plot(nj(dist.dna(x)))
+add.scale.bar()
+@ 
+
+Not the proper way to do this kind of analysis, of course ;)
+
+\section{Conclusion}
+
+I hope this starter will give you the feel to write many Sweave
+documents. A final word about LaTeX:    it
+ignores                               extra
+white                spaces
+           between
+words.
+
+\end{document}