[Adephylo-commits] r180 - pkg/R
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Wed Sep 26 19:27:19 CEST 2012
Author: jombart
Date: 2012-09-26 19:27:19 +0200 (Wed, 26 Sep 2012)
New Revision: 180
Added:
pkg/R/dibas.R
Removed:
pkg/R/tree.group.R
Log:
New version of the method I had left on the side.
Working. Need to publish this.
Copied: pkg/R/dibas.R (from rev 179, pkg/R/tree.group.R)
===================================================================
--- pkg/R/dibas.R (rev 0)
+++ pkg/R/dibas.R 2012-09-26 17:27:19 UTC (rev 180)
@@ -0,0 +1,370 @@
+########
+## dibas ('distance-based group assignment')
+#########
+
+dibas <- function (x, ...) UseMethod("dibas")
+
+
+
+
+
+################
+## dibas.matrix
+################
+dibas.matrix <- function(x, grp, method=c("default","leaveOneOut"), ...){
+ method <- match.arg(method)
+ ## DECLARE SOME VARIABLES, HANDLE ARGUMENTS ##
+ grp <- factor(grp)
+ K <- length(LEV <- levels(grp))
+ N <- nrow(x)
+
+
+ ## AUXILIARY FUNCTIONS ##
+ ## COMPUTE LOG AND AVOIDS -INF
+ logprob <- function(prob){
+ res <- log(prob)
+ res[res< -1e20] <- -1e20
+ return(res)
+ }
+
+ ## FUNCTION TO GET A VECTOR OF PAIRWISE DISTANCE WITHIN ONE GROUP
+ ## M: matrix of distances
+ ## fac: factor
+ ## val: level of the chosen group
+ getdist.within.grp <- function(M, fac, val){ # val is one level of fac
+ temp <- M[fac==val,fac==val]
+ return(temp[lower.tri(temp)])
+ }
+
+
+ ## FUNCTION TO GET LIST OF VECTORS OF PAIRWISE DISTANCES WITHIN GROUP, FOR EVERY GROUP
+ getdist.within.allgrp <- function(M, fac){
+ res <- lapply(LEV, function(e) getdist.within.grp(M, fac, e))
+ names(res) <- LEV
+ return(res)
+ }
+
+
+ ## FUNCTION TO GET THE DISTANCES OF AN INDIVIDUAL TO THE GROUPS
+ getdist.indiv <- function(i){
+ return(split(x[i,-i],grp[-i]))
+ }
+
+ ## FUNCTION TO COMPUTE MEMBERSHIP PROBA FOR ONE INDIV
+ ## i: index of an individual
+ ## distrib.param[1,]: vector of the means of with-grp distance distributions
+ ## distrib.param[2,]: vector of the sds of with-grp distance distributions
+ getproba.ind <- function(i, distrib.param){
+ temp <- getdist.indiv(i)
+ out <- sapply(1:K, function(k) mean(logprob(dnorm(temp[[k]], distrib.param[1,k],distrib.param[1,k]))))
+ return(exp(out)/sum(exp(out)))
+ }
+
+
+
+ ## CORE COMPUTATIONS ##
+ ## DEFAULT: DENSITY BASED ON ENTIRE SAMPLE ##
+ if(method=="default"){
+ ## get distance data for each group
+ temp <- getdist.within.allgrp(x, grp)
+
+ ## parameter of the group distributions
+ ## matrix of within-group dist: col=grp, row1=mean,row2=sd
+ distrib.param <- sapply(temp, function(e) return(c(mean(e,na.rm=TRUE),sd(e,na.rm=TRUE))))
+
+ ## result: row=indiv, col=groups, values=membership proba
+ prob <- t(sapply(1:N, getproba.ind, distrib.param))
+ }
+
+
+ ## LEAVEONEOUT: DENSITY EXCLUDES THE INDIV FOR WHICH PROBA IS SEEKED ##
+ if(method=="leaveOneOut"){
+ ## get within-group distance data for each individual
+ temp <- lapply(1:N, function(i) getdist.within.allgrp(x[-i,-i], grp[-i])) # grp density data for each individual
+
+ ## parameter of the group distributions
+ ## list of matrices, one per individual
+ ## matrix of within-group dist: col=grp, row1=mean,row2=sd
+ distrib.param <- lapply(1:N, function(i) sapply(temp[[i]], function(e) return(c(mean(e,na.rm=TRUE),sd(e,na.rm=TRUE)))))
+
+ ## result: row=indiv, col=groups, values=membership proba
+ prob <- t(sapply(1:N, function(i) getproba.ind(i, distrib.param[[i]])))
+ }
+
+
+ ## SHAPE MEMBERSHIP PROBABILITIES MATRIX ##
+ colnames(prob) <- LEV
+ rownames(prob) <- rownames(x)
+
+
+ ## FIND GROUP ASSIGNMENTS ##
+ temp <- factor(colnames(prob)[apply(prob,1, which.max)])
+ annot <- rep(" ", N)
+ annot[as.character(grp)!=as.character(temp)] <- "!"
+ groups <- data.frame(observed=grp, inferred=temp, annot=annot)
+ ##rownames(groups) <- rownames(prob)
+
+
+ ## BUILD / RETURN RESULT ##
+ ## get proportion of correct assignment
+ propcorrect <- mean(annot==" ")
+ propcorrect.bygroup <- tapply(annot==" ", grp, mean)
+
+ ## get summary of assignments
+ grp.tab <- table(observed=groups[,1], assigned=groups[,2])
+
+ ## get assignability
+ ## i.e. how many times better than at random is assignment?
+ ## 0 = grp very unlikely
+ ## 1 = assignment no better than at random
+ ## >1 = better than random (e.g. 2 = twice as better as at random)
+ temp <- table(grp)/N
+ probActualGrp <- sapply(1:N, function(i) prob[i, as.character(grp[i])])
+ assign.idx <- probActualGrp / as.numeric(temp[as.character(grp)])
+ assignStat <- list(assign.idx=assign.idx, mean=mean(assign.idx), grp.mean=tapply(assign.idx,grp,mean))
+
+
+ ##res <- list(prob=prob,groups=groups, mean.correct=propcorrect, prop.correct=propcorrect.bygroup)
+ res <- list(prob=prob, groups=groups, mean.ok=propcorrect, grp.tab=grp.tab, assignStat=assignStat)
+
+ return(res)
+} # end dibas.matrix
+
+
+
+
+
+
+###############
+## dibas.phylo
+###############
+dibas.phylo <- function(x, grp, method=c("default","leaveOneOut"), ...){
+ if(!require(ape)) stop("ape package is required")
+ if(!inherits(x,"phylo")) stop("x is not a phylo object")
+
+ res <- dibas.matrix(cophenetic.phylo(x), grp, method)
+
+ return(res)
+} # end dibas.phylo
+
+
+
+
+
+
+
+##############
+## dibas.dist
+##############
+dibas.dist <- function(x, grp, method=c("default","leaveOneOut"), ...){
+
+ res <- dibas.matrix(as.matrix(x), grp, method)
+
+ return(res)
+} # end dibas.phylo
+
+
+
+
+
+
+
+##############################
+## simulate data with groups
+##############################
+simDatGroups <- function(k=2, p=1000, n=10, mu=0, sigma=1){
+ ## RECYCLE ARGUMENTS ##
+ n <- rep(n, length=k)
+ mu <- rep(mu, length=k)
+ sigma <- rep(sigma, length=k)
+
+
+ ## GENERATE DATA ##
+ dat <- list()
+ for(i in 1:k){
+ dat[[i]] <- replicate(p, rnorm(n[i], mu[i], sigma[i]))
+ }
+
+ dat <- Reduce(rbind,dat)
+ rownames(dat) <- paste("ind", 1:nrow(dat))
+
+ ## SHAPE OUTPUT ##
+ grp <- factor(paste("grp", rep(1:k, n)))
+ res <- list(dat=dat, grp=grp)
+ return(res)
+} # end simDatGroups
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+########## OLD CODE, USING A DIFFERENT APPROACH ###########
+## THIS WAS USING A KERNEL APPROX OF THE DISTRIBUTION OF
+## WITHIN GROUP DISTANCES. NOT WORKING BECAUSE THERE COULD BE
+## MORE THAN ONE MODE, SO GROUPS COULD BE PRETTY SPLIT ACROSS
+## THE PHYLOGENY
+##
+## ##############
+## ## dibas
+## ##############
+## dibas <- function(x, grp, method=c("default","leaveOneOut","bootstrap"), n.dens=4096, plot=TRUE,
+## warn.lab=FALSE, dat=NULL, FUN=NULL, n.boot=10, ...){
+## if(!require(ape)) stop("ape package is required")
+## if(!inherits(x,"phylo")) stop("x is not a phylo object")
+## method <- match.arg(method)
+
+## if(method=="bootstrap" && (is.null(dat) || is.null(FUN))) stop("dat and FUN must be provided for the bootstrap procedure")
+## if(warn.lab && !is.null(dat) && !identical(x$tip.label,rownames(dat))) warning("Tip labels in x and rownames of dat differ \nplease make sure the same order is used in x, grp, and dat")
+
+## ## DECLARE SOME VARIABLES, HANDLE ARGUMENTS ##
+## grp <- factor(grp)
+## K <- length(LEV <- levels(grp))
+## N <- length(x$tip.label)
+## D <- cophenetic.phylo(x)
+## THRES <- 1e-320 # densities < THRES will be set to THRES to avoid log(x)=-Inf
+
+
+## ## RE-ORDER GRP AND DATA MATRIX AFTER TIP LABELS ##
+## if(!is.null(dat)){
+## if(is.null(rownames(dat))) rownames(dat) <- x$tip.label
+## if(!all(x$tip.label %in% rownames(dat))) stop("some tips do not have data matching their label")
+## grp <- grp[match(x$tip.label, rownames(dat))] # grp is assumed to be in the same order as 'dat'
+## dat <- dat[x$tip.label,,drop=FALSE]
+## }
+
+## #### AUXILIARY FUNCTIONS ####
+## ## FUNCTION TO ESTIMATE A DENSITY AT A SERIES OF POINTS ##
+## compute.dens <- function(dens, values){
+## pred.y <- double(n <- length(values))
+## return(.C("predict_density", dens$x, dens$y, length(dens$x), as.double(values), pred.y, n, PACKAGE="adephylo")[[5]])
+## }
+
+
+## ## FUNCTION TO GET A VECTOR OF PAIRWISE DISTANCE WITHIN ONE GROUP ##
+## getdist.within.grp <- function(M, fac, val){ # val is one level of fac
+## temp <- M[fac==val,fac==val]
+## return(temp[lower.tri(temp)])
+## }
+
+
+## ## FUNCTION TO GET A VECTOR OF PAIRWISE DISTANCES WITHIN GROUP, FOR ALL GROUPS ##
+## getdist.within.allgrp <- function(M, fac){
+## res <- lapply(LEV, function(e) getdist.within.grp(M, fac, e))
+## names(res) <- LEV
+## return(res)
+## }
+
+
+## ## FUNCTION TO GET PROBA FOR ONE INDIV / ONE GROUP ##
+## getprob.ind <- function(i, g, dens.per.grp){ # i: idx of indiv; g: idx of a group
+## temp <- 1:ncol(D)
+## dens <- compute.dens(dens.per.grp[[g]], D[i,grp==LEV[g] & temp!=i])
+## dens[dens < THRES] <- THRES
+## res <- exp(mean(log(dens)))
+## return(res)
+## }
+
+
+## ## FUNCTION TO GET PROBA FOR ALL INDIV / ONE GROUP ##
+## if(method=="leaveOneOut"){
+## getprob.grp <- function(g, dens.per.ind.grp){ # g: idx of a group; dens.per.ind.grp is a list giving grp density for each indiv
+## return(sapply(1:N, function(i) getprob.ind(i,g,dens.per.ind.grp[[i]])))
+## }
+## } else {
+## getprob.grp <- function(g, dens.per.grp){ # g: idx of a group
+## return(sapply(1:N, function(i) getprob.ind(i,g,dens.per.grp)))
+## }
+## }
+
+
+## ## FUNCTION TO GET A BOOTSTRAPPED TREE AND MATCHING GRP ##
+## getboot.tree.grp <- function(){
+## samp <- sample(1:N,replace=TRUE)
+## tre <- FUN(dat[samp,,drop=FALSE])
+## newgrp <- factor(grp[samp])
+## return(list(tre=tre, grp=newgrp))
+## }
+
+
+## #### CORE COMPUTATIONS ####
+## ## DEFAULT: DENSITY BASED ON SAMPLE ##
+## if(method=="default"){
+## dens.dat <- getdist.within.allgrp(D, grp) # density data for each group
+## list.dens <- lapply(dens.dat, density, n=n.dens, ...) # densities for each group
+## }
+
+
+## ## LEAVEONEOUT: GRP DENSITY EXCLUDES THE INDIV FOR WHICH PROBA IS SEEKED ##
+## if(method=="leaveOneOut"){
+## dens.dat <- lapply(1:N, function(i) getdist.within.allgrp(D[-i,-i], grp[-i])) # grp density data for each individual
+## list.dens <- lapply(1:N, function(i) lapply(dens.dat[[i]], density, n=n.dens, ...)) # densities for each group
+## }
+
+
+## ## BOOTSTRAP: DENSITY BASED ON BOOTSTRAPPIN INDIVIDUALS ##
+## if(method=="bootstrap"){
+## ## GET BOOTSTRAPPED TREES ##
+## list.trees.grp <- lapply(1:n.boot, function(i) getboot.tree.grp())
+
+
+## ## GET WITHIN-GROUP DISTANCES FOR EACH BOOTSTRAP SAMPLE ##
+## list.D <- lapply(list.trees.grp, function(e) cophenetic.phylo(e$tre))
+## temp <- lapply(1:n.boot, function(i) getdist.within.allgrp(list.D[[i]], list.trees.grp[[i]]$grp)) # for each replicate, list of distances within for each grp
+
+
+## ## GET DENSITIES FOR EACH GROUP ##
+## dens.dat <- lapply(LEV, function(onegroup) unlist(lapply(temp, function(e) e[[onegroup]]))) # density data for each group
+## list.dens <- lapply(dens.dat, density, n=n.dens, ...) # densities for each group
+## }
+
+
+## ## PLOT DENSITIES ##
+## if(method != "leaveOneOut" && plot){
+## find.mfrow <- function(i) {
+## nrow <- ceiling(sqrt(i))
+## ncol <- ceiling(i/ceiling(sqrt(i)))
+## return(c(nrow,ncol))
+## }
+## par(mfrow = find.mfrow(K))
+## for(i in 1:K){
+## plot(list.dens[[i]], main=paste("Group:",LEV[i]),xlab="Within-group pairwise distance",ylab="Density", col="blue")
+## points(dens.dat[[i]], rep(0,length(dens.dat[[i]])), pch="|", col="blue")
+## }
+## }
+
+
+## ## COMPUTE MEMBERSHIP PROBABILITIES ##
+## prob <- matrix(unlist(lapply(1:K, getprob.grp, list.dens)), ncol=K)
+## prob <- prop.table(prob,1)
+## colnames(prob) <- LEV
+## rownames(prob) <- x$tip.label
+
+
+## ## FIND GROUP ASSIGNMENTS ##
+## temp <- factor(colnames(prob)[apply(prob,1, which.max)])
+## annot <- rep(" ", N)
+## annot[as.character(grp)!=as.character(temp)] <- "!"
+## groups <- data.frame(observed=grp, inferred=temp, annot=annot)
+## ##rownames(groups) <- rownames(prob)
+
+
+## ## BUILD / RETURN RESULT ##
+## propcorrect <- mean(annot==" ")
+## ## propcorrect.bygroup <- tapply(annot==" ", grp, mean)
+## assignability <- mean((apply(prob,1,max)-.5)/.5)
+## ##res <- list(prob=prob,groups=groups, mean.correct=propcorrect, prop.correct=propcorrect.bygroup)
+## res <- list(prob=prob, groups=groups, assigndex=assignability, mean.correct=propcorrect)
+
+## return(res)
+## } # end dibas
Deleted: pkg/R/tree.group.R
===================================================================
--- pkg/R/tree.group.R 2012-02-16 20:12:07 UTC (rev 179)
+++ pkg/R/tree.group.R 2012-09-26 17:27:19 UTC (rev 180)
@@ -1,190 +0,0 @@
-##############
-## treeGroup
-##############
-treeGroup <- function(x, grp, method=c("default","leaveOneOut","bootstrap"), n.dens=4096, plot=TRUE,
- warn.lab=FALSE, dat=NULL, FUN=NULL, n.boot=10, ...){
- if(!require(ape)) stop("ape package is required")
- if(!inherits(x,"phylo")) stop("x is not a phylo object")
- method <- match.arg(method)
-
- if(method=="bootstrap" && (is.null(dat) || is.null(FUN))) stop("dat and FUN must be provided for the bootstrap procedure")
- if(warn.lab && !is.null(dat) && !identical(x$tip.label,rownames(dat))) warning("Tip labels in x and rownames of dat differ \nplease make sure the same order is used in x, grp, and dat")
-
- ## DECLARE SOME VARIABLES, HANDLE ARGUMENTS ##
- grp <- factor(grp)
- K <- length(LEV <- levels(grp))
- N <- length(x$tip.label)
- D <- cophenetic.phylo(x)
- THRES <- 1e-320 # densities < THRES will be set to THRES to avoid log(x)=-Inf
-
-
- ## RE-ORDER GRP AND DATA MATRIX AFTER TIP LABELS ##
- if(!is.null(dat)){
- if(is.null(rownames(dat))) rownames(dat) <- x$tip.label
- if(!all(x$tip.label %in% rownames(dat))) stop("some tips do not have data matching their label")
- grp <- grp[match(x$tip.label, rownames(dat))] # grp is assumed to be in the same order as 'dat'
- dat <- dat[x$tip.label,,drop=FALSE]
- }
-
- #### AUXILIARY FUNCTIONS ####
- ## FUNCTION TO ESTIMATE A DENSITY AT A SERIES OF POINTS ##
- compute.dens <- function(dens, values){
- pred.y <- double(n <- length(values))
- return(.C("predict_density", dens$x, dens$y, length(dens$x), as.double(values), pred.y, n, PACKAGE="adephylo")[[5]])
- }
-
-
- ## FUNCTION TO GET A VECTOR OF PAIRWISE DISTANCE WITHIN ONE GROUP ##
- getdist.within.grp <- function(M, fac, val){ # val is one level of fac
- temp <- M[fac==val,fac==val]
- return(temp[lower.tri(temp)])
- }
-
-
- ## FUNCTION TO GET A VECTOR OF PAIRWISE DISTANCES WITHIN GROUP, FOR ALL GROUPS ##
- getdist.within.allgrp <- function(M, fac){
- res <- lapply(LEV, function(e) getdist.within.grp(M, fac, e))
- names(res) <- LEV
- return(res)
- }
-
-
- ## FUNCTION TO GET PROBA FOR ONE INDIV / ONE GROUP ##
- getprob.ind <- function(i, g, dens.per.grp){ # i: idx of indiv; g: idx of a group
- temp <- 1:ncol(D)
- dens <- compute.dens(dens.per.grp[[g]], D[i,grp==LEV[g] & temp!=i])
- dens[dens < THRES] <- THRES
- res <- exp(mean(log(dens)))
- return(res)
- }
-
-
- ## FUNCTION TO GET PROBA FOR ALL INDIV / ONE GROUP ##
- if(method=="leaveOneOut"){
- getprob.grp <- function(g, dens.per.ind.grp){ # g: idx of a group; dens.per.ind.grp is a list giving grp density for each indiv
- return(sapply(1:N, function(i) getprob.ind(i,g,dens.per.ind.grp[[i]])))
- }
- } else {
- getprob.grp <- function(g, dens.per.grp){ # g: idx of a group
- return(sapply(1:N, function(i) getprob.ind(i,g,dens.per.grp)))
- }
- }
-
-
- ## FUNCTION TO GET A BOOTSTRAPPED TREE AND MATCHING GRP ##
- getboot.tree.grp <- function(){
- samp <- sample(1:N,replace=TRUE)
- tre <- FUN(dat[samp,,drop=FALSE])
- newgrp <- factor(grp[samp])
- return(list(tre=tre, grp=newgrp))
- }
-
-
- #### CORE COMPUTATIONS ####
- ## DEFAULT: DENSITY BASED ON SAMPLE ##
- if(method=="default"){
- dens.dat <- getdist.within.allgrp(D, grp) # density data for each group
- list.dens <- lapply(dens.dat, density, n=n.dens, ...) # densities for each group
- }
-
-
- ## LEAVEONEOUT: GRP DENSITY EXCLUDES THE INDIV FOR WHICH PROBA IS SEEKED ##
- if(method=="leaveOneOut"){
- dens.dat <- lapply(1:N, function(i) getdist.within.allgrp(D[-i,-i], grp[-i])) # grp density data for each individual
- list.dens <- lapply(1:N, function(i) lapply(dens.dat[[i]], density, n=n.dens, ...)) # densities for each group
- }
-
-
- ## BOOTSTRAP: DENSITY BASED ON BOOTSTRAPPIN INDIVIDUALS ##
- if(method=="bootstrap"){
- ## GET BOOTSTRAPPED TREES ##
- list.trees.grp <- lapply(1:n.boot, function(i) getboot.tree.grp())
-
-
- ## GET WITHIN-GROUP DISTANCES FOR EACH BOOTSTRAP SAMPLE ##
- list.D <- lapply(list.trees.grp, function(e) cophenetic.phylo(e$tre))
- temp <- lapply(1:n.boot, function(i) getdist.within.allgrp(list.D[[i]], list.trees.grp[[i]]$grp)) # for each replicate, list of distances within for each grp
-
-
- ## GET DENSITIES FOR EACH GROUP ##
- dens.dat <- lapply(LEV, function(onegroup) unlist(lapply(temp, function(e) e[[onegroup]]))) # density data for each group
- list.dens <- lapply(dens.dat, density, n=n.dens, ...) # densities for each group
- }
-
-
- ## PLOT DENSITIES ##
- if(method != "leaveOneOut" && plot){
- find.mfrow <- function(i) {
- nrow <- ceiling(sqrt(i))
- ncol <- ceiling(i/ceiling(sqrt(i)))
- return(c(nrow,ncol))
- }
- par(mfrow = find.mfrow(K))
- for(i in 1:K){
- plot(list.dens[[i]], main=paste("Group:",LEV[i]),xlab="Within-group pairwise distance",ylab="Density", col="blue")
- points(dens.dat[[i]], rep(0,length(dens.dat[[i]])), pch="|", col="blue")
- }
- }
-
-
- ## COMPUTE MEMBERSHIP PROBABILITIES ##
- prob <- matrix(unlist(lapply(1:K, getprob.grp, list.dens)), ncol=K)
- prob <- prop.table(prob,1)
- colnames(prob) <- LEV
- rownames(prob) <- x$tip.label
-
-
- ## FIND GROUP ASSIGNMENTS ##
- temp <- factor(colnames(prob)[apply(prob,1, which.max)])
- annot <- rep(" ", N)
- annot[as.character(grp)!=as.character(temp)] <- "!"
- groups <- data.frame(observed=grp, inferred=temp, annot=annot)
- ##rownames(groups) <- rownames(prob)
-
-
- ## BUILD / RETURN RESULT ##
- propcorrect <- mean(annot==" ")
- ## propcorrect.bygroup <- tapply(annot==" ", grp, mean)
- assignability <- mean((apply(prob,1,max)-.5)/.5)
- ##res <- list(prob=prob,groups=groups, mean.correct=propcorrect, prop.correct=propcorrect.bygroup)
- res <- list(prob=prob, groups=groups, assigndex=assignability, mean.correct=propcorrect)
-
- return(res)
-} # end treeGroup
-
-
-
-
-
-
-
-
-##############################
-## simulate data with groups
-##############################
-simDatGroups <- function(k=2, p=1000, n=10, mu=0, sigma=1){
- ## RECYCLE ARGUMENTS ##
- n <- rep(n, length=k)
- mu <- rep(mu, length=k)
- sigma <- rep(sigma, length=k)
-
-
- ## GENERATE DATA ##
- dat <- list()
- for(i in 1:k){
- dat[[i]] <- replicate(p, rnorm(n[i], mu[i], sigma[i]))
- }
-
- dat <- Reduce(rbind,dat)
- rownames(dat) <- paste("ind", 1:nrow(dat))
-
- ## SHAPE OUTPUT ##
- grp <- factor(paste("grp", rep(1:k, n)))
- res <- list(dat=dat, grp=grp)
- return(res)
-}
-
-
-
-
-
More information about the Adephylo-commits
mailing list