[adegenet-commits] r838 - in pkg: . R man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Fri Mar 4 13:35:04 CET 2011
Author: jombart
Date: 2011-03-04 13:35:04 +0100 (Fri, 04 Mar 2011)
New Revision: 838
Modified:
pkg/DESCRIPTION
pkg/R/dapc.R
pkg/R/glFunctions.R
pkg/man/glPca.Rd
Log:
Added an example for glPca.
Added possibility to return/provide glPca with matrix of dot product - essential to avoid wasting time in DAPC for genlight.
Modified: pkg/DESCRIPTION
===================================================================
--- pkg/DESCRIPTION 2011-03-04 11:30:49 UTC (rev 837)
+++ pkg/DESCRIPTION 2011-03-04 12:35:04 UTC (rev 838)
@@ -10,6 +10,6 @@
Suggests: ade4, genetics, hierfstat, spdep, tripack, ape, pegas, graph, RBGL, seqinr, multicore
Depends: methods, MASS
Description: Classes and functions for genetic data analysis within the multivariate framework.
-Collate: classes.R basicMethods.R handling.R auxil.R setAs.R find.clust.R hybridize.R scale.R fstat.R import.R seqTrack.R chooseCN.R genind2genpop.R loadingplot.R sequences.R gstat.randtest.R makefreq.R colorplot.R monmonier.R spca.R coords.monmonier.R haploGen.R old2new.R spca.rtests.R dapc.R haploPop.R PCtest.R dist.genpop.R Hs.R propShared.R export.R HWE.R propTyped.R inbreeding.R SNPbin.R glHandle.R glPlot.R glFunctions.R zzz.R
+Collate: classes.R basicMethods.R handling.R auxil.R setAs.R SNPbin.R glHandle.R glFunctions.R find.clust.R hybridize.R scale.R fstat.R import.R seqTrack.R chooseCN.R genind2genpop.R loadingplot.R sequences.R gstat.randtest.R makefreq.R colorplot.R monmonier.R spca.R coords.monmonier.R haploGen.R old2new.R spca.rtests.R dapc.R haploPop.R PCtest.R dist.genpop.R Hs.R propShared.R export.R HWE.R propTyped.R inbreeding.R glPlot.R zzz.R
License: GPL (>=2)
LazyLoad: yes
Modified: pkg/R/dapc.R
===================================================================
--- pkg/R/dapc.R 2011-03-04 11:30:49 UTC (rev 837)
+++ pkg/R/dapc.R 2011-03-04 12:35:04 UTC (rev 838)
@@ -3,9 +3,9 @@
########
dapc <- function (x, ...) UseMethod("dapc")
-#################
+###################
## dapc.data.frame
-#################
+###################
dapc.data.frame <- function(x, grp, n.pca=NULL, n.da=NULL,
center=TRUE, scale=FALSE, var.contrib=TRUE,
pca.select=c("nbEig","percVar"), perc.pca=NULL, ..., dudi=NULL){
@@ -23,9 +23,9 @@
## SOME GENERAL VARIABLES
N <- nrow(x)
+ REDUCEDIM <- is.null(dudi)
-
- if(is.null(dudi)){ # if no dudi provided
+ if(REDUCEDIM){ # if no dudi provided
## PERFORM PCA ##
maxRank <- min(dim(x))
pcaX <- dudi.pca(x, center = center, scale = scale, scannf = FALSE, nf=maxRank)
@@ -173,15 +173,91 @@
-##################
-# Function dapc.dudi
-##################
+######################
+## Function dapc.dudi
+######################
dapc.dudi <- function(x, grp, ...){
return(dapc.data.frame(x$li, grp, dudi=x, ...))
}
+
+
+#################
+## dapc.genlight
+#################
+dapc.genlight <- function(x, pop=NULL, n.pca=NULL, n.da=NULL,
+ scale=FALSE, var.contrib=TRUE,
+ pca.select=c("nbEig","percVar"), perc.pca=NULL, glPca=NULL, ...){
+ ## FIRST CHECKS
+ if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
+ if(!require(MASS, quiet=TRUE)) stop("MASS library is required.")
+
+ if(!inherits(x, "genlight")) stop("x must be a genlight object.")
+
+ if(is.null(pop)) {
+ pop.fac <- pop(x)
+ } else {
+ pop.fac <- pop
+ }
+
+ if(is.null(pop.fac)) stop("x does not include pre-defined populations, and `pop' is not provided")
+
+
+
+ ## PERFORM PCA ##
+ maxRank <- min(c(nInd(x), nLoc(x)))
+
+ if(REDUCEDIM){ # if no dudi provided
+ maxRank <- min(dim(x))
+ pcaX <- glPca(x, center = center, scale = scale, nf=maxRank, loadings=FALSE, ...)
+ } else { # else use the provided dudi
+ pcaX <- glPca
+ }
+ cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
+
+ ## select the number of retained PC for PCA
+ if(is.null(n.pca) & pca.select=="nbEig"){
+ plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA")
+ cat("Choose the number PCs to retain (>=1): ")
+ n.pca <- as.integer(readLines(n = 1))
+ }
+
+ if(is.null(perc.pca) & pca.select=="percVar"){
+ plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA")
+ cat("Choose the percentage of variance to retain (0-100): ")
+ nperc.pca <- as.numeric(readLines(n = 1))
+ }
+
+ ## get n.pca from the % of variance to conserve
+ if(!is.null(perc.pca)){
+ n.pca <- min(which(cumVar >= perc.pca))
+ if(perc.pca > 99.999) n.pca <- length(pcaX$eig)
+ if(n.pca<1) n.pca <- 1
+ }
+
+
+ ## recompute PCA with loadings
+ if(REDUCEDIM){ # if no dudi provided
+ maxRank <- min(dim(x))
+ pcaX <- glPca(x, center = center, scale = scale, nf=n.pca, loadings=TRUE, ...)
+ }
+
+
+
+
+ res$call <- match.call()
+
+ return(res)
+
+} # end dapc.genlight
+
+
+
+
+
+
######################
# Function print.dapc
######################
@@ -216,7 +292,7 @@
if(!is.null(x$var.contr)){
sumry <- array("", c(6, 4), list(1:6, c("data.frame", "nrow", "ncol", "content")))
} else {
- sumry <- array("", c(5, 4), list(1:5, c("data.frame", "nrow", "ncol", "content")))
+ sumry <- array("", c(5, 4), list(1:5, c("data.frame", "nrow", "ncol", "content")))
}
sumry[1, ] <- c("$tab", nrow(x$tab), ncol(x$tab), "retained PCs of PCA")
sumry[2, ] <- c("$loadings", nrow(x$loadings), ncol(x$loadings), "loadings of variables")
Modified: pkg/R/glFunctions.R
===================================================================
--- pkg/R/glFunctions.R 2011-03-04 11:30:49 UTC (rev 837)
+++ pkg/R/glFunctions.R 2011-03-04 12:35:04 UTC (rev 838)
@@ -225,7 +225,8 @@
## PCA for genlight objects
##
glPca <- function(x, center=TRUE, scale=FALSE, nf=NULL, loadings=TRUE, alleleAsUnit=FALSE,
- useC=TRUE, multicore=require("multicore"), n.cores=NULL){
+ useC=TRUE, multicore=require("multicore"), n.cores=NULL,
+ returnDotProd=FALSE, matDotProd=NULL){
if(!inherits(x, "genlight")) stop("x is not a genlight object")
## COMPUTE MEANS AND VARIANCES ##
@@ -242,95 +243,100 @@
myPloidy <- ploidy(x)
+ ## NEED TO COMPUTE DOT PRODUCTS ##
+ if(is.null(matDotProd)){
- ## == if non-C code is used ==
- if(!useC){
- if(multicore && !require(multicore)) stop("multicore package requested but not installed")
- if(multicore && is.null(n.cores)){
- n.cores <- multicore:::detectCores()
- }
+ ## == if non-C code is used ==
+ if(!useC){
+ if(multicore && !require(multicore)) stop("multicore package requested but not installed")
+ if(multicore && is.null(n.cores)){
+ n.cores <- multicore:::detectCores()
+ }
- ## COMPUTE DOT PRODUCTS BETWEEN GENOTYPES ##
- ## to be fast, a particular function is defined for each case of centring/scaling
+ ## COMPUTE DOT PRODUCTS BETWEEN GENOTYPES ##
+ ## to be fast, a particular function is defined for each case of centring/scaling
- ## NO CENTRING / NO SCALING
- if(!center & !scale){
- dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
- a <- as.integer(a) / ploid.a
- a[is.na(a)] <- 0
- b <- as.integer(b) / ploid.b
- b[is.na(b)] <- 0
- return(sum( a*b, na.rm=TRUE))
+ ## NO CENTRING / NO SCALING
+ if(!center & !scale){
+ dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+ a <- as.integer(a) / ploid.a
+ a[is.na(a)] <- 0
+ b <- as.integer(b) / ploid.b
+ b[is.na(b)] <- 0
+ return(sum( a*b, na.rm=TRUE))
+ }
}
- }
- ## CENTRING / NO SCALING
- if(center & !scale){
- dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
- a <- as.integer(a) / ploid.a
- a[is.na(a)] <- vecMeans[is.na(a)]
- b <- as.integer(b) / ploid.b
- b[is.na(b)] <- vecMeans[is.na(b)]
- return(sum( (a-vecMeans) * (b-vecMeans), na.rm=TRUE) )
+ ## CENTRING / NO SCALING
+ if(center & !scale){
+ dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+ a <- as.integer(a) / ploid.a
+ a[is.na(a)] <- vecMeans[is.na(a)]
+ b <- as.integer(b) / ploid.b
+ b[is.na(b)] <- vecMeans[is.na(b)]
+ return(sum( (a-vecMeans) * (b-vecMeans), na.rm=TRUE) )
+ }
}
- }
- ## NO CENTRING / SCALING (odd option...)
- if(!center & scale){
- dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
- a <- as.integer(a) / ploid.a
- a[is.na(a)] <- 0
- b <- as.integer(b) / ploid.b
- b[is.na(b)] <- 0
- return(sum( (a*b)/vecVar, na.rm=TRUE))
+ ## NO CENTRING / SCALING (odd option...)
+ if(!center & scale){
+ dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+ a <- as.integer(a) / ploid.a
+ a[is.na(a)] <- 0
+ b <- as.integer(b) / ploid.b
+ b[is.na(b)] <- 0
+ return(sum( (a*b)/vecVar, na.rm=TRUE))
+ }
}
- }
- ## CENTRING / SCALING
- if(center & scale){
- dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
- a <- as.integer(a) / ploid.a
- a[is.na(a)] <- vecMeans[is.na(a)]
- b <- as.integer(b) / ploid.b
- a[is.na(a)] <- vecMeans[is.na(a)]
- return( sum( ((a-vecMeans)*(b-vecMeans))/vecVar, na.rm=TRUE ) )
+ ## CENTRING / SCALING
+ if(center & scale){
+ dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+ a <- as.integer(a) / ploid.a
+ a[is.na(a)] <- vecMeans[is.na(a)]
+ b <- as.integer(b) / ploid.b
+ a[is.na(a)] <- vecMeans[is.na(a)]
+ return( sum( ((a-vecMeans)*(b-vecMeans))/vecVar, na.rm=TRUE ) )
+ }
}
- }
- ## COMPUTE ALL POSSIBLE DOT PRODUCTS (XX^T / n) ##
- allComb <- combn(1:nInd(x), 2)
- if(multicore){
- allProd <- unlist(mclapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]], x at gen[[allComb[2,i]]], myPloidy[allComb[1,i]], myPloidy[allComb[2,i]]),
- mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE))
- } else {
- allProd <- unlist(lapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]], x at gen[[allComb[2,i]]], myPloidy[allComb[1,i]], myPloidy[allComb[2,i]]) ))
- }
- allProd <- allProd / nInd(x) # assume uniform weights
+ ## COMPUTE ALL POSSIBLE DOT PRODUCTS (XX^T / n) ##
+ allComb <- combn(1:nInd(x), 2)
+ if(multicore){
+ allProd <- unlist(mclapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]], x at gen[[allComb[2,i]]], myPloidy[allComb[1,i]], myPloidy[allComb[2,i]]),
+ mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE))
+ } else {
+ allProd <- unlist(lapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]], x at gen[[allComb[2,i]]], myPloidy[allComb[1,i]], myPloidy[allComb[2,i]]) ))
+ }
+ allProd <- allProd / nInd(x) # assume uniform weights
- ## shape result as a matrix
- attr(allProd,"Size") <- nInd(x)
- attr(allProd,"Diag") <- FALSE
- attr(allProd,"Upper") <- FALSE
- class(allProd) <- "dist"
- allProd <- as.matrix(allProd)
+ ## shape result as a matrix
+ attr(allProd,"Size") <- nInd(x)
+ attr(allProd,"Diag") <- FALSE
+ attr(allProd,"Upper") <- FALSE
+ class(allProd) <- "dist"
+ allProd <- as.matrix(allProd)
- ## compute the diagonal
- if(multicore){
- temp <- unlist(mclapply(1:nInd(x), function(i) dotProd(x at gen[[i]], x at gen[[i]], myPloidy[i], myPloidy[i]),
- mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE))/nInd(x)
- } else {
- temp <- unlist(lapply(1:nInd(x), function(i) dotProd(x at gen[[i]], x at gen[[i]], myPloidy[i], myPloidy[i]) ))/nInd(x)
+ ## compute the diagonal
+ if(multicore){
+ temp <- unlist(mclapply(1:nInd(x), function(i) dotProd(x at gen[[i]], x at gen[[i]], myPloidy[i], myPloidy[i]),
+ mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE))/nInd(x)
+ } else {
+ temp <- unlist(lapply(1:nInd(x), function(i) dotProd(x at gen[[i]], x at gen[[i]], myPloidy[i], myPloidy[i]) ))/nInd(x)
+ }
+ diag(allProd) <- temp
+ } else { # === use C computations ====
+ allProd <- glDotProd(x, center=center, scale=scale, alleleAsUnit=alleleAsUnit)/nInd(x)
}
- diag(allProd) <- temp
- } else { # === use C computations ====
- allProd <- glDotProd(x, center=center, scale=scale, alleleAsUnit=alleleAsUnit)/nInd(x)
+ } else { # END NEED TO COMPUTE DOTPROD
+ if(!all(dim(matDotProd)==nInd(x))) stop("matDotProd has wrong dimensions.")
+ allProd <- matDotProd
}
-
## PERFORM THE ANALYSIS ##
## eigenanalysis
eigRes <- eigen(allProd, symmetric=TRUE, only.values=FALSE)
@@ -403,6 +409,11 @@
}
}
+ if(returnDotProd){
+ res$dotProd <- allProd
+ rownames(res$dotProd) <- colnames(res$dotProd) <- indNames(x)
+ }
+
res$call <- match.call()
class(res) <- "glPca"
@@ -429,7 +440,10 @@
if(!is.null(x$loadings)){
cat("\nPrincipal axes ($loadings):\n matrix with", nrow(x$loadings), "rows (SNPs) and", ncol(x$loadings), "columns (axes)", "\n")
}
- cat("\n")
+ if(!is.null(x$dotProd)){
+ cat("\nDot products between individuals ($dotProd):\n matrix with", nrow(x$dotProd), "rows and", ncol(x$dotProd), "columns", "\n")
+ }
+ cat("\n")
}
@@ -502,6 +516,23 @@
+
+###############
+## .glPca2dudi
+###############
+.glPca2dudi <- function(x){
+ if(!inherits(x,"glPca")) stop("x is not a glPca object")
+ old.names <- names(x)
+ new.names <- sub("scores","li", old.names)
+ new.names <- sub("loadings","c1", new.names)
+ names(x) <- new.names
+ class(x) <- c("pca","dudi")
+ return(x)
+} # end glPca2dudi
+
+
+
+
## TESTING ##
## x <- new("genlight", list(c(0,0,1,1,0), c(1,1,1,0,0,1), c(2,1,1,1,1,NA)))
## as.matrix(x)
Modified: pkg/man/glPca.Rd
===================================================================
--- pkg/man/glPca.Rd 2011-03-04 11:30:49 UTC (rev 837)
+++ pkg/man/glPca.Rd 2011-03-04 12:35:04 UTC (rev 838)
@@ -27,7 +27,8 @@
}
\usage{
glPca(x, center = TRUE, scale = FALSE, nf = NULL, loadings = TRUE,
- alleleAsUnit = FALSE, useC = TRUE, multicore = require("multicore"), n.cores = NULL)
+ alleleAsUnit = FALSE, useC = TRUE, multicore = require("multicore"),
+ n.cores = NULL, returnDotProd=FALSE, matDotProd=NULL)
\method{print}{glPca}(x, \dots)
@@ -72,6 +73,14 @@
\item{n.cores}{if \code{multicore} is TRUE, the number of cores to be
used in the computations; if NULL, then the maximum number of cores
available on the computer is used.}
+ \item{returnDotProd}{a logical indicating whether the matrix of dot
+ products between individuals should be returned (TRUE) or not
+ (FALSE, default).}
+ \item{matDotProd}{an optional matrix of dot products between
+ individuals, NULL by default. This option is used internally to
+ speed up computation time when re-running the same PCA several
+ times. Leave this argument as NULL unless you really know what you
+ are doing.}
\item{\dots}{further arguments to be passed to other functions.}
\item{xax,yax}{\code{integers} specifying which principal components
@@ -153,4 +162,23 @@
}
\author{ Thibaut Jombart \email{t.jombart at imperial.ac.uk} }
+\examples{
+## simulate a toy dataset
+x <- glSim(50,4e3, 50, ploidy=2)
+x
+plot(x)
+
+## perform PCA
+pca1 <- glPca(x, nf=2)
+
+## plot eigenvalues
+barplot(pca1$eig, main="eigenvalues", col=heat.colors(length(pca1$eig)))
+
+## basic plot
+scatter(pca1, ratio=.2)
+
+## plot showing groups
+s.class(pca1$scores, pop(x), col=colors()[c(131,134)])
+add.scatter.eig(pca1$eig,2,1,2)
+}
\keyword{multivariate}
\ No newline at end of file
More information about the adegenet-commits
mailing list