[adegenet-commits] r838 - in pkg: . R man

Fri Mar 4 13:35:04 CET 2011

Author: jombart
Date: 2011-03-04 13:35:04 +0100 (Fri, 04 Mar 2011)
New Revision: 838

Modified:
   pkg/DESCRIPTION
   pkg/R/dapc.R
   pkg/R/glFunctions.R
   pkg/man/glPca.Rd
Log:
Added an example for glPca.
Added possibility to return/provide glPca with matrix of dot product - essential to avoid wasting time in DAPC for genlight.


Modified: pkg/DESCRIPTION
===================================================================

--- pkg/DESCRIPTION	2011-03-04 11:30:49 UTC (rev 837)
+++ pkg/DESCRIPTION	2011-03-04 12:35:04 UTC (rev 838)
@@ -10,6 +10,6 @@
 Suggests: ade4, genetics, hierfstat, spdep, tripack, ape, pegas, graph, RBGL, seqinr, multicore
 Depends: methods, MASS
 Description: Classes and functions for genetic data analysis within the multivariate framework.
-Collate: classes.R basicMethods.R handling.R auxil.R setAs.R find.clust.R hybridize.R scale.R fstat.R import.R seqTrack.R chooseCN.R genind2genpop.R loadingplot.R sequences.R gstat.randtest.R makefreq.R  colorplot.R  monmonier.R spca.R coords.monmonier.R haploGen.R old2new.R spca.rtests.R dapc.R haploPop.R PCtest.R dist.genpop.R Hs.R propShared.R export.R HWE.R propTyped.R  inbreeding.R SNPbin.R glHandle.R glPlot.R glFunctions.R zzz.R
+Collate: classes.R basicMethods.R handling.R auxil.R setAs.R SNPbin.R glHandle.R glFunctions.R find.clust.R hybridize.R scale.R fstat.R import.R seqTrack.R chooseCN.R genind2genpop.R loadingplot.R sequences.R gstat.randtest.R makefreq.R colorplot.R monmonier.R spca.R coords.monmonier.R haploGen.R old2new.R spca.rtests.R dapc.R haploPop.R PCtest.R dist.genpop.R Hs.R propShared.R export.R HWE.R propTyped.R inbreeding.R glPlot.R zzz.R
 License: GPL (>=2)
 LazyLoad: yes

Modified: pkg/R/dapc.R
===================================================================
--- pkg/R/dapc.R	2011-03-04 11:30:49 UTC (rev 837)
+++ pkg/R/dapc.R	2011-03-04 12:35:04 UTC (rev 838)
@@ -3,9 +3,9 @@
 ########
 dapc <- function (x, ...) UseMethod("dapc")
 
-#################
+###################
 ## dapc.data.frame
-#################
+###################
 dapc.data.frame <- function(x, grp, n.pca=NULL, n.da=NULL,
                             center=TRUE, scale=FALSE, var.contrib=TRUE,
                             pca.select=c("nbEig","percVar"), perc.pca=NULL, ..., dudi=NULL){
@@ -23,9 +23,9 @@
 
     ## SOME GENERAL VARIABLES
     N <- nrow(x)
+    REDUCEDIM <- is.null(dudi)
 
-
-    if(is.null(dudi)){ # if no dudi provided
+    if(REDUCEDIM){ # if no dudi provided
         ## PERFORM PCA ##
         maxRank <- min(dim(x))
         pcaX <- dudi.pca(x, center = center, scale = scale, scannf = FALSE, nf=maxRank)
@@ -173,15 +173,91 @@
 
 
 
-##################
-# Function dapc.dudi
-##################
+######################
+## Function dapc.dudi
+######################
 dapc.dudi <- function(x, grp, ...){
     return(dapc.data.frame(x$li, grp, dudi=x, ...))
 }
 
 
 
+
+
+#################
+## dapc.genlight
+#################
+dapc.genlight <- function(x, pop=NULL, n.pca=NULL, n.da=NULL,
+                          scale=FALSE,  var.contrib=TRUE,
+                          pca.select=c("nbEig","percVar"), perc.pca=NULL, glPca=NULL, ...){
+    ## FIRST CHECKS
+    if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
+    if(!require(MASS, quiet=TRUE)) stop("MASS library is required.")
+
+    if(!inherits(x, "genlight")) stop("x must be a genlight object.")
+
+    if(is.null(pop)) {
+        pop.fac <- pop(x)
+    } else {
+        pop.fac <- pop
+    }
+
+    if(is.null(pop.fac)) stop("x does not include pre-defined populations, and `pop' is not provided")
+
+
+
+    ## PERFORM PCA ##
+    maxRank <- min(c(nInd(x), nLoc(x)))
+
+    if(REDUCEDIM){ # if no dudi provided
+        maxRank <- min(dim(x))
+        pcaX <- glPca(x, center = center, scale = scale, nf=maxRank, loadings=FALSE, ...)
+    } else { # else use the provided dudi
+        pcaX <- glPca
+    }
+    cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
+
+    ## select the number of retained PC for PCA
+    if(is.null(n.pca) & pca.select=="nbEig"){
+        plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA")
+        cat("Choose the number PCs to retain (>=1): ")
+        n.pca <- as.integer(readLines(n = 1))
+    }
+
+    if(is.null(perc.pca) & pca.select=="percVar"){
+        plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA")
+        cat("Choose the percentage of variance to retain (0-100): ")
+        nperc.pca <- as.numeric(readLines(n = 1))
+    }
+
+    ## get n.pca from the % of variance to conserve
+    if(!is.null(perc.pca)){
+        n.pca <- min(which(cumVar >= perc.pca))
+        if(perc.pca > 99.999) n.pca <- length(pcaX$eig)
+        if(n.pca<1) n.pca <- 1
+    }
+
+
+    ## recompute PCA with loadings
+    if(REDUCEDIM){ # if no dudi provided
+        maxRank <- min(dim(x))
+        pcaX <- glPca(x, center = center, scale = scale, nf=n.pca, loadings=TRUE, ...)
+    }
+
+
+
+
+    res$call <- match.call()
+
+    return(res)
+
+} # end dapc.genlight
+
+
+
+
+
+
 ######################
 # Function print.dapc
 ######################
@@ -216,7 +292,7 @@
     if(!is.null(x$var.contr)){
         sumry <- array("", c(6, 4), list(1:6, c("data.frame", "nrow", "ncol", "content")))
     } else {
-        sumry <- array("", c(5, 4), list(1:5, c("data.frame", "nrow", "ncol", "content")))     
+        sumry <- array("", c(5, 4), list(1:5, c("data.frame", "nrow", "ncol", "content")))
     }
     sumry[1, ] <- c("$tab", nrow(x$tab), ncol(x$tab), "retained PCs of PCA")
     sumry[2, ] <- c("$loadings", nrow(x$loadings), ncol(x$loadings), "loadings of variables")

Modified: pkg/R/glFunctions.R
===================================================================
--- pkg/R/glFunctions.R	2011-03-04 11:30:49 UTC (rev 837)
+++ pkg/R/glFunctions.R	2011-03-04 12:35:04 UTC (rev 838)
@@ -225,7 +225,8 @@
 ## PCA for genlight objects
 ##
 glPca <- function(x, center=TRUE, scale=FALSE, nf=NULL, loadings=TRUE, alleleAsUnit=FALSE,
-                  useC=TRUE, multicore=require("multicore"), n.cores=NULL){
+                  useC=TRUE, multicore=require("multicore"), n.cores=NULL,
+                  returnDotProd=FALSE, matDotProd=NULL){
     if(!inherits(x, "genlight")) stop("x is not a genlight object")
 
     ## COMPUTE MEANS AND VARIANCES ##
@@ -242,95 +243,100 @@
 
     myPloidy <- ploidy(x)
 
+    ## NEED TO COMPUTE DOT PRODUCTS ##
+    if(is.null(matDotProd)){
 
-    ## == if non-C code is used ==
-    if(!useC){
-        if(multicore && !require(multicore)) stop("multicore package requested but not installed")
-        if(multicore && is.null(n.cores)){
-            n.cores <- multicore:::detectCores()
-        }
+        ## == if non-C code is used ==
+        if(!useC){
+            if(multicore && !require(multicore)) stop("multicore package requested but not installed")
+            if(multicore && is.null(n.cores)){
+                n.cores <- multicore:::detectCores()
+            }
 
 
-        ## COMPUTE DOT PRODUCTS BETWEEN GENOTYPES ##
-        ## to be fast, a particular function is defined for each case of centring/scaling
+            ## COMPUTE DOT PRODUCTS BETWEEN GENOTYPES ##
+            ## to be fast, a particular function is defined for each case of centring/scaling
 
-        ## NO CENTRING / NO SCALING
-        if(!center & !scale){
-            dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
-                a <- as.integer(a) / ploid.a
-                a[is.na(a)] <- 0
-                b <- as.integer(b) / ploid.b
-                b[is.na(b)] <- 0
-                return(sum( a*b, na.rm=TRUE))
+            ## NO CENTRING / NO SCALING
+            if(!center & !scale){
+                dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+                    a <- as.integer(a) / ploid.a
+                    a[is.na(a)] <- 0
+                    b <- as.integer(b) / ploid.b
+                    b[is.na(b)] <- 0
+                    return(sum( a*b, na.rm=TRUE))
+                }
             }
-        }
 
-        ## CENTRING / NO SCALING
-        if(center & !scale){
-            dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
-                a <- as.integer(a) / ploid.a
-                a[is.na(a)] <- vecMeans[is.na(a)]
-                b <- as.integer(b) / ploid.b
-                b[is.na(b)] <- vecMeans[is.na(b)]
-                return(sum( (a-vecMeans) * (b-vecMeans), na.rm=TRUE) )
+            ## CENTRING / NO SCALING
+            if(center & !scale){
+                dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+                    a <- as.integer(a) / ploid.a
+                    a[is.na(a)] <- vecMeans[is.na(a)]
+                    b <- as.integer(b) / ploid.b
+                    b[is.na(b)] <- vecMeans[is.na(b)]
+                    return(sum( (a-vecMeans) * (b-vecMeans), na.rm=TRUE) )
+                }
             }
-        }
 
 
-        ## NO CENTRING / SCALING (odd option...)
-        if(!center & scale){
-            dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
-                a <- as.integer(a) / ploid.a
-                a[is.na(a)] <- 0
-                b <- as.integer(b) / ploid.b
-                b[is.na(b)] <- 0
-                return(sum( (a*b)/vecVar, na.rm=TRUE))
+            ## NO CENTRING / SCALING (odd option...)
+            if(!center & scale){
+                dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+                    a <- as.integer(a) / ploid.a
+                    a[is.na(a)] <- 0
+                    b <- as.integer(b) / ploid.b
+                    b[is.na(b)] <- 0
+                    return(sum( (a*b)/vecVar, na.rm=TRUE))
+                }
             }
-        }
 
 
-        ## CENTRING / SCALING
-        if(center & scale){
-            dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
-                a <- as.integer(a) / ploid.a
-                a[is.na(a)] <- vecMeans[is.na(a)]
-                b <- as.integer(b) / ploid.b
-                a[is.na(a)] <- vecMeans[is.na(a)]
-                return( sum( ((a-vecMeans)*(b-vecMeans))/vecVar, na.rm=TRUE ) )
+            ## CENTRING / SCALING
+            if(center & scale){
+                dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+                    a <- as.integer(a) / ploid.a
+                    a[is.na(a)] <- vecMeans[is.na(a)]
+                    b <- as.integer(b) / ploid.b
+                    a[is.na(a)] <- vecMeans[is.na(a)]
+                    return( sum( ((a-vecMeans)*(b-vecMeans))/vecVar, na.rm=TRUE ) )
+                }
             }
-        }
 
 
-        ## COMPUTE ALL POSSIBLE DOT PRODUCTS (XX^T / n) ##
-        allComb <- combn(1:nInd(x), 2)
-        if(multicore){
-            allProd <- unlist(mclapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]], x at gen[[allComb[2,i]]], myPloidy[allComb[1,i]], myPloidy[allComb[2,i]]),
-                                       mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE))
-        } else {
-            allProd <- unlist(lapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]], x at gen[[allComb[2,i]]], myPloidy[allComb[1,i]], myPloidy[allComb[2,i]]) ))
-        }
-        allProd <- allProd / nInd(x) # assume uniform weights
+            ## COMPUTE ALL POSSIBLE DOT PRODUCTS (XX^T / n) ##
+            allComb <- combn(1:nInd(x), 2)
+            if(multicore){
+                allProd <- unlist(mclapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]], x at gen[[allComb[2,i]]], myPloidy[allComb[1,i]], myPloidy[allComb[2,i]]),
+                                           mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE))
+            } else {
+                allProd <- unlist(lapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]], x at gen[[allComb[2,i]]], myPloidy[allComb[1,i]], myPloidy[allComb[2,i]]) ))
+            }
+            allProd <- allProd / nInd(x) # assume uniform weights
 
-        ## shape result as a matrix
-        attr(allProd,"Size") <- nInd(x)
-        attr(allProd,"Diag") <- FALSE
-        attr(allProd,"Upper") <- FALSE
-        class(allProd) <- "dist"
-        allProd <- as.matrix(allProd)
+            ## shape result as a matrix
+            attr(allProd,"Size") <- nInd(x)
+            attr(allProd,"Diag") <- FALSE
+            attr(allProd,"Upper") <- FALSE
+            class(allProd) <- "dist"
+            allProd <- as.matrix(allProd)
 
-        ## compute the diagonal
-        if(multicore){
-            temp <- unlist(mclapply(1:nInd(x), function(i) dotProd(x at gen[[i]], x at gen[[i]], myPloidy[i], myPloidy[i]),
-                                    mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE))/nInd(x)
-        } else {
-            temp <- unlist(lapply(1:nInd(x), function(i) dotProd(x at gen[[i]], x at gen[[i]], myPloidy[i], myPloidy[i]) ))/nInd(x)
+            ## compute the diagonal
+            if(multicore){
+                temp <- unlist(mclapply(1:nInd(x), function(i) dotProd(x at gen[[i]], x at gen[[i]], myPloidy[i], myPloidy[i]),
+                                        mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE))/nInd(x)
+            } else {
+                temp <- unlist(lapply(1:nInd(x), function(i) dotProd(x at gen[[i]], x at gen[[i]], myPloidy[i], myPloidy[i]) ))/nInd(x)
+            }
+            diag(allProd) <- temp
+        } else { # === use C computations ====
+            allProd <- glDotProd(x, center=center, scale=scale, alleleAsUnit=alleleAsUnit)/nInd(x)
         }
-        diag(allProd) <- temp
-    } else { # === use C computations ====
-        allProd <- glDotProd(x, center=center, scale=scale, alleleAsUnit=alleleAsUnit)/nInd(x)
+    } else { # END NEED TO COMPUTE DOTPROD
+        if(!all(dim(matDotProd)==nInd(x))) stop("matDotProd has wrong dimensions.")
+        allProd <- matDotProd
     }
 
-
     ## PERFORM THE ANALYSIS ##
     ## eigenanalysis
     eigRes <- eigen(allProd, symmetric=TRUE, only.values=FALSE)
@@ -403,6 +409,11 @@
         }
     }
 
+    if(returnDotProd){
+        res$dotProd <- allProd
+        rownames(res$dotProd) <- colnames(res$dotProd) <- indNames(x)
+    }
+
     res$call <- match.call()
 
     class(res) <- "glPca"
@@ -429,7 +440,10 @@
     if(!is.null(x$loadings)){
         cat("\nPrincipal axes ($loadings):\n matrix with", nrow(x$loadings), "rows (SNPs) and", ncol(x$loadings), "columns (axes)", "\n")
     }
-    cat("\n")
+    if(!is.null(x$dotProd)){
+        cat("\nDot products between individuals ($dotProd):\n matrix with", nrow(x$dotProd), "rows and", ncol(x$dotProd), "columns", "\n")
+    }
+  cat("\n")
 }
 
 
@@ -502,6 +516,23 @@
 
 
 
+
+###############
+## .glPca2dudi
+###############
+.glPca2dudi <- function(x){
+    if(!inherits(x,"glPca")) stop("x is not a glPca object")
+    old.names <- names(x)
+    new.names <- sub("scores","li", old.names)
+    new.names <- sub("loadings","c1", new.names)
+    names(x) <- new.names
+    class(x) <- c("pca","dudi")
+    return(x)
+} # end glPca2dudi
+
+
+
+
 ## TESTING ##
 ## x <- new("genlight", list(c(0,0,1,1,0), c(1,1,1,0,0,1), c(2,1,1,1,1,NA)))
 ## as.matrix(x)

Modified: pkg/man/glPca.Rd
===================================================================
--- pkg/man/glPca.Rd	2011-03-04 11:30:49 UTC (rev 837)
+++ pkg/man/glPca.Rd	2011-03-04 12:35:04 UTC (rev 838)
@@ -27,7 +27,8 @@
 }
 \usage{
 glPca(x, center = TRUE, scale = FALSE, nf = NULL, loadings = TRUE, 
-    alleleAsUnit = FALSE, useC = TRUE, multicore = require("multicore"), n.cores = NULL)
+    alleleAsUnit = FALSE, useC = TRUE, multicore = require("multicore"),
+  n.cores = NULL, returnDotProd=FALSE, matDotProd=NULL)
 
 \method{print}{glPca}(x, \dots)
 
@@ -72,6 +73,14 @@
   \item{n.cores}{if \code{multicore} is TRUE, the number of cores to be
     used in the computations; if NULL, then the maximum number of cores
     available on the computer is used.}
+  \item{returnDotProd}{a logical indicating whether the matrix of dot
+    products between individuals should be returned (TRUE) or not
+    (FALSE, default).}
+  \item{matDotProd}{an optional matrix of dot products between
+    individuals, NULL by default. This option is used internally to
+    speed up computation time when re-running the same PCA several
+    times. Leave this argument as NULL unless you really know what you
+    are doing.}
   \item{\dots}{further arguments to be passed to other functions.}
 
   \item{xax,yax}{\code{integers} specifying which principal components
@@ -153,4 +162,23 @@
 
 }
 \author{ Thibaut Jombart \email{t.jombart at imperial.ac.uk} }
+\examples{
+## simulate a toy dataset
+x <- glSim(50,4e3, 50, ploidy=2)
+x
+plot(x)
+
+## perform PCA
+pca1 <- glPca(x, nf=2)
+
+## plot eigenvalues
+barplot(pca1$eig, main="eigenvalues", col=heat.colors(length(pca1$eig)))
+
+## basic plot
+scatter(pca1, ratio=.2)
+
+## plot showing groups
+s.class(pca1$scores, pop(x), col=colors()[c(131,134)])
+add.scatter.eig(pca1$eig,2,1,2)
+}
 \keyword{multivariate}
\ No newline at end of file