[adegenet-commits] r848 - in pkg: R man

Thu Mar 10 17:37:30 CET 2011

Author: jombart
Date: 2011-03-10 17:37:30 +0100 (Thu, 10 Mar 2011)
New Revision: 848

Modified:
   pkg/R/find.clust.R
   pkg/man/dapc.Rd
   pkg/man/find.clusters.Rd
Log:
find.clusters implemented for genlight objects


Modified: pkg/R/find.clust.R
===================================================================

--- pkg/R/find.clust.R	2011-03-10 15:50:01 UTC (rev 847)
+++ pkg/R/find.clust.R	2011-03-10 16:37:30 UTC (rev 848)
@@ -3,12 +3,13 @@
 #############
 find.clusters <- function (x, ...) UseMethod("find.clusters")
 
-######################
+############################
 ## find.clusters.data.frame
-######################
+############################
 find.clusters.data.frame <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE,
                                      criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
-                                     max.n.clust=round(nrow(x)/10), n.iter=1e5, n.start=10, center=TRUE, scale=TRUE, ..., dudi=NULL){
+                                     max.n.clust=round(nrow(x)/10), n.iter=1e5, n.start=10, center=TRUE, scale=TRUE,
+                                     pca.select=c("nbEig","percVar"), perc.pca=NULL,..., dudi=NULL){
 
     ## CHECKS ##
     if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
@@ -16,7 +17,10 @@
     if(!require(stats)) stop("package stats is required")
 
     stat <- match.arg(stat)
+    pca.select <- match.arg(pca.select)
     criterion <- match.arg(criterion)
+    min.n.clust <- 2
+    max.n.clust <- max(max.n.clust, 2)
 
     ## KEEP TRACK OF SOME ORIGINAL PARAMETERS
     ## n.pca.ori <- n.pca
@@ -26,44 +30,52 @@
     ## ESCAPE IF SUB-CLUST ARE SEEKED ##
     if(!is.null(clust)){
         res <- .find.sub.clusters(x=x, clust=clust, n.pca=n.pca, n.clust=n.clust, stat=stat, max.n.clust=max.n.clust, n.iter=n.iter, n.start=n.start,
-                         choose.n.clust=choose.n.clust, criterion=criterion, center=center, scale=scale)
+                                  choose.n.clust=choose.n.clust, criterion=criterion, center=center, scale=scale)
         return(res)
     }
     ## END SUB-CLUST
 
 
-    ## SOME GENERAL VARIABLES ##
+    ## PERFORM PCA ##
     N <- nrow(x)
     REDUCEDIM <- is.null(dudi)
-    min.n.clust <- 2
-    max.n.clust <- max(max.n.clust, 2)
 
-    ## PERFORM PCA ##
-    maxRank <- min(dim(x))
-
-    if(REDUCEDIM){
+    if(REDUCEDIM){ # if no dudi provided
+        ## PERFORM PCA ##
+        maxRank <- min(dim(x))
         pcaX <- dudi.pca(x, center = center, scale = scale, scannf = FALSE, nf=maxRank)
-    } else {
-        if(!inherits(dudi,"dudi")) stop("dudi provided but is not a dudi object")
+    } else { # else use the provided dudi
         pcaX <- dudi
     }
+    cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
 
+    if(!REDUCEDIM){
+        myCol <- rep(c("black", "lightgrey"), c(ncol(pcaX$li),length(pcaX$eig)))
+    } else {
+        myCol <- "black"
+    }
+
     ## select the number of retained PC for PCA
-    if(is.null(n.pca)){
-        if(!REDUCEDIM){
-            myCol <- rep(c("black", "lightgrey"), c(ncol(pcaX$li),length(pcaX$eig)))
-        } else {
-            myCol <- "black"
-        }
-        cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
+    if(is.null(n.pca) & pca.select=="nbEig"){
         plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA", col=myCol)
         cat("Choose the number PCs to retain (>=1): ")
-        n.pca <- NA
-        while(is.na(n.pca)){
-            n.pca <- as.integer(readLines(n = 1))
-        }
+        n.pca <- as.integer(readLines(n = 1))
     }
 
+    if(is.null(perc.pca) & pca.select=="percVar"){
+        plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA", col=myCol)
+        cat("Choose the percentage of variance to retain (0-100): ")
+        nperc.pca <- as.numeric(readLines(n = 1))
+    }
+
+    ## get n.pca from the % of variance to conserve
+    if(!is.null(perc.pca)){
+        n.pca <- min(which(cumVar >= perc.pca))
+        if(perc.pca > 99.999) n.pca <- length(pcaX$eig)
+        if(n.pca<1) n.pca <- 1
+    }
+
+
      ## keep relevant PCs - stored in XU
     X.rank <- length(pcaX$eig)
     n.pca <- min(X.rank, n.pca)
@@ -175,9 +187,9 @@
 
 
 
-###################
+########################
 ## find.clusters.genind
-###################
+########################
 find.clusters.genind <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE,
                                  criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
                           max.n.clust=round(nrow(x at tab)/10), n.iter=1e5, n.start=10,
@@ -203,7 +215,7 @@
 
     ## CALL DATA.FRAME METHOD
     res <- find.clusters(X, clust=clust, n.pca=n.pca, n.clust=n.clust, stat=stat, max.n.clust=max.n.clust, n.iter=n.iter, n.start=n.start,
-                         choose.n.clust=choose.n.clust, criterion=criterion, center=FALSE, scale=FALSE)
+                         choose.n.clust=choose.n.clust, criterion=criterion, center=FALSE, scale=FALSE,...)
     return(res)
 } # end find.clusters.genind
 
@@ -225,6 +237,95 @@
 
 
 
+##########################
+## find.clusters.genlight
+##########################
+find.clusters.genlight <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE,
+                                   criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
+                                   max.n.clust=round(nInd(x)/10), n.iter=1e5, n.start=10,
+                                   scale=FALSE, pca.select=c("nbEig","percVar"), perc.pca=NULL, glPca=NULL, ...){
+
+    ## CHECKS ##
+    if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
+    if(!require(MASS, quiet=TRUE)) stop("MASS library is required.")
+    if(!require(stats)) stop("package stats is required")
+    if(!inherits(x, "genlight")) stop("x is not a genlight object.")
+    stat <- match.arg(stat)
+    pca.select <- match.arg(pca.select)
+
+
+    ## SOME GENERAL VARIABLES ##
+    N <- nInd(x)
+    min.n.clust <- 2
+
+
+    ## PERFORM PCA ##
+    REDUCEDIM <- is.null(glPca)
+
+    if(REDUCEDIM){ # if no glPca provided
+        maxRank <- min(c(nInd(x), nLoc(x)))
+        pcaX <- glPca(x, center = TRUE, scale = scale, nf=maxRank, loadings=FALSE, returnDotProd = FALSE, ...)
+    } else {
+        pcaX <- glPca
+    }
+
+    if(is.null(n.pca)){
+        cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
+    }
+
+
+    ## select the number of retained PC for PCA
+    if(!REDUCEDIM){
+        myCol <- rep(c("black", "lightgrey"), c(ncol(pcaX$scores),length(pcaX$eig)))
+    } else {
+        myCol <- "black"
+    }
+
+    if(is.null(n.pca) & pca.select=="nbEig"){
+        plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA", col=myCol)
+        cat("Choose the number PCs to retain (>=1): ")
+        n.pca <- as.integer(readLines(n = 1))
+    }
+
+    if(is.null(perc.pca) & pca.select=="percVar"){
+        plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA", col=myCol)
+        cat("Choose the percentage of variance to retain (0-100): ")
+        nperc.pca <- as.numeric(readLines(n = 1))
+    }
+
+    ## get n.pca from the % of variance to conserve
+    if(!is.null(perc.pca)){
+        n.pca <- min(which(cumVar >= perc.pca))
+        if(perc.pca > 99.999) n.pca <- length(pcaX$eig)
+        if(n.pca<1) n.pca <- 1
+    }
+
+    if(!REDUCEDIM){
+        if(n.pca > ncol(pcaX$scores)) {
+            n.pca <- ncol(pcaX$scores)
+        }
+    }
+
+
+    ## convert PCA
+    pcaX <- .glPca2dudi(pcaX)
+
+
+    ## CALL DATA.FRAME METHOD
+    res <- find.clusters(pcaX$li, clust=clust, n.pca=n.pca, n.clust=n.clust, stat=stat, max.n.clust=max.n.clust, n.iter=n.iter, n.start=n.start,
+                         choose.n.clust=choose.n.clust, criterion=criterion, center=FALSE, scale=FALSE, dudi=pcaX)
+    return(res)
+} # end find.clusters.genlight
+
+
+
+
+
+
+
+
+
+
 ###################
 ## .find.sub.clusters
 ###################

Modified: pkg/man/dapc.Rd
===================================================================
--- pkg/man/dapc.Rd	2011-03-10 15:50:01 UTC (rev 847)
+++ pkg/man/dapc.Rd	2011-03-10 16:37:30 UTC (rev 848)
@@ -12,8 +12,9 @@
 \alias{assignplot}
 \title{Discriminant Analysis of Principal Components (DAPC)}
 \description{
-  These functions implement the Discriminant Analysis of Principal Components
-  (DAPC). See 'details' section for a succint description of the method. 
+  These functions implement the Discriminant Analysis of Principal
+  Components (DAPC, Jombart et al. 2010). See 'details' section for a
+  succint description of the method.
 
  \code{dapc} is a generic function performing the DAPC on the following
  types of objects:\cr
@@ -42,7 +43,7 @@
 \usage{
 \method{dapc}{data.frame}(x, grp, n.pca=NULL, n.da=NULL, center=TRUE,
      scale=FALSE,var.contrib=TRUE, pca.select=c("nbEig","percVar"),
-    perc.pca=NULL, \ldots, dudi=NULL)
+     perc.pca=NULL, \ldots, dudi=NULL)
 
 \method{dapc}{matrix}(x, \ldots)
 
@@ -61,15 +62,13 @@
 \method{summary}{dapc}(object, \dots)
 
 \method{scatter}{dapc}(x, xax=1, yax=2, grp=NULL,
-                         col=rainbow(length(levels(x$grp))), pch=20,
-                         posi="bottomleft", bg="grey", ratio=0.3, cstar
-                         = 1, cellipse = 1.5, axesell = TRUE, label =
-                         levels(x$grp), clabel = 1, xlim = NULL, ylim =
-                         NULL, grid = TRUE, addaxes = TRUE, origin =
-                         c(0,0), include.origin = TRUE, sub = "", csub =
-                         1, possub = "bottomleft", posleg = "topright",
-                         cleg = 1, cgrid = 1, pixmap = NULL, contour =
-                         NULL, area = NULL, \ldots)
+        col=rainbow(length(levels(x$grp))), pch=20, posi="bottomleft",
+        bg="grey", ratio=0.3, cstar = 1, cellipse = 1.5, axesell = TRUE,
+        label = levels(x$grp), clabel = 1, xlim = NULL, ylim = NULL, grid
+        = TRUE, addaxes = TRUE, origin = c(0,0), include.origin = TRUE,
+        sub = "", csub = 1, possub = "bottomleft", posleg = "topright",
+        cleg = 1, cgrid = 1, pixmap = NULL, contour = NULL, area = NULL,
+        \ldots)
 
 assignplot(x, only.grp=NULL, subset=NULL, cex.lab=.75, pch=3)
 }

Modified: pkg/man/find.clusters.Rd
===================================================================
--- pkg/man/find.clusters.Rd	2011-03-10 15:50:01 UTC (rev 847)
+++ pkg/man/find.clusters.Rd	2011-03-10 16:37:30 UTC (rev 848)
@@ -4,16 +4,18 @@
 \alias{find.clusters.data.frame}
 \alias{find.clusters.matrix}
 \alias{find.clusters.genind}
+\alias{find.clusters.genlight}
 \alias{.find.sub.clusters}
 \title{find.cluster: cluster identification using successive K-means}
 \description{
-   These functions implement the clustering procedure used in Discriminant
-  Analysis of Principal Components (DAPC, Jombart et al. submitted). This
-  procedure consists in running successive K-means with an increasing number of
-  clusters (\code{k}), after transforming data using a principal component
-  analysis (PCA). For each model, a statistical measure of goodness of fit (by
-  default, BIC) is computed, which allows to choose the optimal \code{k}. See
-  \code{details} for a description of how to select the optimal \code{k}.
+   These functions implement the clustering procedure used in
+  Discriminant Analysis of Principal Components (DAPC, Jombart et
+  al. 2010). This procedure consists in running successive K-means with
+  an increasing number of clusters (\code{k}), after transforming data
+  using a principal component analysis (PCA). For each model, a
+  statistical measure of goodness of fit (by default, BIC) is computed,
+  which allows to choose the optimal \code{k}. See \code{details} for a
+  description of how to select the optimal \code{k}.
 
   Optionally, hierarchical clustering can be sought by providing a prior
   clustering of individuals (argument \code{clust}). In such case, clusters will
@@ -21,24 +23,42 @@
 
   The K-means procedure used in \code{find.clusters} is \code{\link[stats]{kmeans}} function
   from the \code{stats} package. The PCA function is \code{\link[ade4]{dudi.pca}} from the
-  \code{ade4} package.
+  \code{ade4} package, except for \linkS4class{genlight} objects which
+  use the \code{\link{glPca}} procedure from adegenet.
+
+  \code{find.clusters} is a generic function with methods for the
+ following types of objects:\cr
+ - \code{data.frame} (only numeric data)\cr
+ - \code{matrix} (only numeric data)\cr
+ - \code{\linkS4class{genind}} objects (genetic markers)\cr
+ - \code{\linkS4class{genlight}} objects (genome-wide SNPs)
+ 
 }
 \usage{
 \method{find.clusters}{data.frame}(x, clust=NULL, n.pca=NULL,
               n.clust=NULL, stat=c("BIC","AIC", "WSS"),
-              choose.n.clust=TRUE,criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
+              choose.n.clust=TRUE,criterion=c("diffNgroup",
+              "min","goesup", "smoothNgoesup", "goodfit"),
               max.n.clust=round(nrow(x)/10), n.iter=1e5, n.start=10,
-              center=TRUE, scale=TRUE, \ldots, dudi=NULL)
+              center=TRUE, scale=TRUE, pca.select=c("nbEig","percVar"),
+              perc.pca=NULL, \ldots, dudi=NULL)
 
 \method{find.clusters}{matrix}(x, \ldots)
 
 \method{find.clusters}{genind}(x, clust=NULL, n.pca=NULL, n.clust=NULL,
               stat=c("BIC","AIC", "WSS"), choose.n.clust=TRUE,
-              criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
-              max.n.clust=round(nrow(x at tab)/10), n.iter=1e5, n.start=10,
-              scale=FALSE, scale.method=c("sigma", "binom"),
+              criterion=c("diffNgroup", "min","goesup", "smoothNgoesup",
+              "goodfit"), max.n.clust=round(nrow(x at tab)/10), n.iter=1e5,
+              n.start=10, scale=FALSE, scale.method=c("sigma", "binom"),
               truenames=TRUE, \ldots)
 
+\method{find.clusters}{genlight}(x, clust=NULL, n.pca=NULL,
+              n.clust=NULL, stat=c("BIC", "AIC",
+              "WSS"),choose.n.clust=TRUE, criterion=c("diffNgroup",
+              "min","goesup","smoothNgoesup", "goodfit"),
+              max.n.clust=round(nInd(x)/10), n.iter=1e5,n.start=10,
+              scale=FALSE, pca.select=c("nbEig","percVar"),
+              perc.pca=NULL,glPca=NULL, \ldots)
 }
 \arguments{
   \item{x}{\code{a data.frame}, \code{matrix}, or \code{\linkS4class{genind}}
@@ -89,6 +109,15 @@
     downweighting informative alleles. Further scaling options are
     available for \linkS4class{genind} objects (see argument
     \code{scale.method}).}
+  \item{pca.select}{a \code{character} indicating the mode of selection of PCA
+    axes, matching either "nbEig" or "percVar". For "nbEig", the user
+    has to specify the number of axes retained (interactively, or via
+    \code{n.pca}). For "percVar", the user has to specify the minimum amount of
+    the total variance to be preserved by the retained axes, expressed as a
+    percentage (interactively, or via \code{perc.pca}).  }
+  \item{perc.pca}{a \code{numeric} value between 0 and 100 indicating the
+    minimal percentage of the total variance of the data to be expressed by the
+    retained axes of PCA.}  
   \item{scale.method}{a \code{character} specifying the scaling method to be used
     for allele frequencies, which must match "sigma" (usual estimate of standard
     deviation) or "binom" (based on binomial distribution). See
@@ -103,6 +132,9 @@
     \code{dudi} (from the ade4 package). If provided, prior PCA will be
     ignored, and this object will be used as a prior step for variable
     orthogonalisation.}
+  \item{glPca}{an optional \code{\link{glPca}} object; if provided,
+    dimension reduction is not performed (saving computational time) but
+    taken directly from this object.}
 }
 \details{
   === ON THE SELECTION OF K ===\cr
@@ -235,5 +267,13 @@
 points(2, foo.WSS$Kstat[2], pch="x", cex=3)
 mtext(3, tex="'X' indicates the actual number of clusters")
 
+
+## TOY EXAMPLE FOR GENLIGHT OBJECTS ##
+x <- glSim(100,500,500)
+x
+plot(x)
+grp <- find.clusters(x, n.pca=100, choose=FALSE, stat="BIC")
+plot(grp$Kstat, type="o", xlab="number of clusters (K)",ylab="BIC",main="find.clusters on a genlight object\n(two groups)")
+
 }
 \keyword{multivariate}