[adegenet-commits] r788 - in pkg: R man

Tue Feb 1 17:08:18 CET 2011

Author: jombart
Date: 2011-02-01 17:08:18 +0100 (Tue, 01 Feb 2011)
New Revision: 788

Modified:
   pkg/R/dapc.R
   pkg/R/find.clust.R
   pkg/man/find.clusters.Rd
   pkg/man/read.snp.Rd
Log:
New procedures for automated selection of K in find.clusters.


Modified: pkg/R/dapc.R
===================================================================

--- pkg/R/dapc.R	2011-01-26 17:18:58 UTC (rev 787)
+++ pkg/R/dapc.R	2011-02-01 16:08:18 UTC (rev 788)
@@ -266,7 +266,7 @@
 ##############
 ## scatter.dapc
 ##############
-scatter.dapc <- function(x, xax=1, yax=2, col=rainbow(length(levels(x$grp))), pch=20, posi="bottomleft", bg="grey", ratio=0.3,
+scatter.dapc <- function(x, xax=1, yax=2, grp=NULL, col=rainbow(length(levels(x$grp))), pch=20, posi="bottomleft", bg="grey", ratio=0.3,
                          cstar = 1, cellipse = 1.5, axesell = TRUE, label = levels(x$grp), clabel = 1, xlim = NULL, ylim = NULL,
                          grid = TRUE, addaxes = TRUE, origin = c(0,0), include.origin = TRUE, sub = "", csub = 1, possub = "bottomleft", cgrid = 1,
                          pixmap = NULL, contour = NULL, area = NULL, ...){
@@ -277,6 +277,10 @@
     col <- rep(col, length(levels(x$grp)))
     pch <- rep(pch, length(levels(x$grp)))
 
+    ## handle grp
+    if(is.null(grp)){
+        grp <- x$grp
+    }
 
     if(!ONEDIM){
         ## set par
@@ -286,12 +290,12 @@
                 axes <- c(xax,yax)
         ## basic empty plot
         ## s.label(x$ind.coord[,axes], clab=0, cpoint=0, grid=FALSE, addaxes = FALSE, cgrid = 1, include.origin = FALSE, ...)
-        s.class(x$ind.coord[,axes], fac=x$grp, col=col, cpoint=0, cstar = cstar, cellipse = cellipse, axesell = axesell, label = label,
+        s.class(x$ind.coord[,axes], fac=grp, col=col, cpoint=0, cstar = cstar, cellipse = cellipse, axesell = axesell, label = label,
                 clabel = clabel, xlim = xlim, ylim = ylim, grid = grid, addaxes = addaxes, origin = origin, include.origin = include.origin,
                 sub = sub, csub = csub, possub = possub, cgrid = cgrid, pixmap = pixmap, contour = contour, area = area)
 
         ## add points
-        colfac <- pchfac <- x$grp
+        colfac <- pchfac <- grp
         levels(colfac) <- col
         levels(pchfac) <- pch
         colfac <- as.character(colfac)
@@ -300,7 +304,7 @@
         if(is.numeric(pch)) pchfac <- as.numeric(pchfac)
 
         points(x$ind.coord[,xax], x$ind.coord[,yax], col=colfac, pch=pchfac, ...)
-        s.class(x$ind.coord[,axes], fac=x$grp, col=col, cpoint=0, add.plot=TRUE, cstar = cstar, cellipse = cellipse, axesell = axesell, label = label,
+        s.class(x$ind.coord[,axes], fac=grp, col=col, cpoint=0, add.plot=TRUE, cstar = cstar, cellipse = cellipse, axesell = axesell, label = label,
                 clabel = clabel, xlim = xlim, ylim = ylim, grid = grid, addaxes = addaxes, origin = origin, include.origin = include.origin,
                 sub = sub, csub = csub, possub = possub, cgrid = cgrid, pixmap = pixmap, contour = contour, area = area)
 
@@ -315,7 +319,7 @@
             pcLab <- xax
         }
         ## get densities
-        ldens <- tapply(x$ind.coord[,pcLab], x$grp, density)
+        ldens <- tapply(x$ind.coord[,pcLab], grp, density)
         allx <- unlist(lapply(ldens, function(e) e$x))
         ally <- unlist(lapply(ldens, function(e) e$y))
          par(bg=bg)

Modified: pkg/R/find.clust.R
===================================================================
--- pkg/R/find.clust.R	2011-01-26 17:18:58 UTC (rev 787)
+++ pkg/R/find.clust.R	2011-02-01 16:08:18 UTC (rev 788)
@@ -6,8 +6,9 @@
 ######################
 ## find.clusters.data.frame
 ######################
-find.clusters.data.frame <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE, criterion=c("min","diff", "conserv"),
-                                     max.n.clust=round(nrow(x)/10), n.iter=1e3, n.start=10, center=TRUE, scale=TRUE, ...){
+find.clusters.data.frame <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE,
+                                     criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
+                                     max.n.clust=round(nrow(x)/10), n.iter=1e5, n.start=10, center=TRUE, scale=TRUE, ...){
 
     ## CHECKS ##
     if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
@@ -66,7 +67,8 @@
         WSS <- numeric(0)
 
         for(i in 1:length(nbClust)){
-            temp <- kmeans(XU, centers=nbClust[i], iter.max=min(n.iter, 100), nstart=min(n.start, 1e3))
+            ## temp <- kmeans(XU, centers=nbClust[i], iter.max=min(n.iter, 100), nstart=min(n.start, 1e3))
+            temp <- kmeans(XU, centers=nbClust[i], iter.max=n.iter, nstart=n.start)
             WSS[i] <- sum(temp$withinss)
         }
 
@@ -114,14 +116,25 @@
             if(criterion=="min") {
                 n.clust <- which.min(myStat)
             }
-            if(criterion=="diff") {
-                temp <- diff(myStat)
-                n.clust <- which.max( which( (temp-min(temp))<max(temp)/1e4))
+            if(criterion=="goesup") {
+                ## temp <- diff(myStat)
+                ## n.clust <- which.max( which( (temp-min(temp))<max(temp)/1e4))
+                n.clust <- min(which(diff(myStat)>0))
             }
-            if(criterion=="conserv") {
-                temp <- min(myStat) + 0.15*(max(myStat) - min(myStat))
-                n.clust <- min( which(myStat < temp))
+            if(criterion=="goodfit") {
+                temp <- min(myStat) + 0.1*(max(myStat) - min(myStat))
+                n.clust <- min( which(myStat < temp))-1
             }
+            if(criterion=="diffNgroup") {
+                temp <- cutree(hclust(dist(diff(myStat)), method="ward"), k=2)
+                goodgrp <- which.min(tapply(diff(myStat), temp, mean))
+                n.clust <- max(which(temp==goodgrp))+1
+            }
+            if(criterion=="smoothNgoesup") {
+                temp <- myStat
+                temp[2:(length(myStat)-1)] <- sapply(1:(length(myStat)-2), function(i) mean(myStat[c(i,i+1,i+2)]))
+                n.clust <- min(which(diff(temp)>0))
+            }
 
         }
     } else { # if n.clust provided
@@ -154,8 +167,9 @@
 ###################
 ## find.clusters.genind
 ###################
-find.clusters.genind <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE, criterion=c("min","diff", "conserv"),
-                          max.n.clust=round(nrow(x at tab)/10), n.iter=1e3, n.start=10,
+find.clusters.genind <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE,
+                                 criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
+                          max.n.clust=round(nrow(x at tab)/10), n.iter=1e5, n.start=10,
                           scale=FALSE, scale.method=c("sigma", "binom"), truenames=TRUE, ...){
 
     ## CHECKS ##

Modified: pkg/man/find.clusters.Rd
===================================================================
--- pkg/man/find.clusters.Rd	2011-01-26 17:18:58 UTC (rev 787)
+++ pkg/man/find.clusters.Rd	2011-02-01 16:08:18 UTC (rev 788)
@@ -26,16 +26,16 @@
 \usage{
 \method{find.clusters}{data.frame}(x, clust=NULL, n.pca=NULL,
               n.clust=NULL, stat=c("BIC","AIC", "WSS"),
-              choose.n.clust=TRUE,criterion=c("min","diff", "conserv"),
-              max.n.clust=round(nrow(x)/10), n.iter=1e3, n.start=10,
+              choose.n.clust=TRUE,criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
+              max.n.clust=round(nrow(x)/10), n.iter=1e5, n.start=10,
               center=TRUE, scale=TRUE, \ldots)
 
 \method{find.clusters}{matrix}(x, \ldots)
 
 \method{find.clusters}{genind}(x, clust=NULL, n.pca=NULL, n.clust=NULL,
               stat=c("BIC","AIC", "WSS"), choose.n.clust=TRUE,
-              criterion=c("min","diff", "conserv"),
-              max.n.clust=round(nrow(x at tab)/10), n.iter=1e3, n.start=10,
+              criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
+              max.n.clust=round(nrow(x at tab)/10), n.iter=1e5, n.start=10,
               scale=FALSE, scale.method=c("sigma", "binom"),
               truenames=TRUE, \ldots)
 
@@ -60,14 +60,15 @@
     Criterion. WSS: within-groups sum of squares, that is, residual variance.}
   \item{choose.n.clust}{ a \code{logical} indicating whether the number of
     clusters should be chosen by the user (TRUE, default), or automatically,
-    based on a given criterion (argument \code{criterion}). IT IS HIGHLY
-    RECOMMENDED to choose the number of clusters interactively, as automatic
-    procedures have not been fully evaluated.}
-  \item{criterion}{ a \code{character} string matching "min", "diff", or
-    "conserv", indicating the criterion for automatic selection of the optimal
-    number of clusters. Honestly, you should go for interactive
-    selection of the number of clusters. Do as you wish. No warranty. If
-    you still want to give it a try, see \code{details}.}
+    based on a given criterion (argument \code{criterion}). It is HIGHLY
+    RECOMMENDED to choose the number of clusters INTERACTIVELY, since
+    i) the decrease of the summary statistics (BIC by default) is
+    informative, and ii) no criteria for automatic selection is
+    appropriate to all cases (see details).}
+  \item{criterion}{ a \code{character} string matching "diffNgroup",
+    "min","goesup", "smoothNgoesup", or "conserv", indicating the criterion for automatic
+    selection of the optimal number of clusters. See \code{details} for
+    an explanation of these procedures.}
   \item{max.n.clust}{ an \code{integer} indicating the maximum number of
     clusters to be tried. Values of 'k' will be picked up between 1 and \code{max.n.clust}}
   \item{n.iter}{ an \code{integer} indicating the number of iterations to be used
@@ -103,37 +104,59 @@
   === ON THE SELECTION OF K ===\cr
   (where K is the 'optimal' number of clusters)
 
-  So far, the analysis of data simulated under various population genetics
-  models (see reference) suggested an ad hoc rule for the selection of the
-  optimal number of clusters. First important result is that BIC seems for
-  efficient than AIC and WSS to select the appropriate number of clusters (see
-  example). The rule of thumb consists in increasing K until it no longer leads
-  to an appreciable improvement of fit (i.e., to a decrease of BIC).  In the
-  most simple models (island models), BIC decreases until it reaches the optimal
-  K, and then increases. In these cases, our rule amounts to choosing the lowest
-  K. In other models such as stepping stones, the decrease of BIC often
-  continues after the optimal K, but is much less steep.
+  So far, the analysis of data simulated under various population
+  genetics models (see reference) suggested an ad hoc rule for the
+  selection of the optimal number of clusters. First important result is
+  that BIC seems for efficient than AIC and WSS to select the
+  appropriate number of clusters (see example). The rule of thumb
+  consists in increasing K until it no longer leads to an appreciable
+  improvement of fit (i.e., to a decrease of BIC).  In the most simple
+  models (island models), BIC decreases until it reaches the optimal K,
+  and then increases. In these cases, our rule amounts to choosing the
+  lowest K. In other models such as stepping stones, the decrease of BIC
+  often continues after the optimal K, but is much less steep.
 
-  
-  An alternative approach, that we do not recommend for now, is automatic
-  selection based on a fixed criterion. For this, set \code{choose.n.clust} to
-  FALSE and specify the \code{criterion} you want to use, from the following
-  values:
 
-  - "min": the model with the minimum summary statistics (as specified by
-    \code{stat} argument, BIC by default) is retained. Is likely to work for
-    simple island model, using BIC.
+  An alternative approach is the automatic selection based on a fixed
+  criterion. Note that, in any case, it is highly recommended to look at
+  the graph of the BIC for different numbers of clusters as displayed
+  during the interactive cluster selection.
+  To use automated selection, set \code{choose.n.clust} to FALSE and specify
+  the \code{criterion} you want to use, from the following values:
 
-  - "diff": model selection based on successive improvement of the test
-  statistic. This procedure attempts to increase K until the model improvement
-  (difference in successive BIC, AIC, or WSS) is no longer important. May be
-  more appropriate to models relating to stepping stones.
+  - "diffNgroup": differences between successive values of the summary
+  statistics (by default, BIC) are splitted into two groups using a
+  Ward's clustering method (see \code{?hclust}), to differentiate sharp
+  decrease from mild decreases or increases. The retained K is the one
+  before the first group switch. Appears to work well for
+  island/hierarchical models, and decently for isolation by distance
+  models, albeit with some unstability. Can be impacted by an initial,
+  very sharp decrease of the test statistics. IF UNSURE ABOUT THE
+  CRITERION TO USE, USE THIS ONE.
 
-  - "conserv": another criterion meant to be conservative, in that it seeks a
-  good fit with a minimum number of clusters. Unlike "diff", it does not rely on
-  differences between successive statistics, but rather on absolute fit. It
-  selects the model with the smallest K so that the overall fit is above a given
-  threshold.
+  - "min": the model with the minimum summary statistics (as specified
+  by \code{stat} argument, BIC by default) is retained. Is likely to
+  work for simple island model, using BIC. It is likely to fail in
+  models relating to stepping stones, where the BIC always decreases
+  (albeit by a small amount) as K increases. In general, this approach
+  tends to over-estimate the number of clusters.
+
+  - "goesup": the selected model is the K after which increasing the
+  number of clusters leads to increasing the summary statistics. Suffers
+  from inaccuracy, since i) a steep decrease might follow a small 'bump'
+  of increase of the statistics, and ii) increase might never happen, or
+  happen after negligible decreases. Is likely to work only for
+  clear-cut island models.
+
+  - "smoothNgoesup": a variant of "goesup", in which the summary
+  statistics is first smoothed using a lowess approach. Is meant to be
+  more accurate than "goesup" as it is less prone to stopping to small
+  'bumps' in the decrease of the statistics.
+
+  - "goodfit": another criterion seeking a good fit with a minimum
+  number of clusters. This approach does not rely on differences between
+  successive statistics, but on absolute fit. It selects the model with
+  the smallest K so that the overall fit is above a given threshold.
 }
 \value{
   The class \code{find.clusters} is a list with the following
@@ -146,10 +169,9 @@
   \item{size}{an \code{integer} vector giving the size of the different clusters.}
 }
 \references{
-  Jombart T, Devillard S and Balloux F  (2010) Discriminant analysis of
+  Jombart T, Devillard S and Balloux F (2010) Discriminant analysis of
   principal components: a new method for the analysis of genetically
-  structured populations. BMC Genetics
-  11:94. doi:10.1186/1471-2156-11-94
+  structured populations. BMC Genetics 11:94. doi:10.1186/1471-2156-11-94
 }
 \seealso{
   - \code{\link{dapc}}: implements the DAPC.

Modified: pkg/man/read.snp.Rd
===================================================================
--- pkg/man/read.snp.Rd	2011-01-26 17:18:58 UTC (rev 787)
+++ pkg/man/read.snp.Rd	2011-02-01 16:08:18 UTC (rev 788)
@@ -21,7 +21,7 @@
 }
 \usage{
 read.snp(file, quiet=FALSE, chunkSize = 10, multicore = require("multicore"),
-    n.cores = NULL\dots)
+    n.cores = NULL, \dots)
 }
 \arguments{
    \item{file}{ a character string giving the path to the file to