[adegenet-commits] r564 - pkg/man

Fri Feb 12 13:03:52 CET 2010

Author: jombart
Date: 2010-02-12 13:03:52 +0100 (Fri, 12 Feb 2010)
New Revision: 564

Modified:
   pkg/man/dapc.Rd
   pkg/man/find.clusters.Rd
Log:
several documentation fixes


Modified: pkg/man/dapc.Rd
===================================================================

--- pkg/man/dapc.Rd	2010-02-11 15:09:03 UTC (rev 563)
+++ pkg/man/dapc.Rd	2010-02-12 12:03:52 UTC (rev 564)
@@ -36,20 +36,22 @@
 }
 \usage{
 \method{dapc}{data.frame}(x, grp, n.pca=NULL, n.da=NULL, center=TRUE,
-scale=FALSE, var.contrib=FALSE, pca.select=c("nbEig","percVar"), perc.pca=NULL)
+    scale=FALSE,var.contrib=FALSE,
+    pca.select=c("nbEig","percVar"), perc.pca=NULL)
 
 \method{dapc}{matrix}(x, \ldots)
 
 \method{dapc}{genind}(x, pop=NULL, n.pca=NULL, n.da=NULL, scale=FALSE,
-scale.method=c("sigma", "binom"), truenames=TRUE, all.contrib=FALSE,
-pca.select=c("nbEig","percVar"), perc.pca=NULL)
+     scale.method=c("sigma", "binom"), truenames=TRUE, all.contrib=FALSE,
+     pca.select=c("nbEig","percVar"), perc.pca=NULL)
 
 \method{print}{dapc}(x, \dots)
 
 \method{summary}{dapc}(object, \dots)
 
-\method{scatter}{dapc}(x, xax=1, yax=2, col=rainbow(length(levels(x$grp))),
-posi="bottomleft", bg="grey", ratio=0.3, csub=1.2, \ldots)
+\method{scatter}{dapc}(x, xax=1, yax=2,
+        col=rainbow(length(levels(x$grp))), posi="bottomleft", bg="grey",
+        ratio=0.3, csub=1.2, \ldots)
 
 \method{assignplot}{dapc}(x, only.grp=NULL, subset=NULL, cex.lab=.75, pch=3)
 }
@@ -148,8 +150,8 @@
     variables (alleles in the case of genetic data) to the principal components
     of DAPC.}
 
-  
-  === other outputs ===\cr
+
+  \cr=== other outputs ===\cr
   Other functions have different outputs:\cr
   - \code{summary.dapc} returns a list with 6 components: \code{n.dim} (number
   of retained DAPC axes), \code{n.pop} (number of groups/populations),
@@ -165,10 +167,14 @@
 Discriminant analysis of principal components: a new method for the analysis of
 genetically structured populations. Submitted to \emph{PLoS genetics}.
 }
-\seealso{\code{\link{find.clusters}}, to identify clusters without
-  prior. \code{\link{dapcIllus}}, a set of simulated data illustrating the DAPC,
-  and \code{\link{eHGDP}} and \code{\link{H3N2}}, empirical datasets also
-  illustrating DAPC.  }
+\seealso{
+  - \code{\link{find.clusters}}: to identify clusters without prior.
+
+  - \code{\link{dapcIllus}}: a set of simulated data illustrating the DAPC
+
+  - \code{\link{eHGDP}}, \code{\link{H3N2}}: empirical datasets illustrating
+  DAPC
+}
 \author{ Thibaut Jombart \email{t.jombart at imperial.ac.uk} }
 \examples{
 ## data(dapcIllus), data(eHGDP), and data(H3N2) illustrate the dapc

Modified: pkg/man/find.clusters.Rd
===================================================================
--- pkg/man/find.clusters.Rd	2010-02-11 15:09:03 UTC (rev 563)
+++ pkg/man/find.clusters.Rd	2010-02-12 12:03:52 UTC (rev 564)
@@ -20,79 +20,87 @@
   be sought within each prior group.
 
   \code{.find.sub.clusters} is a hidden function called in some instances of
-  \code{find.clusters}, and should not be called directely by the user.
+  \code{find.clusters}, and should not be called directly by the user.
 
-  The K-means procedure used in \code{find.clusters} is \code{kmeans} function
-  from the \code{stat} package. The PCA function is \code{dudi.pca} from the
+  The K-means procedure used in \code{find.clusters} is \code{\link[stats]{kmeans}} function
+  from the \code{stats} package. The PCA function is \code{\link[ade4]{dudi.pca}} from the
   \code{ade4} package.
 }
 \usage{
-\method{find.clusters}{data.frame}(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC",
-                                     "AIC", "WSS"), choose.n.clust=TRUE,
-                                     criterion=c("min","diff", "conserv"),
-                                     max.n.clust=round(nrow(x)/10), n.iter=1e3,
-                                     n.start=10, center=TRUE, scale=TRUE)
+\method{find.clusters}{data.frame}(x, clust=NULL, n.pca=NULL,
+              n.clust=NULL, stat=c("BIC","AIC", "WSS"),
+              choose.n.clust=TRUE,criterion=c("min","diff", "conserv"),
+              max.n.clust=round(nrow(x)/10), n.iter=1e3, n.start=10,
+              center=TRUE, scale=TRUE)
 
 \method{find.clusters}{matrix}(x, \ldots)
 
-\method{find.clusters}{genind}(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC",
-                          "AIC", "WSS"), choose.n.clust=TRUE, criterion=c("min","diff",
-                          "conserv"), max.n.clust=round(nrow(x at tab)/10), n.iter=1e3,
-                          n.start=10, scale=FALSE, scale.method=c("sigma", "binom"),
-                          truenames=TRUE, \ldots)
+\method{find.clusters}{genind}(x, clust=NULL, n.pca=NULL, n.clust=NULL,
+              stat=c("BIC","AIC", "WSS"), choose.n.clust=TRUE,
+              criterion=c("min","diff", "conserv"),
+              max.n.clust=round(nrow(x at tab)/10), n.iter=1e3, n.start=10,
+              scale=FALSE, scale.method=c("sigma", "binom"),
+              truenames=TRUE, \ldots)
 
 }
 \arguments{
-\item{x}{\code{a data.frame}, \code{matrix}, or \code{\linkS4class{genind}}
-  object. For the \code{data.frame} and \code{matrix} arguments, only
-  quantitative variables should be provided.}
-\item{clust}{an optional \code{factor} indicating a prior group membership of
-  individuals. If provided, sub-clusters will be sought within each prior
-  group.}
-\item{n.pca}{an \code{integer} indicating the number of axes retained in the
-  Principal Component Analysis (PCA) step. If \code{NULL}, interactive selection
-  is triggered.}
-\item{n.clust}{ an optinal \code{integer} indicating the number of clusters to
-  be sought. If provided, the function will only run K-means once, for this
-  number of clusters. If left as \code{NULL}, several K-means are run for a
-  range of k (number of clusters) values.}
-\item{stat}{ a \code{character} string matching 'BIC', 'AIC', or 'WSS', which
-  indicates the statistic to be computed for each model (i.e., for each value of
-  \code{k}). BIC: Bayesian Information Criterion. AIC: Aikaike's Information
-  Criterion. WSS: within-groups sum of squares, that is, residual variance.}
-\item{choose.n.clust}{ a \code{logical} indicating whether the number of
-clusters should be chosen by the user (TRUE, default), or automatically, based
-on a given criterion (argument \code{criterion}). IT IS HIGHLY RECOMMENDED to
-choose the number of clusters interactively, as automatic procedures are being
-evaluated.}
-\item{criterion}{ a \code{character} string matching "min", "diff", or
-  "conserv", indicating the criterion for automatic selection of the optimal
-  number of clusters. See \code{details}.}
-\item{max.n.clust}{ an \code{integer} indicating the maximum number of clusters
-  to be tried. Values of 'k' will be picked up between 1 and \code{max.n.clust}}
-\item{n.iter}{ an \code{integer} indicating the number of iterations to be used
-  in each run of K-means algorithm. Corresponds to \code{iter.max} of
-  \code{kmeans} function.}
-\item{n.start}{ an \code{integer} indicating the number of randomly chosen
-  starting points to be used in each run of K-means algorithm. Using more
-  starting points ensures convergence of the algorithm. Corresponds to
-  \code{nstart} of \code{kmeans} function.}
-\item{center}{a \code{logical} indicating whether variables should be centred to
-mean 0 (TRUE, default) or not (FALSE). Always TRUE for \linkS4class{genind}
-objects.}
-\item{scale}{a \code{logical} indicating whether variables should be scaled
-  (TRUE) or not (FALSE, default). Scaling consists in dividing variables by
-  their (estimated) standard deviation to account for trivial differences in
-  variances. Further scaling options are available for \linkS4class{genind}
-  objects (see argument \code{scale.method}).}
-\item{scale.method}{a \code{character} specifying the scaling method to be used
-  for allele frequencies, which must match "sigma" (usual estimate of standard
-  deviation) or "binom" (based on binomial distribution). See
-  \code{\link{scaleGen}} for further details.}
-\item{truenames}{a \code{logical} indicating whether true (i.e., user-specified)
-  labels should be used in object outputs (TRUE, default) or not (FALSE).}
-\item{\ldots}{further arguments to be passed to other functions. For
-  \code{find.clusters.matrix}, arguments are to match those of \code{find.clusters.data.frame}.}
+  \item{x}{\code{a data.frame}, \code{matrix}, or \code{\linkS4class{genind}}
+    object. For the \code{data.frame} and \code{matrix} arguments, only
+    quantitative variables should be provided.}
+  \item{clust}{an optional \code{factor} indicating a prior group membership of
+    individuals. If provided, sub-clusters will be sought within each prior
+    group.}
+  \item{n.pca}{an \code{integer} indicating the number of axes retained in the
+    Principal Component Analysis (PCA) step. If \code{NULL}, interactive selection
+    is triggered.}
+  \item{n.clust}{ an optinal \code{integer} indicating the number of clusters to
+    be sought. If provided, the function will only run K-means once, for this
+    number of clusters. If left as \code{NULL}, several K-means are run for a
+    range of k (number of clusters) values.}
+  \item{stat}{ a \code{character} string matching 'BIC', 'AIC', or 'WSS', which
+    indicates the statistic to be computed for each model (i.e., for each value of
+    \code{k}). BIC: Bayesian Information Criterion. AIC: Aikaike's Information
+    Criterion. WSS: within-groups sum of squares, that is, residual variance.}
+  \item{choose.n.clust}{ a \code{logical} indicating whether the number of
+    clusters should be chosen by the user (TRUE, default), or automatically,
+    based on a given criterion (argument \code{criterion}). IT IS HIGHLY
+    RECOMMENDED to choose the number of clusters interactively, as automatic
+    procedures have not been fully evaluated.}
+  \item{criterion}{ a \code{character} string matching "min", "diff", or
+    "conserv", indicating the criterion for automatic selection of the optimal
+    number of clusters. Honestly, you should go for interactive
+    selection of the number of clusters. Do as you wish. No warranty. If
+    you still want to give it a try, see \code{details}.}
+  \item{max.n.clust}{ an \code{integer} indicating the maximum number of
+    clusters to be tried. Values of 'k' will be picked up between 1 and \code{max.n.clust}}
+  \item{n.iter}{ an \code{integer} indicating the number of iterations to be used
+    in each run of K-means algorithm. Corresponds to \code{iter.max} of
+    \code{kmeans} function.}
+  \item{n.start}{ an \code{integer} indicating the number of randomly
+    chosen starting centroids to be used in each run of the K-means
+    algorithm. Using more starting points ensures convergence of the
+    algorithm. Corresponds to \code{nstart} of \code{kmeans} function.}
+  \item{center}{a \code{logical} indicating whether variables should be centred to
+    mean 0 (TRUE, default) or not (FALSE). Always TRUE for \linkS4class{genind}
+    objects.}
+  \item{scale}{a \code{logical} indicating whether variables should be
+    scaled (TRUE) or not (FALSE, default). Scaling consists in dividing
+    variables by their (estimated) standard deviation to account for
+    trivial differences in variances. In allele frequencies, it comes with
+    the risk of giving uninformative alleles more importance while
+    downweighting informative alleles. Further scaling options are
+    available for \linkS4class{genind} objects (see argument
+    \code{scale.method}).}
+  \item{scale.method}{a \code{character} specifying the scaling method to be used
+    for allele frequencies, which must match "sigma" (usual estimate of standard
+    deviation) or "binom" (based on binomial distribution). See
+    \code{\link{scaleGen}} for further details.}
+  \item{truenames}{a \code{logical} indicating whether true (i.e., user-specified)
+    labels should be used in object outputs (TRUE, default) or not
+    (FALSE), in which case generic labels are used.}
+  \item{\ldots}{further arguments to be passed to other functions. For
+    \code{find.clusters.matrix}, arguments are to match those of the
+    \code{data.frame} method.}
 }
 \details{
   === ON THE SELECTION OF K ===
@@ -100,39 +108,43 @@
 
   So far, the analysis of data simulated under various population genetics
   models (see reference) suggested an ad hoc rule for the selection of the
-  optimal number of clusters. First, BIC seems for efficient than AIC and WSS
-  to select the appropriate number of clusters. The rule of thumb consists in
-  increasing K until it no longer leads to an appreciable improve of fit (i.e.,
-  decrease of BIC).  In the most simple models (island models), BIC decreases
-  until it reaches the optimal K, and then increases. In these cases, our rule
-  amounts to choosing the lowest K. In other models such as stepping stones, the
-  decrease of BIC often continues after the optimal K, but is much less steep.
+  optimal number of clusters. First important result is that BIC seems for
+  efficient than AIC and WSS to select the appropriate number of clusters (see
+  example). The rule of thumb consists in increasing K until it no longer leads
+  to an appreciable improvement of fit (i.e., to a decrease of BIC).  In the
+  most simple models (island models), BIC decreases until it reaches the optimal
+  K, and then increases. In these cases, our rule amounts to choosing the lowest
+  K. In other models such as stepping stones, the decrease of BIC often
+  continues after the optimal K, but is much less steep.
 
   
-  An alternative approach that we do not recommend is automatic selection based
-  on a fixed criterion. For this, set \code{choose.n.clust} to FALSE and specify
-  the \code{criterion} you want to use, from the following values:
+  An alternative approach, that we do not recommend for now, is automatic
+  selection based on a fixed criterion. For this, set \code{choose.n.clust} to
+  FALSE and specify the \code{criterion} you want to use, from the following
+  values:
 
-  - "min": the model with the minimum statistics (as specified by \code{stat}
-    argument) is retained. Is likely to work for simple island model with BIC.
+  - "min": the model with the minimum summary statistics (as specified by
+    \code{stat} argument, BIC by default) is retained. Is likely to work for
+    simple island model, using BIC.
 
   - "diff": model selection based on successive improvement of the test
   statistic. This procedure attempts to increase K until the model improvement
   (difference in successive BIC, AIC, or WSS) is no longer important. May be
   more appropriate to models relating to stepping stones.
 
-  "conserv": another criterion meant to be conservative, in that it seeks a good
-  fit with a minimum number of clusters. Unlike "diff", it does not rely on
+  - "conserv": another criterion meant to be conservative, in that it seeks a
+  good fit with a minimum number of clusters. Unlike "diff", it does not rely on
   differences between successive statistics, but rather on absolute fit. It
   selects the model with the smallest K so that the overall fit is above a given
-  threshold.  }
+  threshold.
+}
 \value{
   The class \code{find.clusters} is a list with the following
   components:\cr
-  \item{Kstat}{a \code{numeric} vector giving the values of the statistics for the
-  different values of K. Is NULL if \code{n.clust} was specified.}
-  \item{stat}{a \code{numeric} value giving the value of the statistics for the
-  retained model}
+  \item{Kstat}{a \code{numeric} vector giving the values of the summary
+  statistics for the different values of K. Is NULLif \code{n.clust} was specified.}
+  \item{stat}{a \code{numeric} value giving the value of the summary statistics
+  for the retained model}
   \item{grp}{a \code{factor} giving group membership for each individual.}
   \item{size}{an \code{integer} vector giving the size of the different clusters.}
 }
@@ -143,11 +155,15 @@
 }
 \seealso{
   - \code{\link{dapc}}: implements the DAPC.
-  
-  - \code{\link[stats]{kmeans}}: implementation of K-means in the stat
-  package.
-  
+
+  - \code{\link{dapcIllys}}: dataset illustrating the DAPC and \code{find.clusters}.
+
   - \code{\link{eHGDP}}: dataset illustrating the DAPC and \code{find.clusters}.
+
+  - \code{\link[stats]{kmeans}}: implementation of K-means in the stat package.
+
+  - \code{\link[ade4]{dudi.pca}}: implementation of PCA in the ade4 package.
+
 }
 \author{ Thibaut Jombart \email{t.jombart at imperial.ac.uk} }
 \examples{