[adegenet-commits] r564 - pkg/man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Fri Feb 12 13:03:52 CET 2010
Author: jombart
Date: 2010-02-12 13:03:52 +0100 (Fri, 12 Feb 2010)
New Revision: 564
Modified:
pkg/man/dapc.Rd
pkg/man/find.clusters.Rd
Log:
several documentation fixes
Modified: pkg/man/dapc.Rd
===================================================================
--- pkg/man/dapc.Rd 2010-02-11 15:09:03 UTC (rev 563)
+++ pkg/man/dapc.Rd 2010-02-12 12:03:52 UTC (rev 564)
@@ -36,20 +36,22 @@
}
\usage{
\method{dapc}{data.frame}(x, grp, n.pca=NULL, n.da=NULL, center=TRUE,
-scale=FALSE, var.contrib=FALSE, pca.select=c("nbEig","percVar"), perc.pca=NULL)
+ scale=FALSE,var.contrib=FALSE,
+ pca.select=c("nbEig","percVar"), perc.pca=NULL)
\method{dapc}{matrix}(x, \ldots)
\method{dapc}{genind}(x, pop=NULL, n.pca=NULL, n.da=NULL, scale=FALSE,
-scale.method=c("sigma", "binom"), truenames=TRUE, all.contrib=FALSE,
-pca.select=c("nbEig","percVar"), perc.pca=NULL)
+ scale.method=c("sigma", "binom"), truenames=TRUE, all.contrib=FALSE,
+ pca.select=c("nbEig","percVar"), perc.pca=NULL)
\method{print}{dapc}(x, \dots)
\method{summary}{dapc}(object, \dots)
-\method{scatter}{dapc}(x, xax=1, yax=2, col=rainbow(length(levels(x$grp))),
-posi="bottomleft", bg="grey", ratio=0.3, csub=1.2, \ldots)
+\method{scatter}{dapc}(x, xax=1, yax=2,
+ col=rainbow(length(levels(x$grp))), posi="bottomleft", bg="grey",
+ ratio=0.3, csub=1.2, \ldots)
\method{assignplot}{dapc}(x, only.grp=NULL, subset=NULL, cex.lab=.75, pch=3)
}
@@ -148,8 +150,8 @@
variables (alleles in the case of genetic data) to the principal components
of DAPC.}
-
- === other outputs ===\cr
+
+ \cr=== other outputs ===\cr
Other functions have different outputs:\cr
- \code{summary.dapc} returns a list with 6 components: \code{n.dim} (number
of retained DAPC axes), \code{n.pop} (number of groups/populations),
@@ -165,10 +167,14 @@
Discriminant analysis of principal components: a new method for the analysis of
genetically structured populations. Submitted to \emph{PLoS genetics}.
}
-\seealso{\code{\link{find.clusters}}, to identify clusters without
- prior. \code{\link{dapcIllus}}, a set of simulated data illustrating the DAPC,
- and \code{\link{eHGDP}} and \code{\link{H3N2}}, empirical datasets also
- illustrating DAPC. }
+\seealso{
+ - \code{\link{find.clusters}}: to identify clusters without prior.
+
+ - \code{\link{dapcIllus}}: a set of simulated data illustrating the DAPC
+
+ - \code{\link{eHGDP}}, \code{\link{H3N2}}: empirical datasets illustrating
+ DAPC
+}
\author{ Thibaut Jombart \email{t.jombart at imperial.ac.uk} }
\examples{
## data(dapcIllus), data(eHGDP), and data(H3N2) illustrate the dapc
Modified: pkg/man/find.clusters.Rd
===================================================================
--- pkg/man/find.clusters.Rd 2010-02-11 15:09:03 UTC (rev 563)
+++ pkg/man/find.clusters.Rd 2010-02-12 12:03:52 UTC (rev 564)
@@ -20,79 +20,87 @@
be sought within each prior group.
\code{.find.sub.clusters} is a hidden function called in some instances of
- \code{find.clusters}, and should not be called directely by the user.
+ \code{find.clusters}, and should not be called directly by the user.
- The K-means procedure used in \code{find.clusters} is \code{kmeans} function
- from the \code{stat} package. The PCA function is \code{dudi.pca} from the
+ The K-means procedure used in \code{find.clusters} is \code{\link[stats]{kmeans}} function
+ from the \code{stats} package. The PCA function is \code{\link[ade4]{dudi.pca}} from the
\code{ade4} package.
}
\usage{
-\method{find.clusters}{data.frame}(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC",
- "AIC", "WSS"), choose.n.clust=TRUE,
- criterion=c("min","diff", "conserv"),
- max.n.clust=round(nrow(x)/10), n.iter=1e3,
- n.start=10, center=TRUE, scale=TRUE)
+\method{find.clusters}{data.frame}(x, clust=NULL, n.pca=NULL,
+ n.clust=NULL, stat=c("BIC","AIC", "WSS"),
+ choose.n.clust=TRUE,criterion=c("min","diff", "conserv"),
+ max.n.clust=round(nrow(x)/10), n.iter=1e3, n.start=10,
+ center=TRUE, scale=TRUE)
\method{find.clusters}{matrix}(x, \ldots)
-\method{find.clusters}{genind}(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC",
- "AIC", "WSS"), choose.n.clust=TRUE, criterion=c("min","diff",
- "conserv"), max.n.clust=round(nrow(x at tab)/10), n.iter=1e3,
- n.start=10, scale=FALSE, scale.method=c("sigma", "binom"),
- truenames=TRUE, \ldots)
+\method{find.clusters}{genind}(x, clust=NULL, n.pca=NULL, n.clust=NULL,
+ stat=c("BIC","AIC", "WSS"), choose.n.clust=TRUE,
+ criterion=c("min","diff", "conserv"),
+ max.n.clust=round(nrow(x at tab)/10), n.iter=1e3, n.start=10,
+ scale=FALSE, scale.method=c("sigma", "binom"),
+ truenames=TRUE, \ldots)
}
\arguments{
-\item{x}{\code{a data.frame}, \code{matrix}, or \code{\linkS4class{genind}}
- object. For the \code{data.frame} and \code{matrix} arguments, only
- quantitative variables should be provided.}
-\item{clust}{an optional \code{factor} indicating a prior group membership of
- individuals. If provided, sub-clusters will be sought within each prior
- group.}
-\item{n.pca}{an \code{integer} indicating the number of axes retained in the
- Principal Component Analysis (PCA) step. If \code{NULL}, interactive selection
- is triggered.}
-\item{n.clust}{ an optinal \code{integer} indicating the number of clusters to
- be sought. If provided, the function will only run K-means once, for this
- number of clusters. If left as \code{NULL}, several K-means are run for a
- range of k (number of clusters) values.}
-\item{stat}{ a \code{character} string matching 'BIC', 'AIC', or 'WSS', which
- indicates the statistic to be computed for each model (i.e., for each value of
- \code{k}). BIC: Bayesian Information Criterion. AIC: Aikaike's Information
- Criterion. WSS: within-groups sum of squares, that is, residual variance.}
-\item{choose.n.clust}{ a \code{logical} indicating whether the number of
-clusters should be chosen by the user (TRUE, default), or automatically, based
-on a given criterion (argument \code{criterion}). IT IS HIGHLY RECOMMENDED to
-choose the number of clusters interactively, as automatic procedures are being
-evaluated.}
-\item{criterion}{ a \code{character} string matching "min", "diff", or
- "conserv", indicating the criterion for automatic selection of the optimal
- number of clusters. See \code{details}.}
-\item{max.n.clust}{ an \code{integer} indicating the maximum number of clusters
- to be tried. Values of 'k' will be picked up between 1 and \code{max.n.clust}}
-\item{n.iter}{ an \code{integer} indicating the number of iterations to be used
- in each run of K-means algorithm. Corresponds to \code{iter.max} of
- \code{kmeans} function.}
-\item{n.start}{ an \code{integer} indicating the number of randomly chosen
- starting points to be used in each run of K-means algorithm. Using more
- starting points ensures convergence of the algorithm. Corresponds to
- \code{nstart} of \code{kmeans} function.}
-\item{center}{a \code{logical} indicating whether variables should be centred to
-mean 0 (TRUE, default) or not (FALSE). Always TRUE for \linkS4class{genind}
-objects.}
-\item{scale}{a \code{logical} indicating whether variables should be scaled
- (TRUE) or not (FALSE, default). Scaling consists in dividing variables by
- their (estimated) standard deviation to account for trivial differences in
- variances. Further scaling options are available for \linkS4class{genind}
- objects (see argument \code{scale.method}).}
-\item{scale.method}{a \code{character} specifying the scaling method to be used
- for allele frequencies, which must match "sigma" (usual estimate of standard
- deviation) or "binom" (based on binomial distribution). See
- \code{\link{scaleGen}} for further details.}
-\item{truenames}{a \code{logical} indicating whether true (i.e., user-specified)
- labels should be used in object outputs (TRUE, default) or not (FALSE).}
-\item{\ldots}{further arguments to be passed to other functions. For
- \code{find.clusters.matrix}, arguments are to match those of \code{find.clusters.data.frame}.}
+ \item{x}{\code{a data.frame}, \code{matrix}, or \code{\linkS4class{genind}}
+ object. For the \code{data.frame} and \code{matrix} arguments, only
+ quantitative variables should be provided.}
+ \item{clust}{an optional \code{factor} indicating a prior group membership of
+ individuals. If provided, sub-clusters will be sought within each prior
+ group.}
+ \item{n.pca}{an \code{integer} indicating the number of axes retained in the
+ Principal Component Analysis (PCA) step. If \code{NULL}, interactive selection
+ is triggered.}
+ \item{n.clust}{ an optinal \code{integer} indicating the number of clusters to
+ be sought. If provided, the function will only run K-means once, for this
+ number of clusters. If left as \code{NULL}, several K-means are run for a
+ range of k (number of clusters) values.}
+ \item{stat}{ a \code{character} string matching 'BIC', 'AIC', or 'WSS', which
+ indicates the statistic to be computed for each model (i.e., for each value of
+ \code{k}). BIC: Bayesian Information Criterion. AIC: Aikaike's Information
+ Criterion. WSS: within-groups sum of squares, that is, residual variance.}
+ \item{choose.n.clust}{ a \code{logical} indicating whether the number of
+ clusters should be chosen by the user (TRUE, default), or automatically,
+ based on a given criterion (argument \code{criterion}). IT IS HIGHLY
+ RECOMMENDED to choose the number of clusters interactively, as automatic
+ procedures have not been fully evaluated.}
+ \item{criterion}{ a \code{character} string matching "min", "diff", or
+ "conserv", indicating the criterion for automatic selection of the optimal
+ number of clusters. Honestly, you should go for interactive
+ selection of the number of clusters. Do as you wish. No warranty. If
+ you still want to give it a try, see \code{details}.}
+ \item{max.n.clust}{ an \code{integer} indicating the maximum number of
+ clusters to be tried. Values of 'k' will be picked up between 1 and \code{max.n.clust}}
+ \item{n.iter}{ an \code{integer} indicating the number of iterations to be used
+ in each run of K-means algorithm. Corresponds to \code{iter.max} of
+ \code{kmeans} function.}
+ \item{n.start}{ an \code{integer} indicating the number of randomly
+ chosen starting centroids to be used in each run of the K-means
+ algorithm. Using more starting points ensures convergence of the
+ algorithm. Corresponds to \code{nstart} of \code{kmeans} function.}
+ \item{center}{a \code{logical} indicating whether variables should be centred to
+ mean 0 (TRUE, default) or not (FALSE). Always TRUE for \linkS4class{genind}
+ objects.}
+ \item{scale}{a \code{logical} indicating whether variables should be
+ scaled (TRUE) or not (FALSE, default). Scaling consists in dividing
+ variables by their (estimated) standard deviation to account for
+ trivial differences in variances. In allele frequencies, it comes with
+ the risk of giving uninformative alleles more importance while
+ downweighting informative alleles. Further scaling options are
+ available for \linkS4class{genind} objects (see argument
+ \code{scale.method}).}
+ \item{scale.method}{a \code{character} specifying the scaling method to be used
+ for allele frequencies, which must match "sigma" (usual estimate of standard
+ deviation) or "binom" (based on binomial distribution). See
+ \code{\link{scaleGen}} for further details.}
+ \item{truenames}{a \code{logical} indicating whether true (i.e., user-specified)
+ labels should be used in object outputs (TRUE, default) or not
+ (FALSE), in which case generic labels are used.}
+ \item{\ldots}{further arguments to be passed to other functions. For
+ \code{find.clusters.matrix}, arguments are to match those of the
+ \code{data.frame} method.}
}
\details{
=== ON THE SELECTION OF K ===
@@ -100,39 +108,43 @@
So far, the analysis of data simulated under various population genetics
models (see reference) suggested an ad hoc rule for the selection of the
- optimal number of clusters. First, BIC seems for efficient than AIC and WSS
- to select the appropriate number of clusters. The rule of thumb consists in
- increasing K until it no longer leads to an appreciable improve of fit (i.e.,
- decrease of BIC). In the most simple models (island models), BIC decreases
- until it reaches the optimal K, and then increases. In these cases, our rule
- amounts to choosing the lowest K. In other models such as stepping stones, the
- decrease of BIC often continues after the optimal K, but is much less steep.
+ optimal number of clusters. First important result is that BIC seems for
+ efficient than AIC and WSS to select the appropriate number of clusters (see
+ example). The rule of thumb consists in increasing K until it no longer leads
+ to an appreciable improvement of fit (i.e., to a decrease of BIC). In the
+ most simple models (island models), BIC decreases until it reaches the optimal
+ K, and then increases. In these cases, our rule amounts to choosing the lowest
+ K. In other models such as stepping stones, the decrease of BIC often
+ continues after the optimal K, but is much less steep.
- An alternative approach that we do not recommend is automatic selection based
- on a fixed criterion. For this, set \code{choose.n.clust} to FALSE and specify
- the \code{criterion} you want to use, from the following values:
+ An alternative approach, that we do not recommend for now, is automatic
+ selection based on a fixed criterion. For this, set \code{choose.n.clust} to
+ FALSE and specify the \code{criterion} you want to use, from the following
+ values:
- - "min": the model with the minimum statistics (as specified by \code{stat}
- argument) is retained. Is likely to work for simple island model with BIC.
+ - "min": the model with the minimum summary statistics (as specified by
+ \code{stat} argument, BIC by default) is retained. Is likely to work for
+ simple island model, using BIC.
- "diff": model selection based on successive improvement of the test
statistic. This procedure attempts to increase K until the model improvement
(difference in successive BIC, AIC, or WSS) is no longer important. May be
more appropriate to models relating to stepping stones.
- "conserv": another criterion meant to be conservative, in that it seeks a good
- fit with a minimum number of clusters. Unlike "diff", it does not rely on
+ - "conserv": another criterion meant to be conservative, in that it seeks a
+ good fit with a minimum number of clusters. Unlike "diff", it does not rely on
differences between successive statistics, but rather on absolute fit. It
selects the model with the smallest K so that the overall fit is above a given
- threshold. }
+ threshold.
+}
\value{
The class \code{find.clusters} is a list with the following
components:\cr
- \item{Kstat}{a \code{numeric} vector giving the values of the statistics for the
- different values of K. Is NULL if \code{n.clust} was specified.}
- \item{stat}{a \code{numeric} value giving the value of the statistics for the
- retained model}
+ \item{Kstat}{a \code{numeric} vector giving the values of the summary
+ statistics for the different values of K. Is NULLif \code{n.clust} was specified.}
+ \item{stat}{a \code{numeric} value giving the value of the summary statistics
+ for the retained model}
\item{grp}{a \code{factor} giving group membership for each individual.}
\item{size}{an \code{integer} vector giving the size of the different clusters.}
}
@@ -143,11 +155,15 @@
}
\seealso{
- \code{\link{dapc}}: implements the DAPC.
-
- - \code{\link[stats]{kmeans}}: implementation of K-means in the stat
- package.
-
+
+ - \code{\link{dapcIllys}}: dataset illustrating the DAPC and \code{find.clusters}.
+
- \code{\link{eHGDP}}: dataset illustrating the DAPC and \code{find.clusters}.
+
+ - \code{\link[stats]{kmeans}}: implementation of K-means in the stat package.
+
+ - \code{\link[ade4]{dudi.pca}}: implementation of PCA in the ade4 package.
+
}
\author{ Thibaut Jombart \email{t.jombart at imperial.ac.uk} }
\examples{
More information about the adegenet-commits
mailing list