[adegenet-commits] r848 - in pkg: R man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Thu Mar 10 17:37:30 CET 2011
Author: jombart
Date: 2011-03-10 17:37:30 +0100 (Thu, 10 Mar 2011)
New Revision: 848
Modified:
pkg/R/find.clust.R
pkg/man/dapc.Rd
pkg/man/find.clusters.Rd
Log:
find.clusters implemented for genlight objects
Modified: pkg/R/find.clust.R
===================================================================
--- pkg/R/find.clust.R 2011-03-10 15:50:01 UTC (rev 847)
+++ pkg/R/find.clust.R 2011-03-10 16:37:30 UTC (rev 848)
@@ -3,12 +3,13 @@
#############
find.clusters <- function (x, ...) UseMethod("find.clusters")
-######################
+############################
## find.clusters.data.frame
-######################
+############################
find.clusters.data.frame <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE,
criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
- max.n.clust=round(nrow(x)/10), n.iter=1e5, n.start=10, center=TRUE, scale=TRUE, ..., dudi=NULL){
+ max.n.clust=round(nrow(x)/10), n.iter=1e5, n.start=10, center=TRUE, scale=TRUE,
+ pca.select=c("nbEig","percVar"), perc.pca=NULL,..., dudi=NULL){
## CHECKS ##
if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
@@ -16,7 +17,10 @@
if(!require(stats)) stop("package stats is required")
stat <- match.arg(stat)
+ pca.select <- match.arg(pca.select)
criterion <- match.arg(criterion)
+ min.n.clust <- 2
+ max.n.clust <- max(max.n.clust, 2)
## KEEP TRACK OF SOME ORIGINAL PARAMETERS
## n.pca.ori <- n.pca
@@ -26,44 +30,52 @@
## ESCAPE IF SUB-CLUST ARE SEEKED ##
if(!is.null(clust)){
res <- .find.sub.clusters(x=x, clust=clust, n.pca=n.pca, n.clust=n.clust, stat=stat, max.n.clust=max.n.clust, n.iter=n.iter, n.start=n.start,
- choose.n.clust=choose.n.clust, criterion=criterion, center=center, scale=scale)
+ choose.n.clust=choose.n.clust, criterion=criterion, center=center, scale=scale)
return(res)
}
## END SUB-CLUST
- ## SOME GENERAL VARIABLES ##
+ ## PERFORM PCA ##
N <- nrow(x)
REDUCEDIM <- is.null(dudi)
- min.n.clust <- 2
- max.n.clust <- max(max.n.clust, 2)
- ## PERFORM PCA ##
- maxRank <- min(dim(x))
-
- if(REDUCEDIM){
+ if(REDUCEDIM){ # if no dudi provided
+ ## PERFORM PCA ##
+ maxRank <- min(dim(x))
pcaX <- dudi.pca(x, center = center, scale = scale, scannf = FALSE, nf=maxRank)
- } else {
- if(!inherits(dudi,"dudi")) stop("dudi provided but is not a dudi object")
+ } else { # else use the provided dudi
pcaX <- dudi
}
+ cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
+ if(!REDUCEDIM){
+ myCol <- rep(c("black", "lightgrey"), c(ncol(pcaX$li),length(pcaX$eig)))
+ } else {
+ myCol <- "black"
+ }
+
## select the number of retained PC for PCA
- if(is.null(n.pca)){
- if(!REDUCEDIM){
- myCol <- rep(c("black", "lightgrey"), c(ncol(pcaX$li),length(pcaX$eig)))
- } else {
- myCol <- "black"
- }
- cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
+ if(is.null(n.pca) & pca.select=="nbEig"){
plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA", col=myCol)
cat("Choose the number PCs to retain (>=1): ")
- n.pca <- NA
- while(is.na(n.pca)){
- n.pca <- as.integer(readLines(n = 1))
- }
+ n.pca <- as.integer(readLines(n = 1))
}
+ if(is.null(perc.pca) & pca.select=="percVar"){
+ plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA", col=myCol)
+ cat("Choose the percentage of variance to retain (0-100): ")
+ nperc.pca <- as.numeric(readLines(n = 1))
+ }
+
+ ## get n.pca from the % of variance to conserve
+ if(!is.null(perc.pca)){
+ n.pca <- min(which(cumVar >= perc.pca))
+ if(perc.pca > 99.999) n.pca <- length(pcaX$eig)
+ if(n.pca<1) n.pca <- 1
+ }
+
+
## keep relevant PCs - stored in XU
X.rank <- length(pcaX$eig)
n.pca <- min(X.rank, n.pca)
@@ -175,9 +187,9 @@
-###################
+########################
## find.clusters.genind
-###################
+########################
find.clusters.genind <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE,
criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
max.n.clust=round(nrow(x at tab)/10), n.iter=1e5, n.start=10,
@@ -203,7 +215,7 @@
## CALL DATA.FRAME METHOD
res <- find.clusters(X, clust=clust, n.pca=n.pca, n.clust=n.clust, stat=stat, max.n.clust=max.n.clust, n.iter=n.iter, n.start=n.start,
- choose.n.clust=choose.n.clust, criterion=criterion, center=FALSE, scale=FALSE)
+ choose.n.clust=choose.n.clust, criterion=criterion, center=FALSE, scale=FALSE,...)
return(res)
} # end find.clusters.genind
@@ -225,6 +237,95 @@
+##########################
+## find.clusters.genlight
+##########################
+find.clusters.genlight <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE,
+ criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
+ max.n.clust=round(nInd(x)/10), n.iter=1e5, n.start=10,
+ scale=FALSE, pca.select=c("nbEig","percVar"), perc.pca=NULL, glPca=NULL, ...){
+
+ ## CHECKS ##
+ if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
+ if(!require(MASS, quiet=TRUE)) stop("MASS library is required.")
+ if(!require(stats)) stop("package stats is required")
+ if(!inherits(x, "genlight")) stop("x is not a genlight object.")
+ stat <- match.arg(stat)
+ pca.select <- match.arg(pca.select)
+
+
+ ## SOME GENERAL VARIABLES ##
+ N <- nInd(x)
+ min.n.clust <- 2
+
+
+ ## PERFORM PCA ##
+ REDUCEDIM <- is.null(glPca)
+
+ if(REDUCEDIM){ # if no glPca provided
+ maxRank <- min(c(nInd(x), nLoc(x)))
+ pcaX <- glPca(x, center = TRUE, scale = scale, nf=maxRank, loadings=FALSE, returnDotProd = FALSE, ...)
+ } else {
+ pcaX <- glPca
+ }
+
+ if(is.null(n.pca)){
+ cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
+ }
+
+
+ ## select the number of retained PC for PCA
+ if(!REDUCEDIM){
+ myCol <- rep(c("black", "lightgrey"), c(ncol(pcaX$scores),length(pcaX$eig)))
+ } else {
+ myCol <- "black"
+ }
+
+ if(is.null(n.pca) & pca.select=="nbEig"){
+ plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA", col=myCol)
+ cat("Choose the number PCs to retain (>=1): ")
+ n.pca <- as.integer(readLines(n = 1))
+ }
+
+ if(is.null(perc.pca) & pca.select=="percVar"){
+ plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA", col=myCol)
+ cat("Choose the percentage of variance to retain (0-100): ")
+ nperc.pca <- as.numeric(readLines(n = 1))
+ }
+
+ ## get n.pca from the % of variance to conserve
+ if(!is.null(perc.pca)){
+ n.pca <- min(which(cumVar >= perc.pca))
+ if(perc.pca > 99.999) n.pca <- length(pcaX$eig)
+ if(n.pca<1) n.pca <- 1
+ }
+
+ if(!REDUCEDIM){
+ if(n.pca > ncol(pcaX$scores)) {
+ n.pca <- ncol(pcaX$scores)
+ }
+ }
+
+
+ ## convert PCA
+ pcaX <- .glPca2dudi(pcaX)
+
+
+ ## CALL DATA.FRAME METHOD
+ res <- find.clusters(pcaX$li, clust=clust, n.pca=n.pca, n.clust=n.clust, stat=stat, max.n.clust=max.n.clust, n.iter=n.iter, n.start=n.start,
+ choose.n.clust=choose.n.clust, criterion=criterion, center=FALSE, scale=FALSE, dudi=pcaX)
+ return(res)
+} # end find.clusters.genlight
+
+
+
+
+
+
+
+
+
+
###################
## .find.sub.clusters
###################
Modified: pkg/man/dapc.Rd
===================================================================
--- pkg/man/dapc.Rd 2011-03-10 15:50:01 UTC (rev 847)
+++ pkg/man/dapc.Rd 2011-03-10 16:37:30 UTC (rev 848)
@@ -12,8 +12,9 @@
\alias{assignplot}
\title{Discriminant Analysis of Principal Components (DAPC)}
\description{
- These functions implement the Discriminant Analysis of Principal Components
- (DAPC). See 'details' section for a succint description of the method.
+ These functions implement the Discriminant Analysis of Principal
+ Components (DAPC, Jombart et al. 2010). See 'details' section for a
+ succint description of the method.
\code{dapc} is a generic function performing the DAPC on the following
types of objects:\cr
@@ -42,7 +43,7 @@
\usage{
\method{dapc}{data.frame}(x, grp, n.pca=NULL, n.da=NULL, center=TRUE,
scale=FALSE,var.contrib=TRUE, pca.select=c("nbEig","percVar"),
- perc.pca=NULL, \ldots, dudi=NULL)
+ perc.pca=NULL, \ldots, dudi=NULL)
\method{dapc}{matrix}(x, \ldots)
@@ -61,15 +62,13 @@
\method{summary}{dapc}(object, \dots)
\method{scatter}{dapc}(x, xax=1, yax=2, grp=NULL,
- col=rainbow(length(levels(x$grp))), pch=20,
- posi="bottomleft", bg="grey", ratio=0.3, cstar
- = 1, cellipse = 1.5, axesell = TRUE, label =
- levels(x$grp), clabel = 1, xlim = NULL, ylim =
- NULL, grid = TRUE, addaxes = TRUE, origin =
- c(0,0), include.origin = TRUE, sub = "", csub =
- 1, possub = "bottomleft", posleg = "topright",
- cleg = 1, cgrid = 1, pixmap = NULL, contour =
- NULL, area = NULL, \ldots)
+ col=rainbow(length(levels(x$grp))), pch=20, posi="bottomleft",
+ bg="grey", ratio=0.3, cstar = 1, cellipse = 1.5, axesell = TRUE,
+ label = levels(x$grp), clabel = 1, xlim = NULL, ylim = NULL, grid
+ = TRUE, addaxes = TRUE, origin = c(0,0), include.origin = TRUE,
+ sub = "", csub = 1, possub = "bottomleft", posleg = "topright",
+ cleg = 1, cgrid = 1, pixmap = NULL, contour = NULL, area = NULL,
+ \ldots)
assignplot(x, only.grp=NULL, subset=NULL, cex.lab=.75, pch=3)
}
Modified: pkg/man/find.clusters.Rd
===================================================================
--- pkg/man/find.clusters.Rd 2011-03-10 15:50:01 UTC (rev 847)
+++ pkg/man/find.clusters.Rd 2011-03-10 16:37:30 UTC (rev 848)
@@ -4,16 +4,18 @@
\alias{find.clusters.data.frame}
\alias{find.clusters.matrix}
\alias{find.clusters.genind}
+\alias{find.clusters.genlight}
\alias{.find.sub.clusters}
\title{find.cluster: cluster identification using successive K-means}
\description{
- These functions implement the clustering procedure used in Discriminant
- Analysis of Principal Components (DAPC, Jombart et al. submitted). This
- procedure consists in running successive K-means with an increasing number of
- clusters (\code{k}), after transforming data using a principal component
- analysis (PCA). For each model, a statistical measure of goodness of fit (by
- default, BIC) is computed, which allows to choose the optimal \code{k}. See
- \code{details} for a description of how to select the optimal \code{k}.
+ These functions implement the clustering procedure used in
+ Discriminant Analysis of Principal Components (DAPC, Jombart et
+ al. 2010). This procedure consists in running successive K-means with
+ an increasing number of clusters (\code{k}), after transforming data
+ using a principal component analysis (PCA). For each model, a
+ statistical measure of goodness of fit (by default, BIC) is computed,
+ which allows to choose the optimal \code{k}. See \code{details} for a
+ description of how to select the optimal \code{k}.
Optionally, hierarchical clustering can be sought by providing a prior
clustering of individuals (argument \code{clust}). In such case, clusters will
@@ -21,24 +23,42 @@
The K-means procedure used in \code{find.clusters} is \code{\link[stats]{kmeans}} function
from the \code{stats} package. The PCA function is \code{\link[ade4]{dudi.pca}} from the
- \code{ade4} package.
+ \code{ade4} package, except for \linkS4class{genlight} objects which
+ use the \code{\link{glPca}} procedure from adegenet.
+
+ \code{find.clusters} is a generic function with methods for the
+ following types of objects:\cr
+ - \code{data.frame} (only numeric data)\cr
+ - \code{matrix} (only numeric data)\cr
+ - \code{\linkS4class{genind}} objects (genetic markers)\cr
+ - \code{\linkS4class{genlight}} objects (genome-wide SNPs)
+
}
\usage{
\method{find.clusters}{data.frame}(x, clust=NULL, n.pca=NULL,
n.clust=NULL, stat=c("BIC","AIC", "WSS"),
- choose.n.clust=TRUE,criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
+ choose.n.clust=TRUE,criterion=c("diffNgroup",
+ "min","goesup", "smoothNgoesup", "goodfit"),
max.n.clust=round(nrow(x)/10), n.iter=1e5, n.start=10,
- center=TRUE, scale=TRUE, \ldots, dudi=NULL)
+ center=TRUE, scale=TRUE, pca.select=c("nbEig","percVar"),
+ perc.pca=NULL, \ldots, dudi=NULL)
\method{find.clusters}{matrix}(x, \ldots)
\method{find.clusters}{genind}(x, clust=NULL, n.pca=NULL, n.clust=NULL,
stat=c("BIC","AIC", "WSS"), choose.n.clust=TRUE,
- criterion=c("diffNgroup", "min","goesup", "smoothNgoesup", "goodfit"),
- max.n.clust=round(nrow(x at tab)/10), n.iter=1e5, n.start=10,
- scale=FALSE, scale.method=c("sigma", "binom"),
+ criterion=c("diffNgroup", "min","goesup", "smoothNgoesup",
+ "goodfit"), max.n.clust=round(nrow(x at tab)/10), n.iter=1e5,
+ n.start=10, scale=FALSE, scale.method=c("sigma", "binom"),
truenames=TRUE, \ldots)
+\method{find.clusters}{genlight}(x, clust=NULL, n.pca=NULL,
+ n.clust=NULL, stat=c("BIC", "AIC",
+ "WSS"),choose.n.clust=TRUE, criterion=c("diffNgroup",
+ "min","goesup","smoothNgoesup", "goodfit"),
+ max.n.clust=round(nInd(x)/10), n.iter=1e5,n.start=10,
+ scale=FALSE, pca.select=c("nbEig","percVar"),
+ perc.pca=NULL,glPca=NULL, \ldots)
}
\arguments{
\item{x}{\code{a data.frame}, \code{matrix}, or \code{\linkS4class{genind}}
@@ -89,6 +109,15 @@
downweighting informative alleles. Further scaling options are
available for \linkS4class{genind} objects (see argument
\code{scale.method}).}
+ \item{pca.select}{a \code{character} indicating the mode of selection of PCA
+ axes, matching either "nbEig" or "percVar". For "nbEig", the user
+ has to specify the number of axes retained (interactively, or via
+ \code{n.pca}). For "percVar", the user has to specify the minimum amount of
+ the total variance to be preserved by the retained axes, expressed as a
+ percentage (interactively, or via \code{perc.pca}). }
+ \item{perc.pca}{a \code{numeric} value between 0 and 100 indicating the
+ minimal percentage of the total variance of the data to be expressed by the
+ retained axes of PCA.}
\item{scale.method}{a \code{character} specifying the scaling method to be used
for allele frequencies, which must match "sigma" (usual estimate of standard
deviation) or "binom" (based on binomial distribution). See
@@ -103,6 +132,9 @@
\code{dudi} (from the ade4 package). If provided, prior PCA will be
ignored, and this object will be used as a prior step for variable
orthogonalisation.}
+ \item{glPca}{an optional \code{\link{glPca}} object; if provided,
+ dimension reduction is not performed (saving computational time) but
+ taken directly from this object.}
}
\details{
=== ON THE SELECTION OF K ===\cr
@@ -235,5 +267,13 @@
points(2, foo.WSS$Kstat[2], pch="x", cex=3)
mtext(3, tex="'X' indicates the actual number of clusters")
+
+## TOY EXAMPLE FOR GENLIGHT OBJECTS ##
+x <- glSim(100,500,500)
+x
+plot(x)
+grp <- find.clusters(x, n.pca=100, choose=FALSE, stat="BIC")
+plot(grp$Kstat, type="o", xlab="number of clusters (K)",ylab="BIC",main="find.clusters on a genlight object\n(two groups)")
+
}
\keyword{multivariate}
More information about the adegenet-commits
mailing list