[adegenet-commits] r502 - pkg/R
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Tue Dec 1 18:16:31 CET 2009
Author: jombart
Date: 2009-12-01 18:16:31 +0100 (Tue, 01 Dec 2009)
New Revision: 502
Modified:
pkg/R/dapc.R
Log:
Tuning find.clusters.
Modified: pkg/R/dapc.R
===================================================================
--- pkg/R/dapc.R 2009-11-30 19:26:35 UTC (rev 501)
+++ pkg/R/dapc.R 2009-12-01 17:16:31 UTC (rev 502)
@@ -1,8 +1,92 @@
+#############
+## find.clusters
+#############
+find.clusters <- function(x, n.pca=NULL, choose.n.grp=TRUE,
+ min.n.clust=2, max.n.clust=round(nrow(x at tab)/10), n.iter=1e6, n.start=1e3,
+ scale=TRUE, scale.method=c("sigma", "binom"), truenames=TRUE){
+
+ ## CHECKS ##
+ if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
+ if(!require(MASS, quiet=TRUE)) stop("MASS library is required.")
+ if(!require(stats)) stop("package stats is required")
+ if(!is.genind(x)) stop("x must be a genind object.")
+
+
+ ## SOME GENERAL VARIABLES ##
+ N <- nrow(x at tab)
+
+ ## PERFORM PCA ##
+ maxRank <- min(dim(x at tab))
+
+ X <- scaleGen(x, center = TRUE, scale = scale, method = scale.method,
+ missing = "mean", truenames = truenames)
+
+ pcaX <- dudi.pca(X, center = FALSE, scale = FALSE, scannf = FALSE, nf=maxRank)
+
+ ## select the number of retained PC for PCA
+ if(is.null(n.pca)){
+ cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
+ plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative genetic variance (%)", main="Genetic variance explained by PCA")
+ cat("Choose the number PCs to retain (>=1): ")
+ n.pca <- as.integer(readLines(n = 1))
+ }
+
+ ## keep relevant PCs - stored in XU
+ X.rank <- length(pcaX$eig)
+ n.pca <- min(X.rank, n.pca)
+ if(n.pca >= N) warning("number of retained PCs of PCA is greater than N")
+ if(n.pca > N/3) warning("number of retained PCs of PCA may be too large (> N /3)")
+
+ XU <- pcaX$li[, 1:n.pca, drop=FALSE] # principal components
+
+ ## PERFORM K-MEANS
+ nbClust <- min.n.clust:max.n.clust
+ best <- NULL
+ stat <- numeric(0)
+
+ for(i in 1:length(nbClust)){
+ if(is.null(prior)) {ini <- nbClust[i]} else {ini <- prior}
+ temp <- kmeans(XU, centers=ini, iter.max=min(n.iter, 100), nstart=min(n.start, 1e3))
+ stat[i] <- mean(temp$withinss)
+ if(which.min(stat)==i) {best <- temp}
+ }
+
+
+ ## CHOOSE NUMBER OF GROUPS
+ if(choose.n.grp){
+ ##TSS <- sum(pcaX$eig) * N
+ ##betweenVar <- (1 - ((stat/(N-nbClust-1))/(TSS/(N-1)) )) *100
+ WSS.ori <- sum(apply(XU, 2, function(v) sum((v-mean(v))^2) ))
+ reducWSS <- -diff(c(WSS.ori, stat))
+ reducWSS <- reducWSS/max(reducWSS)
+ plot(reducWSS, xlab="Number of clusters", ylab="Reduction in within SS", main="Reduction of within SS \nwith number of clusters", type="b", col="blue")
+ abline(h=0, lty=2, col="red")
+ cat("Choose the number of clusters (>=2: ")
+ n.clust <- as.integer(readLines(n = 1))
+ } else {
+ n.clust <- length(best$size)
+ }
+
+ best <- kmeans(XU, centers=n.clust, iter.max=n.iter, nstart=n.start)
+
+
+ ## MAKE RESULT ##
+ names(stat) <- paste("K",nbClust, sep="=")
+ res <- list(stat=stat, grp=factor(best$cluster), size=best$size)
+
+ return(res)
+} # end find.clusters
+
+
+
+
+
+
########
## dapc
########
-dapc <- function(x, pop=NULL, n.pca=NULL, n.da=NULL, scale=TRUE,
- scale.method=c("sigma", "binom"), truenames=TRUE, all.contrib=FALSE){
+dapc <- function(x, pop=NULL, n.pca=NULL, n.da=NULL,
+ scale=TRUE, scale.method=c("sigma", "binom"), truenames=TRUE, all.contrib=FALSE){
## FIRST CHECKS
if(!is.genind(x)) stop("x must be a genind object.")
@@ -50,7 +134,7 @@
names(XU) <- paste("PCA-pc", 1:ncol(XU), sep=".")
- ## PERFORM DA ##
+ ## PERFORM DA ##
ldaX <- lda(XU, pop.fac)
if(is.null(n.da)){
barplot(ldaX$svd^2, xlab="Linear Discriminants", ylab="F-statistic", main="Discriminant analysis eigenvalues", col=heat.colors(length(levels(pop.fac))) )
@@ -63,6 +147,8 @@
## BUILD RESULT
res <- list()
+ res$n.pca <- n.pca
+ res$n.da <- n.da
res$tab <- XU
res$fac <- pop.fac
res$var.gen <- XU.lambda
@@ -95,7 +181,7 @@
######################
print.dapc <- function(x, ...){
cat("\t#########################################\n")
- cat("\t# Discriminant analysis of Principal Components #\n")
+ cat("\t# Discriminant Analysis of Principal Components #\n")
cat("\t#########################################\n")
cat("class: ")
cat(class(x))
@@ -103,13 +189,13 @@
print(x$call)
cat("\n$n.pca:", x$n.pca, "first PCs of PCA used")
cat("\n$n.da:", x$n.da, "discriminant functions saved")
- cat("\nProportion of conserved genetic variance: ", x$var.gen)
+ cat("\n$var.gen (proportion of conserved genetic variance):", round(x$var.gen,3))
- cat("\nEigenvalues: ")
+ cat("\n\n$eig (eigenvalues): ")
l0 <- sum(x$eig >= 0)
cat(signif(x$eig, 4)[1:(min(5, l0))])
if (l0 > 5)
- cat(" ...\n")
+ cat(" ...\n\n")
## vectors
sumry <- array("", c(4, 3), list(1:4, c("vector", "length", "content")))
@@ -123,7 +209,7 @@
## data.frames
cat("\n")
sumry <- array("", c(5, 4), list(1:5, c("data.frame", "nrow", "ncol", "content")))
- sumry[1, ] <- c("$tab", nrow(x$tab), ncol(x$tab), "retained principal components of PCA")
+ sumry[1, ] <- c("$tab", nrow(x$tab), ncol(x$tab), "retained PCs of PCA")
sumry[2, ] <- c("$disc.func", nrow(x$disc.func), ncol(x$disc.func), "discriminant functions")
sumry[3, ] <- c("$ind.coord", nrow(x$ind.coord), ncol(x$ind.coord), "coordinates of individuals")
sumry[4, ] <- c("$pop.coord", nrow(x$pop.coord), ncol(x$pop.coord), "coordinates of populations")
@@ -132,8 +218,8 @@
print(sumry)
cat("\nother elements: ")
- if (length(names(x)) > 11)
- cat(names(x)[12:(length(names(x)))], "\n")
+ if (length(names(x)) > 13)
+ cat(names(x)[14:(length(names(x)))], "\n")
else cat("NULL\n")
}
@@ -141,6 +227,7 @@
+
##############
## summary.dapc
##############
@@ -170,11 +257,12 @@
##############
## scatter.dapc
##############
-scatter.dapc <- function(x, axes=1:2, col=rainbow(length(levels(x$fac))), posi="bottomleft", bg="lightgrey", ...){
+scatter.dapc <- function(x, xax=1, yax=2, col=rainbow(length(levels(x$fac))), posi="bottomleft", bg="grey", ratio=0.3, csub=1.2, ...){
if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
+ axes <- c(xax,yax)
par(bg=bg)
s.class(x$ind.coord[,axes], fac=x$fac, col=col, ...)
- add.scatter.eig(x$eig, ncol(x$disc.func), axes[1], axes[2], posi=posi)
+ add.scatter.eig(x$eig, ncol(x$disc.func), axes[1], axes[2], posi=posi, ratio=ratio, csub=csub)
return(invisible())
} # end scatter.dapc
@@ -182,213 +270,9 @@
+###############
+## randtest.dapc
+###############
+randtest.dapc <- function(x, nperm = 999, ...){
-
-
-
-
-
-
-
-
-########################
-# Function summary.dapc
-########################
-summary.dapc <- function (object, ..., printres=TRUE) {
- if (!inherits(object, "dapc"))stop("to be used with 'dapc' object")
- if(!require(ade4,quietly=TRUE)) stop("the library ade4 is required; please install this package")
- if(!require(spdep,quietly=TRUE)) stop("the library spdep is required; please install this package")
-
- #util <- function(n) { ## no longer used
- # x <- "1"
- # for (i in 2:n) x[i] <- paste(x[i - 1], i, sep = "+")
- # return(x)
- #}
- norm.w <- function(X, w) {
- f2 <- function(v) sum(v * v * w)/sum(w)
- norm <- apply(X, 2, f2)
- return(norm)
- }
-
- resfin <- list()
-
- if(printres) {
- cat("\nSpatial principal component analysis\n")
- cat("\nCall: ")
- print(object$call)
- }
-
- appel <- as.list(object$call)
- ## compute original pca
- # prepare data
- obj <- eval(appel$obj)
- if(is.null(appel$truenames)) truenames <- FALSE
-
- f1 <- function(vec){
- m <- mean(vec,na.rm=TRUE)
- vec[is.na(vec)] <- m
- return(vec)
- }
-
- if(is.genind(obj)) { X <- obj at tab }
- if(is.genpop(obj)) { X <- makefreq(obj, quiet=TRUE)$tab }
-
- X <- apply(X,2,f1)
-
- if(truenames){
- rownames(X) <- rownames(truenames(obj))
- colnames(X) <- colnames(truenames(obj))
- }
-
- nfposi <- object$nfposi
- nfnega <- object$nfnega
-
- dudi <- dudi.pca(X, center=TRUE, scale=FALSE, scannf=FALSE, nf=nfposi+nfnega)
- ## end of pca
-
- lw <- object$lw
-
- # I0, Imin, Imax
- n <- nrow(X)
- I0 <- -1/(n-1)
- L <- listw2mat(lw)
- eigL <- eigen(0.5*(L+t(L)))$values
- Imin <- min(eigL)
- Imax <- max(eigL)
- Ival <- data.frame(I0=I0,Imin=Imin,Imax=Imax)
- row.names(Ival) <- ""
- if(printres) {
- cat("\nConnection network statistics:\n")
- print(Ival)
- }
-
- Istat <- c(I0,Imin,Imax)
- names(Istat) <- c("I0","Imin","Imax")
- resfin$Istat <- Istat
-
-
- # les scores de l'analyse de base
- nf <- dudi$nf
- eig <- dudi$eig[1:nf]
- cum <- cumsum(dudi$eig)[1:nf]
- ratio <- cum/sum(dudi$eig)
- w <- apply(dudi$l1,2,lag.listw,x=lw)
- moran <- apply(w*as.matrix(dudi$l1)*dudi$lw,2,sum)
- res <- data.frame(var=eig,cum=cum,ratio=ratio, moran=moran)
- row.names(res) <- paste("Axis",1:nf)
- if(printres) {
- cat("\nScores from the centred PCA\n")
- print(res)
- }
-
- resfin$pca <- res
-
-
- # les scores de l'analyse spatiale
- # on recalcule l'objet en gardant tous les axes
- eig <- object$eig
- nfposimax <- sum(eig > 0)
- nfnegamax <- sum(eig < 0)
-
- ms <- multispati(dudi=dudi, listw=lw, scannf=FALSE,
- nfposi=nfposimax, nfnega=nfnegamax)
-
- ndim <- dudi$rank
- nf <- nfposi + nfnega
- agarder <- c(1:nfposi,if (nfnega>0) (ndim-nfnega+1):ndim)
- varspa <- norm.w(ms$li,dudi$lw)
- moran <- apply(as.matrix(ms$li)*as.matrix(ms$ls)*dudi$lw,2,sum)
- res <- data.frame(eig=eig,var=varspa,moran=moran/varspa)
- row.names(res) <- paste("Axis",1:length(eig))
-
- if(printres) {
- cat("\ndapc eigenvalues decomposition:\n")
- print(res[agarder,])
- }
-
- resfin$dapc <- res
-
- return(invisible(resfin))
-}
-
-
-
-#####################
-# Function plot.dapc
-#####################
-plot.dapc <- function (x, axis = 1, useLag=FALSE, ...){
- if (!inherits(x, "dapc")) stop("Use only with 'dapc' objects.")
-
- if(!require(ade4)) stop("ade4 package is required.")
- if(!require(spdep)) stop("spdep package is required.")
- if(axis>ncol(x$li)) stop("wrong axis required.")
-
- opar <- par(no.readonly = TRUE)
- on.exit(par(opar))
- par(mar = rep(.1,4), mfrow=c(3,2))
-
- n <- nrow(x$li)
- xy <- x$xy
-
- ## handle useLag argument
- if(useLag){
- z <- x$ls[,axis]
- } else {
- z <- x$li[,axis]
- } # end if useLag
- nfposi <- x$nfposi
- nfnega <- x$nfnega
- ## handle neig parameter - hide cn if nore than 100 links
- nLinks <- sum(card(x$lw$neighbours))
- if(nLinks < 500) {
- neig <- nb2neig(x$lw$neighbours)
- } else {
- neig <- NULL
- }
-
- sub <- paste("Score",axis)
- csub <- 2
-
- # 1
- if(n<30) clab <- 1 else clab <- 0
- s.label(xy, clab=clab, include.ori=FALSE, addaxes=FALSE, neig=neig,
- cneig=1, sub="Connection network", csub=2)
-
- # 2
- s.image(xy,z, include.ori=FALSE, grid=TRUE, kgrid=10, cgrid=1,
- sub=sub, csub=csub, possub="bottomleft")
- box()
-
- # 3
- if(n<30) {neig <- nb2neig(x$lw$neighbours)} else {neig <- NULL}
- s.value(xy,z, include.ori=FALSE, addaxes=FALSE, clegend=0, csize=.6,
- neig=neig, sub=sub, csub=csub, possub="bottomleft")
-
- # 4
- s.value(xy,z, include.ori=FALSE, addaxes=FALSE, clegend=0, csize=.6,
- method="greylevel", neig=neig, sub=sub, csub=csub, possub="bottomleft")
-
- # 5
- omar <- par("mar")
- par(mar = c(0.8, 2.8, 0.8, 0.8))
- m <- length(x$eig)
- col.w <- rep("white", m) # elles sont toutes blanches
- col.w[1:nfposi] <- "grey"
- if (nfnega>0) {col.w[m:(m-nfnega+1)] <- "grey"}
- j <- axis
- if (j>nfposi) {j <- j-nfposi +m -nfnega}
- col.w[j] <- "black"
- barplot(x$eig, col = col.w)
- scatterutil.sub(cha ="Eigenvalues", csub = 2.5, possub = "topright")
- par(mar=rep(.1,4))
- box()
- par(mar=omar)
-
- # 6
- par(mar=c(4,4,2,1))
- screeplot(x,main="Eigenvalues decomposition")
- par(mar=rep(.1,4))
- box()
- return(invisible(match.call()))
-}
-
+} # end randtest.dapc
More information about the adegenet-commits
mailing list