[adegenet-commits] r517 - pkg/R

Thu Dec 17 17:09:20 CET 2009

Author: jombart
Date: 2009-12-17 17:09:20 +0100 (Thu, 17 Dec 2009)
New Revision: 517

Added:
   pkg/R/find.clust.R
Modified:
   pkg/R/dapc.R
Log:
Split dapc and find.cluster.
Find.cluster can now check hierarchical structuring. 


Modified: pkg/R/dapc.R
===================================================================

--- pkg/R/dapc.R	2009-12-10 20:51:20 UTC (rev 516)
+++ pkg/R/dapc.R	2009-12-17 16:09:20 UTC (rev 517)
@@ -1,180 +1,4 @@
-#############
-## find.clusters
-#############
-find.clusters <- function (x, ...) UseMethod("find.clusters")
-
-######################
-## find.clusters.data.frame
-######################
-find.clusters.data.frame <- function(x, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE, criterion=c("min","diff", "conserv"),
-                                     max.n.clust=round(nrow(x)/10), n.iter=1e3, n.start=10, center=TRUE, scale=TRUE){
-
-    ## CHECKS ##
-    if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
-    if(!require(MASS, quiet=TRUE)) stop("MASS library is required.")
-    if(!require(stats)) stop("package stats is required")
-
-    stat <- match.arg(stat)
-    criterion <- match.arg(criterion)
-
-    ## SOME GENERAL VARIABLES ##
-    N <- nrow(x)
-    min.n.clust <- 2
-    max.n.clust <- max(max.n.clust, 2)
-
-    ## PERFORM PCA ##
-    maxRank <- min(dim(x))
-
-    pcaX <- dudi.pca(x, center = center, scale = scale, scannf = FALSE, nf=maxRank)
-
-    ## select the number of retained PC for PCA
-    if(is.null(n.pca)){
-        cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
-        plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA")
-        cat("Choose the number PCs to retain (>=1): ")
-        n.pca <- as.integer(readLines(n = 1))
-    }
-
-     ## keep relevant PCs - stored in XU
-    X.rank <- length(pcaX$eig)
-    n.pca <- min(X.rank, n.pca)
-    if(n.pca >= N) warning("number of retained PCs of PCA is greater than N")
-    if(n.pca > N/3) warning("number of retained PCs of PCA may be too large (> N /3)")
-
-    XU <- pcaX$li[, 1:n.pca, drop=FALSE] # principal components
-
-    ## PERFORM K-MEANS
-    if(is.null(n.clust)){
-        nbClust <- min.n.clust:max.n.clust
-        WSS <- numeric(0)
-
-        for(i in 1:length(nbClust)){
-            temp <- kmeans(XU, centers=nbClust[i], iter.max=min(n.iter, 100), nstart=min(n.start, 1e3))
-            WSS[i] <- sum(temp$withinss)
-        }
-
-
-        ## DETERMINE THE NUMBER OF GROUPS
-        ##TSS <- sum(pcaX$eig) * N
-        ##betweenVar <- (1 - ((stat/(N-nbClust-1))/(TSS/(N-1)) )) *100
-        ##WSS.ori <- sum(apply(XU, 2, function(v) sum((v-mean(v))^2) ))
-        ##reducWSS <- -diff(c(WSS.ori, stat))
-        ##reducWSS <- reducWSS/max(reducWSS)
-
-        if(stat=="AIC"){
-            WSS.ori <- sum(apply(XU, 2, function(v) sum((v-mean(v))^2) ))
-            k <- nbClust
-            myStat <- N*log(c(WSS.ori,WSS)/N) + 2*c(1,nbClust)
-            myLab <- "AIC"
-            myTitle <- "Value of AIC \nversus number of clusters"
-
-        }
-        if(stat=="BIC"){
-            WSS.ori <- sum(apply(XU, 2, function(v) sum((v-mean(v))^2) ))
-            k <- nbClust
-            myStat <- N*log(c(WSS.ori,WSS)/N) + log(N) *c(1,nbClust)
-            myLab <- "BIC"
-            myTitle <- "Value of BIC \nversus number of clusters"
-        }
-        if(stat=="WSS"){
-            WSS.ori <- sum(apply(XU, 2, function(v) sum((v-mean(v))^2) ))
-            myStat <- c(WSS.ori, WSS)
-            ##            reducWSS <- -diff(c(WSS.ori, stat))
-            ##            myStat <- reducWSS/max(reducWSS)
-            myLab <- "Within sum of squares"
-            myTitle <- "Value of within SS\nversus number of clusters"
-        }
-
-        if(choose.n.clust){
-            plot(c(1,nbClust), myStat, xlab="Number of clusters", ylab=myLab, main=myTitle, type="b", col="blue")
-            abline(h=0, lty=2, col="red")
-            cat("Choose the number of clusters (>=2: ")
-            n.clust <- as.integer(readLines(n = 1))
-        } else {
-            if(criterion=="min") {
-                n.clust <- which.min(myStat)
-            }
-            if(criterion=="diff") {
-                temp <- diff(myStat)
-                n.clust <- which.max( which( (temp-min(temp))<max(temp)/1e4))
-            }
-            if(criterion=="conserv") {
-                temp <- min(myStat) + 0.15*(max(myStat) - min(myStat))
-                n.clust <- min( which(myStat < temp))
-            }
-
-        }
-    }
-
-    ## get final groups
-    best <-  kmeans(XU, centers=n.clust, iter.max=n.iter, nstart=n.start)
-
-
-    ## MAKE RESULT ##
-    if(is.null(n.clust)){
-        names(myStat) <- paste("K",c(1,nbClust), sep="=")
-    } else {
-        myStat <- sum(best$withinss)
-    }
-
-    res <- list(stat=myStat, grp=factor(best$cluster), size=best$size)
-
-    return(res)
-} # end find.clusters.data.frame
-
-
-
-
-
-
-###################
-## find.clusters.genind
-###################
-find.clusters.genind <- function(x, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE, criterion=c("min","diff", "conserv"),
-                          max.n.clust=round(nrow(x at tab)/10), n.iter=1e3, n.start=10,
-                          scale=TRUE, scale.method=c("sigma", "binom"), truenames=TRUE){
-
-    ## CHECKS ##
-    if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
-    if(!require(MASS, quiet=TRUE)) stop("MASS library is required.")
-    if(!require(stats)) stop("package stats is required")
-    if(!is.genind(x)) stop("x must be a genind object.")
-    stat <- match.arg(stat)
-
-
-    ## SOME GENERAL VARIABLES ##
-    N <- nrow(x at tab)
-    min.n.clust <- 2
-
-    ## PERFORM PCA ##
-    maxRank <- min(dim(x at tab))
-
-    X <- scaleGen(x, center = TRUE, scale = scale, method = scale.method,
-                  missing = "mean", truenames = truenames)
-
-    ## CALL DATA.FRAME METHOD
-    res <- find.clusters(X, n.pca=n.pca, n.clust=n.clust, stat=stat, max.n.clust=max.n.clust, n.iter=n.iter, n.start=n.start,
-                         choose.n.clust=choose.n.clust, criterion=criterion, center=FALSE, scale=FALSE)
-    return(res)
-} # end find.clusters.genind
-
-
-
-
-
-###################
-## find.clusters.matrix
-###################
-find.clusters.matrix <- function(x, ...){
-    return(find.clusters(as.data.frame(x), ...))
-}
-
-
-
-
-
-
-########
+#######
 ## dapc
 ########
 dapc <- function (x, ...) UseMethod("dapc")

Added: pkg/R/find.clust.R
===================================================================
--- pkg/R/find.clust.R	                        (rev 0)
+++ pkg/R/find.clust.R	2009-12-17 16:09:20 UTC (rev 517)
@@ -0,0 +1,233 @@
+#############
+## find.clusters
+#############
+find.clusters <- function (x, ...) UseMethod("find.clusters")
+
+######################
+## find.clusters.data.frame
+######################
+find.clusters.data.frame <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE, criterion=c("min","diff", "conserv"),
+                                     max.n.clust=round(nrow(x)/10), n.iter=1e3, n.start=10, center=TRUE, scale=TRUE){
+
+    ## CHECKS ##
+    if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
+    if(!require(MASS, quiet=TRUE)) stop("MASS library is required.")
+    if(!require(stats)) stop("package stats is required")
+
+    stat <- match.arg(stat)
+    criterion <- match.arg(criterion)
+
+    ## KEEP TRACK OF SOME ORIGINAL PARAMETERS
+    ## n.pca.ori <- n.pca
+    ##n.clust.ori <- n.clust
+
+
+    ## ESCAPE IF SUB-CLUST ARE SEEKED ##
+    if(!is.null(clust)){
+        res <- .find.sub.clusters(x=x, clust=clust, n.pca=n.pca, n.clust=n.clust, stat=stat, max.n.clust=max.n.clust, n.iter=n.iter, n.start=n.start,
+                         choose.n.clust=choose.n.clust, criterion=criterion, center=center, scale=scale)
+        return(res)
+    }
+    ## END SUB-CLUST
+
+
+    ## SOME GENERAL VARIABLES ##
+    N <- nrow(x)
+    min.n.clust <- 2
+    max.n.clust <- max(max.n.clust, 2)
+
+    ## PERFORM PCA ##
+    maxRank <- min(dim(x))
+
+    pcaX <- dudi.pca(x, center = center, scale = scale, scannf = FALSE, nf=maxRank)
+
+    ## select the number of retained PC for PCA
+    if(is.null(n.pca)){
+        cumVar <- 100 * cumsum(pcaX$eig)/sum(pcaX$eig)
+        plot(cumVar, xlab="Number of retained PCs", ylab="Cumulative variance (%)", main="Variance explained by PCA")
+        cat("Choose the number PCs to retain (>=1): ")
+        n.pca <- as.integer(readLines(n = 1))
+    }
+
+     ## keep relevant PCs - stored in XU
+    X.rank <- length(pcaX$eig)
+    n.pca <- min(X.rank, n.pca)
+    if(n.pca >= N) warning("number of retained PCs of PCA is greater than N")
+    if(n.pca > N/3) warning("number of retained PCs of PCA may be too large (> N /3)")
+
+    XU <- pcaX$li[, 1:n.pca, drop=FALSE] # principal components
+
+    ## PERFORM K-MEANS
+    if(is.null(n.clust)){
+        nbClust <- min.n.clust:max.n.clust
+        WSS <- numeric(0)
+
+        for(i in 1:length(nbClust)){
+            temp <- kmeans(XU, centers=nbClust[i], iter.max=min(n.iter, 100), nstart=min(n.start, 1e3))
+            WSS[i] <- sum(temp$withinss)
+        }
+
+
+        ## DETERMINE THE NUMBER OF GROUPS
+        ##TSS <- sum(pcaX$eig) * N
+        ##betweenVar <- (1 - ((stat/(N-nbClust-1))/(TSS/(N-1)) )) *100
+        ##WSS.ori <- sum(apply(XU, 2, function(v) sum((v-mean(v))^2) ))
+        ##reducWSS <- -diff(c(WSS.ori, stat))
+        ##reducWSS <- reducWSS/max(reducWSS)
+
+        if(stat=="AIC"){
+            WSS.ori <- sum(apply(XU, 2, function(v) sum((v-mean(v))^2) ))
+            k <- nbClust
+            myStat <- N*log(c(WSS.ori,WSS)/N) + 2*c(1,nbClust)
+            myLab <- "AIC"
+            myTitle <- "Value of AIC \nversus number of clusters"
+
+        }
+        if(stat=="BIC"){
+            WSS.ori <- sum(apply(XU, 2, function(v) sum((v-mean(v))^2) ))
+            k <- nbClust
+            myStat <- N*log(c(WSS.ori,WSS)/N) + log(N) *c(1,nbClust)
+            myLab <- "BIC"
+            myTitle <- "Value of BIC \nversus number of clusters"
+        }
+        if(stat=="WSS"){
+            WSS.ori <- sum(apply(XU, 2, function(v) sum((v-mean(v))^2) ))
+            myStat <- c(WSS.ori, WSS)
+            ##            reducWSS <- -diff(c(WSS.ori, stat))
+            ##            myStat <- reducWSS/max(reducWSS)
+            myLab <- "Within sum of squares"
+            myTitle <- "Value of within SS\nversus number of clusters"
+        }
+
+        if(choose.n.clust){
+            plot(c(1,nbClust), myStat, xlab="Number of clusters", ylab=myLab, main=myTitle, type="b", col="blue")
+            abline(h=0, lty=2, col="red")
+            cat("Choose the number of clusters (>=2: ")
+            n.clust <- max(1, as.integer(readLines(n = 1)))
+        } else {
+            if(criterion=="min") {
+                n.clust <- which.min(myStat)
+            }
+            if(criterion=="diff") {
+                temp <- diff(myStat)
+                n.clust <- which.max( which( (temp-min(temp))<max(temp)/1e4))
+            }
+            if(criterion=="conserv") {
+                temp <- min(myStat) + 0.15*(max(myStat) - min(myStat))
+                n.clust <- min( which(myStat < temp))
+            }
+
+        }
+    }
+
+    ## get final groups
+    if(n.clust >1){
+        best <-  kmeans(XU, centers=n.clust, iter.max=n.iter, nstart=n.start)
+    } else {
+        best <- list(cluster=factor(rep(1,N)), size=N)
+    }
+
+
+    ## MAKE RESULT ##
+    if(!is.null(myStat)){
+        names(myStat) <- paste("K",c(1,nbClust), sep="=")
+    }
+
+    res <- list(Kstat=myStat, stat=myStat[n.clust], grp=factor(best$cluster), size=best$size)
+
+    return(res)
+} # end find.clusters.data.frame
+
+
+
+
+
+
+###################
+## find.clusters.genind
+###################
+find.clusters.genind <- function(x, clust=NULL, n.pca=NULL, n.clust=NULL, stat=c("BIC", "AIC", "WSS"), choose.n.clust=TRUE, criterion=c("min","diff", "conserv"),
+                          max.n.clust=round(nrow(x at tab)/10), n.iter=1e3, n.start=10,
+                          scale=FALSE, scale.method=c("sigma", "binom"), truenames=TRUE, ...){
+
+    ## CHECKS ##
+    if(!require(ade4, quiet=TRUE)) stop("ade4 library is required.")
+    if(!require(MASS, quiet=TRUE)) stop("MASS library is required.")
+    if(!require(stats)) stop("package stats is required")
+    if(!is.genind(x)) stop("x must be a genind object.")
+    stat <- match.arg(stat)
+
+
+    ## SOME GENERAL VARIABLES ##
+    N <- nrow(x at tab)
+    min.n.clust <- 2
+
+    ## PERFORM PCA ##
+    maxRank <- min(dim(x at tab))
+
+    X <- scaleGen(x, center = TRUE, scale = scale, method = scale.method,
+                  missing = "mean", truenames = truenames)
+
+    ## CALL DATA.FRAME METHOD
+    res <- find.clusters(X, clust=clust, n.pca=n.pca, n.clust=n.clust, stat=stat, max.n.clust=max.n.clust, n.iter=n.iter, n.start=n.start,
+                         choose.n.clust=choose.n.clust, criterion=criterion, center=FALSE, scale=FALSE)
+    return(res)
+} # end find.clusters.genind
+
+
+
+
+
+###################
+## find.clusters.matrix
+###################
+find.clusters.matrix <- function(x, ...){
+    return(find.clusters(as.data.frame(x), ...))
+}
+
+
+
+
+
+
+
+
+###################
+## .find.sub.clusters
+###################
+.find.sub.clusters <- function(x, ...){
+
+    ## GET ... ##
+    myArgs <- list(...)
+    if(!is.null(myArgs$quiet)){
+        quiet <- myArgs$quiet
+        myArgs$quiet <- NULL
+    } else {
+        quiet <- FALSE
+    }
+
+    clust <- myArgs$clust
+    myArgs$clust <- NULL
+
+    if(is.null(clust)) stop("clust is not provided")
+    clust <- as.factor(clust)
+
+    ## temp will store temporary resuts
+    newFac <- character(length(clust))
+
+    ## find sub clusters
+    for(i in levels(clust)){
+        if(!quiet) cat("\nLooking for sub-clusters in cluster",i,"\n")
+        myArgs$x <- x[clust==i, ]
+        temp <- do.call(find.clusters, myArgs)$grp
+        levels(temp) <- paste(i, levels(temp), sep=".")
+        newFac[clust==i] <- as.character(temp)
+    }
+
+    res <- list(stat=NA, grp=factor(newFac), size=as.integer(table(newFac)))
+
+    return(res)
+}
+
+
+