[adegenet-commits] r761 - pkg/R
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Fri Jan 14 16:09:12 CET 2011
Author: jombart
Date: 2011-01-14 16:09:12 +0100 (Fri, 14 Jan 2011)
New Revision: 761
Modified:
pkg/R/glFunctions.R
Log:
Yeeeeeepeeee !
glPca is working. Results are exactly the same as dudi.pca.
Works with missing data as well.
Some doc remains to be added.
Modified: pkg/R/glFunctions.R
===================================================================
--- pkg/R/glFunctions.R 2011-01-10 14:20:03 UTC (rev 760)
+++ pkg/R/glFunctions.R 2011-01-14 15:09:12 UTC (rev 761)
@@ -153,7 +153,7 @@
##
## PCA for genlight objects
##
-glPca <- function(x, center=TRUE, scale=FALSE, scannf=TRUE, nf=2, loadings=TRUE){
+glPca <- function(x, center=TRUE, scale=FALSE, nf=NULL, loadings=TRUE){
if(!inherits(x, "genlight")) stop("x is not a genlight object")
## COMPUTE MEANS AND VARIANCES ##
@@ -163,67 +163,64 @@
}
if(scale){
- vecSd <- sqrt(glVar(x, alleleAsUnit=FALSE))
- if(any(is.na(vecSd))) stop("NAs detected in the vector of variances")
+ vecVar <- glVar(x, alleleAsUnit=FALSE)
+ if(any(is.na(vecVar))) stop("NAs detected in the vector of variances")
}
+
## COMPUTE DOT PRODUCTS BETWEEN GENOTYPES ##
## to be fast, a particular function is defined for each case of centring/scaling
myPloidy <- ploidy(x)
## NO CENTRING / NO SCALING
- if(!centre & !scale){
+ if(!center & !scale){
dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
a <- as.integer(a) / ploid.a
a[is.na(a)] <- 0
b <- as.integer(b) / ploidy.b
b[is.na(b)] <- 0
- res <- sum( a*b, na.rm=TRUE )
- return(res)
+ return(sum( a*b, na.rm=TRUE))
}
}
## CENTRING / NO SCALING
- if(centre & !scale){
+ if(center & !scale){
dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
a <- as.integer(a) / ploid.a
a[is.na(a)] <- vecMeans[is.na(a)]
b <- as.integer(b) / ploid.b
b[is.na(b)] <- vecMeans[is.na(b)]
- res <- sum( (a-vecMeans) * (b-vecMeans), na.rm=TRUE )
- return(res)
+ return(sum( (a-vecMeans) * (b-vecMeans), na.rm=TRUE) )
}
}
## NO CENTRING / SCALING (odd option...)
- if(!centre & scale){
+ if(!center & scale){
dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
a <- as.integer(a) / ploid.a
a[is.na(a)] <- 0
b <- as.integer(b) / ploid.b
b[is.na(b)] <- 0
- res <- sum( a/vecSd * b/vecSd, na.rm=TRUE )
- return(res)
+ return(sum( (a*b)/vecVar, na.rm=TRUE))
}
}
- ## CENTRING/ SCALING
- if(centre & scale){
+ ## CENTRING / SCALING
+ if(center & scale){
dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
a <- as.integer(a) / ploid.a
a[is.na(a)] <- vecMeans[is.na(a)]
b <- as.integer(b) / ploid.b
a[is.na(a)] <- vecMeans[is.na(a)]
- res <- sum( (a-vecMeans)/vecSd * (b-vecMeans)/vecSd, na.rm=TRUE )
- return(res)
+ return( sum( ((a-vecMeans)*(b-vecMeans))/vecVar, na.rm=TRUE ) )
}
}
- ## COMPUTE ALL POSSIBLE DOT PRODUCTS
+ ## COMPUTE ALL POSSIBLE DOT PRODUCTS (XX^T / n) ##
allComb <- combn(1:nInd(x), 2)
allProd <- unlist(lapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]], x at gen[[allComb[2,i]]], myPloidy[allComb[1,i]], myPloidy[allComb[2,i]]) ))
allProd <- allProd / nInd(x) # assume uniform weights
@@ -235,16 +232,20 @@
class(allProd) <- "dist"
allProd <- as.matrix(allProd)
+ ## compute the diagonal
+ temp <- unlist(lapply(1:nInd(x), function(i) dotProd(x at gen[[i]], x at gen[[i]], myPloidy[i], myPloidy[i]) ))/nInd(x)
+ diag(allProd) <- temp
- ## PERFORM EIGENANALYSIS ##
+
+ ## PERFORM THE ANALYSIS ##
## eigenanalysis
eigRes <- eigen(allProd, symmetric=TRUE, only.values=FALSE)
- rank <- sum(eigRes > 1e-10)
- eigRes$values <- eigRes$value[1:rank]
- eigRes$vectors <- eigRes$vectors[1:rank]
+ rank <- sum(eigRes$values > 1e-12)
+ eigRes$values <- eigRes$values[1:rank]
+ eigRes$vectors <- eigRes$vectors[, 1:rank, drop=FALSE]
## scan nb of axes retained
- if(scannf){
+ if(is.null(nf)){
barplot(eigRes$values, main="Eigenvalues", col=heat.colors(rank))
cat("Select the number of axes: ")
nf <- as.integer(readLines(n = 1))
@@ -252,32 +253,55 @@
## rescale PCs
res <- list()
+ res$eig <- eigRes$values
+ ##res$matprod <- allProd # for debugging
+
## use: li = XQU = V\Lambda^(1/2)
- res$scores <- sweep(eigRes$vectors[, 1:nf],2, sqrt(eigRes$values[1:nf]), FUN="*")
+ eigRes$vectors <- eigRes$vectors * sqrt(nInd(x)) # D-normalize vectors
+ res$scores <- sweep(eigRes$vectors[, 1:nf, drop=FALSE],2, sqrt(eigRes$values[1:nf]), FUN="*")
- ## get loadings
+
+ ## GET LOADINGS ##
+ ## need to decompose X^TDV into a sum of n matrices of dim p*r
+ ## but only two such matrices are represented at a time
if(loadings){
- res$loadings <- matrix(0
+ res$loadings <- matrix(0, nrow=nLoc(x), ncol=nf) # create empty matrix
## use: c1 = X^TDV
- ## ##### TO FINISH !!!
- ## for(i in 1:nInd(x)){
- ## temp <- as.integer(x at gen[[i]]) / myPloidy[i]
- ## if(centre) {
- ## temp[is.na(temp)] <- vecMeans[is.na(temp)]
- ## temp <- temp - vecMeans
- ## } else {
- ## temp[is.na(temp)] <- 0
- ## }
- ## if(scale){
- ## temp <- temp/vecSd
- ## }
+ ## and X^TV = A_1 + ... + A_n
+ ## with A_k = X_[k-]^T v[k-]
+ for(k in 1:nInd(x)){
+ temp <- as.integer(x at gen[[k]]) / myPloidy[k]
+ if(center) {
+ temp[is.na(temp)] <- vecMeans[is.na(temp)]
+ temp <- temp - vecMeans
+ } else {
+ temp[is.na(temp)] <- 0
+ }
+ if(scale){
+ temp <- temp/vecSd
+ }
- ## res$loadings <- t(temp) %*% res$scores / N
- ## }
- ## }
- ## ## compute loadings
+ res$loadings <- res$loadings + matrix(temp) %*% eigRes$vectors[k, 1:nf, drop=FALSE]
+ }
- names(res) <- locNames(x)
+ res$loadings <- res$loadings / nInd(x) # don't forget the /n of X_tDV
+ res$loadings <- sweep(res$loadings, 2, sqrt(eigRes$values[1:nf]), FUN="/")
+ }
+
+
+ ## FORMAT OUTPUT ##
+ colnames(res$scores) <- paste("PC", 1:nf, sep="")
+ rownames(res$scores) <- indNames(x)
+
+ if(!is.null(res$loadings)){
+ colnames(res$loadings) <- paste("Axis", 1:nf, sep="")
+ rownames(res$loadings) <- locNames(x)
+ }
+
+ res$call <- match.call()
+
+ class(res) <- "glPca"
+
return(res)
} # glPca
@@ -312,3 +336,36 @@
## all.equal(glMean(x,FALSE), apply(temp,2,mean,na.rm=TRUE)) # MUST BE TRUE
## all.equal(glVar(x,FALSE), apply(temp,2,f1)) # MUST BE TRUE
+
+
+
+
+#### TESTING PCA ####
+## M <- matrix(sample(c(0,1), 1e5, replace=TRUE), nrow=100)
+## rownames(M) <- paste("ind", 1:100)
+
+## ## perform glPca
+## x <- new("genlight",M)
+## toto <- glPca(x, nf=4)
+
+
+## ## perform ordinary PCA
+## titi <- dudi.pca(M, center=TRUE, scale=FALSE, scannf=FALSE, nf=4)
+
+
+## ## check results
+## all(round(abs(toto$scores), 10) == round(abs(titi$li), 10)) # MUST BE TRUE
+## all.equal(toto$eig, titi$eig) # MUST BE TRUE
+## all(round(abs(toto$loadings), 10)==round(abs(titi$c1), 10)) # MUST BE TRUE
+
+
+## test with NAs
+## M <- matrix(sample(c(0,1, NA), 1e5, replace=TRUE, prob=c(.495,.495,.01)), nrow=100)
+## rownames(M) <- paste("ind", 1:100)
+
+## ## perform glPca
+## x <- new("genlight",M)
+## toto <- glPca(x, nf=4)
+
+## round(cor(toto$scores),10) # must be diag(1,4)
+## round(t(toto$loadings) %*% toto$loadings,10) # must be diag(1,4)
More information about the adegenet-commits
mailing list