[adegenet-commits] r760 - pkg/R

Mon Jan 10 15:20:03 CET 2011

Author: jombart
Date: 2011-01-10 15:20:03 +0100 (Mon, 10 Jan 2011)
New Revision: 760

Modified:
   pkg/R/SNPbin.R
   pkg/R/glFunctions.R
Log:
2 modes for sums, variances, means.


Modified: pkg/R/SNPbin.R
===================================================================

--- pkg/R/SNPbin.R	2011-01-10 14:19:32 UTC (rev 759)
+++ pkg/R/SNPbin.R	2011-01-10 14:20:03 UTC (rev 760)
@@ -323,7 +323,7 @@
 
 
 ###############
-## show SNPbin
+## show genlight
 ###############
 setMethod ("show", "genlight", function(object){
     cat(" === S4 class genlight ===")
@@ -336,7 +336,6 @@
         cat("\n Ploidy statistics (min/median/max):", temp[1], "/", temp[3], "/", temp[6])
     }
     temp <- sapply(object at gen, function(e) length(e at NA.posi))
-    ## temp <- round(length(object at NA.posi)/nLoc(object) *100,2)
     cat("\n ", sum(temp), " (", round(sum(temp)/(nInd(object)*nLoc(object)),2)," %) missing data\n", sep="")
 }) # end show method
 

Modified: pkg/R/glFunctions.R
===================================================================
--- pkg/R/glFunctions.R	2011-01-10 14:19:32 UTC (rev 759)
+++ pkg/R/glFunctions.R	2011-01-10 14:20:03 UTC (rev 760)
@@ -5,17 +5,28 @@
 ## compute col sums
 ## removing NAs
 ##
-glSum <- function(x){
+glSum <- function(x, alleleAsUnit=TRUE){
     if(!inherits(x, "genlight")) stop("x is not a genlight object")
 
     ## DEFAULT, VECTOR-WISE PROCEDURE ##
+    ## use ploidy (sum absolute frequencies)
+    if(alleleAsUnit){
     res <- integer(nLoc(x))
-    for(e in x at gen){
-        temp <- as.integer(e)
-        temp[is.na(temp)] <- 0L
-        res <- res + temp
+        for(e in x at gen){
+            temp <- as.integer(e)
+            temp[is.na(temp)] <- 0L
+            res <- res + temp
+        }
+    } else {
+        ## sum relative frequencies
+        res <- numeric(nLoc(x))
+        myPloidy <- ploidy(x)
+        for(i in 1:nInd(x)){
+            temp <- as.integer(x at gen[[i]]) / myPloidy[i]
+            temp[is.na(temp)] <- 0
+            res <- res + temp
+        }
     }
-
     names(res) <- locNames(x)
     return(res)
 
@@ -30,16 +41,28 @@
 ##########
 ## counts NB of NAs per column
 ##
-glNA <- function(x){
+## if alleleAsUnit, then effective is the number of alleles sampled (sum(ploidy(x)))
+## otherwise, effective is simply the number of individuals (
+glNA <- function(x, alleleAsUnit=TRUE){
     if(!inherits(x, "genlight")) stop("x is not a genlight object")
 
     ## DEFAULT, VECTOR-WISE PROCEDURE ##
     res <- integer(nLoc(x))
     temp <- NA.posi(x)
-    for(e in temp){
-        if(length(e)>0){
-            res[e] <- res[e] + 1
+
+    ## NAs in allele sampling
+    if(alleleAsUnit){
+        for(i in 1:length(temp)){
+            if(length(temp[[i]])>0){
+                res[temp[[i]]] <- res[temp[[i]]] + ploidy(x)[i]
+            }
         }
+    } else { ## NAs amongst individuals
+        for(e in temp){
+            if(length(e)>0){
+                res[e] <- res[e] + 1
+            }
+        }
     }
 
     names(res) <- locNames(x)
@@ -57,12 +80,18 @@
 ## computes SNPs means
 ## takes NAs into account
 ##
-glMean <- function(x){
+glMean <- function(x, alleleAsUnit=TRUE){
     if(!inherits(x, "genlight")) stop("x is not a genlight object")
 
     ## DEFAULT, VECTOR-WISE PROCEDURE ##
-    N <- nInd(x) - glNA(x)
-    res <- glSum(x)/N
+    if(alleleAsUnit){ # use alleles
+        N <- sum(ploidy(x)) - glNA(x, alleleAsUnit=TRUE)
+        res <- glSum(x, alleleAsUnit=TRUE)/N
+    } else { # use relative frequencies of individuals
+        N <- nInd(x) - glNA(x, alleleAsUnit=FALSE)
+        res <- glSum(x, alleleAsUnit=FALSE)/N
+    }
+
     names(res) <- locNames(x)
     return(res)
 
@@ -78,21 +107,34 @@
 ## computes SNPs variances
 ## takes NAs into account
 ##
-glVar <- function(x){
+glVar <- function(x, alleleAsUnit=TRUE){
     if(!inherits(x, "genlight")) stop("x is not a genlight object")
 
     ## DEFAULT, VECTOR-WISE PROCEDURE ##
-    N <- nInd(x) - glNA(x)
-    xbar <- glMean(x)
+    res <- numeric(nLoc(x))
+    myPloidy <- ploidy(x)
 
-    res <- numeric(nLoc(x))
-    for(e in x at gen){
-        temp <- (as.integer(e) - xbar)^2
-        temp[is.na(temp)] <- 0L
-        res <- res + temp
+    if(alleleAsUnit){ # use alleles
+        N <- sum(ploidy(x)) - glNA(x, alleleAsUnit=TRUE)
+        xbar <- glMean(x, alleleAsUnit=TRUE)
+        for(i in 1:nInd(x)){
+            temp <- (as.integer(x at gen[[i]])/myPloidy[i] - xbar)^2
+            temp[is.na(temp)] <- 0
+            res <- res + temp*myPloidy[i]
+        }
+        res <- res/N
+    } else { # use relative frequencies of individuals
+        N <- nInd(x) - glNA(x, alleleAsUnit=FALSE)
+        xbar <- glMean(x, alleleAsUnit=FALSE)
+
+        for(i in 1:nInd(x)){
+            temp <- (as.integer(x at gen[[i]])/myPloidy[i] - xbar)^2
+            temp[is.na(temp)] <- 0L
+            res <- res + temp
+        }
+        res <- res/N
     }
 
-    res <- res/N
     names(res) <- locNames(x)
     return(res)
 
@@ -116,27 +158,74 @@
 
     ## COMPUTE MEANS AND VARIANCES ##
     if(center) {
-        vecMeans <- glMean(x)
+       vecMeans <- glMean(x, alleleAsUnit=FALSE)
         if(any(is.na(vecMeans))) stop("NAs detected in the vector of means")
-    } else {
-        vecMeans <- 0
     }
+
     if(scale){
-        if(any(is.na(vecVar))) stop("NAs detected in the vector of variances")
-        vecSd <- sqrt(glVar(x))
-    } else {
-        vecSd <- 1
+        vecSd <- sqrt(glVar(x, alleleAsUnit=FALSE))
+        if(any(is.na(vecSd))) stop("NAs detected in the vector of variances")
     }
 
+    ## COMPUTE DOT PRODUCTS BETWEEN GENOTYPES ##
+    ## to be fast, a particular function is defined for each case of centring/scaling
 
-    ## COMPUTE DOT PRODUCTS BETWEEN GENOTYPES ##
-    dotProd <- function(a,b){ # a and b are two SNPbin objects
-        res <- sum( ((as.integer(a)-vecMeans)/vecSd) * ((as.integer(b)-vecMeans)/vecSd), na.rm=TRUE )
-        return(res)
+    myPloidy <- ploidy(x)
+
+    ## NO CENTRING / NO SCALING
+      if(!centre & !scale){
+        dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+            a <- as.integer(a) / ploid.a
+            a[is.na(a)] <- 0
+            b <- as.integer(b) / ploidy.b
+            b[is.na(b)] <- 0
+            res <- sum( a*b, na.rm=TRUE )
+            return(res)
+        }
     }
 
+    ## CENTRING / NO SCALING
+    if(centre & !scale){
+        dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+            a <- as.integer(a) / ploid.a
+            a[is.na(a)] <- vecMeans[is.na(a)]
+            b <- as.integer(b) / ploid.b
+            b[is.na(b)] <- vecMeans[is.na(b)]
+            res <- sum( (a-vecMeans) * (b-vecMeans), na.rm=TRUE )
+            return(res)
+        }
+    }
+
+
+    ## NO CENTRING / SCALING (odd option...)
+    if(!centre & scale){
+        dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+            a <- as.integer(a) / ploid.a
+            a[is.na(a)] <- 0
+            b <- as.integer(b) / ploid.b
+            b[is.na(b)] <- 0
+            res <- sum( a/vecSd * b/vecSd, na.rm=TRUE )
+            return(res)
+        }
+    }
+
+
+    ## CENTRING/ SCALING
+    if(centre & scale){
+        dotProd <- function(a,b, ploid.a, ploid.b){ # a and b are two SNPbin objects
+            a <- as.integer(a) / ploid.a
+            a[is.na(a)] <- vecMeans[is.na(a)]
+            b <- as.integer(b) / ploid.b
+            a[is.na(a)] <- vecMeans[is.na(a)]
+            res <- sum( (a-vecMeans)/vecSd * (b-vecMeans)/vecSd, na.rm=TRUE )
+            return(res)
+        }
+    }
+
+
+    ## COMPUTE ALL POSSIBLE DOT PRODUCTS
     allComb <- combn(1:nInd(x), 2)
-    allProd <- unlist(lapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]]), x at gen[[allComb[2,i]]]))
+    allProd <- unlist(lapply(1:ncol(allComb), function(i) dotProd(x at gen[[allComb[1,i]]], x at gen[[allComb[2,i]]], myPloidy[allComb[1,i]], myPloidy[allComb[2,i]]) ))
     allProd <- allProd / nInd(x) # assume uniform weights
 
     ## shape result as a matrix
@@ -154,7 +243,7 @@
     eigRes$values <- eigRes$value[1:rank]
     eigRes$vectors <- eigRes$vectors[1:rank]
 
-    ## scan  nb of axes retained
+    ## scan nb of axes retained
     if(scannf){
         barplot(eigRes$values, main="Eigenvalues", col=heat.colors(rank))
         cat("Select the number of axes: ")
@@ -168,12 +257,26 @@
 
     ## get loadings
     if(loadings){
+        res$loadings <- matrix(0
         ## use: c1 = X^TDV
-        ##### TO FINISH !!!
-        res$loadings <- matLoad
-    }
-    ## compute loadings
+    ##     ##### TO FINISH !!!
+    ##     for(i in 1:nInd(x)){
+    ##         temp <- as.integer(x at gen[[i]]) / myPloidy[i]
+    ##         if(centre) {
+    ##             temp[is.na(temp)] <- vecMeans[is.na(temp)]
+    ##             temp <- temp - vecMeans
+    ##         } else {
+    ##             temp[is.na(temp)] <- 0
+    ##         }
+    ##         if(scale){
+    ##             temp <- temp/vecSd
+    ##         }
 
+    ##         res$loadings <- t(temp) %*% res$scores / N
+    ##     }
+    ## }
+    ## ## compute loadings
+
     names(res) <- locNames(x)
     return(res)
 
@@ -183,4 +286,29 @@
 
 
 ## TESTING ##
-## all.equal(glVar(x), apply(as.matrix(x), 2, function(e) mean((e-mean(e, na.rm=TRUE))^2, na.rm=TRUE)))
+## x <- new("genlight", list(c(0,0,1,1,0), c(1,1,1,0,0,1), c(2,1,1,1,1,NA)))
+## as.matrix(x)
+## glNA(x)
+## glSum(x)
+## glMean(x)
+##
+## same ploidy everywhere
+## x <- new("genlight", list(c(0,0,1,1,0), c(1,1,1,0,0,1), c(0,0,0,1,1,1)))
+## f1 <- function(e) {return(mean((e-mean(e, na.rm=TRUE))^2, na.rm=TRUE))}
+## all.equal(glVar(x), apply(as.matrix(x), 2, f1 )) # MUST BE TRUE
+## all.equal(glVar(x,FALSE), apply(as.matrix(x), 2, f1 )) # MUST BE TRUE
+
+## ## differences in ploidy
+## x <- new("genlight", list(c(0,0,1,1,0), c(1,1,1,0,0,1), c(2,1,1,1,1,NA)))
+## temp <- sweep(as.matrix(x), 1, c(1,1,2), "/")
+## f2 <- function(e,w) {
+##     mu <- weighted.mean(e, w, na.rm=TRUE)
+##     res <- weighted.mean((e-mu)^2, w, na.rm=TRUE)
+##     return(res)
+## }
+
+## all.equal(glMean(x), apply(temp,2,weighted.mean, w=c(1,1,2), na.rm=TRUE)) # MUST BE TRUE
+## all.equal(glVar(x), apply(temp, 2, f2,w=c(1,1,2) )) # MUST BE TRUE
+
+## all.equal(glMean(x,FALSE), apply(temp,2,mean,na.rm=TRUE)) # MUST BE TRUE
+## all.equal(glVar(x,FALSE), apply(temp,2,f1)) # MUST BE TRUE