[adegenet-commits] r1104 - in pkg: . R man

Fri Apr 5 14:48:02 CEST 2013

Author: jombart
Date: 2013-04-05 14:48:01 +0200 (Fri, 05 Apr 2013)
New Revision: 1104

Modified:
   pkg/DESCRIPTION
   pkg/R/sequences.R
   pkg/man/sequences.Rd
Log:
Brand new version of DNAbin2genind, orders of magnitude faster, and much more memory-efficient

Modified: pkg/DESCRIPTION
===================================================================

--- pkg/DESCRIPTION	2013-04-05 10:25:24 UTC (rev 1103)
+++ pkg/DESCRIPTION	2013-04-05 12:48:01 UTC (rev 1104)
@@ -7,6 +7,6 @@
 Suggests: genetics, spdep, tripack, pegas, seqinr, adehabitat, multicore, akima, maps, splancs, hierfstat
 Depends: R (>= 2.10), methods, MASS, ade4, igraph, ape
 Description: Classes and functions for genetic data analysis within the multivariate framework.
-Collate: classes.R basicMethods.R handling.R auxil.R setAs.R SNPbin.R glHandle.R glFunctions.R glSim.R find.clust.R hybridize.R scale.R fstat.R import.R seqTrack.R chooseCN.R genind2genpop.R loadingplot.R sequences.R gstat.randtest.R makefreq.R colorplot.R monmonier.R spca.R coords.monmonier.R haploGen.R old2new.R spca.rtests.R dapc.R haploPop.R PCtest.R dist.genpop.R Hs.R propShared.R export.R HWE.R propTyped.R inbreeding.R glPlot.R gengraph.R simOutbreak.R mutations.R zzz.R
+Collate: classes.R basicMethods.R handling.R auxil.R setAs.R SNPbin.R glHandle.R glFunctions.R glSim.R find.clust.R hybridize.R scale.R fstat.R import.R seqTrack.R chooseCN.R genind2genpop.R loadingplot.R sequences.R gstat.randtest.R makefreq.R colorplot.R monmonier.R spca.R coords.monmonier.R haploGen.R old2new.R spca.rtests.R dapc.R dapcXval.R haploPop.R PCtest.R dist.genpop.R Hs.R propShared.R export.R HWE.R propTyped.R inbreeding.R glPlot.R gengraph.R simOutbreak.R mutations.R zzz.R
 License: GPL (>=2)
 LazyLoad: yes

Modified: pkg/R/sequences.R
===================================================================
--- pkg/R/sequences.R	2013-04-05 10:25:24 UTC (rev 1103)
+++ pkg/R/sequences.R	2013-04-05 12:48:01 UTC (rev 1104)
@@ -10,61 +10,61 @@
 ################
 # DNAbin2genind
 ################
-DNAbin2genind <- function(x, pop=NULL, exp.char=c("a","t","g","c"), na.char=NULL, polyThres=1/100){
+DNAbin2genind <- function(x, pop=NULL, exp.char=c("a","t","g","c"), polyThres=1/100){
 
-    ## misc checks
+    ## MISC CHECKS ##
     if(!inherits(x,"DNAbin")) stop("x is not a DNAbin object")
     if(!require(ape)) stop("The package ape is required.")
-
-    ## DNA bin to matrix of characters
-    x <- as.character(x) # should output a matrix
-
-    if(is.list(x)) { # if this is a list
-        temp <- unique(sapply(x,length)) # check lengths of sequences
-        if(length(temp)>1) stop("Sequences have different length - please use alignements only.")
-        else{ # if sequences have same length, build the matrix
-            temp <- names(x)
-            x <- t(as.data.frame(x))
-            rownames(x) <- temp
-        }
+    if(is.list(x)) {
+        x <- as.matrix(x)
     }
 
     if(is.null(colnames(x))) {
         colnames(x) <- 1:ncol(x)
     }
 
-    ## replace NAs
-    if(is.null(na.char)){
-        if(is.null(exp.char)) stop("both exp.char and na.char are NULL")
-        temp <- paste(exp.char, collapse="", sep="")
-        if(any(exp.char=="-")) {
-            temp <- paste("-",temp, sep="") # string '-' must begin the regexp
-        }
-        temp <- paste("[^", temp, "]", sep="") # anything but the expected is NA
-        x <- gsub(temp,NA,x)
-    } else {
-        temp <- paste(na.char, collapse="", sep="")
-        if(any(na.char=="-")) {
-            temp <- paste("-",temp, sep="") # string '-' must start the regexp
-        }
-        temp <- paste("[", temp, "]", sep="")
-        x <- gsub(temp,NA,x)
-    }
 
-    ## keep only columns with polymorphism (i.e., SNPs)
-    isPoly <- function(vec){
+    ## FUNCTION TO PROCESS ONE LOCUS ##
+    ## INPUTS:
+    ## locus is a column of a DNAbin matrix
+    ## posi is the index of this column
+    ## OUTPUTS:
+    ## returns NULL if no polymorphism
+    ## returns a disjonctive table with named columns otherwise
+    ## column names are given as [position.allele]
+    processLocus <- function(locus, posi){
+        vec <- as.character(locus)
+        vec[!vec %in% exp.char] <- NA
         N <- sum(!is.na(vec)) # N: number of sequences
-        temp <- table(vec)/N
-        if(sum(temp > polyThres) >= 2) return(TRUE)
-        return(FALSE)
+        if(N==0) return(NULL) # escape if untyped locus
+        alleles <- names(which(table(vec)/N >= polyThres ))
+        if(length(alleles)<2) return(NULL) # escape if no polymorphism
+        vec[!vec %in% alleles] <- NA
+        out <- sapply(alleles, function(e) 1*(vec==e))
+        colnames(out) <- paste(posi, alleles, sep=".")
+        return(out)
     }
 
-    toKeep <- apply(x, 2, isPoly)
-    if(sum(toKeep)==0) stop("No polymorphic site detected")
-    x <- x[,toKeep]
 
-    ## build output
-    res <- df2genind(x, pop=pop, ploidy=1, ncode=1, type="codom")
+    ## PROCESS ALL LOCI ##
+    ## get disjonctive matrix ##
+    ## system.time(res at tab <- Reduce(cbind, lapply(1:ncol(x), function(i) processLocus(x[,i], i)))) # works, but Reduce is real slow
+    temp <- lapply(1:ncol(x), function(i) processLocus(x[,i], i)) # process all loci, return a list
+    col.names <- unlist(sapply(temp, colnames))
+    temp <-  as.matrix(data.frame(temp[!sapply(temp, is.null)])) # remove NULL slots, list -> matrix
+    if(is.null(temp) || ncol(temp)==0){
+        cat("\nNo polymorphism detected - returning NULL.\n")
+        return(NULL)
+    }
+
+    ## sort out col/row names ##
+    colnames(temp) <- col.names # restore correct names
+    rownames(temp) <- rownames(x)
+
+    ## create genind output ##
+    res <- genind(temp, ploidy=1, type="codom")
+    rm(temp) # remove temp
+    gc() # collect garbage
     res$call <- match.call()
 
     return(res)

Modified: pkg/man/sequences.Rd
===================================================================
--- pkg/man/sequences.Rd	2013-04-05 10:25:24 UTC (rev 1103)
+++ pkg/man/sequences.Rd	2013-04-05 12:48:01 UTC (rev 1104)
@@ -13,8 +13,7 @@
   - alignment (seqinr package): function alignment2genind\cr
 }
 \usage{
-DNAbin2genind(x, pop=NULL, exp.char=c("a","t","g","c"), na.char=NULL,
-                         polyThres=1/100)
+DNAbin2genind(x, pop=NULL, exp.char=c("a","t","g","c"), polyThres=1/100)
 
 alignment2genind(x, pop=NULL, exp.char=c("a","t","g","c"), na.char="-", polyThres=1/100)