[adegenet-commits] r1037 - pkg/R
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Fri Oct 19 16:24:30 CEST 2012
Author: jombart
Date: 2012-10-19 16:24:29 +0200 (Fri, 19 Oct 2012)
New Revision: 1037
Modified:
pkg/R/import.R
Log:
Added a new function to read fasta into DNAbin without using too much memory
Modified: pkg/R/import.R
===================================================================
--- pkg/R/import.R 2012-10-16 10:14:10 UTC (rev 1036)
+++ pkg/R/import.R 2012-10-19 14:24:29 UTC (rev 1037)
@@ -1173,3 +1173,115 @@
return(res)
} # end fasta2genlight
+
+
+
+
+
+
+
+
+###########################
+## Function fasta2DNAbin
+###########################
+fasta2DNAbin <- function(file, quiet=FALSE, chunkSize=10, snpOnly=FALSE, ...){
+ if(!require(ape)) stop("ape package is needed")
+
+ ## HANDLE ARGUMENTS ##
+ ext <- .readExt(file)
+ ext <- toupper(ext)
+ if(!ext %in% c("FASTA", "FA", "FAS")) warning("wrong file extension - '.fasta', '.fa' or '.fas' expected")
+ if(!quiet) cat("\n Converting FASTA alignment into a DNAbin object... \n\n")
+
+
+ ## PRIOR CHECKS ##
+ ## find nb of lines per genome ##
+
+ ## find length of a single line of sequence
+ if(!quiet) cat("\n Finding the size of a single genome... \n\n")
+ lines.to.skip <- 0
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, nmax=1)
+
+ while(length(grep("^>.+", txt))==1){
+ lines.to.skip <- lines.to.skip + 1
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, n=1)
+ }
+
+ char.per.line <- nchar(txt)
+ nbOfLinesToRead <- max(round(1e6/char.per.line),1)#
+
+ ## read file to find genome size, 1e6 characters at a time
+ nblocks <- 1
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, n=nbOfLinesToRead*nblocks)
+
+ while(length(grep("^>.+", txt))<2){
+ nblocks <- nblocks+1
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, n=nbOfLinesToRead*nblocks)
+ }
+
+ ## this is the nb of lines for one genome
+ ## including the first line of annotation
+ LINES.PER.IND <- diff(grep("^>.+", txt))
+
+ ## this is the length of a genome single
+ GENOMESIZE <- sum(nchar(txt[2:LINES.PER.IND]))
+ if(!quiet) cat("\n genome size is:", format(GENOMESIZE, big.mark=","), "nucleotides \n")
+ if(!quiet) cat("\n(",format(LINES.PER.IND, big.mark=","), " lines per genome )\n")
+
+
+ ## START READING / CONVERTING GENOMES ##
+ if(!quiet) cat("\n Importing sequences... \n")
+
+ ## read all genomes by chunks
+ ## initialize
+ lines.to.skip <- 0
+ IND.LAB <- NULL
+ COUNT <- 0 # used to count the nb reads
+
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, n=LINES.PER.IND*chunkSize)
+ res <- raw()
+
+ ## read and process chunks
+ while(length(txt)>0){
+ COUNT <- COUNT + 1
+ if(!quiet) {
+ for(i in 1:(COUNT*chunkSize)) cat(".")
+ }
+
+ nb.ind <- length(grep("^>", txt))
+ IND.LAB <- c(IND.LAB, sub(">","",txt[grep("^>", txt)])) # find individuals' labels
+ txt <- split(txt, rep(1:nb.ind, each=LINES.PER.IND)) # split per individuals
+ txt <- lapply(txt, function(e) unlist(strsplit(tolower(paste(e[-1], collapse="")), split=""))) # each genome -> one vector
+
+ ## convert character vectors to DNAbin output
+ res <- c(res, unlist(lapply(txt, as.DNAbin)))
+
+ ## ## POOL contains all alleles of each position
+ ## temp <- as.list(apply(matrix(unlist(txt), byrow=TRUE, nrow=length(txt)),2,unique)) # alleles current genomes
+ ## POOL <- mapply(function(x,y) unique(c(x,y)), POOL, temp, SIMPLIFY=FALSE) # update global pool
+
+ lines.to.skip <- lines.to.skip + nb.ind*LINES.PER.IND
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, n=LINES.PER.IND*chunkSize)
+ }
+
+ ## GET FINAL OBJECT ##
+ if(!quiet) cat("\n Forming final object... \n")
+
+ ## form matrix ##
+ res <- matrix(res, nrow=length(IND.LAB), byrow=TRUE)
+ class(res) <- "DNAbin"
+ rownames(res) <- IND.LAB
+
+ ## extract snps if needed ##
+ if(snpOnly){
+ if(!quiet) cat("\n Extracting SNPs... \n")
+ snp.posi <- seg.sites(res)
+ if(length(snp.posi)==0) warning("no polymorphic site in the sequences")
+ res <- res[,seg.sites(res),drop=FALSE]
+ }
+
+ ## RETURN OUTPUT ##
+ if(!quiet) cat("\n...done.\n\n")
+
+ return(res)
+} # end fasta2DNAbin
More information about the adegenet-commits
mailing list