[adegenet-commits] r902 - pkg/R
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Fri Jun 10 19:47:13 CEST 2011
Author: jombart
Date: 2011-06-10 19:47:12 +0200 (Fri, 10 Jun 2011)
New Revision: 902
Modified:
pkg/R/import.R
Log:
First complete draft of fasta2genlight
Modified: pkg/R/import.R
===================================================================
--- pkg/R/import.R 2011-06-10 15:42:23 UTC (rev 901)
+++ pkg/R/import.R 2011-06-10 17:47:12 UTC (rev 902)
@@ -1024,3 +1024,132 @@
return(res)
} # end read.PLINK
+
+
+
+
+
+
+
+###########################
+## Function fasta2genlight
+###########################
+fasta2genlight <- function(file, quiet=FALSE, chunkSize=1000, saveNbAlleles=TRUE,
+ multicore=require("multicore"), n.cores=NULL, ...){
+ ## HANDLE ARGUMENTS ##
+ ext <- .readExt(file)
+ ext <- toupper(ext)
+ if(!ext %in% c("FASTA", "FA", "FAS")) warning("wrong file extension - '.fasta', '.fa' or '.fas' expected")
+ if(!quiet) cat("\n Converting FASTA alignment into a genlight object... \n\n")
+ if(multicore && !require(multicore)) stop("multicore package requested but not installed")
+ if(multicore && is.null(n.cores)){
+ n.cores <- multicore:::detectCores()
+ }
+
+
+ ## PRIOR CHECKS ##
+ ## find nb of lines per genome
+ lines.to.skip <- 0
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, nmax=1)
+
+ while(length(grep("^>.+", txt))<2){
+ lines.to.skip <- lines.to.skip + 1
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, nmax=lines.to.skip)
+ }
+
+ LINES.PER.IND <- lines.to.skip-1
+
+
+ ## find length of a genome
+ NLOC <- sum(nchar(txt[2:LINES.PER.IND]))
+
+
+ ## SCAN ALL POSITIONS AND IDENTIFY SNPs ##
+ if(!quiet) cat("\n Looking for polymorphic positions... \n")
+
+ ## read all genomes by chunks
+ ## initialize
+ lines.to.skip <- 0
+ IND.LAB <- NULL
+ POOL <- as.list(rep("-", NLOC))
+ COUNT <- 0 # used to count the nb reads
+
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=LINES.PER.IND*chunkSize)
+
+ ## read and process chunks
+ while(length(txt)>0){
+ COUNT <- COUNT + 1
+ if(!quiet) {
+ for(i in 1:(COUNT*chunkSize)) cat(".")
+ }
+
+
+ nb.ind <- length(grep("^>", txt))
+ IND.LAB <- c(IND.LAB, sub(">","",txt[grep("^>", txt)])) # find individuals' labels
+ txt <- split(txt, rep(1:nb.ind, each=LINES.PER.IND)) # split per individuals
+ txt <- lapply(txt, function(e) strsplit(paste(e[-1], collapse=""), split="")) # each genome -> one vector
+
+
+ ## POOL contains all alleles of each position
+ temp <- as.list(apply(matrix(unlist(txt), byrow=TRUE, nrow=length(txt)),2,unique)) # alleles current genomes
+ POOL <- mapply(function(x,y) unique(c(x,y)), POOL, temp, SIMPLIFY=FALSE) # update global pool
+
+ lines.to.skip <- lines.to.skip + nb.ind*LINES.PER.IND
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=LINES.PER.IND*chunkSize)
+ }
+
+
+ ## analyse pool of alleles
+ POOL <- lapply(POOL, setdiff, "-")
+ nb.alleles <- sapply(POOL, length)
+ snp.posi <- nb.alleles==2
+ sec.all <- unlist(lapply(POOL[snp.posi], function(e) e[2]))
+
+
+
+ ## RE-READ DATA, CONVERT SNPs TO GENLIGHT ##
+ ## initialize
+ lines.to.skip <- 0
+ COUNT <- 0 # used to count the nb reads
+ res <- list()
+
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=LINES.PER.IND*chunkSize)
+
+ ## read and process chunks
+ while(length(txt)>0){
+ COUNT <- COUNT + 1
+ if(!quiet) {
+ for(i in 1:(COUNT*chunkSize)) cat(".")
+ }
+
+
+ ## read SNPs
+ nb.ind <- length(grep("^>", txt))
+ txt <- split(txt, rep(1:nb.ind, each=LINES.PER.IND)) # split per individuals
+ txt <- lapply(txt, function(e) strsplit(paste(e[-1], collapse=""), split="")[[1]][snp.posi]) # each genome -> one SNP vector
+
+
+ ## convert to genlight
+ res <- c(res, lapply(txt, function(e) new("SNPbin", as.integer(e==sec.all))))
+
+ lines.to.skip <- lines.to.skip + nb.ind*LINES.PER.IND
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=LINES.PER.IND*chunkSize)
+ }
+
+
+
+ ## BUILD FINAL OBJECT ##
+ if(!quiet) cat("\n Building final object... \n")
+
+ res <- new("genlight",res, ploidy=1)
+ indNames(res) <- IND.LAB
+ alleles(res) <- sapply(POOL[snp.posi], paste, collapse="/")
+ position(res) <- which(snp.posi)
+ if(saveNbAlleles) other(res) <- list(nb.all.per.loc=nb.alleles)
+
+
+ ## RETURN OUTPUT ##
+ if(!quiet) cat("\n...done.\n\n")
+
+ return(res)
+} # end fasta2genlight
More information about the adegenet-commits
mailing list