[adegenet-commits] r835 - in pkg: R man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Thu Mar 3 17:21:56 CET 2011
Author: jombart
Date: 2011-03-03 17:21:55 +0100 (Thu, 03 Mar 2011)
New Revision: 835
Added:
pkg/man/read.PLINK.Rd
Modified:
pkg/R/SNPbin.R
pkg/R/import.R
pkg/man/read.snp.Rd
Log:
read.PLINK is there. Fuck yeah.
Modified: pkg/R/SNPbin.R
===================================================================
--- pkg/R/SNPbin.R 2011-03-03 12:27:51 UTC (rev 834)
+++ pkg/R/SNPbin.R 2011-03-03 16:21:55 UTC (rev 835)
@@ -416,7 +416,7 @@
if(!is.null(other(object))){
cat(" @other: ")
- cat("a list containing ")
+ cat("a list containing: ")
cat(ifelse(is.null(names(other(object))), paste(length(other(object)),"unnamed elements"),
paste(names(other(object)), collapse= " ")), "\n")
}
Modified: pkg/R/import.R
===================================================================
--- pkg/R/import.R 2011-03-03 12:27:51 UTC (rev 834)
+++ pkg/R/import.R 2011-03-03 16:21:55 UTC (rev 835)
@@ -719,10 +719,11 @@
#######################
# Function read.snp
#######################
-read.snp <- function(file, quiet=FALSE, chunkSize=10,
+read.snp <- function(file, quiet=FALSE, chunkSize=1000,
multicore=require("multicore"), n.cores=NULL, ...){
- ##ext <- .readExt(file)
- ##ext <- toupper(ext)
+ ext <- .readExt(file)
+ ext <- toupper(ext)
+ if(ext != "SNP") warning("wrong file extension - '.snp' expected")
if(!quiet) cat("\n Reading biallelic SNP data file into a genlight object... \n\n")
if(multicore && !require(multicore)) stop("multicore package requested but not installed")
if(multicore && is.null(n.cores)){
@@ -775,32 +776,7 @@
## one genotype is read/converted at a time to spare RAM
if(!quiet) cat("\n Reading",ifelse(is.null(misc.info$population),"",length(misc.info$population)), "genotypes... \n")
- ## res <- list() # this will be a list of SNPbin objects
- ## txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=1)
-
- ## COUNT <- 0 # used to count the nb of genotypes read
-
- ## while(length(grep(">", txt))>0){
- ## COUNT <- COUNT + 1
- ## if(!quiet) {
- ## if(COUNT %% 10 == 0){
- ## cat(COUNT)
- ## } else {
- ## cat(".")
- ## }
- ## }
-
- ## indName <- gsub(">","", txt)
- ## indName <- gsub("(^[[:space:]]+)|([[:space:]]+$)", "", indName)
- ## temp <- strsplit(scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip + 1, nmax=1), "")[[1]]
- ## temp <- suppressWarnings(as.integer(temp))
- ## res[[length(res)+1]] <- new("SNPbin", temp)
- ## names(res)[length(res)] <- indName
- ## lines.to.skip <-lines.to.skip + 2
- ## txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=1)
- ## }
-
res <- list() # this will be a list of SNPbin objects
txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=chunkSize*2)
@@ -890,3 +866,110 @@
return(res)
} # end read.snp
+
+
+
+
+
+
+
+#######################
+# Function read.PLINK
+#######################
+read.PLINK <- function(file, quiet=FALSE, chunkSize=1000, ploidy=2,
+ multicore=require("multicore"), n.cores=NULL, ...){
+ ## HANDLE ARGUMENTS ##
+ ext <- .readExt(file)
+ ext <- toupper(ext)
+ if(ext != "RAW") warning("wrong file extension - '.raw' expected")
+ if(!quiet) cat("\n Reading PLINK raw format into a genlight object... \n\n")
+ if(multicore && !require(multicore)) stop("multicore package requested but not installed")
+ if(multicore && is.null(n.cores)){
+ n.cores <- multicore:::detectCores()
+ }
+
+
+ ## READ NAMES OF LOCI ##
+ if(!quiet) cat("\n Reading loci information... \n")
+
+ loc.names <- scan(file,what="character",sep=" ",quiet=TRUE, nlines=1, blank.lines.skip=FALSE)
+ n.loc <- length(loc.names) - 6
+ misc.info <- lapply(1:6,function(i) NULL)
+ names(misc.info) <- loc.names[1:6]
+ loc.names <- loc.names[7:length(loc.names)]
+ loc.names <- gsub("_[1-9]$","",loc.names)
+
+ ## READ GENOTYPES ##
+ if(!quiet) cat("\n Reading and converting genotypes... \n")
+
+ res <- list() # this will be a list of SNPbin objects
+
+ ## initialize reading
+ lines.to.skip <- 1
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=chunkSize)
+ txt <- lapply(txt, function(e) unlist(strsplit(e,"[[:blank:]]+") ))
+
+ COUNT <- 0 # used to count the nb reads
+
+ while(length(txt)>0){
+ COUNT <- COUNT + 1
+ if(!quiet) {
+ if(COUNT %% 5 == 0){
+ cat(length(res)+length(txt))
+ } else {
+ cat(".")
+ }
+ }
+
+
+ ## handle misc info
+ temp <- lapply(txt, function(e) e[1:6])
+ for(i in 1:6){
+ misc.info[[i]] <- c(misc.info[[i]], unlist(lapply(temp, function(e) e[[i]])) )
+ }
+
+
+ ## build SNPbin objects
+ txt <- lapply(txt, function(e) suppressWarnings(as.integer(e[-(1:6)])))
+
+ if(multicore){
+ res <- c(res, mclapply(txt, function(e) new("SNPbin", snp=e, ploidy=ploidy),
+ mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE) )
+ } else {
+ res <- c(res, lapply(txt, function(e) new("SNPbin", snp=e, ploidy=ploidy)) )
+ }
+
+ lines.to.skip <-lines.to.skip + length(txt)
+
+ ## read lines
+ txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=chunkSize)
+ txt <- lapply(txt, function(e) unlist(strsplit(e,"[[:blank:]]+") ))
+ }
+
+
+ ## MAKE A FEW CHECKS ##
+ if(!all(sapply(res, nLoc)==n.loc)) stop(paste("some individuals do not have",n.loc,"SNPs."))
+
+
+ ## BUILD FINAL OBJECT ##
+ if(!quiet) cat("\n Building final object... \n")
+
+ res <- new("genlight",res, ploidy=ploidy)
+ indNames(res) <- misc.info$IID
+ pop(res) <- misc.info$FID
+ locNames(res) <- loc.names
+ misc.info <- misc.info[c("SEX", "PHENOTYPE", "PAT","MAT")]
+ names(misc.info) <- tolower(names(misc.info))
+ misc.info$sex[misc.info$sex==1] <- "m"
+ misc.info$sex[misc.info$sex==2] <- "f"
+ misc.info$sex <- factor(misc.info$sex)
+ misc.info$phenotype[misc.info$phenotype==1] <- "control"
+ misc.info$phenotype[misc.info$phenotype==2] <- "case"
+ misc.info$phenotype <- factor(misc.info$phenotype)
+ other(res) <- misc.info
+
+ ## RETURN OUTPUT ##
+ if(!quiet) cat("\n...done.\n\n")
+
+ return(res)
+} # end read.PLINK
Added: pkg/man/read.PLINK.Rd
===================================================================
--- pkg/man/read.PLINK.Rd (rev 0)
+++ pkg/man/read.PLINK.Rd 2011-03-03 16:21:55 UTC (rev 835)
@@ -0,0 +1,86 @@
+\encoding{UTF-8}
+\name{read.PLINK}
+\alias{read.PLINK}
+\title{ Reading PLINK Single Nucleotide Polymorphism data}
+\description{
+ The function \code{read.PLINK} reads a data file exported by the PLINK
+ software with extension '.raw' and converts it into a
+ \linkS4class{genlight} object.
+
+ The function reads data by chunks of several genomes (minimum 1, no
+ maximum) at a time, which allows one to read massive datasets with
+ negligible RAM requirements (albeit at a cost of computational
+ time). The argument \code{chunkSize} indicates the number of genomes
+ read at a time. Increasing this value decreases the computational time
+ required to read data in, while increasing memory requirements.
+
+ See details for the documentation about how to export data using PLINK
+ to the '.raw' format.
+}
+\usage{
+read.PLINK(file, quiet=FALSE, chunkSize=1000, ploidy=2,
+ multicore=require("multicore"), n.cores=NULL, \dots)
+}
+\arguments{
+ \item{file}{ a character string giving the path to the file to
+ convert, with the extension ".PLINK".}
+ \item{quiet}{ logical stating whether a conversion messages should be
+ printed (TRUE,default) or not (FALSE).}
+ \item{chunkSize}{an integer indicating the number of genomes to be
+ read at a time; larger values require more RAM but decrease the time
+ needed to read the data.}
+ \item{ploidy}{an integer indicating the ploidy of the data; defaults to
+ 2.}
+ \item{multicore}{a logical indicating whether multiple cores -if
+ available- should be used for the computations (TRUE, default), or
+ not (FALSE); requires the package \code{multicore} to be installed
+ (see details).}
+ \item{n.cores}{if \code{multicore} is TRUE, the number of cores to be
+ used in the computations; if NULL, then the maximum number of cores
+ available on the computer is used.}
+ \item{\dots}{other arguments to be passed to other functions -
+ currently not used.}
+}
+\details{
+ === Exporting data from PLINK ===
+
+ Data need to be exported from PLINK using the option "--recodeA" (and
+ NOT "--recodeAD"). The PLINK command should therefore look like:
+ \code{plink --file data --recodeA}. For more information on this
+ topic, please look at this webpage:
+ \url{http://pngu.mgh.harvard.edu/~purcell/plink/dataman.shtml}
+
+
+ === Using multiple cores ===
+
+ Most recent machines have one or several processors with multiple
+ cores. R processes usually use one single core. The package
+ \code{multicore} allows for parallelizing some computations on
+ multiple cores, which decreases drastically computational time.
+
+ To use this functionality, you need to have the last version of the
+ \code{multicore} package installed. To install it, type:
+ install.packages("multicore",,"http://rforge.net/",type="source")
+
+ DO NOT use the version on CRAN, which is slightly outdated.
+}
+\value{an object of the class \linkS4class{genlight}}
+\seealso{
+ - a description of the class \linkS4class{genlight}.
+
+ - \code{\link{read.snp}}: read from adegenet's dedicated format to
+ - store biallelic SNP data.
+
+ - other import function in adegenet: \code{\link{import2genind}},
+ \code{\link{df2genind}}, \code{\link{read.genetix}}
+ \code{\link{read.fstat}}, \code{\link{read.structure}},
+ \code{\link{read.genepop}}.
+
+ - another function \code{read.plink} is available in the package
+ \code{snpMatrix}.
+}
+\author{Thibaut Jombart \email{t.jombart at imperial.ac.uk} }
+\examples{
+
+}
+\keyword{manip}
Modified: pkg/man/read.snp.Rd
===================================================================
--- pkg/man/read.snp.Rd 2011-03-03 12:27:51 UTC (rev 834)
+++ pkg/man/read.snp.Rd 2011-03-03 16:21:55 UTC (rev 835)
@@ -20,7 +20,7 @@
distributed with adegenet (see example below).
}
\usage{
-read.snp(file, quiet=FALSE, chunkSize = 10, multicore = require("multicore"),
+read.snp(file, quiet=FALSE, chunkSize = 1000, multicore = require("multicore"),
n.cores = NULL, \dots)
}
\arguments{
@@ -43,11 +43,14 @@
}
\details{
=== The .snp format ===
+
Details of the .snp format can be found in the example file
distributed with adegenet (see below), or on the adegenet website
(type \code{adegenetWeb()} in R).
- === Using multiple cores ===
+
+ === Using multiple cores ===
+
Most recent machines have one or several processors with multiple
cores. R processes usually use one single core. The package
\code{multicore} allows for parallelizing some computations on
More information about the adegenet-commits
mailing list