[adegenet-commits] r835 - in pkg: R man

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Thu Mar 3 17:21:56 CET 2011


Author: jombart
Date: 2011-03-03 17:21:55 +0100 (Thu, 03 Mar 2011)
New Revision: 835

Added:
   pkg/man/read.PLINK.Rd
Modified:
   pkg/R/SNPbin.R
   pkg/R/import.R
   pkg/man/read.snp.Rd
Log:
read.PLINK is there. Fuck yeah.


Modified: pkg/R/SNPbin.R
===================================================================
--- pkg/R/SNPbin.R	2011-03-03 12:27:51 UTC (rev 834)
+++ pkg/R/SNPbin.R	2011-03-03 16:21:55 UTC (rev 835)
@@ -416,7 +416,7 @@
 
     if(!is.null(other(object))){
         cat(" @other: ")
-        cat("a list containing ")
+        cat("a list containing: ")
         cat(ifelse(is.null(names(other(object))), paste(length(other(object)),"unnamed elements"),
                    paste(names(other(object)), collapse= "  ")), "\n")
     }

Modified: pkg/R/import.R
===================================================================
--- pkg/R/import.R	2011-03-03 12:27:51 UTC (rev 834)
+++ pkg/R/import.R	2011-03-03 16:21:55 UTC (rev 835)
@@ -719,10 +719,11 @@
 #######################
 # Function read.snp
 #######################
-read.snp <- function(file, quiet=FALSE, chunkSize=10,
+read.snp <- function(file, quiet=FALSE, chunkSize=1000,
                   multicore=require("multicore"), n.cores=NULL, ...){
-    ##ext <- .readExt(file)
-    ##ext <- toupper(ext)
+    ext <- .readExt(file)
+    ext <- toupper(ext)
+    if(ext != "SNP") warning("wrong file extension - '.snp' expected")
     if(!quiet) cat("\n Reading biallelic SNP data file into a genlight object... \n\n")
     if(multicore && !require(multicore)) stop("multicore package requested but not installed")
     if(multicore && is.null(n.cores)){
@@ -775,32 +776,7 @@
     ## one genotype is read/converted at a time to spare RAM
     if(!quiet) cat("\n Reading",ifelse(is.null(misc.info$population),"",length(misc.info$population)), "genotypes... \n")
 
-    ## res <- list() # this will be a list of SNPbin objects
 
-    ## txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=1)
-
-    ## COUNT <- 0 # used to count the nb of genotypes read
-
-    ## while(length(grep(">", txt))>0){
-    ##     COUNT <- COUNT + 1
-    ##     if(!quiet) {
-    ##         if(COUNT %% 10 == 0){
-    ##             cat(COUNT)
-    ##         } else {
-    ##             cat(".")
-    ##         }
-    ##     }
-
-    ##     indName <- gsub(">","", txt)
-    ##     indName <- gsub("(^[[:space:]]+)|([[:space:]]+$)", "", indName)
-    ##     temp <- strsplit(scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip + 1, nmax=1), "")[[1]]
-    ##     temp <- suppressWarnings(as.integer(temp))
-    ##     res[[length(res)+1]] <- new("SNPbin", temp)
-    ##     names(res)[length(res)] <- indName
-    ##     lines.to.skip <-lines.to.skip + 2
-    ##     txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=1)
-    ## }
-
     res <- list() # this will be a list of SNPbin objects
 
     txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=chunkSize*2)
@@ -890,3 +866,110 @@
     return(res)
 
 } # end read.snp
+
+
+
+
+
+
+
+#######################
+# Function read.PLINK
+#######################
+read.PLINK <- function(file, quiet=FALSE, chunkSize=1000, ploidy=2,
+                       multicore=require("multicore"), n.cores=NULL, ...){
+    ## HANDLE ARGUMENTS ##
+    ext <- .readExt(file)
+    ext <- toupper(ext)
+    if(ext != "RAW") warning("wrong file extension - '.raw' expected")
+    if(!quiet) cat("\n Reading PLINK raw format into a genlight object... \n\n")
+    if(multicore && !require(multicore)) stop("multicore package requested but not installed")
+    if(multicore && is.null(n.cores)){
+        n.cores <- multicore:::detectCores()
+    }
+
+
+    ## READ NAMES OF LOCI ##
+    if(!quiet) cat("\n Reading loci information... \n")
+
+    loc.names <- scan(file,what="character",sep=" ",quiet=TRUE,  nlines=1, blank.lines.skip=FALSE)
+    n.loc <- length(loc.names) - 6
+    misc.info <- lapply(1:6,function(i) NULL)
+    names(misc.info) <- loc.names[1:6]
+    loc.names <- loc.names[7:length(loc.names)]
+    loc.names <- gsub("_[1-9]$","",loc.names)
+
+    ## READ GENOTYPES ##
+    if(!quiet) cat("\n Reading and converting genotypes... \n")
+
+    res <- list() # this will be a list of SNPbin objects
+
+    ## initialize reading
+    lines.to.skip <- 1
+    txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=chunkSize)
+    txt <- lapply(txt, function(e) unlist(strsplit(e,"[[:blank:]]+") ))
+
+    COUNT <- 0 # used to count the nb reads
+
+    while(length(txt)>0){
+        COUNT <- COUNT + 1
+        if(!quiet) {
+            if(COUNT %% 5 == 0){
+                cat(length(res)+length(txt))
+            } else {
+                cat(".")
+            }
+        }
+
+
+        ## handle misc info
+        temp <- lapply(txt, function(e) e[1:6])
+        for(i in 1:6){
+            misc.info[[i]] <- c(misc.info[[i]], unlist(lapply(temp, function(e) e[[i]])) )
+        }
+
+
+        ## build SNPbin objects
+        txt <- lapply(txt, function(e) suppressWarnings(as.integer(e[-(1:6)])))
+
+        if(multicore){
+            res <- c(res, mclapply(txt, function(e) new("SNPbin", snp=e, ploidy=ploidy),
+                                   mc.cores=n.cores, mc.silent=TRUE, mc.cleanup=TRUE, mc.preschedule=FALSE) )
+        } else {
+            res <- c(res, lapply(txt, function(e) new("SNPbin", snp=e, ploidy=ploidy)) )
+        }
+
+        lines.to.skip <-lines.to.skip + length(txt)
+
+        ## read lines
+        txt <- scan(file,what="character",sep="\n",quiet=TRUE, skip=lines.to.skip, nmax=chunkSize)
+        txt <- lapply(txt, function(e) unlist(strsplit(e,"[[:blank:]]+") ))
+    }
+
+
+    ## MAKE A FEW CHECKS ##
+    if(!all(sapply(res, nLoc)==n.loc)) stop(paste("some individuals do not have",n.loc,"SNPs."))
+
+
+    ## BUILD FINAL OBJECT ##
+    if(!quiet) cat("\n Building final object... \n")
+
+    res <- new("genlight",res, ploidy=ploidy)
+    indNames(res) <- misc.info$IID
+    pop(res) <- misc.info$FID
+    locNames(res) <- loc.names
+    misc.info <- misc.info[c("SEX", "PHENOTYPE", "PAT","MAT")]
+    names(misc.info) <- tolower(names(misc.info))
+    misc.info$sex[misc.info$sex==1] <- "m"
+    misc.info$sex[misc.info$sex==2] <- "f"
+    misc.info$sex <- factor(misc.info$sex)
+    misc.info$phenotype[misc.info$phenotype==1] <- "control"
+    misc.info$phenotype[misc.info$phenotype==2] <- "case"
+    misc.info$phenotype <- factor(misc.info$phenotype)
+    other(res) <- misc.info
+
+    ## RETURN OUTPUT ##
+    if(!quiet) cat("\n...done.\n\n")
+
+    return(res)
+} # end read.PLINK

Added: pkg/man/read.PLINK.Rd
===================================================================
--- pkg/man/read.PLINK.Rd	                        (rev 0)
+++ pkg/man/read.PLINK.Rd	2011-03-03 16:21:55 UTC (rev 835)
@@ -0,0 +1,86 @@
+\encoding{UTF-8}
+\name{read.PLINK}
+\alias{read.PLINK}
+\title{ Reading PLINK Single Nucleotide Polymorphism data}
+\description{
+  The function \code{read.PLINK} reads a data file exported by the PLINK
+  software with extension '.raw' and converts it into a
+  \linkS4class{genlight} object.
+
+  The function reads data by chunks of several genomes (minimum 1, no
+  maximum) at a time, which allows one to read massive datasets with
+  negligible RAM requirements (albeit at a cost of computational
+  time). The argument \code{chunkSize} indicates the number of genomes
+  read at a time. Increasing this value decreases the computational time
+  required to read data in, while increasing memory requirements.
+
+  See details for the documentation about how to export data using PLINK
+  to the '.raw' format.
+}
+\usage{
+read.PLINK(file, quiet=FALSE, chunkSize=1000, ploidy=2,
+           multicore=require("multicore"), n.cores=NULL, \dots)
+}
+\arguments{
+  \item{file}{ a character string giving the path to the file to
+    convert, with the extension ".PLINK".}
+  \item{quiet}{ logical stating whether a conversion messages should be
+    printed (TRUE,default) or not (FALSE).}
+  \item{chunkSize}{an integer indicating the number of genomes to be
+    read at a time; larger values require more RAM but decrease the time
+    needed to read the data.}
+  \item{ploidy}{an integer indicating the ploidy of the data; defaults to
+    2.}
+  \item{multicore}{a logical indicating whether multiple cores -if
+    available- should be used for the computations (TRUE, default), or
+    not (FALSE); requires the package \code{multicore} to be installed
+    (see details).}
+  \item{n.cores}{if \code{multicore} is TRUE, the number of cores to be
+    used in the computations; if NULL, then the maximum number of cores
+    available on the computer is used.}
+  \item{\dots}{other arguments to be passed to other functions -
+    currently not used.}
+}
+\details{
+  === Exporting data from PLINK ===
+  
+  Data need to be exported from PLINK using the option "--recodeA" (and
+  NOT "--recodeAD"). The PLINK command should therefore look like:
+  \code{plink --file data --recodeA}. For more information on this
+  topic, please look at this webpage:
+  \url{http://pngu.mgh.harvard.edu/~purcell/plink/dataman.shtml}
+
+  
+  === Using multiple cores ===
+  
+  Most recent machines have one or several processors with multiple
+  cores. R processes usually use one single core. The package
+  \code{multicore} allows for parallelizing some computations on
+  multiple cores, which decreases drastically computational time.
+
+  To use this functionality, you need to have the last version of the
+  \code{multicore} package installed. To install it, type:
+  install.packages("multicore",,"http://rforge.net/",type="source")
+
+  DO NOT use the version on CRAN, which is slightly outdated.
+}
+\value{an object of the class \linkS4class{genlight}}
+\seealso{
+  - a description of the class \linkS4class{genlight}.
+
+  - \code{\link{read.snp}}: read from adegenet's dedicated format to
+  - store biallelic SNP data.
+  
+  - other import function in adegenet: \code{\link{import2genind}},
+  \code{\link{df2genind}}, \code{\link{read.genetix}}
+  \code{\link{read.fstat}}, \code{\link{read.structure}},
+  \code{\link{read.genepop}}.
+
+  - another function \code{read.plink} is available in the package
+  \code{snpMatrix}.
+}
+\author{Thibaut Jombart \email{t.jombart at imperial.ac.uk} }
+\examples{
+
+}
+\keyword{manip}

Modified: pkg/man/read.snp.Rd
===================================================================
--- pkg/man/read.snp.Rd	2011-03-03 12:27:51 UTC (rev 834)
+++ pkg/man/read.snp.Rd	2011-03-03 16:21:55 UTC (rev 835)
@@ -20,7 +20,7 @@
   distributed with adegenet (see example below).
 }
 \usage{
-read.snp(file, quiet=FALSE, chunkSize = 10, multicore = require("multicore"),
+read.snp(file, quiet=FALSE, chunkSize = 1000, multicore = require("multicore"),
     n.cores = NULL, \dots)
 }
 \arguments{
@@ -43,11 +43,14 @@
 }
 \details{
   === The .snp format ===
+  
   Details of the .snp format can be found in the example file
   distributed with adegenet (see below), or on the adegenet website
   (type \code{adegenetWeb()} in R).
 
-   === Using multiple cores ===
+  
+  === Using multiple cores ===
+  
   Most recent machines have one or several processors with multiple
   cores. R processes usually use one single core. The package
   \code{multicore} allows for parallelizing some computations on



More information about the adegenet-commits mailing list