[Gtdb-commits] r42 - pkg/gt.db/R

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Tue Mar 2 10:01:44 CET 2010


Author: dahinds
Date: 2010-03-02 10:01:43 +0100 (Tue, 02 Mar 2010)
New Revision: 42

Modified:
   pkg/gt.db/R/hapmap.R
Log:
- Better memory utilization and performance for reading hapmap data



Modified: pkg/gt.db/R/hapmap.R
===================================================================
--- pkg/gt.db/R/hapmap.R	2010-03-02 02:03:45 UTC (rev 41)
+++ pkg/gt.db/R/hapmap.R	2010-03-02 09:01:43 UTC (rev 42)
@@ -26,6 +26,7 @@
 {
     # fill in non-SNP allele lists based on the actual data
     bad <- grep('^[ACGT]/[ACGT]$', alleles, invert=TRUE)
+    if (!length(bad)) return(alleles)
     gt <- c('A','C','G','T')
     tbl <- (ch.table(geno[bad], chars=gt) > 0)
     a12 <- apply(tbl, 1, function(x) c(gt[x],'?','?')[1:2])
@@ -42,7 +43,8 @@
     close(fd)
 
     # parse just map information
-    tc <- textConnection(filedata)
+    tc <- textConnection(gsub("(?m) ncbi_b\\d+ .*$", "",
+                              filedata, perl=TRUE))
     map <- scan(tc, what=list('','','',1L,''),
                 flush=TRUE, skip=1, quiet=TRUE)
     close(tc)
@@ -52,13 +54,10 @@
     map <- as.data.frame(map)
 
     # now parse genotype data
-    tc <- textConnection(filedata)
-    rm(filedata)
-    geno <- readLines(tc)
-    close(tc)
-    geno <- sub('.* QC\\S+ ', '', geno, perl=TRUE)
+    filedata <- gsub("(?m)^.* QC\\S+ ", "", filedata, perl=TRUE)
+    geno <- strsplit(filedata, "\n", fixed=TRUE)[[1]]
     sample <- strsplit(geno[1], ' ')[[1]]
-    geno <- geno[1+(1:nrow(map))]
+    geno <- geno[-1]
     map$alleles <- .fixup.hapmap.alleles(map$alleles, geno)
     map$genotype <- .Call('do_encode_gt', as.character(map$alleles),
                           geno, PACKAGE='gt.db')
@@ -81,7 +80,7 @@
     map$X0 <- map$X1 <- NULL
 
     fd <- file(paste(basename,'.phase.gz',sep=''),'r')
-    tc <- textConnection(readChar(fd, nchars=5e+08, useBytes=TRUE))
+    tc <- textConnection(readChar(fd, nchars=100e+06, useBytes=TRUE))
     close(fd)
     gt1 <- scan(tc, what=integer(), quiet=TRUE, nlines=1)
     gt <- c(gt1,scan(tc, what=integer(), quiet=TRUE))



More information about the Gtdb-commits mailing list