[Gtdb-commits] r45 - in pkg/gt.db: R man

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Thu Mar 4 02:23:43 CET 2010


Author: dahinds
Date: 2010-03-04 02:23:42 +0100 (Thu, 04 Mar 2010)
New Revision: 45

Modified:
   pkg/gt.db/R/genotype.R
   pkg/gt.db/R/hapmap.R
   pkg/gt.db/R/misc.R
   pkg/gt.db/R/sql.R
   pkg/gt.db/man/load.hapmap.data.Rd
Log:
- Fixed goofs in compression support
- Finished implementing loading of phased HapMap r22 data



Modified: pkg/gt.db/R/genotype.R
===================================================================
--- pkg/gt.db/R/genotype.R	2010-03-03 01:25:50 UTC (rev 44)
+++ pkg/gt.db/R/genotype.R	2010-03-04 01:23:42 UTC (rev 45)
@@ -42,13 +42,13 @@
     db.mode <- .gt.db.options('db.mode')
     tx.mode <- .gt.db.options('tx.mode')
     if (db.mode == tx.mode) {
-        txt.fn <- ''
+        txt.fn <- ':clob:'
         raw.fn <- ':blob:'
     } else if (db.mode == 'raw' && tx.mode == 'hex') {
-        txt.fn <- ''
+        txt.fn <- ':clob:'
         raw.fn <- ':hex.blob:'
     } else if (db.mode == 'zip' && tx.mode == 'hex') {
-        txt.fn <- 'unzip'
+        txt.fn <- ':unzip.clob:'
         raw.fn <- ':hex.unzip.blob:'
     } else {
         stop('unknown conversion!')

Modified: pkg/gt.db/R/hapmap.R
===================================================================
--- pkg/gt.db/R/hapmap.R	2010-03-03 01:25:50 UTC (rev 44)
+++ pkg/gt.db/R/hapmap.R	2010-03-04 01:23:42 UTC (rev 45)
@@ -67,29 +67,31 @@
     structure(map, sample=sample)
 }
 
-.read.hapmap.r22phased <- function(basename)
+.read.hapmap.r22phased <- function(file)
 {
+    basename <- sub('\\.phased?\\.gz', '', file)
     sample <- read.table(paste(basename,'_sample.txt.gz',sep=''),
                          header=FALSE, as.is=1)
     sample <- paste(rep(sample[,1],each=2), c('A','B'), sep='_')
 
     map <- read.table(paste(basename,'_legend.txt.gz',sep=''),
                       header=TRUE, as.is=1)
+    names(map)[1] <- 'assay.name'
+    chr <- sub('genotypes_(chr\\w\\d?).*', '\\1', file, perl=TRUE)
     map$strand <- factor('+')
+    map$scaffold <- factor(chr)
     map$alleles <- factor(paste(map$X0, map$X1, sep='/'))
     map$X0 <- map$X1 <- NULL
 
-    fd <- file(paste(basename,'.phase.gz',sep=''),'r')
-    tc <- textConnection(readChar(fd, nchars=100e+06, useBytes=TRUE))
+    fd <- file(file,'r')
+    gt <- readChar(fd, nchars=100e6, useBytes=TRUE)
     close(fd)
-    gt1 <- scan(tc, what=integer(), quiet=TRUE, nlines=1)
-    gt <- c(gt1,scan(tc, what=integer(), quiet=TRUE))
-    close(tc)
-    # convert from 0/1 codes to a/b strings
-    gt <- matrix(as.raw(gt+0x61), ncol=length(gt1))
-    map$genotype <- apply(gt, 2, rawToChar)
+    gt <- chartr('01','ab',gsub(' ', '', gt, perl=TRUE))
+    gt <- strsplit(gt, '\n', fixed=TRUE)[[1]]
+    gt <- sapply(gt, charToRaw, USE.NAMES=FALSE)
+    map$genotype <- apply(t(gt), 2, rawToChar)
 
-    structure(map, sample = sample[1:nrow(gt)])
+    structure(map, sample = sample[1:ncol(gt)])
 }
 
 .resolve.hapmap.positions <- function(data)
@@ -139,11 +141,15 @@
 
 .merge.hapmap.files <- function(files, verbose=TRUE)
 {
+    read.file <- if (all(grepl('_r22_.*phase', files)))
+        .read.hapmap.r22phased
+    else
+        .read.hapmap.file
     wrap.read.file <- function(file)
     {
         part <- sub('.*(chr..?_[^_]+)_.*', '\\1', file)
         if (verbose) message('Reading ', part, '...')
-        .read.hapmap.file(file)
+        read.file(file)
     }
     all <- lapply(files, wrap.read.file)
 
@@ -173,12 +179,14 @@
 }
 
 load.hapmap.data <-
-    function(files, project.name='HapMap', verbose=TRUE)
+    function(files, project.name='HapMap', map=TRUE, verbose=TRUE)
 {
     # split filenames at '_', and at '.' unless after a digit
     f <- strsplit(basename(files), '_|(?<!\\d)\\.', perl=TRUE)
     f <- as.data.frame(do.call('rbind', f)[,1:7])
     names(f) <- c('gt','chr','pop','rel','ver','build','strand')
+    if (grepl('\\.phase', files[1]))
+        f$ver <- paste(f$ver, '_phased', sep='')
     stopifnot(f$gt == 'genotypes', f$strand == 'fwd',
               f$rel == f$rel[1], f$ver == f$ver[1],
               f$build == f$build[1])
@@ -213,8 +221,9 @@
         if (!nrow(s)) {
             data(hapmap)
             sn <- attr(m,'sample')
-            s <- data.frame(sample.name=sn, subject.name=sn,
-                            gender=hapmap.subjects[sn,'gender.ref'],
+            su <- sub('(NA\\d{5}).*', '\\1', sn)
+            s <- data.frame(sample.name=sn, subject.name=su,
+                            gender=hapmap.subjects[su,'gender.ref'],
                             position=1:length(sn))
             mk.sample(dataset, s)
         } else {
@@ -226,8 +235,10 @@
         # FIXME: we'll need to fix up pseudo-autosomal regions
         m$ploidy <- switch(chr, 'chrX'='X', 'chrY'='Y', 'chrMT'='M', 'A')
         if (verbose) message("Loading ", nrow(m), " records...")
-        mk.assay(platform, m, progress=verbose)
-        mk.assay.position(platform, mapping, m, progress=verbose)
+        if (map) {
+            mk.assay(platform, m, progress=verbose)
+            mk.assay.position(platform, mapping, m, progress=verbose)
+        }
         mk.assay.data(dataset, m, progress=verbose)
     }
 }

Modified: pkg/gt.db/R/misc.R
===================================================================
--- pkg/gt.db/R/misc.R	2010-03-03 01:25:50 UTC (rev 44)
+++ pkg/gt.db/R/misc.R	2010-03-04 01:23:42 UTC (rev 45)
@@ -91,7 +91,7 @@
     s <- strsplit(paste(s, collapse='\n'), ';\n')[[1]]
     sapply(s, sql.exec, db=gt.db::.gt.db, USE.NAMES=FALSE)
     .gt.db.options(db.mode=db.mode)
-    sql.exec(gt.db::.gt.db, 'insert into gtdb_option values (?,?)',
+    sql.exec(gt.db::.gt.db, 'insert into gtdb_option values (:1,:2)',
              'db.mode', db.mode)
 }
 

Modified: pkg/gt.db/R/sql.R
===================================================================
--- pkg/gt.db/R/sql.R	2010-03-03 01:25:50 UTC (rev 44)
+++ pkg/gt.db/R/sql.R	2010-03-04 01:23:42 UTC (rev 45)
@@ -113,11 +113,11 @@
         sql <- gsub(':user:', 'user()', sql)
         sql <- gsub(':sysdate:', 'sysdate()', sql)
         sql <- gsub(':clob:\\((\\w+)\\)', '\\1', sql)
+        sql <- gsub(':unzip.clob:\\((\\w+)\\)', 'uncompress(\\1) \\1', sql)
         sql <- gsub(':blob:\\((\\w+)\\)', '\\1', sql)
         sql <- gsub(':hex.blob:\\((\\w+)\\)', 'hex(\\1) \\1', sql)
         sql <- gsub(':unhex:', 'unhex', sql)
         sql <- gsub(':zip:', 'compress', sql)
-        sql <- gsub(':unzip:', 'uncompress', sql)
         sql <- gsub(':zip.unhex:\\(([^)]+)\\)', 'compress(unhex(\\1))', sql)
         sql <- gsub(':hex.unzip.blob:\\((\\w+)\\)',
                     'hex(uncompress(\\1)) \\1', sql)

Modified: pkg/gt.db/man/load.hapmap.data.Rd
===================================================================
--- pkg/gt.db/man/load.hapmap.data.Rd	2010-03-03 01:25:50 UTC (rev 44)
+++ pkg/gt.db/man/load.hapmap.data.Rd	2010-03-04 01:23:42 UTC (rev 45)
@@ -18,7 +18,7 @@
 % 
 \name{load.hapmap.data}
 \alias{load.hapmap.data}
-\title{Import Unphased HapMap Genotype Data}
+\title{Import HapMap Genotype Data}
 \description{
   Import non-redundant, forward-stranded genotype data from the 
   International HapMap Project.
@@ -38,7 +38,8 @@
   those panels.  Only non-redundant forward-orientation files are
   supported.
 
-  It has been tested against Phase II r22 and r24, and Phase III r2.
+  It has been tested against Phase II unphased r22 and r24, Phase II
+  phased r22, and unphased Phase III r2.
 }
 \seealso{
   \code{\link{hapmap.subjects}}.



More information about the Gtdb-commits mailing list