[Gtdb-commits] r34 - in pkg/gt.db: R man

Sun Feb 21 05:09:31 CET 2010

Author: dahinds
Date: 2010-02-21 05:09:28 +0100 (Sun, 21 Feb 2010)
New Revision: 34

Added:
   pkg/gt.db/R/hapmap.R
   pkg/gt.db/man/load.hapmap.data.Rd
Modified:
   pkg/gt.db/R/affy.R
   pkg/gt.db/R/assay.R
   pkg/gt.db/R/misc.R
   pkg/gt.db/man/ls.mapping.Rd
   pkg/gt.db/man/mk.assay.Rd
Log:
- added loader for HapMap project data, load.hapmap.data()
- added 'mapping.name' argument to ls.mapping()
- made probe.seq optional in mk.assay()
- fixed bug in name checking function .check.names()
- removed header length limit in .read.affy.header()



Modified: pkg/gt.db/R/affy.R
===================================================================

--- pkg/gt.db/R/affy.R	2010-02-17 20:19:15 UTC (rev 33)
+++ pkg/gt.db/R/affy.R	2010-02-21 04:09:28 UTC (rev 34)
@@ -20,9 +20,14 @@
 # Code to handle importing Affymetrix SNP annotations and genotype
 # data into GT.DB
 
-.read.affy.header <- function(file, max.lines=10000)
+.read.affy.header <- function(file)
 {
-    txt <- grep('^\\#%.*=', readLines(file, max.lines), value=TRUE)
+    txt <- character()
+    repeat {
+        x <- grep('^\\#%.*=', readLines(file, 100), value=TRUE)
+        if (!length(x)) break;
+        txt <- c(txt,x)
+    }
     val <- sub('^..[^=]*=(.*)', '\\1', txt)
     structure(val, names=sub('^..([^=]*)=.*', '\\1', txt))
 }
@@ -60,7 +65,7 @@
     ploidy[anno$Chromosome=='Y'] <- 'Y'
     ploidy[anno$Chromosome=='MT'] <- 'M'
     ploidy[anno$ChrX.pseudo.autosomal.region.1 |
-             anno$ChrX.pseudo.autosomal.region.2] <- 'A'
+           anno$ChrX.pseudo.autosomal.region.2] <- 'A'
     dbsnp.orient <-
         factor(anno$Strand.Versus.dbSNP,
                levels=c('reverse','same'), labels=c('-','+'))

Modified: pkg/gt.db/R/assay.R
===================================================================
--- pkg/gt.db/R/assay.R	2010-02-17 20:19:15 UTC (rev 33)
+++ pkg/gt.db/R/assay.R	2010-02-21 04:09:28 UTC (rev 34)
@@ -21,16 +21,18 @@
 #---------------------------------------------------------------------
 
 ls.mapping <-
-function(platform.name, show.all=FALSE, show.ids=FALSE)
+function(platform.name, mapping.name='%',
+         show.all=FALSE, show.ids=FALSE)
 {
     sql <-
      'select mapping_id, name mapping_name, description,
         assembly, is_hidden, created_by, created_dt
       from mapping
       where platform_id=:1
-        and is_hidden<=:2'
+        and name like :2
+        and is_hidden<=:3'
     r <- sql.query(gt.db::.gt.db, sql, lookup.id('platform', platform.name),
-                   show.all)
+                   mapping.name, show.all)
     .filter.ids(r, show.ids)
 }
 
@@ -87,6 +89,7 @@
     if (any(r,na.rm=TRUE))
         stop("invalid probe sequence(s)", call.=FALSE)
     if (is.null(data$flags)) data$flags <- 0
+    if (is.null(data$probe.seq)) data$probe.seq <- NA
     sql <- 'insert into assay values (null,:1,:2,:3,:4,:5)'
     sql.exec(gt.db::.gt.db, sql, plat.id,
              data[c('assay.name','flags','alleles','probe.seq')],
@@ -133,6 +136,8 @@
         data$ploidy <- NA
     }
     data$ploidy <- .fixup.ploidy(data$ploidy)
+    if (is.null(data$dbsnp.rsid)) data$dbsnp.rsid <- NA
+    if (is.null(data$dbsnp.orient)) data$dbsnp.orient <- NA
     sql.exec(gt.db::.gt.db, sql, map.id,
              data[c('assay.id', 'scaffold', 'position', 'strand',
                     'ploidy', 'dbsnp.rsid', 'dbsnp.orient')],
@@ -156,7 +161,8 @@
 
     db.mode <- .gt.db.options('db.mode')
     tx.mode <- .gt.db.options('tx.mode')
-    if (db.mode == tx.mode) {
+    if ((db.mode == tx.mode) ||
+        all(is.na(data$qscore) && is.na(data$raw.data))) {
         cvt.fn <- ''
     } else if (db.mode == 'raw' && tx.mode == 'hex') {
         cvt.fn <- ':unhex:'

Added: pkg/gt.db/R/hapmap.R
===================================================================
--- pkg/gt.db/R/hapmap.R	                        (rev 0)
+++ pkg/gt.db/R/hapmap.R	2010-02-21 04:09:28 UTC (rev 34)
@@ -0,0 +1,133 @@
+#
+# Copyright (C) 2010, 23andMe, Inc.
+#
+# Written by David A. Hinds <dhinds at sonic.net>
+#
+# This is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the license, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>
+#
+
+# Functions for reading unphased HapMap genotype data files
+#
+# We only support the 'fwd' files, which do not specify the orientations
+# of the rs clusters.  The 'rs' files contain orientation errors.
+
+.read.hapmap.file <- function(file)
+{
+    # read just map information
+    map <- scan(file, what=list('','','',1L,''),
+                flush=TRUE, skip=1, quiet=TRUE)
+    names(map) <- c('assay.name','alleles','scaffold','position','strand')
+    map$assay.name <- I(gsub('-','_',map$assay.name))
+    map$scaffold <- sub('MT','M',map$scaffold)
+    map <- as.data.frame(map)
+
+    # now parse genotype data
+    geno <- readLines(file)
+    # note that we keep the leading space
+    geno <- sub('.* QC[^ ]+', '', geno, perl=TRUE)
+    sample <- strsplit(geno[1], ' ')[[1]][-1]
+    geno <- geno[-1]
+    for (a in levels(map$alleles)) {
+        w <- (map$alleles == a)
+        ab <- strsplit(a,'/')[[1]]
+        geno[w] <- chartr(sprintf('%s%sN',ab[1],ab[2]),'abn',geno[w])
+    }
+    geno <- gsub(' ab| ba', 'h', geno, perl=TRUE)
+    map$genotype <- gsub(' .', '', geno, perl=TRUE)
+    stopifnot(all(nchar(map$genotype) == length(sample)))
+    structure(map, sample=sample)
+}
+
+.merge.hapmap.files <- function(files, verbose=TRUE)
+{
+    wrap.read.file <- function(file)
+    {
+        part <- sub('.*(chr..?_[^_]+)_.*', '\\1', file)
+        if (verbose) message('Reading ', part, '...')
+        .read.hapmap.file(file)
+    }
+    all <- lapply(files, wrap.read.file)
+
+    # construct unified map
+    map <- do.call('rbind', lapply(all, '[', -6))
+    map <- unique(map)
+    map <- map[order(map$position),]
+
+    # verify that all files are consistent
+    stopifnot(!anyDuplicated(map$assay.name))
+
+    map$genotype <- ''
+    sample <- character(0)
+    for (d in all) {
+        xx <- paste(rep('x', nchar(d$genotype[1])), collapse='')
+        gt <- d$genotype[match(map$assay.name,d$assay.name)]
+        map$genotype <- paste(map$genotype, if.na(gt,xx), sep='')
+        sample <- c(sample, attr(d,'sample'))
+    }
+    structure(map, sample=sample)
+}
+
+load.hapmap.data <-
+    function(files, project.name='HapMap', verbose=TRUE)
+{
+    # split filenames at '_', and at '.' unless after a digit
+    f <- strsplit(basename(files), '_|(?<!\\d)\\.', perl=TRUE)
+    f <- as.data.frame(do.call('rbind', f)[,1:7])
+    names(f) <- c('gt','chr','pop','rel','ver','build','strand')
+    stopifnot(f$gt == 'genotypes', f$strand == 'fwd',
+              f$rel == f$rel[1], f$ver == f$ver[1],
+              f$build == f$build[1])
+    nchr <- length(unique(f$chr))
+    npop <- length(unique(f$pop))
+    stopifnot(nrow(f) == nchr * npop)
+
+    phase <- (if (grepl('phase3',f$rel[1])) 3 else 2)
+    platform <- paste('HapMap', phase, sep='')
+    mapping <- sub('phase3\\.','r',f$rel[1])
+    dataset <- paste(platform, mapping, f$ver[1], sep='_')
+    if (npop == 1) dataset <- paste(dataset, f$pop[1], sep='_')
+    desc <- sprintf('HapMap Phase %d', phase)
+    if (!nrow(ls.platform(platform)))
+        mk.platform(platform, desc)
+    desc <- sprintf('%s Release %s', desc, f$rel[1])
+    if (!nrow(ls.mapping(platform, mapping)))
+        mk.mapping(platform, mapping, desc, sprintf('ncbi_%s', f$build[1]))
+    desc <- sprintf('%s (%s)', desc, f$ver[1])
+    if (!nrow(ls.dataset(project.name, dataset)))
+        mk.dataset(dataset, project.name, platform, desc)
+
+    for (chr in levels(.sort.levels(f$chr))) {
+        m <- .merge.hapmap.files(sort(files[f$chr == chr]), verbose)
+        s <- ls.sample(dataset)
+        if (!nrow(s)) {
+            data(hapmap)
+            sn <- attr(m,'sample')
+            s <- data.frame(sample.name=sn, subject.name=sn,
+                            gender=hapmap.subjects[sn,'gender.ref'],
+                            position=1:length(sn))
+            mk.sample(dataset, s)
+        } else {
+            stopifnot(s$sample.name == attr(m,'sample'))
+        }
+        rs <- ifelse(!grepl('rs\\d+',m$assay.name), NA,
+                     sub('.*rs(\\d+).*','\\1',m$assay.name))
+        m$dbsnp.rsid <- as.integer(rs)
+        # FIXME: we'll need to fix up pseudo-autosomal regions
+        m$ploidy <- switch(chr, 'chrX'='X', 'chrY'='Y', 'chrMT'='M', 'A')
+        if (verbose) message("Loading ", nrow(m), " records...")
+        mk.assay(platform, m)
+        mk.assay.position(platform, mapping, m)
+        mk.assay.data(dataset, m)
+    }
+}

Modified: pkg/gt.db/R/misc.R
===================================================================
--- pkg/gt.db/R/misc.R	2010-02-17 20:19:15 UTC (rev 33)
+++ pkg/gt.db/R/misc.R	2010-02-21 04:09:28 UTC (rev 34)
@@ -46,12 +46,12 @@
 
 .check.name <- function(name)
 {
-    r <- (regexpr('^[A-Za-z0-9_.]+$', name) < 0)
-    if (sum(r) == 1) {
-        stop("invalid name '", name, "'", call.=FALSE)
-    } else if (sum(r)) {
-        stop("invalid names '", name[1], "', '", name[2],
-             ifelse(sum(r)>2, "', ...", "'"), call.=FALSE)
+    bad <- name[regexpr('^[A-Za-z0-9_.]+$', name) < 0]
+    if (length(bad) == 1) {
+        stop("invalid name '", bad, "'", call.=FALSE)
+    } else if (length(bad)) {
+        stop("invalid names '", bad[1], "', '", bad[2],
+             ifelse(length(bad)>2, "', ...", "'"), call.=FALSE)
     }
 }
 
@@ -149,7 +149,7 @@
         x <- sql.query(gt.db::.gt.db, sql, ...)
     }
     if (nrow(x)) {
-        if (length(unique(x[,1])) < nrow(x))
+        if (anyDuplicated(x[,1]))
             stop('lookup failed: please be more specific', call.=FALSE)
         r <- x[match(name,x[,1]),2]
         f <- name[is.na(r)]

Added: pkg/gt.db/man/load.hapmap.data.Rd
===================================================================
--- pkg/gt.db/man/load.hapmap.data.Rd	                        (rev 0)
+++ pkg/gt.db/man/load.hapmap.data.Rd	2010-02-21 04:09:28 UTC (rev 34)
@@ -0,0 +1,64 @@
+%
+% Copyright (C) 2010, 23andMe, Inc.
+% 
+% Written by David A. Hinds <dhinds at sonic.net>
+% 
+% This is free software; you can redistribute it and/or modify it
+% under the terms of the GNU General Public License as published by
+% the Free Software Foundation; either version 3 of the license, or
+% (at your option) any later version.
+% 
+% This program is distributed in the hope that it will be useful,
+% but WITHOUT ANY WARRANTY; without even the implied warranty of
+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+% GNU General Public License for more details.
+% 
+% You should have received a copy of the GNU General Public License
+% along with this program.  If not, see <http://www.gnu.org/licenses/>
+% 
+\name{load.hapmap.data}
+\alias{load.hapmap.data}
+\title{Import Unphased HapMap Genotype Data}
+\description{
+  Import non-redundant, forward-stranded genotype data from the 
+  International HapMap Project.
+}
+\usage{
+load.hapmap.data(files, project='HapMap', verbose=TRUE)
+}
+\arguments{
+  \item{files}{a vector of HapMap genotype data files.}
+  \item{project.name}{the project to be associated with this data.}
+  \item{verbose}{logical: indicates whether to report progress.}
+}
+\details{
+  This function supports loading recent Phase 2 and Phase 3 genotype
+  files.  If files include data from several population panels, then the
+  genotype data is merged into a single dataset spanning all those
+  panels.  Only non-redundant forward-orientation files are supported.
+}
+\seealso{
+  \code{\link{hapmap.subjects}}.
+}
+\examples{
+\dontrun{
+# create HapMap project
+demo('setup.hapmap')
+
+# get file list for latest Phase II HapMap release
+base <- 'http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes'
+release  <- '/latest_phaseII_ncbi_b36/fwd_strand/non-redundant'
+path <- paste(base, release, sep='')
+re <- '.*(genotypes_chr[^_]+_..._[0-9A-Za-z_.]+gz).*'
+files <- sub(re, '\\1', grep(re, readLines(path), value=TRUE))
+
+# download to current directory
+for (f in files) {
+    download.file(paste(path,f,sep='/'), f)
+}
+
+# load into current database
+load.hapmap.data(files)
+}
+}
+\keyword{IO}

Modified: pkg/gt.db/man/ls.mapping.Rd
===================================================================
--- pkg/gt.db/man/ls.mapping.Rd	2010-02-17 20:19:15 UTC (rev 33)
+++ pkg/gt.db/man/ls.mapping.Rd	2010-02-21 04:09:28 UTC (rev 34)
@@ -1,5 +1,6 @@
 %
 % Copyright (C) 2009, Perlegen Sciences, Inc.
+% Copyright (C) 2010, 23andMe, Inc.
 % 
 % Written by David A. Hinds <dhinds at sonic.net>
 % 
@@ -24,10 +25,13 @@
   platform.
 }
 \usage{
-ls.mapping(platform.name, show.all=FALSE, show.ids=FALSE)
+ls.mapping(platform.name, mapping.name='\%',
+           show.all=FALSE, show.ids=FALSE)
 }
 \arguments{
   \item{platform.name}{a genotyping platform name.}
+  \item{mapping.name}{an SQL \code{LIKE} expression for matching
+    mapping names.}
   \item{show.all}{logical: indicates if hidden mapping sets should be
     included in the output.}
   \item{show.ids}{logical: indicates whether to include values of

Modified: pkg/gt.db/man/mk.assay.Rd
===================================================================
--- pkg/gt.db/man/mk.assay.Rd	2010-02-17 20:19:15 UTC (rev 33)
+++ pkg/gt.db/man/mk.assay.Rd	2010-02-21 04:09:28 UTC (rev 34)
@@ -39,8 +39,8 @@
     \item{assay.name}{a unique name (within this platform) for the assay.}
     \item{flags}{an optional integer value composed of single-bit flags.}
     \item{alleles}{a slash separated list of valid alleles.}
-    \item{probe.seq}{the genomic flanking sequence for the assay, with
-      the variant position denoted by an underscore (\code{'_'}).}
+    \item{probe.seq}{an optional genomic flanking sequence for the assay,
+      with the variant position denoted by an underscore (\code{'_'}).}
   }
 }
 \value{