[Gtdb-commits] r31 - in pkg/gt.db: . R man

Wed Feb 17 03:12:37 CET 2010

Author: dahinds
Date: 2010-02-17 03:12:37 +0100 (Wed, 17 Feb 2010)
New Revision: 31

Added:
   pkg/gt.db/R/affy.R
   pkg/gt.db/man/pack.raw.data.Rd
Modified:
   pkg/gt.db/INDEX
   pkg/gt.db/R/misc.R
   pkg/gt.db/R/rawdata.R
   pkg/gt.db/man/mk.assay.data.Rd
   pkg/gt.db/man/reshape.gt.data.Rd
Log:
- new code for loading Affymetrix data into GT.DB from flat files,
  still needs documentation
- speedup for lookup.id()
- help page for pack.raw.data()/unpack.raw.data()



Modified: pkg/gt.db/INDEX
===================================================================

--- pkg/gt.db/INDEX	2010-02-16 19:35:51 UTC (rev 30)
+++ pkg/gt.db/INDEX	2010-02-17 02:12:37 UTC (rev 31)
@@ -48,7 +48,6 @@
 ld.plot                 Pairwise Linkage Disequilibrium Plot
 ld.prune                Prune SNP List to Limit Linkage Disequilibrium
 ls.assay                List Assay Definitions
-ls.assay.group          List Assay Groups
 ls.assay.position       List Assay Positions
 ls.dataset              List Genotype Datasets
 ls.mapping              List Assay Mapping Sets
@@ -63,7 +62,6 @@
 match.gt.data           Identify Equivalent Genotyping Assays
 mk.assay                Create Genotyping Assay Definitions
 mk.assay.data           Create Genotyping Assay Data
-mk.assay.group          Create or Remove a Genotype Assay Group
 mk.assay.position       Create Genotyping Assay Positions
 mk.attr                 Create, Remove, and List Attribute Definitions
 mk.dataset              Create or Remove Genotype Datasets
@@ -82,6 +80,7 @@
 nw.align                Needleman and Wunsch Sequence Alignment
 nw.orient.assay         Orient Assay Sequences
 orient.gt.data          Flip Assay Strands and/or Swap Alleles
+pack.raw.data           Pack and Unpack Raw Genotype Data
 panel.cluster           Panel Function for Drawing Elliptical Cluster
                         Boundaries
 panel.qqpval            Quantile-Quantile Plots for P Values: Panel

Added: pkg/gt.db/R/affy.R
===================================================================
--- pkg/gt.db/R/affy.R	                        (rev 0)
+++ pkg/gt.db/R/affy.R	2010-02-17 02:12:37 UTC (rev 31)
@@ -0,0 +1,212 @@
+#
+# Copyright (C) 2010, 23andMe, Inc.
+#
+# Written by David A. Hinds <dhinds at sonic.net>
+#
+# This is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the license, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>
+#
+
+# Code to handle importing Affymetrix SNP annotations and genotype
+# data into GT.DB
+
+.read.affy.header <- function(file, max.lines=10000)
+{
+    txt <- grep('^\\#%.*=', readLines(file, max.lines), value=TRUE)
+    val <- sub('^..[^=]*=(.*)', '\\1', txt)
+    structure(val, names=sub('^..([^=]*)=.*', '\\1', txt))
+}
+
+read.affy.anno <- function(file)
+{
+    cc <- c(Minor.Allele='NULL',Minor.Allele.Frequency='NULL',
+            Associated.Gene='NULL',Genetic.Map='NULL',
+            Microsatellite='NULL',Allele.Frequencies='NULL',
+            Heterozygous.Allele.Frequencies='NULL',OMIM='NULL',
+            Number.of.individuals.Number.of.chromosomes='NULL',
+            dbSNP.RS.ID='character',Flank='character')
+    d <- read.table(file=file, header=TRUE, quote='"', comment.char='#',
+                    sep=',', na.strings='---', row.names=1, colClasses=cc)
+    d$Chromosome <- .sort.levels(d$Chromosome)
+    d <- d[order(d$Chromosome, d$Physical.Position),]
+    tags <- .read.affy.header(file)
+    do.call('structure', c(list(d), as.list(tags)))
+}
+
+.parse.affy.assays <- function(anno)
+{
+    alleles <- paste(anno$Allele.A, anno$Allele.B, sep='/')
+    data.frame(assay.name=sub('-','_',rownames(anno)),
+               alleles=alleles,
+               probe.seq=sub('\\[.*\\]','_',anno$Flank),
+               stringsAsFactors=FALSE)
+}
+
+.parse.affy.mapping <- function(anno)
+{
+    ploidy <- factor('A',levels=c('A','M','X','Y'))
+    ploidy[is.na(anno$Chromosome)] <- NA
+    ploidy[anno$Chromosome=='X'] <- 'X'
+    ploidy[anno$Chromosome=='Y'] <- 'Y'
+    ploidy[anno$Chromosome=='MT'] <- 'M'
+    ploidy[anno$ChrX.pseudo.autosomal.region.1 |
+             anno$ChrX.pseudo.autosomal.region.2] <- 'A'
+    dbsnp.orient <-
+        factor(anno$Strand.Versus.dbSNP,
+               levels=c('reverse','same'), labels=c('-','+'))
+    scaffold <- na.if(paste('chr', anno$Chromosome, sep=''), 'ChrNA')
+    scaffold <- sub('chrMT','chrM',scaffold)
+    data.frame(assay.name=sub('-','_',rownames(anno)),
+               scaffold=scaffold,
+               position=anno$Physical.Position,
+               strand=anno$Strand,
+               ploidy=ploidy,
+               dbsnp.rsid=as.integer(sub('^rs','',anno$dbSNP.RS.ID)),
+               dbsnp.orient=dbsnp.orient,
+               stringsAsFactors=FALSE)
+}
+
+load.affy.platform <- function(anno, description, progress=TRUE)
+{
+    platform <- attr(anno,'chip_type')
+    cat(sprintf("Creating platform '%s'...\n", platform))
+    mk.platform(platform, description)
+    mk.assay(platform, .parse.affy.assays(anno), progress=progress)
+}
+
+load.affy.mapping <- function(anno, progress=TRUE)
+{
+    platform <- attr(anno,'chip_type')
+    version <- attr(anno, 'netaffx-annotation-netaffx-build')
+    mapping <- sprintf('na%s', version)
+    cat(sprintf("Creating mapping '%s' for '%s'...\n",
+                mapping, platform))
+    ncbi.build <- attr(anno, 'genome-version-ncbi')
+    dbsnp.build <- attr(anno, 'dbsnp-version')
+    desc <- sprintf("%s on NCBI b%s, dbSNP b%s",
+                    ls.platform(platform)$description,
+                    ncbi.build, dbsnp.build)
+    mk.mapping(platform, mapping, desc,
+               sub('\\(d+).*','\\1',ncbi.build))
+    mk.assay.position(platform, mapping,
+                      .parse.affy.mapping(anno),
+                      progress=progress)
+}
+
+#---------------------------------------------------------------------
+
+# read a single Affy CHP file (in text form) and convert individual
+# genotypes to a GT.DB representation
+
+.read.affy.chp <- function(file, names, row.names=TRUE)
+{
+    recode.gt <- function(x)
+    {
+        factor(x, levels=c('AA','AB','BB','NC'),
+               labels=c('a','h','b','n'))
+    }
+
+    cc <- c('character','factor',rep('numeric',3),'factor')
+    d <- read.table(file, sep='\t', header=TRUE,
+                    row.names=1, colClasses=cc)
+    if (!missing(names)) {
+        if (!all(names %in% rownames(d)))
+            stop('Probe set name(s) are not matched?')
+        d <- d[names,]
+    }
+    if (!row.names) row.names(d) <- NULL
+
+    colnames(d) <- tolower(colnames(d))
+    d$forced.call <- recode.gt(d$forced.call)
+    qscore <- round(-10*log10(d$confidence))
+    qscore <- ifelse(is.finite(qscore), qscore, 0)
+    raw.data <- rawToHex(matrix(.pack.chpdata(d), nrow=5))
+    data.frame(genotype=recode.gt(d$call),
+               qscore=as.integer(qscore),
+               raw.data=raw.data,
+               row.names=rownames(d),
+               stringsAsFactors=FALSE)
+}
+
+# read a list of CHP files, and construct a set of temporary files
+# made up of subsets of SNPs across all samples, to facilitate
+# transposing the data to the form that will go into GT.DB
+
+.reorg.chp.data <-
+    function(files, names, slice=8192, progress=TRUE)
+{
+    nr <- nrow(.read.affy.chp(files[1], names))
+    lo <- seq(1, nr, slice)
+    hi <- c(lo[-1]-1, nr)
+    nb <- length(lo)
+    parts <- sprintf("part%03d.dat", 1:nb)
+    if (length(parts) > 120)
+        stop("too many slices")
+    fd <- lapply(parts, gzfile, 'wb')
+    sapply(fd, function(x) save(files, file=x))
+    nf <- length(files)
+    if (progress) progress.bar(0, nf)
+    for (n in 1:nf) {
+        d <- .read.affy.chp(files[n], names, row.names=(n==1))
+        sapply(1:nb, function(x)
+           { slice <- d[lo[x]:hi[x],]
+             save(slice, file=fd[[x]]) })
+        if (progress) progress.bar(n, nf)
+    }
+    sapply(fd, close)
+    parts
+}
+
+.reshape.affy.block <- function(file)
+{
+    pack.by.assay <- function(data, col, ...)
+    {
+        # peel out selected column into a matrix, and transpose
+        m <- t(sapply(data, function(x) x[,col]))
+        # pack columns into strings
+        if (is.character(m))
+            apply(m, 2, paste, collapse='')
+        else
+            apply(m, 2, function(x) rawToHex(writeBin(x,raw(0),...)))
+    }
+    load.one <- function(fd)
+    {
+        tmpenv <- new.env(parent=emptyenv())
+        name <- load(fd, tmpenv)
+        get(name, envir=tmpenv)
+    }
+
+    fd <- gzfile(file, 'rb')
+    files <- load.one(fd)
+    d <- replicate(length(files), load.one(fd), simplify=FALSE)
+    close(fd)
+    data.frame(assay.name=sub('-','_',row.names(d[[1]])),
+               genotype=pack.by.assay(d,'genotype'),
+               qscore=pack.by.assay(d,'qscore',size=1),
+               raw.data=pack.by.assay(d,'raw.data'),
+               stringsAsFactors=FALSE)
+}
+
+load.affy.chp.data <- function(dataset, anno, files, progress=TRUE)
+{
+    cat("Reorganizing CHP data...\n")
+    parts <- .reorg.chp.data(files, rownames(anno), progress=progress)
+    cat("Loading data into database...\n")
+    if (progress) progress.bar(0, length(parts))
+    for (n in 1:length(parts)) {
+        d <- .reshape.affy.block(parts[n])
+        mk.assay.data(dataset, d)
+        if (progress) progress.bar(n, length(parts))
+    }
+    unlink(parts)
+}

Modified: pkg/gt.db/R/misc.R
===================================================================
--- pkg/gt.db/R/misc.R	2010-02-16 19:35:51 UTC (rev 30)
+++ pkg/gt.db/R/misc.R	2010-02-17 02:12:37 UTC (rev 31)
@@ -149,7 +149,7 @@
         x <- sql.query(gt.db::.gt.db, sql, ...)
     }
     if (nrow(x)) {
-        if (any(table(x[,1]) > 1))
+        if (length(unique(x[,1])) < nrow(x))
             stop('lookup failed: please be more specific', call.=FALSE)
         r <- x[match(name,x[,1]),2]
         f <- name[is.na(r)]

Modified: pkg/gt.db/R/rawdata.R
===================================================================
--- pkg/gt.db/R/rawdata.R	2010-02-16 19:35:51 UTC (rev 30)
+++ pkg/gt.db/R/rawdata.R	2010-02-17 02:12:37 UTC (rev 31)
@@ -19,10 +19,10 @@
 
 #---------------------------------------------------------------------
 
-.unpack.signal <- function(data)
+.unpack.signal <- function(raw)
 {
-    nr <- length(data)/4
-    i <- readBin(data, what='int', n=2*nr, size=2,
+    nr <- length(raw)/4
+    i <- readBin(raw, what='int', n=2*nr, size=2,
                  signed=FALSE, endian='little')
     i <- na.if(i, 65535)
     dim(i) <- c(2,nr)
@@ -31,9 +31,9 @@
 
 .pack.signal <- function(data)
 {
-    a <- writeBin(if.na(data$signal.a,65535),
+    a <- writeBin(as.integer(if.na(data$signal.a,65535)),
                   raw(), size=2, endian='little')
-    b <- writeBin(if.na(data$signal.b,65535),
+    b <- writeBin(as.integer(if.na(data$signal.b,65535)),
                   raw(), size=2, endian='little')
     dim(a) <- dim(b) <- c(2,nrow(data))
     as.vector(rbind(a,b))
@@ -41,9 +41,9 @@
 
 #---------------------------------------------------------------------
 
-.unpack.seqread <- function(data)
+.unpack.seqread <- function(raw)
 {
-    i <- na.if(as.integer(data), 255)
+    i <- na.if(as.integer(raw), 255)
     dim(i) <- c(4,length(i)/4)
     data.frame(fwd.a=i[1,], rev.a=i[2,], fwd.b=i[3,], rev.b=i[4,])
 }
@@ -58,15 +58,15 @@
 
 #---------------------------------------------------------------------
 
-.unpack.chpdata <- function(data)
+.unpack.chpdata <- function(raw)
 {
-    nr <- length(data)/5
-    dim(data) <- c(5,nr)
-    i <- readBin(data[1:2,], what='int', n=nr, size=2,
+    nr <- length(raw)/5
+    dim(raw) <- c(5,nr)
+    i <- readBin(raw[1:2,], what='int', n=nr, size=2,
                  signed=TRUE, endian='little')
-    j <- readBin(data[3:4,], what='int', n=nr, size=2,
+    j <- readBin(raw[3:4,], what='int', n=nr, size=2,
                  signed=FALSE, endian='little')
-    k <- factor(as.integer(data[5,]), levels=1:4,
+    k <- factor(as.integer(raw[5,]), levels=1:4,
                 labels=c('a','h','b','n'))
     data.frame(log.ratio=i/256, strength=j/256, forced.call=k)
 }
@@ -77,21 +77,21 @@
                   raw(), size=2, endian='little')
     y <- writeBin(as.integer(round(256*data$strength)),
                   raw(), size=2, endian='little')
-    z <- as.raw(data$forced.call)
+    z <- as.raw(factor(data$forced.call, levels=c('a','h','b','n')))
     dim(x) <- dim(y) <- c(2,nrow(data))
     as.vector(rbind(x,y,z))
 }
 
 #---------------------------------------------------------------------
 
-unpack.raw.data <- function(data, raw.layout)
+unpack.raw.data <- function(raw, raw.layout)
 {
     if (raw.layout == 'signal') {
-        .unpack.signal(data)
+        .unpack.signal(raw)
     } else if (raw.layout == 'seqread') {
-        .unpack.seqread(data)
+        .unpack.seqread(raw)
     } else if (raw.layout == 'chpdata') {
-        .unpack.chpdata(data)
+        .unpack.chpdata(raw)
     } else {
         stop('unknown raw data layout')
     }

Modified: pkg/gt.db/man/mk.assay.data.Rd
===================================================================
--- pkg/gt.db/man/mk.assay.data.Rd	2010-02-16 19:35:51 UTC (rev 30)
+++ pkg/gt.db/man/mk.assay.data.Rd	2010-02-17 02:12:37 UTC (rev 31)
@@ -41,8 +41,8 @@
     \item{assay.id}{the unique integer ID for this assay.}
     \item{flags}{an integer value composed of single-bit flags.}
     \item{genotype}{a string of a/h/b/n/x genotypes.}
-    \item{qscore}{a string of packed quality scores.}
-    \item{raw.data}{a string containing packed raw data (such as
+    \item{qscore}{a hex string of packed quality scores.}
+    \item{raw.data}{a hex string containing packed raw data (such as
       signal intensities or read counts).}
   }
   An \code{assay.name} column can be supplied in place of
@@ -63,7 +63,9 @@
 \seealso{
   \code{\link{mk.assay.position}},
   \code{\link{mk.assay}},
-  \code{\link{fetch.gt.data}}.
+  \code{\link{fetch.gt.data}},
+  \code{\link{pack.raw.data}},
+  \code{\link{unpack.raw.data}}.
 }
 \examples{\dontrun{
 data(demo_01)

Added: pkg/gt.db/man/pack.raw.data.Rd
===================================================================
--- pkg/gt.db/man/pack.raw.data.Rd	                        (rev 0)
+++ pkg/gt.db/man/pack.raw.data.Rd	2010-02-17 02:12:37 UTC (rev 31)
@@ -0,0 +1,91 @@
+%
+% Copyright (C) 2010, 23andMe, Inc.
+% 
+% Written by David A. Hinds <dhinds at sonic.net>
+% 
+% This is free software; you can redistribute it and/or modify it
+% under the terms of the GNU General Public License as published by
+% the Free Software Foundation; either version 3 of the license, or
+% (at your option) any later version.
+% 
+% This program is distributed in the hope that it will be useful,
+% but WITHOUT ANY WARRANTY; without even the implied warranty of
+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+% GNU General Public License for more details.
+% 
+% You should have received a copy of the GNU General Public License
+% along with this program.  If not, see <http://www.gnu.org/licenses/>
+%
+\name{pack.raw.data}
+\alias{pack.raw.data}
+\alias{unpack.raw.data}
+\title{Pack and Unpack Raw Genotype Data}
+\description{
+  Convert raw genotype data between multi-column data frames and packed
+  raw vectors.
+}
+\usage{
+pack.raw.data(data, raw.layout)
+unpack.raw.data(raw, raw.layout)
+}
+\arguments{
+  \item{data}{a data frame.}
+  \item{raw}{a raw vector.}
+  \item{raw.layout}{one of \code{'signal'}, \code{'seqread'}, or
+    \code{'chpdata'}.}
+}
+\details{
+  In a packed raw vector, all the data elements for an individual sample
+  are grouped together, to simplify subsetting by sample.  For
+  \code{pack.raw.data}, columns will be coerced to the appropriate data
+  types before packing.
+
+  The raw data layouts have the following structures:
+
+  \describe{
+    
+    \item{\code{'signal'}}{two columns: \code{signal.a} and
+      \code{signal.b}.  These represent transformed intensities for the
+      A and B alleles.  These should fit into 16-bit unsigned integers
+      (0 to 65535, with 65535 reserved for representing \code{NA}).}
+    
+    \item{\code{'seqread'}}{four columns: \code{fwd.a}, \code{rev.a},
+      \code{fwd.b}, and \code{rev.b}.  These represent forward and
+      reverse sequencing read counts for the A and B alleles.  Each
+      should fit into a single unsigned byte, with the value of 255
+      reserved for \code{NA}.}
+    
+    \item{\code{'chpdata'}}{three columns: \code{log.ratio},
+      \code{strength}, and \code{forced.call}.  These represent the
+      base-2 log of the intensity ratio for A and B alleles; the base-2
+      log of the overall signal strength; and a forced genotype call
+      with no quality thresholding.  The first two are stored as
+      fixed-point signed 8.8-bit numbers in the raw vector form.}
+
+  }
+}
+\value{
+  For \code{pack.raw.data}, a raw vector encoding the source data
+  according to \code{raw.layout}.
+
+  For \code{unpack.raw.data}, a data frame representing the contents of
+  the raw vector interpreted according to \code{raw.layout}.
+}
+\seealso{
+  \code{\link{mk.assay.data}}, \code{\link{reshape.gt.data}}.
+}
+\examples{
+x <- data.frame(signal.a=c(100,10), signal.b=c(10,100))
+y <- pack.raw.data(x, 'signal')
+unpack.raw.data(y, 'signal')
+
+x <- data.frame(fwd.a=c(2,5), rev.a=c(4,4), fwd.b=c(3,0), rev.b=c(4,0))
+y <- pack.raw.data(x, 'seqread')
+unpack.raw.data(y, 'seqread')
+
+x <- data.frame(log.ratio=c(0.1,3.5), strength=c(5.5,7.5),
+                forced.call=c('h','a'))
+y <- pack.raw.data(x, 'chpdata')
+unpack.raw.data(y, 'chpdata')
+}
+\keyword{manip}

Modified: pkg/gt.db/man/reshape.gt.data.Rd
===================================================================
--- pkg/gt.db/man/reshape.gt.data.Rd	2010-02-16 19:35:51 UTC (rev 30)
+++ pkg/gt.db/man/reshape.gt.data.Rd	2010-02-17 02:12:37 UTC (rev 31)
@@ -51,7 +51,7 @@
 }
 \seealso{
   \code{\link{fetch.gt.data}}, \code{\link{unpack.gt.matrix}},
-  \code{\link{gt.split}}.
+  \code{\link{gt.split}}, \code{\link{unpack.raw.data}}.
 }
 \examples{
 gt.demo.check()