[Gtdb-commits] r31 - in pkg/gt.db: . R man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Wed Feb 17 03:12:37 CET 2010
Author: dahinds
Date: 2010-02-17 03:12:37 +0100 (Wed, 17 Feb 2010)
New Revision: 31
Added:
pkg/gt.db/R/affy.R
pkg/gt.db/man/pack.raw.data.Rd
Modified:
pkg/gt.db/INDEX
pkg/gt.db/R/misc.R
pkg/gt.db/R/rawdata.R
pkg/gt.db/man/mk.assay.data.Rd
pkg/gt.db/man/reshape.gt.data.Rd
Log:
- new code for loading Affymetrix data into GT.DB from flat files,
still needs documentation
- speedup for lookup.id()
- help page for pack.raw.data()/unpack.raw.data()
Modified: pkg/gt.db/INDEX
===================================================================
--- pkg/gt.db/INDEX 2010-02-16 19:35:51 UTC (rev 30)
+++ pkg/gt.db/INDEX 2010-02-17 02:12:37 UTC (rev 31)
@@ -48,7 +48,6 @@
ld.plot Pairwise Linkage Disequilibrium Plot
ld.prune Prune SNP List to Limit Linkage Disequilibrium
ls.assay List Assay Definitions
-ls.assay.group List Assay Groups
ls.assay.position List Assay Positions
ls.dataset List Genotype Datasets
ls.mapping List Assay Mapping Sets
@@ -63,7 +62,6 @@
match.gt.data Identify Equivalent Genotyping Assays
mk.assay Create Genotyping Assay Definitions
mk.assay.data Create Genotyping Assay Data
-mk.assay.group Create or Remove a Genotype Assay Group
mk.assay.position Create Genotyping Assay Positions
mk.attr Create, Remove, and List Attribute Definitions
mk.dataset Create or Remove Genotype Datasets
@@ -82,6 +80,7 @@
nw.align Needleman and Wunsch Sequence Alignment
nw.orient.assay Orient Assay Sequences
orient.gt.data Flip Assay Strands and/or Swap Alleles
+pack.raw.data Pack and Unpack Raw Genotype Data
panel.cluster Panel Function for Drawing Elliptical Cluster
Boundaries
panel.qqpval Quantile-Quantile Plots for P Values: Panel
Added: pkg/gt.db/R/affy.R
===================================================================
--- pkg/gt.db/R/affy.R (rev 0)
+++ pkg/gt.db/R/affy.R 2010-02-17 02:12:37 UTC (rev 31)
@@ -0,0 +1,212 @@
+#
+# Copyright (C) 2010, 23andMe, Inc.
+#
+# Written by David A. Hinds <dhinds at sonic.net>
+#
+# This is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the license, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>
+#
+
+# Code to handle importing Affymetrix SNP annotations and genotype
+# data into GT.DB
+
+.read.affy.header <- function(file, max.lines=10000)
+{
+ txt <- grep('^\\#%.*=', readLines(file, max.lines), value=TRUE)
+ val <- sub('^..[^=]*=(.*)', '\\1', txt)
+ structure(val, names=sub('^..([^=]*)=.*', '\\1', txt))
+}
+
+read.affy.anno <- function(file)
+{
+ cc <- c(Minor.Allele='NULL',Minor.Allele.Frequency='NULL',
+ Associated.Gene='NULL',Genetic.Map='NULL',
+ Microsatellite='NULL',Allele.Frequencies='NULL',
+ Heterozygous.Allele.Frequencies='NULL',OMIM='NULL',
+ Number.of.individuals.Number.of.chromosomes='NULL',
+ dbSNP.RS.ID='character',Flank='character')
+ d <- read.table(file=file, header=TRUE, quote='"', comment.char='#',
+ sep=',', na.strings='---', row.names=1, colClasses=cc)
+ d$Chromosome <- .sort.levels(d$Chromosome)
+ d <- d[order(d$Chromosome, d$Physical.Position),]
+ tags <- .read.affy.header(file)
+ do.call('structure', c(list(d), as.list(tags)))
+}
+
+.parse.affy.assays <- function(anno)
+{
+ alleles <- paste(anno$Allele.A, anno$Allele.B, sep='/')
+ data.frame(assay.name=sub('-','_',rownames(anno)),
+ alleles=alleles,
+ probe.seq=sub('\\[.*\\]','_',anno$Flank),
+ stringsAsFactors=FALSE)
+}
+
+.parse.affy.mapping <- function(anno)
+{
+ ploidy <- factor('A',levels=c('A','M','X','Y'))
+ ploidy[is.na(anno$Chromosome)] <- NA
+ ploidy[anno$Chromosome=='X'] <- 'X'
+ ploidy[anno$Chromosome=='Y'] <- 'Y'
+ ploidy[anno$Chromosome=='MT'] <- 'M'
+ ploidy[anno$ChrX.pseudo.autosomal.region.1 |
+ anno$ChrX.pseudo.autosomal.region.2] <- 'A'
+ dbsnp.orient <-
+ factor(anno$Strand.Versus.dbSNP,
+ levels=c('reverse','same'), labels=c('-','+'))
+ scaffold <- na.if(paste('chr', anno$Chromosome, sep=''), 'ChrNA')
+ scaffold <- sub('chrMT','chrM',scaffold)
+ data.frame(assay.name=sub('-','_',rownames(anno)),
+ scaffold=scaffold,
+ position=anno$Physical.Position,
+ strand=anno$Strand,
+ ploidy=ploidy,
+ dbsnp.rsid=as.integer(sub('^rs','',anno$dbSNP.RS.ID)),
+ dbsnp.orient=dbsnp.orient,
+ stringsAsFactors=FALSE)
+}
+
+load.affy.platform <- function(anno, description, progress=TRUE)
+{
+ platform <- attr(anno,'chip_type')
+ cat(sprintf("Creating platform '%s'...\n", platform))
+ mk.platform(platform, description)
+ mk.assay(platform, .parse.affy.assays(anno), progress=progress)
+}
+
+load.affy.mapping <- function(anno, progress=TRUE)
+{
+ platform <- attr(anno,'chip_type')
+ version <- attr(anno, 'netaffx-annotation-netaffx-build')
+ mapping <- sprintf('na%s', version)
+ cat(sprintf("Creating mapping '%s' for '%s'...\n",
+ mapping, platform))
+ ncbi.build <- attr(anno, 'genome-version-ncbi')
+ dbsnp.build <- attr(anno, 'dbsnp-version')
+ desc <- sprintf("%s on NCBI b%s, dbSNP b%s",
+ ls.platform(platform)$description,
+ ncbi.build, dbsnp.build)
+ mk.mapping(platform, mapping, desc,
+ sub('\\(d+).*','\\1',ncbi.build))
+ mk.assay.position(platform, mapping,
+ .parse.affy.mapping(anno),
+ progress=progress)
+}
+
+#---------------------------------------------------------------------
+
+# read a single Affy CHP file (in text form) and convert individual
+# genotypes to a GT.DB representation
+
+.read.affy.chp <- function(file, names, row.names=TRUE)
+{
+ recode.gt <- function(x)
+ {
+ factor(x, levels=c('AA','AB','BB','NC'),
+ labels=c('a','h','b','n'))
+ }
+
+ cc <- c('character','factor',rep('numeric',3),'factor')
+ d <- read.table(file, sep='\t', header=TRUE,
+ row.names=1, colClasses=cc)
+ if (!missing(names)) {
+ if (!all(names %in% rownames(d)))
+ stop('Probe set name(s) are not matched?')
+ d <- d[names,]
+ }
+ if (!row.names) row.names(d) <- NULL
+
+ colnames(d) <- tolower(colnames(d))
+ d$forced.call <- recode.gt(d$forced.call)
+ qscore <- round(-10*log10(d$confidence))
+ qscore <- ifelse(is.finite(qscore), qscore, 0)
+ raw.data <- rawToHex(matrix(.pack.chpdata(d), nrow=5))
+ data.frame(genotype=recode.gt(d$call),
+ qscore=as.integer(qscore),
+ raw.data=raw.data,
+ row.names=rownames(d),
+ stringsAsFactors=FALSE)
+}
+
+# read a list of CHP files, and construct a set of temporary files
+# made up of subsets of SNPs across all samples, to facilitate
+# transposing the data to the form that will go into GT.DB
+
+.reorg.chp.data <-
+ function(files, names, slice=8192, progress=TRUE)
+{
+ nr <- nrow(.read.affy.chp(files[1], names))
+ lo <- seq(1, nr, slice)
+ hi <- c(lo[-1]-1, nr)
+ nb <- length(lo)
+ parts <- sprintf("part%03d.dat", 1:nb)
+ if (length(parts) > 120)
+ stop("too many slices")
+ fd <- lapply(parts, gzfile, 'wb')
+ sapply(fd, function(x) save(files, file=x))
+ nf <- length(files)
+ if (progress) progress.bar(0, nf)
+ for (n in 1:nf) {
+ d <- .read.affy.chp(files[n], names, row.names=(n==1))
+ sapply(1:nb, function(x)
+ { slice <- d[lo[x]:hi[x],]
+ save(slice, file=fd[[x]]) })
+ if (progress) progress.bar(n, nf)
+ }
+ sapply(fd, close)
+ parts
+}
+
+.reshape.affy.block <- function(file)
+{
+ pack.by.assay <- function(data, col, ...)
+ {
+ # peel out selected column into a matrix, and transpose
+ m <- t(sapply(data, function(x) x[,col]))
+ # pack columns into strings
+ if (is.character(m))
+ apply(m, 2, paste, collapse='')
+ else
+ apply(m, 2, function(x) rawToHex(writeBin(x,raw(0),...)))
+ }
+ load.one <- function(fd)
+ {
+ tmpenv <- new.env(parent=emptyenv())
+ name <- load(fd, tmpenv)
+ get(name, envir=tmpenv)
+ }
+
+ fd <- gzfile(file, 'rb')
+ files <- load.one(fd)
+ d <- replicate(length(files), load.one(fd), simplify=FALSE)
+ close(fd)
+ data.frame(assay.name=sub('-','_',row.names(d[[1]])),
+ genotype=pack.by.assay(d,'genotype'),
+ qscore=pack.by.assay(d,'qscore',size=1),
+ raw.data=pack.by.assay(d,'raw.data'),
+ stringsAsFactors=FALSE)
+}
+
+load.affy.chp.data <- function(dataset, anno, files, progress=TRUE)
+{
+ cat("Reorganizing CHP data...\n")
+ parts <- .reorg.chp.data(files, rownames(anno), progress=progress)
+ cat("Loading data into database...\n")
+ if (progress) progress.bar(0, length(parts))
+ for (n in 1:length(parts)) {
+ d <- .reshape.affy.block(parts[n])
+ mk.assay.data(dataset, d)
+ if (progress) progress.bar(n, length(parts))
+ }
+ unlink(parts)
+}
Modified: pkg/gt.db/R/misc.R
===================================================================
--- pkg/gt.db/R/misc.R 2010-02-16 19:35:51 UTC (rev 30)
+++ pkg/gt.db/R/misc.R 2010-02-17 02:12:37 UTC (rev 31)
@@ -149,7 +149,7 @@
x <- sql.query(gt.db::.gt.db, sql, ...)
}
if (nrow(x)) {
- if (any(table(x[,1]) > 1))
+ if (length(unique(x[,1])) < nrow(x))
stop('lookup failed: please be more specific', call.=FALSE)
r <- x[match(name,x[,1]),2]
f <- name[is.na(r)]
Modified: pkg/gt.db/R/rawdata.R
===================================================================
--- pkg/gt.db/R/rawdata.R 2010-02-16 19:35:51 UTC (rev 30)
+++ pkg/gt.db/R/rawdata.R 2010-02-17 02:12:37 UTC (rev 31)
@@ -19,10 +19,10 @@
#---------------------------------------------------------------------
-.unpack.signal <- function(data)
+.unpack.signal <- function(raw)
{
- nr <- length(data)/4
- i <- readBin(data, what='int', n=2*nr, size=2,
+ nr <- length(raw)/4
+ i <- readBin(raw, what='int', n=2*nr, size=2,
signed=FALSE, endian='little')
i <- na.if(i, 65535)
dim(i) <- c(2,nr)
@@ -31,9 +31,9 @@
.pack.signal <- function(data)
{
- a <- writeBin(if.na(data$signal.a,65535),
+ a <- writeBin(as.integer(if.na(data$signal.a,65535)),
raw(), size=2, endian='little')
- b <- writeBin(if.na(data$signal.b,65535),
+ b <- writeBin(as.integer(if.na(data$signal.b,65535)),
raw(), size=2, endian='little')
dim(a) <- dim(b) <- c(2,nrow(data))
as.vector(rbind(a,b))
@@ -41,9 +41,9 @@
#---------------------------------------------------------------------
-.unpack.seqread <- function(data)
+.unpack.seqread <- function(raw)
{
- i <- na.if(as.integer(data), 255)
+ i <- na.if(as.integer(raw), 255)
dim(i) <- c(4,length(i)/4)
data.frame(fwd.a=i[1,], rev.a=i[2,], fwd.b=i[3,], rev.b=i[4,])
}
@@ -58,15 +58,15 @@
#---------------------------------------------------------------------
-.unpack.chpdata <- function(data)
+.unpack.chpdata <- function(raw)
{
- nr <- length(data)/5
- dim(data) <- c(5,nr)
- i <- readBin(data[1:2,], what='int', n=nr, size=2,
+ nr <- length(raw)/5
+ dim(raw) <- c(5,nr)
+ i <- readBin(raw[1:2,], what='int', n=nr, size=2,
signed=TRUE, endian='little')
- j <- readBin(data[3:4,], what='int', n=nr, size=2,
+ j <- readBin(raw[3:4,], what='int', n=nr, size=2,
signed=FALSE, endian='little')
- k <- factor(as.integer(data[5,]), levels=1:4,
+ k <- factor(as.integer(raw[5,]), levels=1:4,
labels=c('a','h','b','n'))
data.frame(log.ratio=i/256, strength=j/256, forced.call=k)
}
@@ -77,21 +77,21 @@
raw(), size=2, endian='little')
y <- writeBin(as.integer(round(256*data$strength)),
raw(), size=2, endian='little')
- z <- as.raw(data$forced.call)
+ z <- as.raw(factor(data$forced.call, levels=c('a','h','b','n')))
dim(x) <- dim(y) <- c(2,nrow(data))
as.vector(rbind(x,y,z))
}
#---------------------------------------------------------------------
-unpack.raw.data <- function(data, raw.layout)
+unpack.raw.data <- function(raw, raw.layout)
{
if (raw.layout == 'signal') {
- .unpack.signal(data)
+ .unpack.signal(raw)
} else if (raw.layout == 'seqread') {
- .unpack.seqread(data)
+ .unpack.seqread(raw)
} else if (raw.layout == 'chpdata') {
- .unpack.chpdata(data)
+ .unpack.chpdata(raw)
} else {
stop('unknown raw data layout')
}
Modified: pkg/gt.db/man/mk.assay.data.Rd
===================================================================
--- pkg/gt.db/man/mk.assay.data.Rd 2010-02-16 19:35:51 UTC (rev 30)
+++ pkg/gt.db/man/mk.assay.data.Rd 2010-02-17 02:12:37 UTC (rev 31)
@@ -41,8 +41,8 @@
\item{assay.id}{the unique integer ID for this assay.}
\item{flags}{an integer value composed of single-bit flags.}
\item{genotype}{a string of a/h/b/n/x genotypes.}
- \item{qscore}{a string of packed quality scores.}
- \item{raw.data}{a string containing packed raw data (such as
+ \item{qscore}{a hex string of packed quality scores.}
+ \item{raw.data}{a hex string containing packed raw data (such as
signal intensities or read counts).}
}
An \code{assay.name} column can be supplied in place of
@@ -63,7 +63,9 @@
\seealso{
\code{\link{mk.assay.position}},
\code{\link{mk.assay}},
- \code{\link{fetch.gt.data}}.
+ \code{\link{fetch.gt.data}},
+ \code{\link{pack.raw.data}},
+ \code{\link{unpack.raw.data}}.
}
\examples{\dontrun{
data(demo_01)
Added: pkg/gt.db/man/pack.raw.data.Rd
===================================================================
--- pkg/gt.db/man/pack.raw.data.Rd (rev 0)
+++ pkg/gt.db/man/pack.raw.data.Rd 2010-02-17 02:12:37 UTC (rev 31)
@@ -0,0 +1,91 @@
+%
+% Copyright (C) 2010, 23andMe, Inc.
+%
+% Written by David A. Hinds <dhinds at sonic.net>
+%
+% This is free software; you can redistribute it and/or modify it
+% under the terms of the GNU General Public License as published by
+% the Free Software Foundation; either version 3 of the license, or
+% (at your option) any later version.
+%
+% This program is distributed in the hope that it will be useful,
+% but WITHOUT ANY WARRANTY; without even the implied warranty of
+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+% GNU General Public License for more details.
+%
+% You should have received a copy of the GNU General Public License
+% along with this program. If not, see <http://www.gnu.org/licenses/>
+%
+\name{pack.raw.data}
+\alias{pack.raw.data}
+\alias{unpack.raw.data}
+\title{Pack and Unpack Raw Genotype Data}
+\description{
+ Convert raw genotype data between multi-column data frames and packed
+ raw vectors.
+}
+\usage{
+pack.raw.data(data, raw.layout)
+unpack.raw.data(raw, raw.layout)
+}
+\arguments{
+ \item{data}{a data frame.}
+ \item{raw}{a raw vector.}
+ \item{raw.layout}{one of \code{'signal'}, \code{'seqread'}, or
+ \code{'chpdata'}.}
+}
+\details{
+ In a packed raw vector, all the data elements for an individual sample
+ are grouped together, to simplify subsetting by sample. For
+ \code{pack.raw.data}, columns will be coerced to the appropriate data
+ types before packing.
+
+ The raw data layouts have the following structures:
+
+ \describe{
+
+ \item{\code{'signal'}}{two columns: \code{signal.a} and
+ \code{signal.b}. These represent transformed intensities for the
+ A and B alleles. These should fit into 16-bit unsigned integers
+ (0 to 65535, with 65535 reserved for representing \code{NA}).}
+
+ \item{\code{'seqread'}}{four columns: \code{fwd.a}, \code{rev.a},
+ \code{fwd.b}, and \code{rev.b}. These represent forward and
+ reverse sequencing read counts for the A and B alleles. Each
+ should fit into a single unsigned byte, with the value of 255
+ reserved for \code{NA}.}
+
+ \item{\code{'chpdata'}}{three columns: \code{log.ratio},
+ \code{strength}, and \code{forced.call}. These represent the
+ base-2 log of the intensity ratio for A and B alleles; the base-2
+ log of the overall signal strength; and a forced genotype call
+ with no quality thresholding. The first two are stored as
+ fixed-point signed 8.8-bit numbers in the raw vector form.}
+
+ }
+}
+\value{
+ For \code{pack.raw.data}, a raw vector encoding the source data
+ according to \code{raw.layout}.
+
+ For \code{unpack.raw.data}, a data frame representing the contents of
+ the raw vector interpreted according to \code{raw.layout}.
+}
+\seealso{
+ \code{\link{mk.assay.data}}, \code{\link{reshape.gt.data}}.
+}
+\examples{
+x <- data.frame(signal.a=c(100,10), signal.b=c(10,100))
+y <- pack.raw.data(x, 'signal')
+unpack.raw.data(y, 'signal')
+
+x <- data.frame(fwd.a=c(2,5), rev.a=c(4,4), fwd.b=c(3,0), rev.b=c(4,0))
+y <- pack.raw.data(x, 'seqread')
+unpack.raw.data(y, 'seqread')
+
+x <- data.frame(log.ratio=c(0.1,3.5), strength=c(5.5,7.5),
+ forced.call=c('h','a'))
+y <- pack.raw.data(x, 'chpdata')
+unpack.raw.data(y, 'chpdata')
+}
+\keyword{manip}
Modified: pkg/gt.db/man/reshape.gt.data.Rd
===================================================================
--- pkg/gt.db/man/reshape.gt.data.Rd 2010-02-16 19:35:51 UTC (rev 30)
+++ pkg/gt.db/man/reshape.gt.data.Rd 2010-02-17 02:12:37 UTC (rev 31)
@@ -51,7 +51,7 @@
}
\seealso{
\code{\link{fetch.gt.data}}, \code{\link{unpack.gt.matrix}},
- \code{\link{gt.split}}.
+ \code{\link{gt.split}}, \code{\link{unpack.raw.data}}.
}
\examples{
gt.demo.check()
More information about the Gtdb-commits
mailing list