[Gtdb-commits] r34 - in pkg/gt.db: R man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sun Feb 21 05:09:31 CET 2010
Author: dahinds
Date: 2010-02-21 05:09:28 +0100 (Sun, 21 Feb 2010)
New Revision: 34
Added:
pkg/gt.db/R/hapmap.R
pkg/gt.db/man/load.hapmap.data.Rd
Modified:
pkg/gt.db/R/affy.R
pkg/gt.db/R/assay.R
pkg/gt.db/R/misc.R
pkg/gt.db/man/ls.mapping.Rd
pkg/gt.db/man/mk.assay.Rd
Log:
- added loader for HapMap project data, load.hapmap.data()
- added 'mapping.name' argument to ls.mapping()
- made probe.seq optional in mk.assay()
- fixed bug in name checking function .check.names()
- removed header length limit in .read.affy.header()
Modified: pkg/gt.db/R/affy.R
===================================================================
--- pkg/gt.db/R/affy.R 2010-02-17 20:19:15 UTC (rev 33)
+++ pkg/gt.db/R/affy.R 2010-02-21 04:09:28 UTC (rev 34)
@@ -20,9 +20,14 @@
# Code to handle importing Affymetrix SNP annotations and genotype
# data into GT.DB
-.read.affy.header <- function(file, max.lines=10000)
+.read.affy.header <- function(file)
{
- txt <- grep('^\\#%.*=', readLines(file, max.lines), value=TRUE)
+ txt <- character()
+ repeat {
+ x <- grep('^\\#%.*=', readLines(file, 100), value=TRUE)
+ if (!length(x)) break;
+ txt <- c(txt,x)
+ }
val <- sub('^..[^=]*=(.*)', '\\1', txt)
structure(val, names=sub('^..([^=]*)=.*', '\\1', txt))
}
@@ -60,7 +65,7 @@
ploidy[anno$Chromosome=='Y'] <- 'Y'
ploidy[anno$Chromosome=='MT'] <- 'M'
ploidy[anno$ChrX.pseudo.autosomal.region.1 |
- anno$ChrX.pseudo.autosomal.region.2] <- 'A'
+ anno$ChrX.pseudo.autosomal.region.2] <- 'A'
dbsnp.orient <-
factor(anno$Strand.Versus.dbSNP,
levels=c('reverse','same'), labels=c('-','+'))
Modified: pkg/gt.db/R/assay.R
===================================================================
--- pkg/gt.db/R/assay.R 2010-02-17 20:19:15 UTC (rev 33)
+++ pkg/gt.db/R/assay.R 2010-02-21 04:09:28 UTC (rev 34)
@@ -21,16 +21,18 @@
#---------------------------------------------------------------------
ls.mapping <-
-function(platform.name, show.all=FALSE, show.ids=FALSE)
+function(platform.name, mapping.name='%',
+ show.all=FALSE, show.ids=FALSE)
{
sql <-
'select mapping_id, name mapping_name, description,
assembly, is_hidden, created_by, created_dt
from mapping
where platform_id=:1
- and is_hidden<=:2'
+ and name like :2
+ and is_hidden<=:3'
r <- sql.query(gt.db::.gt.db, sql, lookup.id('platform', platform.name),
- show.all)
+ mapping.name, show.all)
.filter.ids(r, show.ids)
}
@@ -87,6 +89,7 @@
if (any(r,na.rm=TRUE))
stop("invalid probe sequence(s)", call.=FALSE)
if (is.null(data$flags)) data$flags <- 0
+ if (is.null(data$probe.seq)) data$probe.seq <- NA
sql <- 'insert into assay values (null,:1,:2,:3,:4,:5)'
sql.exec(gt.db::.gt.db, sql, plat.id,
data[c('assay.name','flags','alleles','probe.seq')],
@@ -133,6 +136,8 @@
data$ploidy <- NA
}
data$ploidy <- .fixup.ploidy(data$ploidy)
+ if (is.null(data$dbsnp.rsid)) data$dbsnp.rsid <- NA
+ if (is.null(data$dbsnp.orient)) data$dbsnp.orient <- NA
sql.exec(gt.db::.gt.db, sql, map.id,
data[c('assay.id', 'scaffold', 'position', 'strand',
'ploidy', 'dbsnp.rsid', 'dbsnp.orient')],
@@ -156,7 +161,8 @@
db.mode <- .gt.db.options('db.mode')
tx.mode <- .gt.db.options('tx.mode')
- if (db.mode == tx.mode) {
+ if ((db.mode == tx.mode) ||
+ all(is.na(data$qscore) && is.na(data$raw.data))) {
cvt.fn <- ''
} else if (db.mode == 'raw' && tx.mode == 'hex') {
cvt.fn <- ':unhex:'
Added: pkg/gt.db/R/hapmap.R
===================================================================
--- pkg/gt.db/R/hapmap.R (rev 0)
+++ pkg/gt.db/R/hapmap.R 2010-02-21 04:09:28 UTC (rev 34)
@@ -0,0 +1,133 @@
+#
+# Copyright (C) 2010, 23andMe, Inc.
+#
+# Written by David A. Hinds <dhinds at sonic.net>
+#
+# This is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3 of the license, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>
+#
+
+# Functions for reading unphased HapMap genotype data files
+#
+# We only support the 'fwd' files, which do not specify the orientations
+# of the rs clusters. The 'rs' files contain orientation errors.
+
+.read.hapmap.file <- function(file)
+{
+ # read just map information
+ map <- scan(file, what=list('','','',1L,''),
+ flush=TRUE, skip=1, quiet=TRUE)
+ names(map) <- c('assay.name','alleles','scaffold','position','strand')
+ map$assay.name <- I(gsub('-','_',map$assay.name))
+ map$scaffold <- sub('MT','M',map$scaffold)
+ map <- as.data.frame(map)
+
+ # now parse genotype data
+ geno <- readLines(file)
+ # note that we keep the leading space
+ geno <- sub('.* QC[^ ]+', '', geno, perl=TRUE)
+ sample <- strsplit(geno[1], ' ')[[1]][-1]
+ geno <- geno[-1]
+ for (a in levels(map$alleles)) {
+ w <- (map$alleles == a)
+ ab <- strsplit(a,'/')[[1]]
+ geno[w] <- chartr(sprintf('%s%sN',ab[1],ab[2]),'abn',geno[w])
+ }
+ geno <- gsub(' ab| ba', 'h', geno, perl=TRUE)
+ map$genotype <- gsub(' .', '', geno, perl=TRUE)
+ stopifnot(all(nchar(map$genotype) == length(sample)))
+ structure(map, sample=sample)
+}
+
+.merge.hapmap.files <- function(files, verbose=TRUE)
+{
+ wrap.read.file <- function(file)
+ {
+ part <- sub('.*(chr..?_[^_]+)_.*', '\\1', file)
+ if (verbose) message('Reading ', part, '...')
+ .read.hapmap.file(file)
+ }
+ all <- lapply(files, wrap.read.file)
+
+ # construct unified map
+ map <- do.call('rbind', lapply(all, '[', -6))
+ map <- unique(map)
+ map <- map[order(map$position),]
+
+ # verify that all files are consistent
+ stopifnot(!anyDuplicated(map$assay.name))
+
+ map$genotype <- ''
+ sample <- character(0)
+ for (d in all) {
+ xx <- paste(rep('x', nchar(d$genotype[1])), collapse='')
+ gt <- d$genotype[match(map$assay.name,d$assay.name)]
+ map$genotype <- paste(map$genotype, if.na(gt,xx), sep='')
+ sample <- c(sample, attr(d,'sample'))
+ }
+ structure(map, sample=sample)
+}
+
+load.hapmap.data <-
+ function(files, project.name='HapMap', verbose=TRUE)
+{
+ # split filenames at '_', and at '.' unless after a digit
+ f <- strsplit(basename(files), '_|(?<!\\d)\\.', perl=TRUE)
+ f <- as.data.frame(do.call('rbind', f)[,1:7])
+ names(f) <- c('gt','chr','pop','rel','ver','build','strand')
+ stopifnot(f$gt == 'genotypes', f$strand == 'fwd',
+ f$rel == f$rel[1], f$ver == f$ver[1],
+ f$build == f$build[1])
+ nchr <- length(unique(f$chr))
+ npop <- length(unique(f$pop))
+ stopifnot(nrow(f) == nchr * npop)
+
+ phase <- (if (grepl('phase3',f$rel[1])) 3 else 2)
+ platform <- paste('HapMap', phase, sep='')
+ mapping <- sub('phase3\\.','r',f$rel[1])
+ dataset <- paste(platform, mapping, f$ver[1], sep='_')
+ if (npop == 1) dataset <- paste(dataset, f$pop[1], sep='_')
+ desc <- sprintf('HapMap Phase %d', phase)
+ if (!nrow(ls.platform(platform)))
+ mk.platform(platform, desc)
+ desc <- sprintf('%s Release %s', desc, f$rel[1])
+ if (!nrow(ls.mapping(platform, mapping)))
+ mk.mapping(platform, mapping, desc, sprintf('ncbi_%s', f$build[1]))
+ desc <- sprintf('%s (%s)', desc, f$ver[1])
+ if (!nrow(ls.dataset(project.name, dataset)))
+ mk.dataset(dataset, project.name, platform, desc)
+
+ for (chr in levels(.sort.levels(f$chr))) {
+ m <- .merge.hapmap.files(sort(files[f$chr == chr]), verbose)
+ s <- ls.sample(dataset)
+ if (!nrow(s)) {
+ data(hapmap)
+ sn <- attr(m,'sample')
+ s <- data.frame(sample.name=sn, subject.name=sn,
+ gender=hapmap.subjects[sn,'gender.ref'],
+ position=1:length(sn))
+ mk.sample(dataset, s)
+ } else {
+ stopifnot(s$sample.name == attr(m,'sample'))
+ }
+ rs <- ifelse(!grepl('rs\\d+',m$assay.name), NA,
+ sub('.*rs(\\d+).*','\\1',m$assay.name))
+ m$dbsnp.rsid <- as.integer(rs)
+ # FIXME: we'll need to fix up pseudo-autosomal regions
+ m$ploidy <- switch(chr, 'chrX'='X', 'chrY'='Y', 'chrMT'='M', 'A')
+ if (verbose) message("Loading ", nrow(m), " records...")
+ mk.assay(platform, m)
+ mk.assay.position(platform, mapping, m)
+ mk.assay.data(dataset, m)
+ }
+}
Modified: pkg/gt.db/R/misc.R
===================================================================
--- pkg/gt.db/R/misc.R 2010-02-17 20:19:15 UTC (rev 33)
+++ pkg/gt.db/R/misc.R 2010-02-21 04:09:28 UTC (rev 34)
@@ -46,12 +46,12 @@
.check.name <- function(name)
{
- r <- (regexpr('^[A-Za-z0-9_.]+$', name) < 0)
- if (sum(r) == 1) {
- stop("invalid name '", name, "'", call.=FALSE)
- } else if (sum(r)) {
- stop("invalid names '", name[1], "', '", name[2],
- ifelse(sum(r)>2, "', ...", "'"), call.=FALSE)
+ bad <- name[regexpr('^[A-Za-z0-9_.]+$', name) < 0]
+ if (length(bad) == 1) {
+ stop("invalid name '", bad, "'", call.=FALSE)
+ } else if (length(bad)) {
+ stop("invalid names '", bad[1], "', '", bad[2],
+ ifelse(length(bad)>2, "', ...", "'"), call.=FALSE)
}
}
@@ -149,7 +149,7 @@
x <- sql.query(gt.db::.gt.db, sql, ...)
}
if (nrow(x)) {
- if (length(unique(x[,1])) < nrow(x))
+ if (anyDuplicated(x[,1]))
stop('lookup failed: please be more specific', call.=FALSE)
r <- x[match(name,x[,1]),2]
f <- name[is.na(r)]
Added: pkg/gt.db/man/load.hapmap.data.Rd
===================================================================
--- pkg/gt.db/man/load.hapmap.data.Rd (rev 0)
+++ pkg/gt.db/man/load.hapmap.data.Rd 2010-02-21 04:09:28 UTC (rev 34)
@@ -0,0 +1,64 @@
+%
+% Copyright (C) 2010, 23andMe, Inc.
+%
+% Written by David A. Hinds <dhinds at sonic.net>
+%
+% This is free software; you can redistribute it and/or modify it
+% under the terms of the GNU General Public License as published by
+% the Free Software Foundation; either version 3 of the license, or
+% (at your option) any later version.
+%
+% This program is distributed in the hope that it will be useful,
+% but WITHOUT ANY WARRANTY; without even the implied warranty of
+% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+% GNU General Public License for more details.
+%
+% You should have received a copy of the GNU General Public License
+% along with this program. If not, see <http://www.gnu.org/licenses/>
+%
+\name{load.hapmap.data}
+\alias{load.hapmap.data}
+\title{Import Unphased HapMap Genotype Data}
+\description{
+ Import non-redundant, forward-stranded genotype data from the
+ International HapMap Project.
+}
+\usage{
+load.hapmap.data(files, project='HapMap', verbose=TRUE)
+}
+\arguments{
+ \item{files}{a vector of HapMap genotype data files.}
+ \item{project.name}{the project to be associated with this data.}
+ \item{verbose}{logical: indicates whether to report progress.}
+}
+\details{
+ This function supports loading recent Phase 2 and Phase 3 genotype
+ files. If files include data from several population panels, then the
+ genotype data is merged into a single dataset spanning all those
+ panels. Only non-redundant forward-orientation files are supported.
+}
+\seealso{
+ \code{\link{hapmap.subjects}}.
+}
+\examples{
+\dontrun{
+# create HapMap project
+demo('setup.hapmap')
+
+# get file list for latest Phase II HapMap release
+base <- 'http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes'
+release <- '/latest_phaseII_ncbi_b36/fwd_strand/non-redundant'
+path <- paste(base, release, sep='')
+re <- '.*(genotypes_chr[^_]+_..._[0-9A-Za-z_.]+gz).*'
+files <- sub(re, '\\1', grep(re, readLines(path), value=TRUE))
+
+# download to current directory
+for (f in files) {
+ download.file(paste(path,f,sep='/'), f)
+}
+
+# load into current database
+load.hapmap.data(files)
+}
+}
+\keyword{IO}
Modified: pkg/gt.db/man/ls.mapping.Rd
===================================================================
--- pkg/gt.db/man/ls.mapping.Rd 2010-02-17 20:19:15 UTC (rev 33)
+++ pkg/gt.db/man/ls.mapping.Rd 2010-02-21 04:09:28 UTC (rev 34)
@@ -1,5 +1,6 @@
%
% Copyright (C) 2009, Perlegen Sciences, Inc.
+% Copyright (C) 2010, 23andMe, Inc.
%
% Written by David A. Hinds <dhinds at sonic.net>
%
@@ -24,10 +25,13 @@
platform.
}
\usage{
-ls.mapping(platform.name, show.all=FALSE, show.ids=FALSE)
+ls.mapping(platform.name, mapping.name='\%',
+ show.all=FALSE, show.ids=FALSE)
}
\arguments{
\item{platform.name}{a genotyping platform name.}
+ \item{mapping.name}{an SQL \code{LIKE} expression for matching
+ mapping names.}
\item{show.all}{logical: indicates if hidden mapping sets should be
included in the output.}
\item{show.ids}{logical: indicates whether to include values of
Modified: pkg/gt.db/man/mk.assay.Rd
===================================================================
--- pkg/gt.db/man/mk.assay.Rd 2010-02-17 20:19:15 UTC (rev 33)
+++ pkg/gt.db/man/mk.assay.Rd 2010-02-21 04:09:28 UTC (rev 34)
@@ -39,8 +39,8 @@
\item{assay.name}{a unique name (within this platform) for the assay.}
\item{flags}{an optional integer value composed of single-bit flags.}
\item{alleles}{a slash separated list of valid alleles.}
- \item{probe.seq}{the genomic flanking sequence for the assay, with
- the variant position denoted by an underscore (\code{'_'}).}
+ \item{probe.seq}{an optional genomic flanking sequence for the assay,
+ with the variant position denoted by an underscore (\code{'_'}).}
}
}
\value{
More information about the Gtdb-commits
mailing list