[Gtdb-commits] r36 - in pkg/gt.db: R src
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sun Feb 21 20:50:33 CET 2010
Author: dahinds
Date: 2010-02-21 20:50:32 +0100 (Sun, 21 Feb 2010)
New Revision: 36
Added:
pkg/gt.db/src/encode.c
Modified:
pkg/gt.db/R/hapmap.R
pkg/gt.db/R/progress.R
Log:
- autoscale progress.bar() to fit console width
- speed-ups for reading HapMap format files
Modified: pkg/gt.db/R/hapmap.R
===================================================================
--- pkg/gt.db/R/hapmap.R 2010-02-21 04:11:32 UTC (rev 35)
+++ pkg/gt.db/R/hapmap.R 2010-02-21 19:50:32 UTC (rev 36)
@@ -1,3 +1,4 @@
+
#
# Copyright (C) 2010, 23andMe, Inc.
#
@@ -24,27 +25,29 @@
.read.hapmap.file <- function(file)
{
- # read just map information
- map <- scan(file, what=list('','','',1L,''),
+ # schlep the whole file into memory
+ fd <- file(file)
+ filedata <- readChar(fd,100000000,TRUE)
+ close(fd)
+
+ # parse just map information
+ tc <- textConnection(filedata)
+ map <- scan(tc, what=list('','','',1L,''),
flush=TRUE, skip=1, quiet=TRUE)
+ close(tc)
names(map) <- c('assay.name','alleles','scaffold','position','strand')
map$assay.name <- I(gsub('-','_',map$assay.name))
map$scaffold <- sub('MT','M',map$scaffold)
map <- as.data.frame(map)
# now parse genotype data
- geno <- readLines(file)
- # note that we keep the leading space
- geno <- sub('.* QC[^ ]+', '', geno, perl=TRUE)
- sample <- strsplit(geno[1], ' ')[[1]][-1]
- geno <- geno[-1]
- for (a in levels(map$alleles)) {
- w <- (map$alleles == a)
- ab <- strsplit(a,'/')[[1]]
- geno[w] <- chartr(sprintf('%s%sN',ab[1],ab[2]),'abn',geno[w])
- }
- geno <- gsub(' ab| ba', 'h', geno, perl=TRUE)
- map$genotype <- gsub(' .', '', geno, perl=TRUE)
+ tc <- textConnection(filedata)
+ geno <- readLines(tc)
+ close(tc)
+ geno <- sub('.* QC\\S+ ', '', geno, perl=TRUE)
+ sample <- strsplit(geno[1], ' ')[[1]]
+ map$genotype <- .Call('do_encode_gt', as.character(map$alleles),
+ geno[1+(1:nrow(map))])
stopifnot(all(nchar(map$genotype) == length(sample)))
structure(map, sample=sample)
}
@@ -60,6 +63,7 @@
all <- lapply(files, wrap.read.file)
# construct unified map
+ if (verbose) message('Merging...')
map <- do.call('rbind', lapply(all, '[', -6))
map <- unique(map)
map <- map[order(map$position),]
@@ -126,8 +130,8 @@
# FIXME: we'll need to fix up pseudo-autosomal regions
m$ploidy <- switch(chr, 'chrX'='X', 'chrY'='Y', 'chrMT'='M', 'A')
if (verbose) message("Loading ", nrow(m), " records...")
- mk.assay(platform, m)
- mk.assay.position(platform, mapping, m)
- mk.assay.data(dataset, m)
+ mk.assay(platform, m, progress=verbose)
+ mk.assay.position(platform, mapping, m, progress=verbose)
+ mk.assay.data(dataset, m, progress=verbose)
}
}
Modified: pkg/gt.db/R/progress.R
===================================================================
--- pkg/gt.db/R/progress.R 2010-02-21 04:11:32 UTC (rev 35)
+++ pkg/gt.db/R/progress.R 2010-02-21 19:50:32 UTC (rev 36)
@@ -1,23 +1,24 @@
#
# Copyright (C) 2009, Perlegen Sciences, Inc.
-#
+# Copyright (C) 2010, 23andMe, Inc.
+#
# Written by David A. Hinds <dhinds at sonic.net>
-#
+#
# This is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the license, or
# (at your option) any later version.
-#
+#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
-#
+#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>
-#
+#
-progress.bar <- function (done, total, len=40)
+progress.bar <- function (done, total, len=options('width')-40)
{
now <- Sys.time()
p <- try(get(".Progress", pos='package:gt.db'), silent=TRUE)
Added: pkg/gt.db/src/encode.c
===================================================================
--- pkg/gt.db/src/encode.c (rev 0)
+++ pkg/gt.db/src/encode.c 2010-02-21 19:50:32 UTC (rev 36)
@@ -0,0 +1,90 @@
+/*
+
+ Copyright (C) 2009, 23andMe, Inc.
+
+ Written by David A. Hinds <dhinds at sonic.net>
+
+ This is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3 of the license, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>
+
+*/
+
+#include <Rversion.h>
+#include <R.h>
+#include <Rinternals.h>
+#include <R_ext/Rdynload.h>
+#include <R_ext/Arith.h>
+#include <ctype.h>
+#include <string.h>
+
+#ifdef ENABLE_NLS
+#include <libintl.h>
+#define _(String) dgettext ("gt.db", String)
+#else
+#define _(String) (String)
+#endif
+
+/*---------------------------------------------------------------------*/
+
+/*
+ Parse a vector of space-delimited genotype strings, i.e. 'AA AC CC'
+ as in HapMap files, into GT.DB notation, i.e. 'ahb', based on a
+ vector of 'A/B' allele strings.
+*/
+
+SEXP do_encode_gt(SEXP sa, SEXP sg)
+{
+ int i, j, len;
+ const char *a, *g;
+ char *buf, a1, a2;
+ SEXP ans;
+
+ if (!isString(sa) || !isString(sg))
+ error(_("arguments should be character vectors"));
+ if (LENGTH(sa) != LENGTH(sg))
+ error(_("argument lengths should be equal"));
+ PROTECT(ans = allocVector(STRSXP, LENGTH(sa)));
+ len = (LENGTH(STRING_ELT(sg,0))+1)/3;
+ buf = (char *)R_alloc(len+1, sizeof(char));
+ for (i = 0; i < LENGTH(sa); i++) {
+ a = CHAR(STRING_ELT(sa,i));
+ a1 = a[0]; a2 = a[2];
+ g = CHAR(STRING_ELT(sg,i));
+ for (j = 0; j < len && *g; j++) {
+ buf[j] = '?';
+ if (*g == a1) {
+ if (g[1] == a1)
+ buf[j] = 'a';
+ else if (g[1] == a2)
+ buf[j] = 'h';
+ } else if (g[0] == a2) {
+ if (g[1] == a2)
+ buf[j] = 'b';
+ else if (g[1] == a1)
+ buf[j] = 'h';
+ } else if (g[0] == 'N' && g[1] == 'N')
+ buf[j] = 'n';
+ if (buf[j] == '?')
+ break;
+ g += 2;
+ while (*g == ' ') g++;
+ }
+ buf[j] = '\0';
+ if (j < len)
+ SET_STRING_ELT(ans, i, NA_STRING);
+ else
+ SET_STRING_ELT(ans, i, mkChar(buf));
+ }
+ UNPROTECT(1);
+ return ans;
+}
More information about the Gtdb-commits
mailing list