[Gtdb-commits] r36 - in pkg/gt.db: R src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Sun Feb 21 20:50:33 CET 2010


Author: dahinds
Date: 2010-02-21 20:50:32 +0100 (Sun, 21 Feb 2010)
New Revision: 36

Added:
   pkg/gt.db/src/encode.c
Modified:
   pkg/gt.db/R/hapmap.R
   pkg/gt.db/R/progress.R
Log:
- autoscale progress.bar() to fit console width
- speed-ups for reading HapMap format files



Modified: pkg/gt.db/R/hapmap.R
===================================================================
--- pkg/gt.db/R/hapmap.R	2010-02-21 04:11:32 UTC (rev 35)
+++ pkg/gt.db/R/hapmap.R	2010-02-21 19:50:32 UTC (rev 36)
@@ -1,3 +1,4 @@
+
 #
 # Copyright (C) 2010, 23andMe, Inc.
 #
@@ -24,27 +25,29 @@
 
 .read.hapmap.file <- function(file)
 {
-    # read just map information
-    map <- scan(file, what=list('','','',1L,''),
+    # schlep the whole file into memory
+    fd <- file(file)
+    filedata <- readChar(fd,100000000,TRUE)
+    close(fd)
+
+    # parse just map information
+    tc <- textConnection(filedata)
+    map <- scan(tc, what=list('','','',1L,''),
                 flush=TRUE, skip=1, quiet=TRUE)
+    close(tc)
     names(map) <- c('assay.name','alleles','scaffold','position','strand')
     map$assay.name <- I(gsub('-','_',map$assay.name))
     map$scaffold <- sub('MT','M',map$scaffold)
     map <- as.data.frame(map)
 
     # now parse genotype data
-    geno <- readLines(file)
-    # note that we keep the leading space
-    geno <- sub('.* QC[^ ]+', '', geno, perl=TRUE)
-    sample <- strsplit(geno[1], ' ')[[1]][-1]
-    geno <- geno[-1]
-    for (a in levels(map$alleles)) {
-        w <- (map$alleles == a)
-        ab <- strsplit(a,'/')[[1]]
-        geno[w] <- chartr(sprintf('%s%sN',ab[1],ab[2]),'abn',geno[w])
-    }
-    geno <- gsub(' ab| ba', 'h', geno, perl=TRUE)
-    map$genotype <- gsub(' .', '', geno, perl=TRUE)
+    tc <- textConnection(filedata)
+    geno <- readLines(tc)
+    close(tc)
+    geno <- sub('.* QC\\S+ ', '', geno, perl=TRUE)
+    sample <- strsplit(geno[1], ' ')[[1]]
+    map$genotype <- .Call('do_encode_gt', as.character(map$alleles),
+                          geno[1+(1:nrow(map))])
     stopifnot(all(nchar(map$genotype) == length(sample)))
     structure(map, sample=sample)
 }
@@ -60,6 +63,7 @@
     all <- lapply(files, wrap.read.file)
 
     # construct unified map
+    if (verbose) message('Merging...')
     map <- do.call('rbind', lapply(all, '[', -6))
     map <- unique(map)
     map <- map[order(map$position),]
@@ -126,8 +130,8 @@
         # FIXME: we'll need to fix up pseudo-autosomal regions
         m$ploidy <- switch(chr, 'chrX'='X', 'chrY'='Y', 'chrMT'='M', 'A')
         if (verbose) message("Loading ", nrow(m), " records...")
-        mk.assay(platform, m)
-        mk.assay.position(platform, mapping, m)
-        mk.assay.data(dataset, m)
+        mk.assay(platform, m, progress=verbose)
+        mk.assay.position(platform, mapping, m, progress=verbose)
+        mk.assay.data(dataset, m, progress=verbose)
     }
 }

Modified: pkg/gt.db/R/progress.R
===================================================================
--- pkg/gt.db/R/progress.R	2010-02-21 04:11:32 UTC (rev 35)
+++ pkg/gt.db/R/progress.R	2010-02-21 19:50:32 UTC (rev 36)
@@ -1,23 +1,24 @@
 #
 # Copyright (C) 2009, Perlegen Sciences, Inc.
-# 
+# Copyright (C) 2010, 23andMe, Inc.
+#
 # Written by David A. Hinds <dhinds at sonic.net>
-# 
+#
 # This is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 3 of the license, or
 # (at your option) any later version.
-# 
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>
-# 
+#
 
-progress.bar <- function (done, total, len=40) 
+progress.bar <- function (done, total, len=options('width')-40)
 {
     now <- Sys.time()
     p <- try(get(".Progress", pos='package:gt.db'), silent=TRUE)

Added: pkg/gt.db/src/encode.c
===================================================================
--- pkg/gt.db/src/encode.c	                        (rev 0)
+++ pkg/gt.db/src/encode.c	2010-02-21 19:50:32 UTC (rev 36)
@@ -0,0 +1,90 @@
+/*
+
+  Copyright (C) 2009, 23andMe, Inc.
+  
+  Written by David A. Hinds <dhinds at sonic.net>
+  
+  This is free software; you can redistribute it and/or modify it
+  under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 3 of the license, or
+  (at your option) any later version.
+  
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+  
+  You should have received a copy of the GNU General Public License
+  along with this program.  If not, see <http://www.gnu.org/licenses/>
+  
+*/
+
+#include <Rversion.h>
+#include <R.h>
+#include <Rinternals.h>
+#include <R_ext/Rdynload.h>
+#include <R_ext/Arith.h>
+#include <ctype.h>
+#include <string.h>
+
+#ifdef ENABLE_NLS
+#include <libintl.h>
+#define _(String) dgettext ("gt.db", String)
+#else
+#define _(String) (String)
+#endif
+
+/*---------------------------------------------------------------------*/
+
+/*
+  Parse a vector of space-delimited genotype strings, i.e. 'AA AC CC'
+  as in HapMap files, into GT.DB notation, i.e. 'ahb', based on a
+  vector of 'A/B' allele strings.
+*/
+
+SEXP do_encode_gt(SEXP sa, SEXP sg)
+{
+	int i, j, len;
+	const char *a, *g;
+	char *buf, a1, a2;
+	SEXP ans;
+
+	if (!isString(sa) || !isString(sg))
+		error(_("arguments should be character vectors"));
+	if (LENGTH(sa) != LENGTH(sg))
+		error(_("argument lengths should be equal"));
+	PROTECT(ans = allocVector(STRSXP, LENGTH(sa)));
+	len = (LENGTH(STRING_ELT(sg,0))+1)/3;
+	buf = (char *)R_alloc(len+1, sizeof(char));
+	for (i = 0; i < LENGTH(sa); i++) {
+		a = CHAR(STRING_ELT(sa,i));
+		a1 = a[0]; a2 = a[2];
+		g = CHAR(STRING_ELT(sg,i));
+		for (j = 0; j < len && *g; j++) {
+			buf[j] = '?';
+			if (*g == a1) {
+				if (g[1] == a1)
+					buf[j] = 'a';
+				else if (g[1] == a2)
+					buf[j] = 'h';
+			} else if (g[0] == a2) {
+				if (g[1] == a2)
+					buf[j] = 'b';
+				else if (g[1] == a1)
+					buf[j] = 'h';
+			} else if (g[0] == 'N' && g[1] == 'N')
+			       	buf[j] = 'n';
+			if (buf[j] == '?')
+				break;
+			g += 2;
+			while (*g == ' ') g++;
+		}
+		buf[j] = '\0';
+		if (j < len)
+			SET_STRING_ELT(ans, i, NA_STRING);
+		else
+			SET_STRING_ELT(ans, i, mkChar(buf));
+	}
+	UNPROTECT(1);
+	return ans;
+}



More information about the Gtdb-commits mailing list