[Gtdb-commits] r41 - in pkg/gt.db: R src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Tue Mar 2 03:03:45 CET 2010


Author: dahinds
Date: 2010-03-02 03:03:45 +0100 (Tue, 02 Mar 2010)
New Revision: 41

Modified:
   pkg/gt.db/R/hapmap.R
   pkg/gt.db/src/str.c
Log:
- fixed bug in ch.table() with NA arguments
- added code to handle various HapMap file inconsistencies



Modified: pkg/gt.db/R/hapmap.R
===================================================================
--- pkg/gt.db/R/hapmap.R	2010-02-27 08:02:18 UTC (rev 40)
+++ pkg/gt.db/R/hapmap.R	2010-03-02 02:03:45 UTC (rev 41)
@@ -22,6 +22,18 @@
 # We only support the 'fwd' files, which do not specify the orientations
 # of the rs clusters.  The 'rs' files contain orientation errors.
 
+.fixup.hapmap.alleles <- function(alleles, geno)
+{
+    # fill in non-SNP allele lists based on the actual data
+    bad <- grep('^[ACGT]/[ACGT]$', alleles, invert=TRUE)
+    gt <- c('A','C','G','T')
+    tbl <- (ch.table(geno[bad], chars=gt) > 0)
+    a12 <- apply(tbl, 1, function(x) c(gt[x],'?','?')[1:2])
+    alleles <- as.character(alleles)
+    alleles[bad] <- paste(a12[1,], a12[2,], sep='/')
+    factor(alleles)
+}
+
 .read.hapmap.file <- function(file)
 {
     # schlep the whole file into memory: 250 MB should be enough
@@ -41,16 +53,91 @@
 
     # now parse genotype data
     tc <- textConnection(filedata)
+    rm(filedata)
     geno <- readLines(tc)
     close(tc)
     geno <- sub('.* QC\\S+ ', '', geno, perl=TRUE)
     sample <- strsplit(geno[1], ' ')[[1]]
+    geno <- geno[1+(1:nrow(map))]
+    map$alleles <- .fixup.hapmap.alleles(map$alleles, geno)
     map$genotype <- .Call('do_encode_gt', as.character(map$alleles),
-                          geno[1+(1:nrow(map))], PACKAGE='gt.db')
-    stopifnot(all(nchar(map$genotype) == length(sample)))
+                          geno, PACKAGE='gt.db')
+
+    stopifnot(!is.na(map$genotype))
+    stopifnot(nchar(map$genotype) == length(sample))
     structure(map, sample=sample)
 }
 
+.read.hapmap.r22phased <- function(basename)
+{
+    sample <- read.table(paste(basename,'_sample.txt.gz',sep=''),
+                         header=FALSE, as.is=1)
+    sample <- paste(rep(sample[,1],each=2), c('A','B'), sep='_')
+
+    map <- read.table(paste(basename,'_legend.txt.gz',sep=''),
+                      header=TRUE, as.is=1)
+    map$strand <- factor('+')
+    map$alleles <- factor(paste(map$X0, map$X1, sep='/'))
+    map$X0 <- map$X1 <- NULL
+
+    fd <- file(paste(basename,'.phase.gz',sep=''),'r')
+    tc <- textConnection(readChar(fd, nchars=5e+08, useBytes=TRUE))
+    close(fd)
+    gt1 <- scan(tc, what=integer(), quiet=TRUE, nlines=1)
+    gt <- c(gt1,scan(tc, what=integer(), quiet=TRUE))
+    close(tc)
+    # convert from 0/1 codes to a/b strings
+    gt <- matrix(as.raw(gt+0x61), ncol=length(gt1))
+    map$genotype <- apply(gt, 2, rawToChar)
+
+    structure(map, sample = sample[1:nrow(gt)])
+}
+
+.resolve.hapmap.positions <- function(data)
+{
+    # if the same rsid appears at two positions, assign new names
+    map <- unique(do.call("rbind", lapply(data, "[", -6)))
+    bad <- map$assay.name[which(duplicated(map$assay.name))]
+    for (rsid in bad) {
+        pos <- unique(map[map$assay.name == rsid,]$position)
+        if (length(pos) > 1) {
+            warning(rsid, ": found at multiple map positions",
+                    call.=FALSE, immediate=TRUE)
+            for (i in 1:length(data)) {
+                rn <- which(data[[i]]$assay.name == rsid)
+                data[[i]]$assay.name[rn] <-
+                    paste(rsid, match(data[[i]]$position[rn], pos), sep='_')
+            }
+        }
+    }
+    data
+}
+
+.resolve.hapmap.alleles <- function(data)
+{
+    map <- unique(do.call("rbind", lapply(data, "[", -6)))
+    bad <- map$assay.name[which(duplicated(map$assay.name))]
+    for (rsid in bad) {
+        m <- map[map$assay.name == rsid,]
+        tbl <- ch.table(paste(m$alleles,collapse='/'),
+                        chars=c('A','C','G','T'))[1,]
+        a12 <- c(names(tbl[tbl>0]),'?')
+        if (length(a12) > 3)
+            stop(rsid,': more than two alleles')
+        new <- paste(a12[1], a12[2], sep='/')
+        for (i in 1:length(data)) {
+            rn <- match(rsid, data[[i]]$assay.name)
+            if (is.na(rn)) next
+            # convert B/? to A/B genotypes?
+            if (data[[i]]$alleles[rn] == sprintf('%s/?',a12[2]))
+                data[[i]]$genotype[rn] <-
+                    chartr('ab','ba',data[[i]]$genotype[rn])
+            data[[i]]$alleles[rn] <- new
+        }
+    }
+    data
+}
+
 .merge.hapmap.files <- function(files, verbose=TRUE)
 {
     wrap.read.file <- function(file)
@@ -61,15 +148,20 @@
     }
     all <- lapply(files, wrap.read.file)
 
+    if (verbose) message('Merging...')
+
+    # resolve inconsistencies in positions, alleles
+    all <- .resolve.hapmap.positions(all)
+    all <- .resolve.hapmap.alleles(all)
+
     # construct unified map
-    if (verbose) message('Merging...')
-    map <- do.call('rbind', lapply(all, '[', -6))
-    map <- unique(map)
+    map <- unique(do.call('rbind', lapply(all, '[', -6)))
     map <- map[order(map$position),]
 
     # verify that all files are consistent
     stopifnot(!anyDuplicated(map$assay.name))
 
+    # concatenate genotype strings
     map$genotype <- ''
     sample <- character(0)
     for (d in all) {

Modified: pkg/gt.db/src/str.c
===================================================================
--- pkg/gt.db/src/str.c	2010-02-27 08:02:18 UTC (rev 40)
+++ pkg/gt.db/src/str.c	2010-03-02 02:03:45 UTC (rev 41)
@@ -145,7 +145,7 @@
 	for (i = 0; i < ns; i++) {
 		if (STRING_ELT(ss,i) == NA_STRING) {
 			for (j = 0; j < nc; j++)
-				ians[nc*i+j] = NA_INTEGER;
+				ians[i+ns*j] = NA_INTEGER;
 		} else {
 			s = CHAR(STRING_ELT(ss,i));
 			len = strlen(s);
@@ -199,7 +199,7 @@
 		if (STRING_ELT(ss1, i1) == NA_STRING ||
 			STRING_ELT(ss2, i2) == NA_STRING) {
 			for (j = 0; j < nc*nc; j++)
-				ians[nc*nc*i+j] = NA_INTEGER;
+				ians[i+ns*j] = NA_INTEGER;
 		} else {
 			s1 = CHAR(STRING_ELT(ss1,i1));
 			s2 = CHAR(STRING_ELT(ss2,i2));



More information about the Gtdb-commits mailing list