[Gtdb-commits] r38 - pkg/gt.db/R
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Mon Feb 22 00:10:41 CET 2010
Author: dahinds
Date: 2010-02-22 00:10:40 +0100 (Mon, 22 Feb 2010)
New Revision: 38
Modified:
pkg/gt.db/R/affy.R
pkg/gt.db/R/hapmap.R
Log:
- minor code cleanups
- speed up for load.affy.chp.data()
- fixed bug in .read.affy.header()
Modified: pkg/gt.db/R/affy.R
===================================================================
--- pkg/gt.db/R/affy.R 2010-02-21 21:00:03 UTC (rev 37)
+++ pkg/gt.db/R/affy.R 2010-02-21 23:10:40 UTC (rev 38)
@@ -23,11 +23,13 @@
.read.affy.header <- function(file)
{
txt <- character()
+ fd <- file(file, 'rt')
repeat {
- x <- grep('^\\#%.*=', readLines(file, 100), value=TRUE)
- if (!length(x)) break;
+ x <- grep('^\\#%.*=', readLines(fd, 100), value=TRUE)
+ if (!length(x)) break
txt <- c(txt,x)
}
+ close(fd)
val <- sub('^..[^=]*=(.*)', '\\1', txt)
structure(val, names=sub('^..([^=]*)=.*', '\\1', txt))
}
@@ -84,7 +86,7 @@
load.affy.platform <- function(anno, description, progress=TRUE)
{
platform <- attr(anno,'chip_type')
- cat(sprintf("Creating platform '%s'...\n", platform))
+ message("Creating platform '", platform, "'...")
mk.platform(platform, description)
mk.assay(platform, .parse.affy.assays(anno), progress=progress)
}
@@ -94,8 +96,7 @@
platform <- attr(anno,'chip_type')
version <- attr(anno, 'netaffx-annotation-netaffx-build')
mapping <- sprintf('na%s', version)
- cat(sprintf("Creating mapping '%s' for '%s'...\n",
- mapping, platform))
+ message("Creating mapping '", mapping, "' for '", platform, "'...")
ncbi.build <- attr(anno, 'genome-version-ncbi')
dbsnp.build <- attr(anno, 'dbsnp-version')
desc <- sprintf("%s on NCBI b%s, dbSNP b%s",
@@ -113,7 +114,7 @@
# read a single Affy CHP file (in text form) and convert individual
# genotypes to a GT.DB representation
-.read.affy.chp <- function(file, names, row.names=TRUE)
+.read.affy.chp <- function(file, probes, row.names=TRUE)
{
recode.gt <- function(x)
{
@@ -122,12 +123,14 @@
}
cc <- c('character','factor',rep('numeric',3),'factor')
- d <- read.table(file, sep='\t', header=TRUE,
+ tc <- textConnection(readChar(file, nchars=1e8, useBytes=TRUE))
+ d <- read.table(tc, sep='\t', header=TRUE,
row.names=1, colClasses=cc)
- if (!missing(names)) {
- if (!all(names %in% rownames(d)))
+ close(tc)
+ if (!missing(probes)) {
+ if (!all(probes %in% rownames(d)))
stop('Probe set name(s) are not matched?')
- d <- d[names,]
+ d <- d[probes,]
}
if (!row.names) row.names(d) <- NULL
@@ -135,7 +138,7 @@
d$forced.call <- recode.gt(d$forced.call)
qscore <- round(-10*log10(d$confidence))
qscore <- ifelse(is.finite(qscore), qscore, 0)
- raw.data <- rawToHex(matrix(.pack.chpdata(d), nrow=5))
+ raw.data <- rawToHex(matrix(.pack.chpdata(d), ncol=nrow(d)))
data.frame(genotype=recode.gt(d$call),
qscore=as.integer(qscore),
raw.data=raw.data,
@@ -148,9 +151,9 @@
# transposing the data to the form that will go into GT.DB
.reorg.chp.data <-
- function(files, names, slice=8192, progress=TRUE)
+ function(files, probes, slice=8192, progress=TRUE)
{
- nr <- nrow(.read.affy.chp(files[1], names))
+ nr <- nrow(.read.affy.chp(files[1], probes))
lo <- seq(1, nr, slice)
hi <- c(lo[-1]-1, nr)
nb <- length(lo)
@@ -162,7 +165,7 @@
nf <- length(files)
if (progress) progress.bar(0, nf)
for (n in 1:nf) {
- d <- .read.affy.chp(files[n], names, row.names=(n==1))
+ d <- .read.affy.chp(files[n], probes, row.names=(n==1))
sapply(1:nb, function(x)
{ slice <- d[lo[x]:hi[x],]
save(slice, file=fd[[x]]) })
@@ -184,7 +187,7 @@
else
apply(m, 2, function(x) rawToHex(writeBin(x,raw(0),...)))
}
- load.one <- function(fd)
+ load.obj <- function(fd)
{
tmpenv <- new.env(parent=emptyenv())
name <- load(fd, tmpenv)
@@ -192,8 +195,8 @@
}
fd <- gzfile(file, 'rb')
- files <- load.one(fd)
- d <- replicate(length(files), load.one(fd), simplify=FALSE)
+ files <- load.obj(fd)
+ d <- replicate(length(files), load.obj(fd), simplify=FALSE)
close(fd)
data.frame(assay.name=sub('-','_',row.names(d[[1]])),
genotype=pack.by.assay(d,'genotype'),
@@ -204,9 +207,9 @@
load.affy.chp.data <- function(dataset, anno, files, progress=TRUE)
{
- cat("Reorganizing CHP data...\n")
+ message("Reorganizing CHP data...")
parts <- .reorg.chp.data(files, rownames(anno), progress=progress)
- cat("Loading data into database...\n")
+ message("Loading data into database...")
if (progress) progress.bar(0, length(parts))
for (n in 1:length(parts)) {
d <- .reshape.affy.block(parts[n])
Modified: pkg/gt.db/R/hapmap.R
===================================================================
--- pkg/gt.db/R/hapmap.R 2010-02-21 21:00:03 UTC (rev 37)
+++ pkg/gt.db/R/hapmap.R 2010-02-21 23:10:40 UTC (rev 38)
@@ -25,9 +25,9 @@
.read.hapmap.file <- function(file)
{
- # schlep the whole file into memory
+ # schlep the whole file into memory: 100 MB should be enough
fd <- file(file)
- filedata <- readChar(fd,100000000,TRUE)
+ filedata <- readChar(file, nchars=1e8, useBytes=TRUE)
close(fd)
# parse just map information
@@ -102,14 +102,20 @@
dataset <- paste(platform, mapping, f$ver[1], sep='_')
if (npop == 1) dataset <- paste(dataset, f$pop[1], sep='_')
desc <- sprintf('HapMap Phase %d', phase)
- if (!nrow(ls.platform(platform)))
+ if (!nrow(ls.platform(platform))) {
+ message("Creating platform '", platform, "'...")
mk.platform(platform, desc)
+ }
desc <- sprintf('%s Release %s', desc, f$rel[1])
- if (!nrow(ls.mapping(platform, mapping)))
+ if (!nrow(ls.mapping(platform, mapping))) {
+ message("Creating mapping '", mapping, "'...")
mk.mapping(platform, mapping, desc, sprintf('ncbi_%s', f$build[1]))
+ }
desc <- sprintf('%s (%s)', desc, f$ver[1])
- if (!nrow(ls.dataset(project.name, dataset)))
+ if (!nrow(ls.dataset(project.name, dataset))) {
+ message("Creating dataset '", dataset, "'...")
mk.dataset(dataset, project.name, platform, desc)
+ }
for (chr in levels(.sort.levels(f$chr))) {
m <- .merge.hapmap.files(sort(files[f$chr == chr]), verbose)
More information about the Gtdb-commits
mailing list