[Genabel-commits] r2063 - pkg/MultiABEL/R
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Tue Jan 24 12:25:18 CET 2017
Author: nd_0001
Date: 2017-01-24 12:25:17 +0100 (Tue, 24 Jan 2017)
New Revision: 2063
Modified:
pkg/MultiABEL/R/load.summary.R
Log:
load.summary version 2
Modified: pkg/MultiABEL/R/load.summary.R
===================================================================
--- pkg/MultiABEL/R/load.summary.R 2017-01-24 11:07:03 UTC (rev 2062)
+++ pkg/MultiABEL/R/load.summary.R 2017-01-24 11:25:17 UTC (rev 2063)
@@ -5,7 +5,8 @@
#'
#' @param files A vector of file names as strings. Each file name should contain summary statistics of
#' one trait to be included in the multi-trait analysis. The columns of the summary statistics have to
-#' contain (uppercase or lowercase does not matter) \code{'snp'} (marker ID), \code{'a1'} (the first allele), \code{'freq'}
+#' contain (uppercase or lowercase does not matter) \code{'snp'} (marker ID), \code{'a1'} (the first allele),
+#' \code{'a2'} (the second allele), \code{'freq'},
#' (frequency of the first allele), \code{'beta'} (effect size), \code{'se'} (standard error), and
#' \code{'n'} (sample size).
#' @param cor.pheno A #traits x #traits matrix of correlation matrix of the phenotypes, to be used to
@@ -21,7 +22,7 @@
#' If \code{TRUE}, each phenotypic variance will be estimated to adjust the summary statistics, so that
#' the corresponding phenoypic variance is 1.
#' @param columnNames A vector with names of columns containing necessary information in the input file;
-#' default values are c('snp','a1','freq','beta','se','n'). The values are case-insensitive. Note: check
+#' default values are c('snp','a1','a2','freq','beta','se','n'). The values are case-insensitive. Note: check
#' your allele definitions for different traits are based on the same strand!
#' @param fixedN sample size to assume across all analyses, when provided, this number will be used
#' (instead of the ones specified in the input files)
@@ -67,52 +68,75 @@
#' @keywords multivariate, meta-analysis
#'
`load.summary` <- function(files, cor.pheno = NULL, indep.snps = NULL, est.var = FALSE,
- columnNames = c('snp', 'a1', 'freq', 'beta', 'se', 'n'), fixedN = NULL) {
+ columnNames = c('snp', 'a1', 'a2', 'freq', 'beta', 'se', 'n'),
+ fixedN = NULL) {
+
+ ### I. Sanity checks ###
+
+ require(data.table)
+
if (!all(is.character(files))) {
stop('files should be given as strings!')
}
+
if (sum(file.exists(files)) < 2) {
stop('number of traits has to be more than 2!')
}
+
if (is.null(cor.pheno) & is.null(indep.snps)) {
stop('indep.snps required for estimating cor.pheno!')
}
+
m <- length(files)
if (!is.null(cor.pheno)) {
if (nrow(cor.pheno) != m | ncol(cor.pheno) != m) {
stop('wrong dimensions of cor.pheno!')
}
}
+
columnNames <- tolower(columnNames)
+
if (!is.null(fixedN)) if (fixedN <= 0) {
- stop('fixedN should be a positive number')
- }
- if (is.null(fixedN)) { colNamLen = 6 } else { colNamLen = 5 }
+ stop('fixedN should be a positive number')
+ }
+
+ if (is.null(fixedN)) { colNamLen = 7 } else { colNamLen = 6 }
+
if (!is.character(columnNames)) {
stop('columnNames should be character')
}
+
if (length(columnNames) != colNamLen) {
cat('columnNames should be a vector with',colNamLen,'elements')
stop('... exiting')
}
+
if (length(unique(columnNames)) != colNamLen) {
stop('elements of columnNames must be unique')
}
- # column Names Translation
+
cNT = list(
'snp' = columnNames[1],
'a1' = columnNames[2],
- 'freq'= columnNames[3],
- 'beta'= columnNames[4],
- 'se' = columnNames[5],
- 'n' = columnNames[6]
+ 'a2' = columnNames[3],
+ 'freq'= columnNames[4],
+ 'beta'= columnNames[5],
+ 'se' = columnNames[6],
+ 'n' = columnNames[7]
)
- cat('loading data ...\n')
+
+ ### II. Data load ###
+
+ cat("loading data ...\n")
+
data <- c()
fn <- files # rev(files)
vys <- rep(1, m)
for (i in m:1) {
- dd <- read.table(fn[i], header = TRUE, stringsAsFactors = FALSE)
+
+ #dd <- read.table(fn[i], header = TRUE, stringsAsFactors = FALSE)
+ dd <- data.frame(fread(fn[i], header = TRUE, stringsAsFactors = FALSE,verbose=FALSE,showProgress=FALSE))
+
colnames(dd) <- tolower(colnames(dd))
currentColNames <- colnames(dd)
if (any(!(columnNames %in% currentColNames))) {
@@ -141,44 +165,80 @@
}
cat('\n')
if (est.var) cat('phenotypic variances are:', vys, '\n')
- cat('checking markers ...\n')
- snps <- data[[1]][, cNT$snp]
- for (i in 2:m) {
- snps <- data[[i]][ data[[i]][, cNT$snp] %in% snps, cNT$snp]
- progress(i/m*100)
- }
- snps <- unique(snps)
- cat('\n')
- cat('cleaning data ...\n')
+
+ ### III. SNP cheking and cleaning ###
+
+ cat("checking markers ...\n")
+
+ snps <- data[[1]][,cNT$snp]
for (i in 1:m) {
- data[[i]] <- data[[i]][snps,]
- progress(i/m*100)
+ data[[i]]=na.omit(data[[i]][,unlist(cNT)])
+ snps=intersect(snps,data[[i]][, cNT$snp])
+ progress(i/m * 100)
+ }
+
+ snps <- unique(snps)
+ cat("\n")
+
+ cat("cleaning data ...\n")
+ for (i in 1:m) {
+ data[[i]] <- data[[i]][snps, ]
+ progress(i/m * 100)
+ }
+ cat("\n")
+ cat("correcting parameters ...\n")
+
+ for (i in 1:m) {
+ data[[i]][, cNT[["a1"]]] = toupper(data[[i]][, cNT[["a1"]]])
+ data[[i]][, cNT[["a2"]]] = toupper(data[[i]][, cNT[["a2"]]])
}
- cat('\n')
- cat('correcting parameters ...\n')
+
+ d1a12=paste(data[[1]][,c(cNT$a1)],data[[1]][,c(cNT$a2)],sep="")
+ d1a21=paste(data[[1]][,c(cNT$a2)],data[[1]][,c(cNT$a1)],sep="")
+
+
+ snps=rownames(data[[1]])
+
+ i=2
for (i in 2:m) {
- if (any(data[[i]][, cNT[['a1']]] != data[[1]][, cNT[['a1']]])) {
- adj <- 2*as.numeric(data[[i]][, cNT[['a1']]] == data[[1]][, cNT[['a1']]]) - 1
- data[[i]][, cNT$beta] <- data[[i]][, cNT$beta]*adj
- data[[i]][, cNT$freq] <- (adj == 1)*data[[i]][, cNT$freq] + (adj == -1)*(1 - data[[i]][,cNT$freq])
- }
- progress(i/m*100)
- }
- cat('\n')
- cat('adjusting sample size ... ')
- n0 <- matrix(NA, nrow(data[[1]]), m)
- if (is.null(fixedN)) {
- for (i in 1:m) {
- n0[,i] <- data[[i]][,cNT$n]
- }
- } else {
- for (i in 1:m) {
- n0[,i] <- fixedN
- }
- }
- n <- apply(n0, 1, "min")
- cat('done.\n')
- cat('finalizing summary statistics ...\n')
+
+ dia12=paste(data[[i]][,c(cNT$a1)],data[[i]][,c(cNT$a2)],sep="")
+
+ ind=which(((dia12==d1a12) + (dia12==d1a21))==1)
+ snps=intersect(rownames(data[[i]])[ind],snps)
+
+ if (any(data[[i]][, cNT[["a1"]]] != data[[1]][, cNT[["a1"]]])) {
+ adj <- 2 * as.numeric(data[[i]][, cNT[["a1"]]] == data[[1]][, cNT[["a1"]]]) - 1
+ data[[i]][, cNT$beta] <- data[[i]][, cNT$beta] *adj
+ data[[i]][, cNT$freq] <- (adj == 1) * data[[i]][,cNT$freq] + (adj == -1) * (1 - data[[i]][, cNT$freq])
+ }
+ progress(i/m * 100)
+ }
+
+ for (i in 1:m) {
+ data[[i]] <- data[[i]][snps, ]
+ #progress(i/m * 100)
+ }
+
+ cat("\n")
+
+ ### IV. Sample size and finalisation ###
+
+ cat("adjusting sample size ... ")
+ n0 <- matrix(NA, nrow(data[[1]]), m)
+ if (is.null(fixedN)) {
+ for (i in 1:m) {
+ n0[, i] <- data[[i]][, cNT$n]
+ }
+ }else {
+ for (i in 1:m) {
+ n0[, i] <- fixedN
+ }
+ }
+ n <- apply(n0, 1, "min")
+ cat("done.\n")
+
+ cat('finalizing summary statistics ...\n')
gwa0 <- matrix(NA, nrow(data[[1]]), 2*m + 2)
for (i in 1:m) {
gwa0[,i*2 - 1] <- data[[i]][,cNT$beta]
@@ -190,6 +250,7 @@
rownames(gwa0) <- data[[1]][,cNT$snp]
gwa0 <- na.omit(gwa0)
cat('\n')
+
if (is.null(cor.pheno)) {
n.ratio <- diag(m)
for (i in 1:(m - 1)) {
@@ -210,10 +271,16 @@
cor.pheno <- cor(z, use = 'pairwise.complete.obs')
cat('done.\n')
}
+
dimnames(cor.pheno) <- list(files, files)
gwanames <- c(paste(rep(files, each = 2), rep(c('.beta', '.se'), m), sep = ''), 'f', 'n')
colnames(gwa0) <- gwanames
- dd <- list(gwa = gwa0, cor.pheno = cor.pheno, var.pheno = vys)
- class(dd) <- 'multi.summary'
- return(dd)
+
+ A12=data[[1]][rownames(gwa0), c(cNT[["a1"]],cNT[["a2"]])]
+ colnames(A12)=c("A1","A2")
+
+ dd <- list(gwa = gwa0, cor.pheno = cor.pheno, var.pheno = vys, alleles=A12)
+ class(dd) <- "multi.summary"
+ return(dd)
+
}
More information about the Genabel-commits
mailing list