[Genabel-commits] r2062 - pkg/MultiABEL/R
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Tue Jan 24 12:07:04 CET 2017
Author: nd_0001
Date: 2017-01-24 12:07:03 +0100 (Tue, 24 Jan 2017)
New Revision: 2062
Modified:
pkg/MultiABEL/R/load.summary.R
Log:
changes in load.summary
Modified: pkg/MultiABEL/R/load.summary.R
===================================================================
--- pkg/MultiABEL/R/load.summary.R 2016-12-24 10:54:02 UTC (rev 2061)
+++ pkg/MultiABEL/R/load.summary.R 2017-01-24 11:07:03 UTC (rev 2062)
@@ -1,219 +1,219 @@
-#' Loading multiple summary statistics from genome-wide association studies
-#'
-#' The function loads multiple meta-GWAS summary statistics, for subsequent multi-trait GWAS.
-#' Currently, the package only analyzes summary statistics from inverse-Gaussianized continuous traits.
-#'
-#' @param files A vector of file names as strings. Each file name should contain summary statistics of
-#' one trait to be included in the multi-trait analysis. The columns of the summary statistics have to
-#' contain (uppercase or lowercase does not matter) \code{'snp'} (marker ID), \code{'a1'} (the first allele), \code{'freq'}
-#' (frequency of the first allele), \code{'beta'} (effect size), \code{'se'} (standard error), and
-#' \code{'n'} (sample size).
-#' @param cor.pheno A #traits x #traits matrix of correlation matrix of the phenotypes, to be used to
-#' construct the multi-trait test statistic. If \code{NULL},
-#' this matrix will be estimated from genome-wide summary statistics. If you have partially overlapping
-#' samples for different traits, shrinkage correlation matrix is recommended (see reference), so in that
-#' case, unless you know what you are doing, leave this argument as default, i.e. \code{NULL}.
-#' @param indep.snps A vector of strings containing the names of a set of independent SNPs. This is
-#' recommended to be generated by LD-pruning the genotype data in a certain cohort. Typically the
-#' number of SNPs should be more than 10,000 in order to obtain a good estimate of \code{cor.pheno}. If
-#' \code{cor.pheno = NULL}, this argument cannot be \code{NULL}.
-#' @param est.var A logical value. If \code{FALSE}, each phenotypic variance is assumed to be known as 1.
-#' If \code{TRUE}, each phenotypic variance will be estimated to adjust the summary statistics, so that
-#' the corresponding phenoypic variance is 1.
-#' @param columnNames A vector with names of columns containing necessary information in the input file;
-#' default values are c('snp','a1','freq','beta','se','n'). The values are case-insensitive. Note: check
-#' your allele definitions for different traits are based on the same strand!
-#' @param fixedN sample size to assume across all analyses, when provided, this number will be used
-#' (instead of the ones specified in the input files)
-#'
-#' @return The function returns a list of class \code{multi.summary}, containing two elements: \code{gwa}
-#' (the cleaned data to be processed in multi-trait GWAS), \code{cor.pheno} (user input or estimated), and
-#' \code{var.pheno} (default or estimated).
-#'
-#' @author Xia Shen, Yurii S. Aulchenko
-#'
-#' @references
-#' Xia Shen, Zheng Ning, Yakov Tsepilov, Masoud Shirali,
-#' Generation Scotland, Blair H. Smith, Lynne J. Hocking, Sandosh Padmanabhan, Caroline Hayward,
-#' David J. Porteous, Yudi Pawitan, Chris S. Haley, Yurii S. Aulchenko (2015).
-#' Simple multi-trait analysis identifies novel loci
-#' associated with growth and obesity measures. \emph{Submitted}.
-#'
-#' @seealso
-#' \code{MultiSummary}
-#'
-#' @examples
-#' \dontrun{
-#' ## download the six example files from:
-#' ## https://www.dropbox.com/sh/hhta45cewvvea2s/AADfj4OXlbroToZAwIii2Buha?dl=0
-#' ## the summary statistics from Randall et al. (2013) PLoS Genet
-#' ## for males only
-#' ## bmi: body mass index
-#' ## hip: hip circumference
-#' ## wc: waist circumference
-#' ## whr: waist-hip ratio
-#'
-#' ## load the prepared set of independent SNPs
-#' indep.snps <- as.character(read.table('indep.snps')$V1)
-#'
-#' ## load summary statistics of the six traits
-#' stats.male <- load.summary(files = list.files(pattern = '*.txt'), indep.snps = indep.snps)
-#'
-#' ## perform multi-trait meta-GWAS
-#' result <- MultiSummary(stats.male)
-#' head(result)
-#' }
-#' @aliases load.summary
-#' @keywords multivariate, meta-analysis
-#'
-`load.summary` <- function(files, cor.pheno = NULL, indep.snps = NULL, est.var = FALSE,
- columnNames = c('snp', 'a1', 'freq', 'beta', 'se', 'n'), fixedN = NULL) {
- if (!all(is.character(files))) {
- stop('files should be given as strings!')
- }
- if (sum(file.exists(files)) < 2) {
- stop('number of traits has to be more than 2!')
- }
- if (is.null(cor.pheno) & is.null(indep.snps)) {
- stop('indep.snps required for estimating cor.pheno!')
- }
- m <- length(files)
- if (!is.null(cor.pheno)) {
- if (nrow(cor.pheno) != m | ncol(cor.pheno) != m) {
- stop('wrong dimensions of cor.pheno!')
- }
- }
- columnNames <- tolower(columnNames)
- if (!is.null(fixedN)) if (fixedN <= 0) {
- stop('fixedN should be a positive number')
- }
- if (is.null(fixedN)) { colNamLen = 6 } else { colNamLen = 5 }
- if (!is.character(columnNames)) {
- stop('columnNames should be character')
- }
- if (length(columnNames) != colNamLen) {
- cat('columnNames should be a vector with',colNamLen,'elements')
- stop('... exiting')
- }
- if (length(unique(columnNames)) != colNamLen) {
- stop('elements of columnNames must be unique')
- }
- # column Names Translation
- cNT = list(
- 'snp' = columnNames[1],
- 'a1' = columnNames[2],
- 'freq'= columnNames[3],
- 'beta'= columnNames[4],
- 'se' = columnNames[5],
- 'n' = columnNames[6]
- )
- cat('loading data ...\n')
- data <- c()
- fn <- files # rev(files)
- vys <- rep(1, m)
- for (i in m:1) {
- dd <- read.table(fn[i], header = TRUE, stringsAsFactors = FALSE)
- colnames(dd) <- tolower(colnames(dd))
- currentColNames <- colnames(dd)
- if (any(!(columnNames %in% currentColNames))) {
- stop('file column names do not match columnNames in ', fn[i], '... exiting!')
- }
- idx <- which(duplicated(dd[, cNT$snp]))
- if (length(idx) > 0) {
- data[[i]] <- dd[-idx,]
- rownames(data[[i]]) <- dd[ -idx , cNT$snp]
- } else {
- data[[i]] <- dd
- rownames(data[[i]]) <- dd[, cNT$snp]
- }
- if (est.var) {
- if (is.null(fixedN)) {
- D <- dd[,cNT$n]*2*dd[, cNT$freq]*(1 - dd[, cNT$freq])
- vy <- D*dd[ , cNT$se]**2 + D*dd[, cNT$beta]**2/(dd[, cNT$n] - 1)
- } else {
- D <- fixedN*2*dd[, cNT$freq]*(1 - dd[, cNT$freq])
- vy <- D*dd[ , cNT$se]**2 + D*dd[, cNT$beta]**2/(fixedN - 1)
- }
- dvy <- density(na.omit(vy))
- vys[i] <- dvy$x[which.max(dvy$y)] #median(vy, na.rm = TRUE)
- }
- progress((m - i + 1)/m*100)
- }
- cat('\n')
- if (est.var) cat('phenotypic variances are:', vys, '\n')
- cat('checking markers ...\n')
- snps <- data[[1]][, cNT$snp]
- for (i in 2:m) {
- snps <- data[[i]][ data[[i]][, cNT$snp] %in% snps, cNT$snp]
- progress(i/m*100)
- }
- snps <- unique(snps)
- cat('\n')
- cat('cleaning data ...\n')
- for (i in 1:m) {
- data[[i]] <- data[[i]][snps,]
- progress(i/m*100)
- }
- cat('\n')
- cat('correcting parameters ...\n')
- for (i in 2:m) {
- if (any(data[[i]][, cNT[['a1']]] != data[[1]][, cNT[['a1']]])) {
- adj <- 2*as.numeric(data[[i]][, cNT[['a1']]] == data[[1]][, cNT[['a1']]]) - 1
- data[[i]][, cNT$beta] <- data[[i]][, cNT$beta]*adj
- data[[i]][, cNT$freq] <- (adj == 1)*data[[i]][, cNT$freq] + (adj == -1)*(1 - data[[i]][,cNT$freq])
- }
- progress(i/m*100)
- }
- cat('\n')
- cat('adjusting sample size ... ')
- n0 <- matrix(NA, nrow(data[[1]]), m)
- if (is.null(fixedN)) {
- for (i in 1:m) {
- n0[,i] <- data[[i]][,cNT$n]
- }
- } else {
- for (i in 1:m) {
- n0[,i] <- fixedN
- }
- }
- n <- apply(n0, 1, "min")
- cat('done.\n')
- cat('finalizing summary statistics ...\n')
- gwa0 <- matrix(NA, nrow(data[[1]]), 2*m + 2)
- for (i in 1:m) {
- gwa0[,i*2 - 1] <- data[[i]][,cNT$beta]
- gwa0[,i*2] <- data[[i]][,cNT$se]
- progress(i/m*100)
- }
- gwa0[,2*length(data) + 1] <- data[[1]][,cNT$freq]
- gwa0[,2*length(data) + 2] <- n
- rownames(gwa0) <- data[[1]][,cNT$snp]
- gwa0 <- na.omit(gwa0)
- cat('\n')
- if (is.null(cor.pheno)) {
- n.ratio <- diag(m)
- for (i in 1:(m - 1)) {
- for (j in (i + 1):m) {
- ratio <- mean(sqrt(n0[,j]/n0[,i]), na.rm = TRUE)
- n.ratio[i,j] <- n.ratio[j,i] <- ifelse(ratio > 1, 1/ratio, ratio)
- }
- }
- if (any(n.ratio < 1)) {
- cat('samples partially overlap!\n')
- cat('estimating shrinkage phenotypic correlations ... ')
- } else {
- cat('estimating phenotypic correlations ... ')
- }
- idx <- which(rownames(gwa0) %in% indep.snps)
- gwa1 <- gwa0[idx,]
- z <- gwa1[,seq(1, 2*m, 2)]/gwa1[,seq(2, 2*m, 2)]
- cor.pheno <- cor(z, use = 'pairwise.complete.obs')
- cat('done.\n')
- }
- dimnames(cor.pheno) <- list(files, files)
- gwanames <- c(paste(rep(files, each = 2), rep(c('.beta', '.se'), m), sep = ''), 'f', 'n')
- colnames(gwa0) <- gwanames
- dd <- list(gwa = gwa0, cor.pheno = cor.pheno, var.pheno = vys)
- class(dd) <- 'multi.summary'
- return(dd)
-}
+#' Loading multiple summary statistics from genome-wide association studies
+#'
+#' The function loads multiple meta-GWAS summary statistics, for subsequent multi-trait GWAS.
+#' Currently, the package only analyzes summary statistics from inverse-Gaussianized continuous traits.
+#'
+#' @param files A vector of file names as strings. Each file name should contain summary statistics of
+#' one trait to be included in the multi-trait analysis. The columns of the summary statistics have to
+#' contain (uppercase or lowercase does not matter) \code{'snp'} (marker ID), \code{'a1'} (the first allele), \code{'freq'}
+#' (frequency of the first allele), \code{'beta'} (effect size), \code{'se'} (standard error), and
+#' \code{'n'} (sample size).
+#' @param cor.pheno A #traits x #traits matrix of correlation matrix of the phenotypes, to be used to
+#' construct the multi-trait test statistic. If \code{NULL},
+#' this matrix will be estimated from genome-wide summary statistics. If you have partially overlapping
+#' samples for different traits, shrinkage correlation matrix is recommended (see reference), so in that
+#' case, unless you know what you are doing, leave this argument as default, i.e. \code{NULL}.
+#' @param indep.snps A vector of strings containing the names of a set of independent SNPs. This is
+#' recommended to be generated by LD-pruning the genotype data in a certain cohort. Typically the
+#' number of SNPs should be more than 10,000 in order to obtain a good estimate of \code{cor.pheno}. If
+#' \code{cor.pheno = NULL}, this argument cannot be \code{NULL}.
+#' @param est.var A logical value. If \code{FALSE}, each phenotypic variance is assumed to be known as 1.
+#' If \code{TRUE}, each phenotypic variance will be estimated to adjust the summary statistics, so that
+#' the corresponding phenoypic variance is 1.
+#' @param columnNames A vector with names of columns containing necessary information in the input file;
+#' default values are c('snp','a1','freq','beta','se','n'). The values are case-insensitive. Note: check
+#' your allele definitions for different traits are based on the same strand!
+#' @param fixedN sample size to assume across all analyses, when provided, this number will be used
+#' (instead of the ones specified in the input files)
+#'
+#' @return The function returns a list of class \code{multi.summary}, containing two elements: \code{gwa}
+#' (the cleaned data to be processed in multi-trait GWAS), \code{cor.pheno} (user input or estimated), and
+#' \code{var.pheno} (default or estimated).
+#'
+#' @author Xia Shen, Yurii S. Aulchenko
+#'
+#' @references
+#' Xia Shen, Zheng Ning, Yakov Tsepilov, Masoud Shirali,
+#' Generation Scotland, Blair H. Smith, Lynne J. Hocking, Sandosh Padmanabhan, Caroline Hayward,
+#' David J. Porteous, Yudi Pawitan, Chris S. Haley, Yurii S. Aulchenko (2015).
+#' Simple multi-trait analysis identifies novel loci
+#' associated with growth and obesity measures. \emph{Submitted}.
+#'
+#' @seealso
+#' \code{MultiSummary}
+#'
+#' @examples
+#' \dontrun{
+#' ## download the six example files from:
+#' ## https://www.dropbox.com/sh/hhta45cewvvea2s/AADfj4OXlbroToZAwIii2Buha?dl=0
+#' ## the summary statistics from Randall et al. (2013) PLoS Genet
+#' ## for males only
+#' ## bmi: body mass index
+#' ## hip: hip circumference
+#' ## wc: waist circumference
+#' ## whr: waist-hip ratio
+#'
+#' ## load the prepared set of independent SNPs
+#' indep.snps <- as.character(read.table('indep.snps')$V1)
+#'
+#' ## load summary statistics of the six traits
+#' stats.male <- load.summary(files = list.files(pattern = '*.txt'), indep.snps = indep.snps)
+#'
+#' ## perform multi-trait meta-GWAS
+#' result <- MultiSummary(stats.male)
+#' head(result)
+#' }
+#' @aliases load.summary
+#' @keywords multivariate, meta-analysis
+#'
+`load.summary` <- function(files, cor.pheno = NULL, indep.snps = NULL, est.var = FALSE,
+ columnNames = c('snp', 'a1', 'freq', 'beta', 'se', 'n'), fixedN = NULL) {
+ if (!all(is.character(files))) {
+ stop('files should be given as strings!')
+ }
+ if (sum(file.exists(files)) < 2) {
+ stop('number of traits has to be more than 2!')
+ }
+ if (is.null(cor.pheno) & is.null(indep.snps)) {
+ stop('indep.snps required for estimating cor.pheno!')
+ }
+ m <- length(files)
+ if (!is.null(cor.pheno)) {
+ if (nrow(cor.pheno) != m | ncol(cor.pheno) != m) {
+ stop('wrong dimensions of cor.pheno!')
+ }
+ }
+ columnNames <- tolower(columnNames)
+ if (!is.null(fixedN)) if (fixedN <= 0) {
+ stop('fixedN should be a positive number')
+ }
+ if (is.null(fixedN)) { colNamLen = 6 } else { colNamLen = 5 }
+ if (!is.character(columnNames)) {
+ stop('columnNames should be character')
+ }
+ if (length(columnNames) != colNamLen) {
+ cat('columnNames should be a vector with',colNamLen,'elements')
+ stop('... exiting')
+ }
+ if (length(unique(columnNames)) != colNamLen) {
+ stop('elements of columnNames must be unique')
+ }
+ # column Names Translation
+ cNT = list(
+ 'snp' = columnNames[1],
+ 'a1' = columnNames[2],
+ 'freq'= columnNames[3],
+ 'beta'= columnNames[4],
+ 'se' = columnNames[5],
+ 'n' = columnNames[6]
+ )
+ cat('loading data ...\n')
+ data <- c()
+ fn <- files # rev(files)
+ vys <- rep(1, m)
+ for (i in m:1) {
+ dd <- read.table(fn[i], header = TRUE, stringsAsFactors = FALSE)
+ colnames(dd) <- tolower(colnames(dd))
+ currentColNames <- colnames(dd)
+ if (any(!(columnNames %in% currentColNames))) {
+ stop('file column names do not match columnNames in ', fn[i], '... exiting!')
+ }
+ idx <- which(duplicated(dd[, cNT$snp]))
+ if (length(idx) > 0) {
+ data[[i]] <- dd[-idx,]
+ rownames(data[[i]]) <- dd[ -idx , cNT$snp]
+ } else {
+ data[[i]] <- dd
+ rownames(data[[i]]) <- dd[, cNT$snp]
+ }
+ if (est.var) {
+ if (is.null(fixedN)) {
+ D <- dd[,cNT$n]*2*dd[, cNT$freq]*(1 - dd[, cNT$freq])
+ vy <- D*dd[ , cNT$se]**2 + D*dd[, cNT$beta]**2/(dd[, cNT$n] - 1)
+ } else {
+ D <- fixedN*2*dd[, cNT$freq]*(1 - dd[, cNT$freq])
+ vy <- D*dd[ , cNT$se]**2 + D*dd[, cNT$beta]**2/(fixedN - 1)
+ }
+ dvy <- density(na.omit(vy))
+ vys[i] <- dvy$x[which.max(dvy$y)] #median(vy, na.rm = TRUE)
+ }
+ progress((m - i + 1)/m*100)
+ }
+ cat('\n')
+ if (est.var) cat('phenotypic variances are:', vys, '\n')
+ cat('checking markers ...\n')
+ snps <- data[[1]][, cNT$snp]
+ for (i in 2:m) {
+ snps <- data[[i]][ data[[i]][, cNT$snp] %in% snps, cNT$snp]
+ progress(i/m*100)
+ }
+ snps <- unique(snps)
+ cat('\n')
+ cat('cleaning data ...\n')
+ for (i in 1:m) {
+ data[[i]] <- data[[i]][snps,]
+ progress(i/m*100)
+ }
+ cat('\n')
+ cat('correcting parameters ...\n')
+ for (i in 2:m) {
+ if (any(data[[i]][, cNT[['a1']]] != data[[1]][, cNT[['a1']]])) {
+ adj <- 2*as.numeric(data[[i]][, cNT[['a1']]] == data[[1]][, cNT[['a1']]]) - 1
+ data[[i]][, cNT$beta] <- data[[i]][, cNT$beta]*adj
+ data[[i]][, cNT$freq] <- (adj == 1)*data[[i]][, cNT$freq] + (adj == -1)*(1 - data[[i]][,cNT$freq])
+ }
+ progress(i/m*100)
+ }
+ cat('\n')
+ cat('adjusting sample size ... ')
+ n0 <- matrix(NA, nrow(data[[1]]), m)
+ if (is.null(fixedN)) {
+ for (i in 1:m) {
+ n0[,i] <- data[[i]][,cNT$n]
+ }
+ } else {
+ for (i in 1:m) {
+ n0[,i] <- fixedN
+ }
+ }
+ n <- apply(n0, 1, "min")
+ cat('done.\n')
+ cat('finalizing summary statistics ...\n')
+ gwa0 <- matrix(NA, nrow(data[[1]]), 2*m + 2)
+ for (i in 1:m) {
+ gwa0[,i*2 - 1] <- data[[i]][,cNT$beta]
+ gwa0[,i*2] <- data[[i]][,cNT$se]
+ progress(i/m*100)
+ }
+ gwa0[,2*length(data) + 1] <- data[[1]][,cNT$freq]
+ gwa0[,2*length(data) + 2] <- n
+ rownames(gwa0) <- data[[1]][,cNT$snp]
+ gwa0 <- na.omit(gwa0)
+ cat('\n')
+ if (is.null(cor.pheno)) {
+ n.ratio <- diag(m)
+ for (i in 1:(m - 1)) {
+ for (j in (i + 1):m) {
+ ratio <- mean(sqrt(n0[,j]/n0[,i]), na.rm = TRUE)
+ n.ratio[i,j] <- n.ratio[j,i] <- ifelse(ratio > 1, 1/ratio, ratio)
+ }
+ }
+ if (any(n.ratio < 1)) {
+ cat('samples partially overlap!\n')
+ cat('estimating shrinkage phenotypic correlations ... ')
+ } else {
+ cat('estimating phenotypic correlations ... ')
+ }
+ idx <- which(rownames(gwa0) %in% indep.snps)
+ gwa1 <- gwa0[idx,]
+ z <- gwa1[,seq(1, 2*m, 2)]/gwa1[,seq(2, 2*m, 2)]
+ cor.pheno <- cor(z, use = 'pairwise.complete.obs')
+ cat('done.\n')
+ }
+ dimnames(cor.pheno) <- list(files, files)
+ gwanames <- c(paste(rep(files, each = 2), rep(c('.beta', '.se'), m), sep = ''), 'f', 'n')
+ colnames(gwa0) <- gwanames
+ dd <- list(gwa = gwa0, cor.pheno = cor.pheno, var.pheno = vys)
+ class(dd) <- 'multi.summary'
+ return(dd)
+}
More information about the Genabel-commits
mailing list