[Genabel-commits] r2070 - in pkg/MultiABEL: . R man

Thu Jun 15 00:56:27 CEST 2017

Author: shenxia
Date: 2017-06-15 00:56:27 +0200 (Thu, 15 Jun 2017)
New Revision: 2070

Modified:
   pkg/MultiABEL/DESCRIPTION
   pkg/MultiABEL/NAMESPACE
   pkg/MultiABEL/R/load.summary.R
   pkg/MultiABEL/man/load.summary.Rd
Log:


Modified: pkg/MultiABEL/DESCRIPTION
===================================================================

--- pkg/MultiABEL/DESCRIPTION	2017-06-14 22:35:32 UTC (rev 2069)
+++ pkg/MultiABEL/DESCRIPTION	2017-06-14 22:56:27 UTC (rev 2070)
@@ -10,7 +10,8 @@
     statistics.
 Depends:
     R (>= 2.10),
-    svMisc
+    svMisc,
+    data.table
 Suggests:
     GenABEL,
     DatABEL

Modified: pkg/MultiABEL/NAMESPACE
===================================================================
--- pkg/MultiABEL/NAMESPACE	2017-06-14 22:35:32 UTC (rev 2069)
+++ pkg/MultiABEL/NAMESPACE	2017-06-14 22:56:27 UTC (rev 2070)
@@ -2,6 +2,6 @@
 importFrom("stats", "cor", "density", "lm", "na.omit", "pchisq", "pf",
              "qchisq", "qnorm", "quantile")
 importFrom("utils", "packageDescription", "read.table", "write.table")
-import("svMisc")
+import("svMisc", "data.table")
 export("Multivariate", "MultiRep", "MultiMeta", "MultiLoad","load.summary", "MultiSummary")
 exportClasses("multi.summary", "multi.loaded")
\ No newline at end of file

Modified: pkg/MultiABEL/R/load.summary.R
===================================================================
--- pkg/MultiABEL/R/load.summary.R	2017-06-14 22:35:32 UTC (rev 2069)
+++ pkg/MultiABEL/R/load.summary.R	2017-06-14 22:56:27 UTC (rev 2070)
@@ -5,7 +5,8 @@
 #' 
 #' @param files A vector of file names as strings. Each file name should contain summary statistics of
 #' one trait to be included in the multi-trait analysis. The columns of the summary statistics have to
-#' contain (uppercase or lowercase does not matter) \code{'snp'} (marker ID), \code{'a1'} (the first allele), \code{'freq'} 
+#' contain (uppercase or lowercase does not matter) \code{'snp'} (marker ID), \code{'a1'} (the first allele),
+#' \code{'a2'} (the second allele), \code{'freq'},
 #' (frequency of the first allele), \code{'beta'} (effect size), \code{'se'} (standard error), and 
 #' \code{'n'} (sample size).
 #' @param cor.pheno A #traits x #traits matrix of correlation matrix of the phenotypes, to be used to 
@@ -21,7 +22,7 @@
 #' If \code{TRUE}, each phenotypic variance will be estimated to adjust the summary statistics, so that
 #' the corresponding phenoypic variance is 1.
 #' @param columnNames A vector with names of columns containing necessary information in the input file;
-#' default values are c('snp','a1','freq','beta','se','n'). The values are case-insensitive. Note: check
+#' default values are c('snp','a1','a2','freq','beta','se','n'). The values are case-insensitive. Note: check
 #' your allele definitions for different traits are based on the same strand!
 #' @param fixedN sample size to assume across all analyses, when provided, this number will be used
 #' (instead of the ones specified in the input files)
@@ -30,14 +31,12 @@
 #' (the cleaned data to be processed in multi-trait GWAS), \code{cor.pheno} (user input or estimated), and 
 #' \code{var.pheno} (default or estimated).
 #' 
-#' @author Xia Shen, Yurii S. Aulchenko
+#' @author Xia Shen, Yakov, Tsepilov, Yurii S. Aulchenko
 #' 
 #' @references 
-#' Xia Shen, Zheng Ning, Yakov Tsepilov, Masoud Shirali, 
-#' Generation Scotland, Blair H. Smith, Lynne J. Hocking, Sandosh Padmanabhan, Caroline Hayward, 
-#' David J. Porteous, Yudi Pawitan, Chris S. Haley, Yurii S. Aulchenko (2015).
-#' Simple multi-trait analysis identifies novel loci 
-#' associated with growth and obesity measures. \emph{Submitted}.
+#' Xia Shen, Yakov Tsepilov, ..., Yudi Pawitan, Chris S. Haley, Yurii S. Aulchenko (2017).
+#' Discovery, replication, and in silico functional investigation of 
+#' 22 new pleiotropic anthropometric loci. \emph{Submitted}.
 #' 
 #' @seealso 
 #' \code{MultiSummary}
@@ -66,66 +65,91 @@
 #' @aliases load.summary
 #' @keywords multivariate, meta-analysis
 #' 
-`load.summary` <- function(files, cor.pheno = NULL, indep.snps = NULL, est.var = FALSE, 
-		                   columnNames = c('snp', 'a1', 'freq', 'beta', 'se', 'n'), fixedN = NULL) {
+`load.summary` <- function(files, cor.pheno = NULL, indep.snps = NULL, 
+		est.var = FALSE, 
+		columnNames = c('snp', 'a1', 'a2', 'freq', 'beta', 'se', 'n'),
+		fixedN = NULL) {
+	
+	### I. Sanity checks ###
+	
+	# require(data.table)
+	
 	if (!all(is.character(files))) {
 		stop('files should be given as strings!')
 	}
+	
 	if (sum(file.exists(files)) < 2) {
 		stop('number of traits has to be more than 2!')
 	}
+	
 	if (is.null(cor.pheno) & is.null(indep.snps)) {
 		stop('indep.snps required for estimating cor.pheno!')
 	}
+	
 	m <- length(files)
 	if (!is.null(cor.pheno)) {
 		if (nrow(cor.pheno) != m | ncol(cor.pheno) != m) {
 			stop('wrong dimensions of cor.pheno!')
 		}
 	}
+	
 	columnNames <- tolower(columnNames)
+	
 	if (!is.null(fixedN)) if (fixedN <= 0) {
 			stop('fixedN should be a positive number')
 		}
-	if (is.null(fixedN)) { colNamLen = 6 } else { colNamLen = 5 }
+	
+	if (is.null(fixedN)) { colNamLen = 7 } else { colNamLen = 6 }
+	
 	if (!is.character(columnNames)) {
 		stop('columnNames should be character')
 	}
+	
 	if (length(columnNames) != colNamLen) {
-		cat('columnNames should be a vector with', colNamLen, 'elements')
+		cat('columnNames should be a vector with',colNamLen,'elements')
 		stop('... exiting')
 	}
+	
 	if (length(unique(columnNames)) != colNamLen) {
 		stop('elements of columnNames must be unique')
 	}
-	# column Names Translation
+	
 	cNT = list(
-		'snp' = columnNames[1],
-		'a1'  = columnNames[2],
-		'freq'= columnNames[3],
-		'beta'= columnNames[4],
-		'se'  = columnNames[5],
-		'n'   = columnNames[6]
+			'snp' = columnNames[1],
+			'a1'  = columnNames[2],
+			'a2' = columnNames[3],
+			'freq'= columnNames[4],
+			'beta'= columnNames[5],
+			'se'  = columnNames[6],
+			'n'   = columnNames[7]
 	)
-	cat('loading data ...\n')
+	
+	### II. Data load ###
+	
+	cat("loading data ...\n")
+	
 	data <- c()
 	fn <- files # rev(files)
 	vys <- rep(1, m)
 	for (i in m:1) {
-		dd <- read.table(fn[i], header = TRUE, stringsAsFactors = FALSE)
+		
+		# Reserved for possible problems with data.table package
+		# dd <- read.table(fn[i], header = TRUE, stringsAsFactors = FALSE)
+		
+		dd <- data.table::fread(fn[i], header = TRUE, stringsAsFactors = FALSE,verbose = FALSE, showProgress = FALSE, data.table = FALSE)
+		
 		colnames(dd) <- tolower(colnames(dd))
 		currentColNames <- colnames(dd)
 		if (any(!(columnNames %in% currentColNames))) {
 			stop('file column names do not match columnNames in ', fn[i], '... exiting!')
 		}
-		dd[,cNT$a1] <- toupper(dd[,cNT$a1])
-		idx <- which(duplicated(dd[,cNT$snp]))
+		idx <- which(duplicated(dd[, cNT$snp]))
 		if (length(idx) > 0) {
 			data[[i]] <- dd[-idx,]
-			rownames(data[[i]]) <- dd[-idx,cNT$snp]
+			rownames(data[[i]]) <- dd[ -idx , cNT$snp]
 		} else {
 			data[[i]] <- dd
-			rownames(data[[i]]) <- dd[,cNT$snp]
+			rownames(data[[i]]) <- dd[, cNT$snp]
 		}
 		if (est.var) {
 			if (is.null(fixedN)) {
@@ -142,43 +166,79 @@
 	}
 	cat('\n')
 	if (est.var) cat('phenotypic variances are:', vys, '\n')
-	cat('checking markers ...\n')
-	snps <- data[[1]][, cNT$snp]
-	for (i in 2:m) {
-		snps <- data[[i]][ data[[i]][, cNT$snp] %in% snps, cNT$snp]
-		progress(i/m*100)
+	
+	### III. SNP cheking and cleaning ###
+	
+	cat("checking markers ...\n")
+	
+	snps <- data[[1]][,cNT$snp]
+	for (i in 1:m) {
+		data[[i]]=na.omit(data[[i]][,unlist(cNT)])
+		snps=intersect(snps,data[[i]][, cNT$snp])
+		progress(i/m * 100)
 	}
+	
 	snps <- unique(snps)
-	cat('\n')
-	cat('cleaning data ...\n')
+	cat("\n")
+	
+	cat("cleaning data ...\n")
 	for (i in 1:m) {
-		data[[i]] <- data[[i]][snps,]
-		progress(i/m*100)
+		data[[i]] <- data[[i]][snps, ]
+		progress(i/m * 100)
 	}
-	cat('\n')
-	cat('correcting parameters ...\n')
+	cat("\n")
+	cat("correcting parameters ...\n")
+	
+	for (i in 1:m) {
+		data[[i]][, cNT[["a1"]]] = toupper(data[[i]][, cNT[["a1"]]])
+		data[[i]][, cNT[["a2"]]] = toupper(data[[i]][, cNT[["a2"]]])
+	}
+	
+	d1a12 <- paste(data[[1]][,c(cNT$a1)],data[[1]][,c(cNT$a2)], sep = "")
+	d1a21 <- paste(data[[1]][,c(cNT$a2)],data[[1]][,c(cNT$a1)], sep = "")
+	
+	
+	snps=rownames(data[[1]])
+	
+	i <- 2
 	for (i in 2:m) {
-		if (any(data[[i]][, cNT[['a1']]] != data[[1]][, cNT[['a1']]])) {
-			adj <- 2*as.numeric(data[[i]][, cNT[['a1']]] == data[[1]][, cNT[['a1']]]) - 1
-			data[[i]][, cNT$beta] <- data[[i]][, cNT$beta]*adj
-			data[[i]][, cNT$freq] <- (adj == 1)*data[[i]][, cNT$freq] + (adj == -1)*(1 - data[[i]][,cNT$freq])
+		
+		dia12 <- paste(data[[i]][,c(cNT$a1)],data[[i]][,c(cNT$a2)], sep = "")
+		
+		ind=which(((dia12==d1a12) + (dia12==d1a21))==1)
+		snps=intersect(rownames(data[[i]])[ind],snps)
+		
+		if (any(data[[i]][, cNT[["a1"]]] != data[[1]][, cNT[["a1"]]])) {
+			adj <- 2 * as.numeric(data[[i]][, cNT[["a1"]]] == data[[1]][, cNT[["a1"]]]) - 1
+			data[[i]][, cNT$beta] <- data[[i]][, cNT$beta] *adj
+			data[[i]][, cNT$freq] <- (adj == 1) * data[[i]][,cNT$freq] + (adj == -1) * (1 - data[[i]][, cNT$freq])
 		}
-		progress(i/m*100)
+		progress(i/m * 100)
 	}
-	cat('\n')
-	cat('adjusting sample size ... ')
+	
+	for (i in 1:m) {
+		data[[i]] <- data[[i]][snps, ]
+		#progress(i/m * 100)
+	}
+	
+	cat("\n")
+	
+	### IV. Sample size and finalisation ###
+	
+	cat("adjusting sample size ... ")
 	n0 <- matrix(NA, nrow(data[[1]]), m)
 	if (is.null(fixedN)) {
 		for (i in 1:m) {
-			n0[,i] <- data[[i]][,cNT$n]
+			n0[, i] <- data[[i]][, cNT$n]
 		}
-	} else {
+	}else {
 		for (i in 1:m) {
-			n0[,i] <- fixedN
-		}            
+			n0[, i] <- fixedN
+		}
 	}
 	n <- apply(n0, 1, "min")
-	cat('done.\n')
+	cat("done.\n")
+	
 	cat('finalizing summary statistics ...\n')
 	gwa0 <- matrix(NA, nrow(data[[1]]), 2*m + 2)
 	for (i in 1:m) {
@@ -191,6 +251,7 @@
 	rownames(gwa0) <- data[[1]][,cNT$snp]
 	gwa0 <- na.omit(gwa0)
 	cat('\n')
+	
 	if (is.null(cor.pheno)) {
 		n.ratio <- diag(m)
 		for (i in 1:(m - 1)) {
@@ -211,10 +272,16 @@
 		cor.pheno <- cor(z, use = 'pairwise.complete.obs')
 		cat('done.\n')
 	}
+	
 	dimnames(cor.pheno) <- list(files, files)
 	gwanames <- c(paste(rep(files, each = 2), rep(c('.beta', '.se'), m), sep = ''), 'f', 'n')
 	colnames(gwa0) <- gwanames
-	dd <- list(gwa = gwa0, cor.pheno = cor.pheno, var.pheno = vys)
-	class(dd) <- 'multi.summary'
+	
+	A12 <- data[[1]][rownames(gwa0), c(cNT[["a1"]],cNT[["a2"]])]
+	colnames(A12) <- c("A1","A2")
+	
+	dd <- list(gwa = gwa0, cor.pheno = cor.pheno, var.pheno = vys, alleles = A12)
+	class(dd) <- "multi.summary"
 	return(dd)
+	
 }

Modified: pkg/MultiABEL/man/load.summary.Rd
===================================================================
--- pkg/MultiABEL/man/load.summary.Rd	2017-06-14 22:35:32 UTC (rev 2069)
+++ pkg/MultiABEL/man/load.summary.Rd	2017-06-14 22:56:27 UTC (rev 2070)
@@ -5,12 +5,14 @@
 \title{Loading multiple summary statistics from genome-wide association studies}
 \usage{
 load.summary(files, cor.pheno = NULL, indep.snps = NULL, est.var = FALSE,
-  columnNames = c("snp", "a1", "freq", "beta", "se", "n"), fixedN = NULL)
+  columnNames = c("snp", "a1", "a2", "freq", "beta", "se", "n"),
+  fixedN = NULL)
 }
 \arguments{
 \item{files}{A vector of file names as strings. Each file name should contain summary statistics of
 one trait to be included in the multi-trait analysis. The columns of the summary statistics have to
-contain (uppercase or lowercase does not matter) \code{'snp'} (marker ID), \code{'a1'} (the first allele), \code{'freq'} 
+contain (uppercase or lowercase does not matter) \code{'snp'} (marker ID), \code{'a1'} (the first allele),
+\code{'a2'} (the second allele), \code{'freq'},
 (frequency of the first allele), \code{'beta'} (effect size), \code{'se'} (standard error), and 
 \code{'n'} (sample size).}
 
@@ -30,7 +32,7 @@
 the corresponding phenoypic variance is 1.}
 
 \item{columnNames}{A vector with names of columns containing necessary information in the input file;
-default values are c('snp','a1','freq','beta','se','n'). The values are case-insensitive. Note: check
+default values are c('snp','a1','a2','freq','beta','se','n'). The values are case-insensitive. Note: check
 your allele definitions for different traits are based on the same strand!}
 
 \item{fixedN}{sample size to assume across all analyses, when provided, this number will be used
@@ -68,17 +70,15 @@
 }
 }
 \references{
-Xia Shen, Zheng Ning, Yakov Tsepilov, Masoud Shirali, 
-Generation Scotland, Blair H. Smith, Lynne J. Hocking, Sandosh Padmanabhan, Caroline Hayward, 
-David J. Porteous, Yudi Pawitan, Chris S. Haley, Yurii S. Aulchenko (2015).
-Simple multi-trait analysis identifies novel loci 
-associated with growth and obesity measures. \emph{Submitted}.
+Xia Shen, Yakov Tsepilov, ..., Yudi Pawitan, Chris S. Haley, Yurii S. Aulchenko (2017).
+Discovery, replication, and in silico functional investigation of 
+22 new pleiotropic anthropometric loci. \emph{Submitted}.
 }
 \seealso{
 \code{MultiSummary}
 }
 \author{
-Xia Shen, Yurii S. Aulchenko
+Xia Shen, Yakov, Tsepilov, Yurii S. Aulchenko
 }
 \keyword{meta-analysis}
 \keyword{multivariate,}