[Genabel-commits] r2075 - in pkg/MultiABEL: . R

Tue May 15 06:33:17 CEST 2018

Author: shenxia
Date: 2018-05-15 06:33:16 +0200 (Tue, 15 May 2018)
New Revision: 2075

Modified:
   pkg/MultiABEL/ChangeLog
   pkg/MultiABEL/DESCRIPTION
   pkg/MultiABEL/R/MultiSummary.R
   pkg/MultiABEL/R/Multivariate.R
Log:
1.1-8

Modified: pkg/MultiABEL/ChangeLog
===================================================================

--- pkg/MultiABEL/ChangeLog	2017-06-15 17:03:58 UTC (rev 2074)
+++ pkg/MultiABEL/ChangeLog	2018-05-15 04:33:16 UTC (rev 2075)
@@ -1,3 +1,15 @@
+2017-05-15 13:21 xia
+    
+    * Updated to version 1.1-8
+    * Fixed F-stat degrees of freedom bug
+    * Effective sample size reported
+
+2018-03-22 12:50 xia
+    
+    * Updated to version 1.1-7
+    * Added F-stat test option for high-dimensional 
+      cases (argument high.dim)
+
 2017-06-14 23:50 xia
     
     * Updated to version 1.1-6

Modified: pkg/MultiABEL/DESCRIPTION
===================================================================
--- pkg/MultiABEL/DESCRIPTION	2017-06-15 17:03:58 UTC (rev 2074)
+++ pkg/MultiABEL/DESCRIPTION	2018-05-15 04:33:16 UTC (rev 2075)
@@ -1,8 +1,8 @@
 Package: MultiABEL
 Type: Package
 Title: Multi-Trait Genome-Wide Association Analysis
-Version: 1.1-6
-Date: 2017-06-14
+Version: 1.1-8
+Date: 2018-05-15
 Author: Xia Shen
 Maintainer: Xia Shen <xia.shen at ed.ac.uk>
 Description: Multivariate genome-wide association analyses. The analysis can be
@@ -18,5 +18,5 @@
 License: GPL (>= 2)
 URL: https://github.com/xiashen/MultiABEL
 LazyLoad: yes
-Packaged: 2017-06-14 23:26 CET; xia
+Packaged: 2018-05-15 07:19 CET; xia
 RoxygenNote: 6.0.1
\ No newline at end of file

Modified: pkg/MultiABEL/R/MultiSummary.R
===================================================================
--- pkg/MultiABEL/R/MultiSummary.R	2017-06-15 17:03:58 UTC (rev 2074)
+++ pkg/MultiABEL/R/MultiSummary.R	2018-05-15 04:33:16 UTC (rev 2075)
@@ -13,19 +13,24 @@
 #' scale-invariant test most similar to the direct MANOVA, but may be less powerful in some scenarios.
 #' @param vars A numeric vector gives the variance of the genotypes at each SNP, e.g. coded as 0, 1 and 2.
 #' Only used when \code{type = "precise"}.
+#' @param high.dim Are the phenotypes high-dimensional or not? This is particularly important when the ratio
+#' of the number of individuals (n) to the number of phenotypes being analyzed (p) is not big enough, e.g when
+#' analyzing a big number of omics phenotypes in a small cohort. Default = \code{FALSE}.
 #' 
 #' @return The function returns a data frame containing the multi-trait GWAS results, where the row names are
 #' the variants names. The column names are: variant name (\code{Marker}), allele frequency (\code{Freq}),
-#' the smallest sample size of the traits (\code{N}), effect on the phenotype score (\code{Beta.S}, see reference),
+#' the effective sample size of the multiple traits (\code{N}), effect on the phenotype score (\code{Beta.S}, see reference),
 #' standard error (\code{SE}), p-value (\code{P}), and the rest the coefficients to construct the phenotype score
 #' (see reference).
 #' 
 #' @author Xia Shen
 #' 
 #' @references 
-#' Xia Shen, Zheng Ning, Yakov Tsepilov, Peter K. Joshi,
-#' James F. Wilson, Yudi Pawitan, Chris S. Haley, Yurii S. Aulchenko (2016).
-#' Fast pleiotropic meta-analysis for genetic studies. \emph{Submitted}.
+#' Zheng Ning, Yakov Tsepilov, Peter K. Joshi,
+#' James F. Wilson, Yudi Pawitan, Chris S. Haley, 
+#' Yurii S. Aulchenko, Xia Shen (2018).
+#' Pleiotropic meta-analysis for genomic studies: discovery, replication, 
+#' and interpretation. \emph{Submitted}.
 #' 
 #' @seealso 
 #' \code{load.summary}
@@ -56,7 +61,7 @@
 #' @aliases MultiSummary, multi.summary
 #' @keywords multivariate, meta-analysis
 #' 
-`MultiSummary` <- function(x, index = NULL,  type = 'direct', vars = NULL) {
+`MultiSummary` <- function(x, index = NULL,  type = 'direct', vars = NULL, high.dim = FALSE) {
 	if (class(x) != 'multi.summary') {
 		stop('Wrong data type!')
 	}
@@ -110,12 +115,26 @@
 		scan1 <- .Fortran('MultiSummaryLoopDirect', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
 				tmat = tmat, invR = solve(x$cor.pheno), 
 				pil = numeric(k), fstat = numeric(k), PACKAGE = "MultiABEL")
+		fs <- scan1$pil/m/(n - scan1$pil)*(n - m - 1)
+		mf <- mean(fs)
+		n.eff <- mf*2/(mf - 1) + m + 1 # effective N: fit a F-distribution to the observed fstat using moment estimator
+		res$n <- round(n.eff, digits = 2)
+		if (!high.dim) {
+			if (n.eff < 5*m) {
+				warning('Effective sample size < 5 times number of phenotypes. Consider argument high.dim = TRUE.')
+			}
+			pv <- pf(fstat, m, n - m - 1, lower.tail = FALSE)	
+		} else {
+			pv <- pf(fs, m, n.eff - m - 1, lower.tail = FALSE)	
+			# scan1 <- .Fortran('MultiSummaryLoopDirectFstat', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
+			#		betamat = betamat, invsemat = 1/semat, invse = diag(m), invR = solve(x$cor.pheno), invV = diag(m), 
+			#		pil = numeric(k), fstat = numeric(k), PACKAGE = "MultiABEL")
+		}
 		scan2 <- .Fortran('MultiSummaryLoop', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
 				f = as.numeric(f), betamat = betamat, R = x$cor.pheno, invR = solve(x$cor.pheno), D = matrix(0, m, m),
 				sdY = diag(sqrt(x$var.pheno)), invsdY = diag(1/sqrt(x$var.pheno)), sY = sqrt(x$var.pheno), 
 				b = betamat, s = betamat, pil = numeric(k), coef = betamat, ss = betamat, PACKAGE = "MultiABEL")
-		res2 <- data.frame(p = pf(scan1$fstat, m, n - m - 1, lower.tail = FALSE),
-		                   beta.score = scan2$pil)
+		res2 <- data.frame(p = pv, beta.score = scan2$pil)
 		res2$se.score <- res2$beta.score/sqrt(qchisq(res2$p, 1, lower.tail = FALSE))
 	    res3[,seq(1, m*3, 3)] <- scan2$b
 		res3[,seq(2, m*3, 3)] <- scan2$s
@@ -128,12 +147,26 @@
 		scan1 <- .Fortran('MultiSummaryLoopDirect', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
 				tmat = tmat, invR = solve(x$cor.pheno), 
 				pil = numeric(k), fstat = numeric(k), PACKAGE = "MultiABEL")
+		fs <- scan1$pil/m/(n - scan1$pil)*(n - m - 1)
+		mf <- mean(fs)
+		n.eff <- mf*2/(mf - 1) + m + 1 # effective N: fit a F-distribution to the observed fstat using moment estimator
+		res$n <- round(n.eff, digits = 2)
+		if (!high.dim) {
+			if (n.eff < 5*m) {
+				warning('Effective sample size < 5 times number of phenotypes. Consider argument high.dim = TRUE.')
+			}
+			pv <- pf(fstat, m, n - m - 1, lower.tail = FALSE)	
+		} else {
+			pv <- pf(fs, m, n.eff - m - 1, lower.tail = FALSE)	
+			# scan1 <- .Fortran('MultiSummaryLoopDirectFstat', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
+			#		betamat = betamat, invsemat = 1/semat, invse = diag(m), invR = solve(x$cor.pheno), invV = diag(m), 
+			#		pil = numeric(k), fstat = numeric(k), PACKAGE = "MultiABEL")
+		}
 		scan2 <- .Fortran('MultiSummaryLoopInbred', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
 				f = as.numeric(f), betamat = betamat, R = x$cor.pheno, invR = solve(x$cor.pheno), D = matrix(0, m, m),
 				sdY = diag(sqrt(x$var.pheno)), invsdY = diag(1/sqrt(x$var.pheno)), sY = sqrt(x$var.pheno), 
 				b = betamat, s = betamat, pil = numeric(k), coef = betamat, ss = betamat, PACKAGE = "MultiABEL")
-		res2 <- data.frame(p = pf(scan1$fstat, m, n - m - 1, lower.tail = FALSE),
-				beta.score = scan2$pil)
+		res2 <- data.frame(p = pv, beta.score = scan2$pil)
 		res2$se.score <- res2$beta.score/sqrt(qchisq(res2$p, 1, lower.tail = FALSE))
 		res3[,seq(1, m*3, 3)] <- scan2$b
 		res3[,seq(2, m*3, 3)] <- scan2$s
@@ -146,12 +179,26 @@
 		scan1 <- .Fortran('MultiSummaryLoopDirect', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
 				tmat = tmat, invR = solve(x$cor.pheno), 
 				pil = numeric(k), fstat = numeric(k), PACKAGE = "MultiABEL")
+		fs <- scan1$pil/m/(n - scan1$pil)*(n - m - 1)
+		mf <- mean(fs)
+		n.eff <- mf*2/(mf - 1) + m + 1 # effective N: fit a F-distribution to the observed fstat using moment estimator
+		res$n <- round(n.eff, digits = 2)
+		if (!high.dim) {
+			if (n.eff < 5*m) {
+				warning('Effective sample size < 5 times number of phenotypes. Consider argument high.dim = TRUE.')
+			}
+			pv <- pf(fstat, m, n - m - 1, lower.tail = FALSE)	
+		} else {
+			pv <- pf(fs, m, n.eff - m - 1, lower.tail = FALSE)	
+			# scan1 <- .Fortran('MultiSummaryLoopDirectFstat', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
+			#		betamat = betamat, invsemat = 1/semat, invse = diag(m), invR = solve(x$cor.pheno), invV = diag(m), 
+			#		pil = numeric(k), fstat = numeric(k), PACKAGE = "MultiABEL")
+		}
 		scan2 <- .Fortran('MultiSummaryLoopPrecise', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
 				varg = vars, betamat = betamat, R = x$cor.pheno, invR = solve(x$cor.pheno), D = matrix(0, m, m),
 				sdY = diag(sqrt(x$var.pheno)), invsdY = diag(1/sqrt(x$var.pheno)), sY = sqrt(x$var.pheno), 
 				b = betamat, s = betamat, pil = numeric(k), coef = betamat, ss = betamat, PACKAGE = "MultiABEL")
-		res2 <- data.frame(p = pf(scan1$fstat, m, n - m - 1, lower.tail = FALSE),
-				beta.score = scan2$pil)
+		res2 <- data.frame(p = pv, beta.score = scan2$pil)
 		res2$se.score <- res2$beta.score/sqrt(qchisq(res2$p, 1, lower.tail = FALSE))
 		res3[,seq(1, m*3, 3)] <- scan2$b
 		res3[,seq(2, m*3, 3)] <- scan2$s
@@ -161,10 +208,25 @@
 		resc[,seq(2, m*3, 3)] <- sqrt(scan2$ss)
 		resc[,seq(3, m*3, 3)] <- pchisq(scan2$coef**2/scan2$ss, 1, lower.tail = FALSE)
 	} else if (type == 'direct') {
-		scan <- .Fortran('MultiSummaryLoopDirect', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
+		scan1 <- .Fortran('MultiSummaryLoopDirect', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
 				tmat = tmat, invR = solve(x$cor.pheno), 
 				pil = numeric(k), fstat = numeric(k), PACKAGE = "MultiABEL")
-		res$p <- pf(scan$fstat, m, n - m - 1, lower.tail = FALSE)
+		fs <- scan1$pil/m/(n - scan1$pil)*(n - m - 1)
+		mf <- mean(fs)
+		n.eff <- mf*2/(mf - 1) + m + 1 # effective N: fit a F-distribution to the observed fstat using moment estimator
+		res$n <- round(n.eff, digits = 2)
+		if (!high.dim) {
+			if (n.eff < 5*m) {
+				warning('Effective sample size < 5 times number of phenotypes. Consider argument high.dim = TRUE.')
+			}
+			pv <- pf(fstat, m, n - m - 1, lower.tail = FALSE)	
+		} else {
+			pv <- pf(fs, m, n.eff - m - 1, lower.tail = FALSE)	
+			# scan1 <- .Fortran('MultiSummaryLoopDirectFstat', k = as.integer(k), m = as.integer(m), nn = as.numeric(n), 
+			#		betamat = betamat, invsemat = 1/semat, invse = diag(m), invR = solve(x$cor.pheno), invV = diag(m), 
+			#		pil = numeric(k), fstat = numeric(k), PACKAGE = "MultiABEL")
+		}
+		res$p <- pv
 	}else {
 		stop('Wrong type of analysis!')
 	}

Modified: pkg/MultiABEL/R/Multivariate.R
===================================================================
--- pkg/MultiABEL/R/Multivariate.R	2017-06-15 17:03:58 UTC (rev 2074)
+++ pkg/MultiABEL/R/Multivariate.R	2018-05-15 04:33:16 UTC (rev 2075)
@@ -22,9 +22,12 @@
 #' @author Xia Shen
 #' 
 #' @references 
-#' Xia Shen, ..., Jim Wilson, Gordan Lauc, Yurii Aulchenko (2015).
-#' Multi-omic-variate analysis identified novel loci associated with 
-#' compound N-Glycosylation of human Immunoglobulin G. \emph{Submitted}.
+#' Shen X, Klarić L, Sharapov S, Mangino M, Ning Z, Wu D, 
+#' Trbojević-Akmačić I, Pučić-Baković M, Rudan I, Polašek O, 
+#' Hayward C, Spector TD, Wilson JF, Lauc G, Aulchenko YS (2017): 
+#' Multivariate discovery and replication of five novel loci 
+#' associated with Immunoglobulin G N-glycosylation. 
+#' \emph{Nature Communications}, 8, 447; doi: 10.1038/s41467-017-00453-3.
 #' 
 #' @seealso 
 #' \code{\link{MultiLoad}}