[Genabel-commits] r709 - in pkg/GenABEL: . R

Mon Apr 18 11:12:32 CEST 2011

Author: yurii
Date: 2011-04-18 11:12:31 +0200 (Mon, 18 Apr 2011)
New Revision: 709

Modified:
   pkg/GenABEL/CHANGES.LOG
   pkg/GenABEL/R/check.marker.R
   pkg/GenABEL/R/polygenic_hglm.R
Log:
update Xia's polygenic_hglm; updated check.marker, added warning if no. Y-markers is < 10 


Modified: pkg/GenABEL/CHANGES.LOG
===================================================================

--- pkg/GenABEL/CHANGES.LOG	2011-04-14 22:40:26 UTC (rev 708)
+++ pkg/GenABEL/CHANGES.LOG	2011-04-18 09:12:31 UTC (rev 709)
@@ -1,5 +1,10 @@
-*** v. 1.6-6 (2011.03.31)
+*** v. 1.6-6 (2011.04.18)
 
+Added Xia Shen's procedure 'polygenic_hglm'. Features: quick 
+convergence, standard errors for fixed effects.
+
+Updated check.marker, added warning if no. Y-markers is < 10 
+
 Applied the patch of Nicola Pirastu 
 http://lists.r-forge.r-project.org/pipermail/genabel-devel/2011-March/000182.html
 to descriptives.trait. Added RUnit regression tests, updated 
@@ -23,9 +28,6 @@
 option 'mapHasHeaderLine' to convert.snp.ped and updated documentation
 (resolving feature request #1317).  
 
-With suggestion and guidance from Xia Shen, added procedure 'polygenic_hglm'.
-At the moment experimental. 
-
 ***** v. 1.6-5 (2011.02.07)
 
 Added '#include <cstdarg>' to iterator.cpp to solve 

Modified: pkg/GenABEL/R/check.marker.R
===================================================================
--- pkg/GenABEL/R/check.marker.R	2011-04-14 22:40:26 UTC (rev 708)
+++ pkg/GenABEL/R/check.marker.R	2011-04-18 09:12:31 UTC (rev 709)
@@ -1,12 +1,12 @@
 "check.marker" <-
-function(data, snpsubset, idsubset,
-			callrate=0.95,perid.call=0.95, extr.call = 0.1, extr.perid.call = 0.1, 
-			het.fdr=0.01, ibs.threshold = 0.95, ibs.mrk = 2000, ibs.exclude="lower",
-			maf, p.level=-1, 
-			fdrate = 0.2, odds = 1000, hweidsubset, redundant="no", minconcordance = 2.0, 
-			qoption="bh95",imphetasmissing=TRUE,XXY.call=0.8, 
-			intermediateXF = c(0.5,0.5)) {
-
+		function(data, snpsubset, idsubset,
+				callrate=0.95,perid.call=0.95, extr.call = 0.1, extr.perid.call = 0.1, 
+				het.fdr=0.01, ibs.threshold = 0.95, ibs.mrk = 2000, ibs.exclude="lower",
+				maf, p.level=-1, 
+				fdrate = 0.2, odds = 1000, hweidsubset, redundant="no", minconcordance = 2.0, 
+				qoption="bh95",imphetasmissing=TRUE,XXY.call=0.8, 
+				intermediateXF = c(0.5,0.5)) {
+	
 	if (is(data,"gwaa.data")) data <- data at gtdata
 	if (!is(data,"snp.data")) stop("data argument should be of type gwaa.data or snp.data");
 	if (!missing(snpsubset)) data <- data[,snpsubset]
@@ -21,18 +21,18 @@
 	snam <- data at snpnames
 	sids <- data at idnames
 	smap <- data at map
-	schr <- data at chromosome
-
+	schr <- chromosome(data)
+	
 	cat("Excluding people/markers with extremely low call rate...\n")
-	cat(data at nsnps,"markers and",data at nids,"people in total\n")
+	cat(nsnps(data),"markers and",nids(data),"people in total\n")
 	ts <- perid.summary(data)
-	out$idnocall <- rownames(ts[ts[,"CallPP"]<extr.perid.call,])
+	out$idnocall <- rownames(ts[which(ts[,"CallPP"]<extr.perid.call),])
 	out$idok <- data at idnames[!(data at idnames %in% out$idnocall)]
 	cat(length(out$idnocall),"people excluded because of call rate <",extr.perid.call,"\n")
 	ts <- summary(data)
-	if (any(as.character(data at chromosome) == "Y") && any(data at male==1)) {
-		tsY <- summary(data[which(data at male==1),(which(as.character(data at chromosome) == "Y"))])
-		ts[(which(as.character(data at chromosome) == "Y")),] <- tsY
+	if (any(chromosome(data) == "Y") && any(male(data)==1)) {
+		tsY <- summary(data[which(male(data)==1),(which(chromosome(data) == "Y"))])
+		ts[(which(chromosome(data) == "Y")),] <- tsY
 	}
 	out$nocall <- rownames(ts[ts[,"CallRate"]<extr.call,])
 	out$snpok <- data at snpnames[!(data at snpnames %in% out$nocall)]
@@ -43,7 +43,7 @@
 	if (!missing(hweidsubset)) hweidsubset <- hweidsubset[hweidsubset %in% out$idok]
 	
 	updat <- 0
-	if (any(data at chromosome=="X")) {
+	if (any(chromosome(data)=="X")) {
 		cat("\nRunning sex chromosome checks...\n")
 		
 		#maleInd <- male(data)
@@ -72,7 +72,7 @@
 		#	cat("cut off Fmale =",Fmale,"\n")
 		#	stop("Fmale not in [0,1]")
 		#}
-	    Ffemale <- intermediateXF[1]
+		Ffemale <- intermediateXF[1]
 		Fmale <- intermediateXF[2]
 		if (Ffemale>Fmale) stop("Ffemale>Fmale")
 		if (Ffemale > 1 | Ffemale < 0) {
@@ -84,7 +84,7 @@
 			stop("Fmale not in [0,1]")
 		}
 		
-		out.nxt <- Xcheck(data[,data at chromosome=="X"],Pgte=0.001,Pssw=0.01,
+		out.nxt <- Xcheck(data[,chromosome(data)=="X"],Pgte=0.001,Pssw=0.01,
 				Pmsw=0.01,odds=odds,Ffemale=Ffemale,Fmale=Fmale,tabonly=F)
 		nxerr <- dim(out.nxt$Xerrtab)[1]
 		if (!length(nxerr)) nxerr <- 0
@@ -95,18 +95,24 @@
 		cat(length(out.nxt$otherSexErr)," people have intermediate inbreeding (",Ffemale," > F > ",Fmale,")\n",sep="")
 		out <- update.check.marker(out,out.nxt)
 		updat <- 1
-		out.nxt <- Xcheck(data[out$idok,out$snpok[out$snpok %in% data at snpnames[data at chromosome=="X"]]],Pgte=0.001,Pssw=0.01,Pmsw=0.01,odds=odds,tabonly=T)
+		out.nxt <- Xcheck(data[out$idok,out$snpok[out$snpok %in% data at snpnames[chromosome(data)=="X"]]],Pgte=0.001,Pssw=0.01,Pmsw=0.01,odds=odds,tabonly=T)
 		nxerr <- dim(out.nxt$Xerrtab)[1]
 		if (!length(nxerr)) nxerr <- 0
 		cat("If these people/markers are removed,",nxerr,"heterozygous male genotypes are left\n")
 		if (nxerr & imphetasmissing) cat("... these will be considered missing in analysis.\n")
 		if (nxerr) cat("... Use Xfix() to fix these problems.\n")
 	}
-
+	
 # XXY checks
-	if (any(data at chromosome=="Y")) {
-		totest <- unique(c(out$isfemale,intersect(data at idnames[data at male==0],out$idok)))
-		Ytab <- perid.summary(data[totest,(as.character(data at chromosome) == "Y")])
+	if (any(chromosome(data)=="Y")) {
+		nYsnps <- length(which(chromosome(data)=="Y"))
+		if (nYsnps<10) {
+			message <- paste("The number of Y-chromosome SNPs is low (",nYsnps,
+					"). Consider ignoring Y-checks by setting 'XXY.call' to 2",sep="")
+			warning(message,immediate. = TRUE)
+		}
+		totest <- unique(c(out$isfemale,intersect(data at idnames[male(data)==0],out$idok)))
+		Ytab <- perid.summary(data[totest,(chromosome(data) == "Y")])
 		Ytab <- Ytab[Ytab$NoMeasured>0,]
 		cat("\n",sum(Ytab$NoMeasured),"possibly female Y genotypes identified")
 		Ytabexc <- Ytab[Ytab$CallPP>=XXY.call,]
@@ -125,7 +131,7 @@
 			cat("\nNone of these people excluded based on Y-threshold of",XXY.call,"\n")
 		}
 	}
-
+	
 	if (updat) { 
 		data <- data[out$idok,out$snpok]
 		gc(verbose=FALSE)
@@ -133,25 +139,25 @@
 	}
 #	out<-list(qc=out,data=data)
 #	return(out)
-
+	
 	ts <- summary(data)
-	if (any(data at chromosome=="Y")) {
+	if (any(chromosome(data)=="Y")) {
 		cat("\nChecking Y-chromsome heterozygous genotypes... ");
-		sumYerr <- sum(ts[data at chromosome=="Y","P.12"],na.rm=T)
-		cat(sumYerr,"(",100*sumYerr/sum(ts[data at chromosome=="Y","NoMeasured"],na.rm=T),"%) found.\n");
+		sumYerr <- sum(ts[chromosome(data)=="Y","P.12"],na.rm=T)
+		cat(sumYerr,"(",100*sumYerr/sum(ts[chromosome(data)=="Y","NoMeasured"],na.rm=T),"%) found.\n");
 		if (sumYerr & imphetasmissing) cat("... these will be considered missing in analysis.\n")
 		if (sumYerr) cat("... Use Xfix() to fix these problems.\n")
 	}
-	if (any(data at chromosome=="mt")) {
+	if (any(chromosome(data)=="mt")) {
 		cat("\nChecking mtDNA heterozygous genotypes... ");
-		sumMTerr <- sum(ts[data at chromosome=="mt","P.12"],na.rm=T)
-		cat(sumMTerr,"(",100*sumMTerr/sum(ts[data at chromosome=="mt","NoMeasured"],na.rm=T),"%) found.\n");
+		sumMTerr <- sum(ts[chromosome(data)=="mt","P.12"],na.rm=T)
+		cat(sumMTerr,"(",100*sumMTerr/sum(ts[chromosome(data)=="mt","NoMeasured"],na.rm=T),"%) found.\n");
 		if (sumMTerr & imphetasmissing) cat("... these will be considered missing in analysis.\n")
 		if (sumMTerr) cat("... Use Xfix() to fix these problems.\n")
 	}
 	rm(ts)
 	gc(verbose=FALSE)
-
+	
 	if (imphetasmissing & (data at nsnps != length(autosomal(data)))) {
 		cat("\n")
 		data <- Xfix(data)
@@ -159,18 +165,18 @@
 		cat("\n")
 	}
 	
-
+	
 # first round -- redundancy 
 	run <- 1
 	passed <- 0
 	while (!passed) {
 		cat("\nRUN",run,"\n");
 		out.nxt <- check.marker.internal(data=data, 
-			callrate=callrate,perid.call=perid.call,
-			het.fdr=het.fdr, ibs.threshold = ibs.threshold, ibs.mrk = ibs.mrk, ibs.exclude=ibs.exclude,
-			maf = maf, p.level= p.level, 
-			fdrate = fdrate, hweidsubset = hweidsubset, redundant = "no", minconcordance = 2.0, 
-			qoption = qoption, imphetasmissing = imphetasmissing)
+				callrate=callrate,perid.call=perid.call,
+				het.fdr=het.fdr, ibs.threshold = ibs.threshold, ibs.mrk = ibs.mrk, ibs.exclude=ibs.exclude,
+				maf = maf, p.level= p.level, 
+				fdrate = fdrate, hweidsubset = hweidsubset, redundant = "no", minconcordance = 2.0, 
+				qoption = qoption, imphetasmissing = imphetasmissing)
 		if (length(out$snpok) == length(out.nxt$snpok) && length(out$idok) == length(out.nxt$idok))
 			if (all(out$snpok == out.nxt$snpok) && all(out$idok == out.nxt$idok)) 
 				passed <- 1
@@ -185,11 +191,11 @@
 	if (redundant != "no") {
 		cat("\nCHECK REDUNDANCY\n");
 		out.nxt <- check.marker.internal(data=data, 
-			callrate=callrate,perid.call=perid.call,
-			het.fdr=het.fdr, ibs.threshold = ibs.threshold, ibs.mrk = ibs.mrk, ibs.exclude=ibs.exclude,
-			maf = maf, p.level= p.level, 
-			fdrate = fdrate, hweidsubset = hweidsubset, redundant = redundant, minconcordance = minconcordance, 
-			qoption = qoption, imphetasmissing = imphetasmissing)
+				callrate=callrate,perid.call=perid.call,
+				het.fdr=het.fdr, ibs.threshold = ibs.threshold, ibs.mrk = ibs.mrk, ibs.exclude=ibs.exclude,
+				maf = maf, p.level= p.level, 
+				fdrate = fdrate, hweidsubset = hweidsubset, redundant = redundant, minconcordance = minconcordance, 
+				qoption = qoption, imphetasmissing = imphetasmissing)
 		out <- update.check.marker(out,out.nxt)
 	}
 	out$call$name <- snam

Modified: pkg/GenABEL/R/polygenic_hglm.R
===================================================================
--- pkg/GenABEL/R/polygenic_hglm.R	2011-04-14 22:40:26 UTC (rev 708)
+++ pkg/GenABEL/R/polygenic_hglm.R	2011-04-18 09:12:31 UTC (rev 709)
@@ -1,9 +1,7 @@
 #' Estimation of polygenic model
 #' 
-#' Estimation of polygenic model using hierarchical
-#' GLM (hglm package)
-#' Tell when REML is used, what otherwise; 
-#' what it is doing, how to do tests, add more examples, references, etc.
+#' Estimation of polygenic model using a hierarchical generalized linear model 
+#' (HGLM; Lee and Nelder 1996. \code{hglm} package; Ronnegard et al. 2010).
 #' 
 #' @param formula Formula describing fixed effects to be used in analysis, e.g. 
 #' y ~ a + b means that outcome (y) depends on two covariates, a and b. 
@@ -12,18 +10,36 @@
 #' @param kinship.matrix Kinship matrix, as provided by e.g. ibs(,weight="freq"), 
 #' or estimated outside of GenABEL from pedigree data.
 #' @param data An (optional) object of \code{\link{gwaa.data-class}} or a data frame with 
-#' outcome and covariates
+#' outcome and covariates.
 #' @param family a description of the error distribution and link function to be 
 #' used in the mean part of the model (see \code{\link{family}} for details of 
-#' family functions)
-#' @param conv 'conv' parameter of 'hglm' (stricter then default)
-#' @param maxit 'maxit' parameter of 'hglm' (stricter then default)
-#' @param ... other parameters passed to 'hglm' call
+#' family functions).
+#' @param conv 'conv' parameter of \code{hglm} (stricter than default, for great precision, use 1e-8).
+#' @param maxit 'maxit' parameter of \code{hglm} (stricter than default).
+#' @param ... other parameters passed to \code{hglm} call
 #' 
 #' @author Xia Shen, Yurii Aulchenko
 #' 
+#' @details
+#'
+#' The algorithm gives extended quasi-likelihood (EQL) estimates
+#' for restricted maximum likelihood (REML) (Ronnegard et al. 2010).
+#' Implemented is the inter-connected GLM interpretation of HGLM (Lee and Nelder 2001).
+#' Testing on fixed and random effects estimates are directly done using inter-connected
+#' GLMs. Testing on dispersion parameters (variance components) can be done by extracting
+#' the profile likelihood function value of REML.
+#' 
 #' @references 
-#' need to put reference here
+#'
+#' Ronnegard, L, Shen, X and Alam, M (2010). hglm: A Package For Fitting 
+#' Hierarchical Generalized Linear Models. \emph{The R Journal}, \bold{2}(2), 20-28.
+#'
+#' Lee, Y and Nelder JA (2001). Hierarchical generalised linear models: 
+#' A synthesis of generalised linear models, random-effect models 
+#' and structured dispersions. \emph{Biometrika}, \bold{88}(4), 987-1006.
+#'
+#' Lee, Y and Nelder JA (1996). Hierarchical Generalized Linear Models. 
+#' \emph{Journal of the Royal Statistical Society. Series B}, \bold{58}(4), 619-678.
 #' 
 #' @seealso 
 #' \code{\link{polygenic}},
@@ -35,22 +51,56 @@
 #' df <- ge03d2ex.clean[,autosomal(ge03d2ex.clean)]
 #' gkin <- ibs(df,w="freq")
 #' 
-#' # for quantitative traits
-#' h2ht <- polygenic_hglm(height ~ sex + age,kin=gkin,df)
-#' # estimate of heritability
+#' # ----- for quantitative traits
+#' h2ht <- polygenic_hglm(height ~ sex + age, kin = gkin, df)
+#' # ----- estimate of heritability
 #' h2ht$esth2
-#' # other parameters
+#' # ----- other parameters
 #' h2ht$h2an
 #' 
-#' # for binary traits
-#' h2dm <- polygenic_hglm(dm2 ~ sex + age,kin=gkin,df,family=binomial(link = 'logit'))
-#' # estimated parameters
+#' # ----- test the significance of fixed effects 
+#' # ----- (e.g. quantitative trait)
+#' h2ht <- polygenic_hglm(height ~ sex + age, kin = gkin, df)
+#' # ----- summary with standard errors and p-values
+#' summary(h2ht$hglm)
+#' # ----- output for the fixed effects part
+#' # ...
+#' # MEAN MODEL
+#' # Summary of the fixed effects estimates 
+#' #              Estimate Std. Error t value Pr(>|t|)    
+#' # (Intercept) 169.53525    2.57624  65.807  < 2e-16 ***
+#' # sex          14.10694    1.41163   9.993  4.8e-15 ***
+#' # age          -0.15332    0.05208  -2.944  0.00441 ** 
+#' # ---
+#' # Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
+#' # Note: P-values are based on 69 degrees of freedom
+#' # ...
+#' # ----- extract fixed effects estimates and standard errors
+#' h2ht$h2an
+#' 
+#' # test the significance of polygenic effect
+#' h2ht <- polygenic_hglm(height ~ sex + age, kin = gkin, df, method = 'REML')
+#' nullht <- lm(height ~ sex + age, df)
+#' l1 <- h2ht$ProfLogLik
+#' l0 <- as.numeric(logLik(nullht))
+#' # the likelihood ratio test (LRT) statistic
+#' (S <- -2*(l0 - l1))
+#' # 5% right-tailed significance cutoff 
+#' # for 50:50 mixture distribution between point mass 0 and \chi^{2}(1)
+#' # Self, SG and Liang KY (1987) Journal of the American Statistical Association.
+#' qchisq(((1 - .05) - .50)/.50, 1)
+#' # p-value
+#' pchisq(S, 1, lower.tail = FALSE)*2
+#' 
+#' #' # ----- for binary traits
+#' h2dm <- polygenic_hglm(dm2 ~ sex + age, kin = gkin, df, family = binomial(link = 'logit'))
+#' # ----- estimated parameters
 #' h2dm$h2an
 #' 
 #' 
 #' @keywords htest
 #' 
-"polygenic_hglm" <- function(formula,kinship.matrix,data,family=gaussian(), conv=1e-8, maxit=100, ... )
+"polygenic_hglm" <- function(formula, kinship.matrix, data, family = gaussian(), conv = 1e-6, maxit = 100, ...)
 {
 	if (!require(hglm))
 		stop("this function requires 'hglm' package to be installed")
@@ -88,12 +138,16 @@
 	out$h2an$estimate <- c(res_hglm$fixef,out$esth2,tVar)
 	names(out$h2an$estimate)[(length(out$h2an$estimate)-1):(length(out$h2an$estimate))] <- 
 			c("h2","totalVar")
+	out$h2an$se <- c(res_hglm$SeFe,NA,NA)
+	names(out$h2an$se) <- 
+			c(names(res_hglm$fixef), "h2","totalVar")
 	out$pgresidualY <- rep(NA,length(mids))
 	out$pgresidualY[mids] <- y-res_hglm$fv
 	out$residualY <- rep(NA,length(mids))
 	out$residualY[mids] <- y - desmat %*% res_hglm$fixef
 	out$InvSigma <- ginv(tVar*out$esth2*relmat + 
 					diag(tVar*(1-out$esth2),ncol=length(y),nrow=length(y)))
+	out$ProfLogLik <- res_hglm$ProfLogLik
 	
 	class(out) <- "polygenic"
 	out