[Genabel-commits] r1412 - in pkg/GenABEL: . R man

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Thu Nov 21 21:00:02 CET 2013


Author: nd_0001
Date: 2013-11-21 21:00:01 +0100 (Thu, 21 Nov 2013)
New Revision: 1412

Added:
   pkg/GenABEL/R/VIFGC.R
   pkg/GenABEL/R/VIFGC_ovdom.R
Removed:
   pkg/GenABEL/R/GC.R
   pkg/GenABEL/R/GC_ovdom.R
   pkg/GenABEL/man/GC.Rd
   pkg/GenABEL/man/GC_ovdom.Rd
Modified:
   pkg/GenABEL/NAMESPACE
   pkg/GenABEL/R/PGC.R
   pkg/GenABEL/generate_documentation.R
   pkg/GenABEL/man/GenABEL.Rd
   pkg/GenABEL/man/PGC.Rd
   pkg/GenABEL/man/estlambda.Rd
   pkg/GenABEL/man/export.plink.Rd
   pkg/GenABEL/man/grammar.Rd
   pkg/GenABEL/man/impute2databel.Rd
   pkg/GenABEL/man/impute2mach.Rd
Log:
Changes in GC functions

Modified: pkg/GenABEL/NAMESPACE
===================================================================
--- pkg/GenABEL/NAMESPACE	2013-11-21 09:29:15 UTC (rev 1411)
+++ pkg/GenABEL/NAMESPACE	2013-11-21 20:00:01 UTC (rev 1412)
@@ -55,8 +55,8 @@
 	findRelatives,
 	formetascore,
 	GASurv,
-	GC,
-	GC_ovdom,
+	VIFGC,
+	VIFGC_ovdom,
 	generateOffspring,
 	getLogLikelihoodGivenRelation,
 	grammar,

Deleted: pkg/GenABEL/R/GC.R
===================================================================
--- pkg/GenABEL/R/GC.R	2013-11-21 09:29:15 UTC (rev 1411)
+++ pkg/GenABEL/R/GC.R	2013-11-21 20:00:01 UTC (rev 1412)
@@ -1,285 +0,0 @@
-#' Genomic control for various model of inheritance using VIF
-#' 
-#' This function estimates corrected statistic using genomic control
-#' for different models (recessive,dominant,additive etc.),
-#' using VIF. VIF coefficients are estimated
-#' by optimizing diffrent error functions: regress,
-#' median and ks.test.
-#'  
-#' @param data Input vector of Chi square statistic
-#' @param method Function of error to be optimized. Can be
-#' "regress", "median" or "ks.test"
-#' @param p Input vector of allele frequencies
-#' @param x Model of inheritance (0 for recessive,0.5 for additive, 1 for dominant)
-#' @param index.filter Indexes for variables that will be use for analisis in data vector
-#' @param n size of the sample
-#' @param proportion The proportion of lowest P (Chi2) to be used when
-#'   estimating the inflation factor Lambda for "regress" method only
-#' @param clust For developers only
-#' @param vart0 For developers only
-#' @param tmp For developers only
-#' @param CA For developers only
-#' @param p.table For developers only
-#' 
-#' @return A list with elements
-#' \item{Zx}{output vector corrected Chi square statistic}
-#' \item{vv}{output vector of VIF}
-#' \item{exeps}{output vector of exepsons (NA)}
-#' \item{calrate}{output vector of calrate}
-#' \item{F}{F}
-#' \item{K}{K}
-#' 
-#' @author Yakov Tsepilov
-#' 
-#' @examples
-#' data(ge03d2)
-#' # truncate the data to make the example faster
-#' ge03d2 <- ge03d2[seq(from=1,to=nids(ge03d2),by=2),seq(from=1,to=nsnps(ge03d2),by=3)]
-#' qts <- mlreg(dm2~sex,data=ge03d2,gtmode = "dominant")
-#' chi2.1df <- results(qts)$chi2.1df
-#' s <- summary(ge03d2)
-#' freq <- s$Q.2
-#' result <- GC(p=freq,x=1,method = "median",CA=FALSE,data=chi2.1df,n=nids(ge03d2))
-#' 
-#' @keywords htest
-#'
-
-GC = function(data,p,x,method = "regress",n,index.filter=NULL,proportion=1,clust=0,vart0=0,tmp=0,CA=FALSE,p.table=0){
-#
-	if (is.null(index.filter)){
-		ind.function=which(!is.na(data))
-	} else ind.function=index.filter
-	ind.function=ind.function[which(!is.na(data[ind.function]))]
-	#
-	if (!(method=="regress" | method=="median" | method=="ks.test")){
-		print("Error. I do not know this method");
-		break;
-	}
-	#
-	if (CA){
-		p.table[1,1,]->p00
-		p.table[1,2,]->p01
-		p.table[1,3,]->p02
-		p.table[1,4,]->n0
-
-		p.table[2,1,]->p10
-		p.table[2,2,]->p11
-		p.table[2,3,]->p12
-		p.table[2,4,]->n1
-
-		p.table[3,1,]->p0
-		p.table[3,2,]->p1
-		p.table[3,3,]->p2
-		p.table[3,4,]->n
-		Zx=0;
-		exeps=0;
-		Dx=(p12-p02)+x*(p11-p01);
-		i=1;
-		while (i<=length(n)){
-			vart=((p2[i]+p1[i]*x^2)-(p2[i]+x*p1[i])^2)*((1/n1[i])+(1/n0[i]));
-			if (vart==0) {Zx[i]=NA; exeps[i]=T; } else{
-			Zx[i]=((Dx[i])^2)/(vart); exeps[i]=F}
-			i=i+1;
-		}
-		#Zx=CASforVIF(p.table,p,x)
-		inf=which(!is.na(Zx));
-		notinf=which(is.na(Zx));
-	} else {
-		Zx=data
-		inf=which(!is.na(Zx));
-		notinf=which(is.na(Zx));
-	}
-	#
-	Clust <- function(k){
-		#l=which(data1 at phdata$dm2==k)
-		#data1 <- data1[l,inf]
-		#data1 <- Xfix(data1)
-		#detach(df at phdata)
-		#data1.gkin <- ibs(data1[,sample(which(data1 at gtdata@chromosome!="X"),1000)],weight="freq")
-		data.dist <- as.dist(0.5-tmp)
-		data.mds <- cmdscale(data.dist)
-		return(data.mds)
-	}
-	#
-	ClustN <- function(k,data1.mds){
-		km <- kmeans(data1.mds,centers=k,nstart=1000)
-		i=1;
-		  cl=0
-		while (i<=k){
-			cl[i] <- length(which(km$cluster==i))
-			i=i+1;
-		}
-		return (cl);
-	}
-	#
-
-	VIF <- function(p,N,F,K){
-		q=1-p;
-		Var = ((2*p*(1-F)*q*x^2+F*p+(1-F)*p^2)-(2*p*q*(1-F)*x+F*p+(1-F)*p^2)^2);
-		S=(1+F)*(1+2*F);
-		Cov1=((6*F^3*p+11*(1-F)*F^2*p^2+(1-F)^3*p^4+6*(1-F)^2*F*p^3)/S)-((1-F)*p^2 + F*p)^2; 
-		Cov2=x*(((4*(2*(1-F)*F^2*p*q+(1-F)^3*p^3*q+3*(1-F)^2*F*p^2*q))/S)-2*((1-F)*p^2+F*p)*(2*(1 - F)*p*q)); 
-		Cov3=x^2*((4*((1-F)^3*p^2*q^2+F*(1-F)*p*q))/S-(2*(1-F)*p*q)^2);
-		Cov=Cov1+Cov2+Cov3;
-		if (vart0==1){VarT0=N*Var-N*Cov;}
-		if (vart0==0){VarT0=N*Var;}
-		VarT=VarT0+Cov*K;
-		#VarT=VarT0+Cov*(K-N);
-		VIF=VarT/VarT0;
-		return(VIF)
-	}
-	#
-	GC_VIF_nlm = function(FK){
-	###
-		F=FK[1];
-		K=FK[2];
-		#if ((F>1) || (F<0) || (K<0) || (K>(n[1]^2))) {dMedian=(length(Zx)*100 + abs(K));} else{
-		vector_vif=VIF(p,n,F,K);
-		Zxl=Zx/vector_vif;
-		Zxl=sort(Zxl[ind.function]);
-		Zxl_r=Zxl[1:ntp]
-		if (method=="ks.test"){
-		dMedian=-log(ks.test(Zxl,"pchisq",df=1)$p.value);
-		}
-		if (method=="median"){
-		dMedian=abs(qchisq(.5,1)-median(Zxl));
-		}
-		if (method=="regress")
-		{dMedian=sum((Zxl_r-Chi2)^2);}
-		#}
-		return(1*dMedian)
-	}
-	###
-	GC_VIF = function(F,K){
-	###
-		#F=FK[1];
-		#K=FK[2];
-		#if ((F>1) || (F<0) || (K<0) || (K>(n[1]^2))) {dMedian=(length(Zx)*100 + abs(K));} else{
-		vector_vif=VIF(p,n,F,K);
-		Zxl=Zx/vector_vif;
-		Zxl=sort(Zxl[ind.function]);
-		Zxl_r=Zxl[1:ntp]
-		if (method=="ks.test"){
-		dMedian=-log(ks.test(Zxl,"pchisq",df=1)$p.value);
-		}
-		if (method=="median"){
-		dMedian=abs(qchisq(.5,1)-median(Zxl));
-		}
-		if (method=="regress")
-		{dMedian=sum((Zxl_r-Chi2)^2);}
-		#}
-		return(1*dMedian)
-	}
-
-	###
-	F=0.5; K=n[1];
-
-	#N_inf=sum(!is.na(data[ind.function]))
-	#Chi2=(0:(N_inf-1))/N_inf;
-	#Chi2=qchisq(Chi2,1)
-	data_p <- data[ind.function]
-	if (proportion>1.0 || proportion<=0) 
-		stop("proportion argument should be greater than zero and less than or equal to one")
-	ntp <- round(proportion*length(data_p))
-	if (ntp<1) stop("no valid measurements")
-	if (ntp==1) {
-		warning(paste("One measurement, Lambda = 1 returned"))
-		return(list(estimate=1.0,se=999.99))
-	}
-	if (ntp<10) warning(paste("number of points is too small:",ntp))
-	if (min(data_p)<0) stop("data argument has values <0")
-	if (max(data_p)<=1) {
-		data_p<- qchisq(data_p,1,lower.tail=FALSE)
-	}
-		
-	data_p <- sort(data_p)
-	ppoi <- ppoints(data_p)
-	Chi2 <- sort(qchisq(1-ppoi,1))
-	Chi2 <- Chi2[1:ntp]
-	#####
-	if (clust==1){
-	data.mds0 <- Clust(0)
-	#plot(data.mds0)
-	k=2;
-	cl0 <- ClustN(k,data.mds0)
-	data.mds1 <- Clust(1)
-	#plot(data.mds1)
-	k=2;
-	cl1 <- ClustN(k,data.mds1)
-	if (length(cl1)>length(cl0)){cl0[(length(cl0)+1):length(cl1)]=0}
-	if (length(cl0)>length(cl1)){cl1[(length(cl1)+1):length(cl0)]=0}
-	S=sum(cl0);
-	R=sum(cl1);
-	K=sum((cl1*S-cl0*R)^2)
-	K1=K/(R*S)
-	#####
-	#opt_1=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=0.2);
-	opt_2=optimize(GC_VIF,c(0,1),tol=0.0001,K=K1);
-	#opt_3=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=opt_2$minimum);
-	F=opt_2$minimum;
-	K=K1;
-		#j=0.01;
-		#min=10000;
-		#while (j<1){
-		#	opt_1=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=j);
-		#	opt_2=optimize(GC_VIF,c(0,1),tol=0.0001,K=opt_1$minimum);
-		#	if (opt_1$objective<min){	
-		#		K=opt_1$minimum;
-		#		F=j;
-		#		min=opt_1$objective;
-		#	}
-		#	if (opt_2$objective<min){	
-		#		K=opt_1$minimum;
-		#		F=opt_2$minimum;
-		#		min=opt_2$objective;
-		#	}
-		#	j=j+0.1;	
-		#}	
-		#FK=nlm(GC_VIF ,p=c(F=0.5, K=680))$estimate;
-		#F=FK[1];
-		#K=FK[2];
-	}
-	if (clust==0){
-		#diag(tmp) <- diag(tmp)-0.5;
-		#F1=abs(mean(tmp[lower.tri(tmp,diag=T)]));
-		#l1=which (tmp>=0 & lower.tri(tmp,diag=T));
-		#F1=mean(tmp[l1]);
-		if ((median(Zx,na.rm=T)/qchisq(.5,1))>=1){
-			lda=median(Zx,na.rm=T)/qchisq(.5,1)+0.05;
-			c=(lda-1)*(n[1]/2)
-			F1=1/(c)^0.5
-			K1=c*(1+F1)/F1
-		} else{
-			F1=0.5
-			K1=n[1]
-		}
-		#F1=0.1225
-		#K1=245.254
-		#K1=279.394
-		
-		#opt_1=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=F1);
-		#K1=opt_1$minimum;
-		
-		####opt_1=optimize(GC_VIF,lower=10,upper=n[1]^2,tol=0.01,F=F1,maximum=T);
-			
-		FK=nlm(GC_VIF_nlm,c(F=F1,K=K1))$estimate;
-		F=FK[1];
-		K=FK[2];
-	}
-	vector_vif=VIF(p,n,F,K);
-	Zx=Zx/vector_vif;
-	#Zx[notinf]=NA;
-	exeps <- is.na(Zx);
-	#out <- as.data.frame(Zx)
-	out=list()
-	out$Zx<-Zx
-	out$vv <-vector_vif;
-	out$exeps <-exeps
-	out$calrate <- n/max(n)
-	out$F=F
-	out$K=K
-	out
-	#Zx
-	#return(Zx)
-}
-#ZxVIF=GC(p12,p02,p11,p01,p,x,n1,n0,n);

Deleted: pkg/GenABEL/R/GC_ovdom.R
===================================================================
--- pkg/GenABEL/R/GC_ovdom.R	2013-11-21 09:29:15 UTC (rev 1411)
+++ pkg/GenABEL/R/GC_ovdom.R	2013-11-21 20:00:01 UTC (rev 1412)
@@ -1,217 +0,0 @@
-#' Genomic control for overdimunant model of inheritance using VIF
-#' 
-#' This function estimates the corrected statistic using genomic control
-#' for the overdominant model,
-#' using VIF. VIF coefficients are estimated
-#' by optimizing different error functions: regress,
-#' median and ks.test.
-#'  
-#' @param data Input vector of Chi square statistic
-#' @param method Function of error to be optimized. Can be
-#' "regress", "median" or "ks.test"
-#' @param p Input vector of allele frequencies
-#' @param index.filter Indexes for variables that will be use for analisis in data vector
-#' @param n size of the sample
-#' @param proportion The proportion of lowest P (Chi2) to be used when
-#'   estimating the inflation factor Lambda for "regress" method only
-#' @param clust For developers only
-#' @param vart0 For developers only
-#' @param tmp For developers only
-#' 
-#' @return A list with elements
-#' \item{Zx}{output vector corrected Chi square statistic}
-#' \item{vv}{output vector of VIF}
-#' \item{exeps}{output vector of exepsons (NA)}
-#' \item{calrate}{output vector of calrate}
-#' \item{F}{F}
-#' \item{K}{K}
-#' 
-#' @author Yakov Tsepilov
-#' 
-#' @examples
-#' data(ge03d2)
-#' # truncate the data to make the example faster
-#' ge03d2 <- ge03d2[seq(from=1,to=nids(ge03d2),by=2),seq(from=1,to=nsnps(ge03d2),by=3)]
-#' qts <- mlreg(phdata(ge03d2)$dm2~1,data=ge03d2,gtmode = "overdominant")
-#' chi2.1df <- results(qts)$chi2.1df
-#' s <- summary(ge03d2)
-#' freq <- s$Q.2
-#' result <- GC_ovdom(p=freq,method = "median",data=chi2.1df,n=nids(ge03d2))
-#' 
-#' @keywords htest
-#'
-
-GC_ovdom = function(data,p,method = "regress",n,index.filter=NULL,proportion=1,clust=0,vart0=0,tmp=0){
-	#
-	if (is.null(index.filter)){
-		ind.function=which(!is.na(data))
-	} else ind.function=index.filter
-	ind.function=ind.function[which(!is.na(data[ind.function]))]
-	#
-	if (!(method=="regress" | method=="median" | method=="ks.test")){
-	print("Error. I do not know this method");
-	break;
-	}
-	#
-	Zx=data
-	inf=which(!is.na(Zx));
-	notinf=which(is.na(Zx));
-	#
-	Clust <- function(k){
-		#l=which(data1 at phdata$dm2==k)
-		#data1 <- data1[l,inf]
-		#data1 <- Xfix(data1)
-		#detach(df at phdata)
-		#data1.gkin <- ibs(data1[,sample(which(data1 at gtdata@chromosome!="X"),1000)],weight="freq")
-		data.dist <- as.dist(0.5-tmp)
-		data.mds <- cmdscale(data.dist)
-		return(data.mds)
-	}
-	#
-	ClustN <- function(k,data1.mds){
-		km <- kmeans(data1.mds,centers=k,nstart=1000)
-		i=1;
-		  cl=0
-		while (i<=k){
-			cl[i] <- length(which(km$cluster==i))
-			i=i+1;
-		}
-		return (cl);
-	}
-	#
-
-	VIF <- function(p,N,F,K){
-		q=1-p;
-		Var=2*p*q*x^2 - 2*F*p*q*x^2 - 4*p^2*q^2*x^2 + 8*F*p^2*q^2*x^2-4*F^2*p^2*q^2*x^2;
-		#Var = ((2*p*(1-F)*q*x^2+F*p+(1-F)*p^2)-(2*p*q*(1-F)*x+F*p+(1-F)*p^2)^2);
-		S=(1+F)*(1+2*F);
-		Cov=x^2*((4*((1-F)^3*p^2*q^2+F*(1-F)*p*q))/S-(2*(1-F)*p*q)^2);
-		if (vart0==1){VarT0=N*Var-N*Cov;}
-		if (vart0==0){VarT0=N*Var;}
-		VarT=VarT0+Cov*K;
-		#VarT=VarT0+Cov*(K-N);
-		VIF=VarT/VarT0;
-		return(VIF)
-	}
-	#
-	GC_VIF_nlm = function(FK){
-	###
-		F=FK[1];
-		K=FK[2];
-		#if ((F>1) || (F<0) || (K<0) || (K>(n[1]^2))) {dMedian=(length(Zx)*100 + abs(K));} else{
-		vector_vif=VIF(p,n,F,K);
-		Zxl=Zx/vector_vif;
-		Zxl=sort(Zxl[ind.function]);
-		Zxl_r=Zxl[1:ntp]
-		if (method=="ks.test"){
-		dMedian=-log(ks.test(Zxl,"pchisq",df=1)$p.value);
-		}
-		if (method=="median"){
-		dMedian=abs(qchisq(.5,1)-median(Zxl));
-		}
-		if (method=="regress")
-		{dMedian=sum((Zxl_r-Chi2)^2);}
-		#}
-		return(1*dMedian)
-	}
-	###
-	GC_VIF = function(F,K){
-	###
-		#F=FK[1];
-		#K=FK[2];
-		#if ((F>1) || (F<0) || (K<0) || (K>(n[1]^2))) {dMedian=(length(Zx)*100 + abs(K));} else{
-		vector_vif=VIF(p,n,F,K);
-		Zxl=Zx/vector_vif;
-		Zxl=sort(Zxl[ind.function]);
-		Zxl_r=Zxl[1:ntp]
-		if (method=="ks.test"){
-		dMedian=-log(ks.test(Zxl,"pchisq",df=1)$p.value);
-		}
-		if (method=="median"){
-		dMedian=abs(qchisq(.5,1)-median(Zxl));
-		}
-		if (method=="regress")
-		{dMedian=sum((Zxl_r-Chi2)^2);}
-		#}
-		return(1*dMedian)
-	}
-
-	F=0.5; K=0.2*n[1];
-	
-	#N_inf=sum(!is.na(data[ind.function]))
-	#Chi2=(0:(N_inf-1))/N_inf;
-	#Chi2=qchisq(Chi2,1)
-	data_p <- data[ind.function]
-	if (proportion>1.0 || proportion<=0) 
-		stop("proportion argument should be greater than zero and less than or equal to one")
-	ntp <- round(proportion*length(data_p))
-	if (ntp<1) stop("no valid measurements")
-	if (ntp==1) {
-		warning(paste("One measurement, Lambda = 1 returned"))
-		return(list(estimate=1.0,se=999.99))
-	}
-	if (ntp<10) warning(paste("number of points is too small:",ntp))
-	if (min(data_p)<0) stop("data argument has values <0")
-	if (max(data_p)<=1) {
-		data_p<- qchisq(data_p,1,lower.tail=FALSE)
-	}
-		
-	data_p <- sort(data_p)
-	ppoi <- ppoints(data_p)
-	Chi2 <- sort(qchisq(1-ppoi,1))
-	Chi2 <- Chi2[1:ntp]
-	x=1;
-	
-	if (clust==1){
-		data.mds0 <- Clust(0)
-		#plot(data.mds0)
-		k=2;
-		cl0 <- ClustN(k,data.mds0)
-		data.mds1 <- Clust(1)
-		#plot(data.mds1)
-		k=2;
-		cl1 <- ClustN(k,data.mds1)
-		if (length(cl1)>length(cl0)){cl0[(length(cl0)+1):length(cl1)]=0}
-		if (length(cl0)>length(cl1)){cl1[(length(cl1)+1):length(cl0)]=0}
-		S=sum(cl0);
-		R=sum(cl1);
-		K=sum((cl1*S-cl0*R)^2)
-		K1=K/(R*S)
-		#####
-		#opt_1=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=0.2);
-		opt_2=optimize(GC_VIF,c(0,1),tol=0.0001,K=K1);
-		#opt_3=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=opt_2$minimum);
-		F=opt_2$minimum;
-		K=K1;
-	}
-	if (clust==0){
-		#diag(tmp) <- diag(tmp)-0.5;
-		#F1=abs(mean(tmp[lower.tri(tmp,diag=T)]));
-		#l1=which (tmp>=0 & lower.tri(tmp,diag=T));
-		#F1=mean(tmp[l1]);
-		
-		lda=median(Zx,na.rm=T)/qchisq(.5,1)+0.05;
-		c=(lda-1)*(n[1]/2)
-		F1=1/(c)^0.5
-		K1=c*(1+F1)/F1
-		FK=nlm(GC_VIF_nlm,c(F=F1,K=K1))$estimate;
-		F=FK[1];
-		K=FK[2];
-	}
-	vector_vif=VIF(p,n,F,K);
-	Zx=Zx/vector_vif;
-	#Zx[notinf]=NA;
-	exeps <- is.na(Zx);
-	#out <- as.data.frame(Zx)
-	out=list()
-	out$Zx<-Zx
-	out$vv <-vector_vif;
-	out$exeps <-exeps
-	out$calrate <- n/max(n)
-	out$F=F
-	out$K=K
-	out
-	#Zx
-	#return(Zx)
-}
-#ZxVIF=GC(p12,p02,p11,p01,p,x,n1,n0,n);

Modified: pkg/GenABEL/R/PGC.R
===================================================================
--- pkg/GenABEL/R/PGC.R	2013-11-21 09:29:15 UTC (rev 1411)
+++ pkg/GenABEL/R/PGC.R	2013-11-21 20:00:01 UTC (rev 1412)
@@ -2,169 +2,180 @@
 #' 
 #' This function estimates the genomic controls
 #' for different models and degrees of freedom,
-#' using polinom function. Polinomic coefficients are estimated
+#' using polinomial function. Polinomial coefficients are estimated
 #' by optimizing different error functions: regress,
-#' median and ks.test.
+#' median, ks.test or group regress.
 #' 
 #' @param data Input vector of Chi square statistic
 #' @param method Function of error to be optimized. Can be
-#' "regress", "median" or "ks.test"
+#'  "regress", "median", "ks.test" or "group_regress"
 #' @param p Input vector of allele frequencies
-#' @param df Number of dergees of freedom
-#' @param pol.d Polinom degree
-#' @param plot If true, functon makes plot of lambda from allele frequencies
+#' @param df Number of degrees of freedom
+#' @param pol.d The degree of polinomial function
+#' @param plot If TRUE, plot of lambda will be produced
 #' @param start.corr For regress method use it only when you want to make calculations faster
 #' @param index.filter Index of variables in data vector, that will be used in analysis 
-#' if zero - all variables will be used
+#'  if zero - all variables will be used
 #' @param proportion The proportion of lowest P (Chi2) to be used when
 #'   estimating the inflation factor Lambda for "regress" method only
+#' @param n_quiantile The number of groups for "group_regress" method
+#' @param title_name The title name for plot
+#' @param type_of_plot For developers only
+#' @param lmax The threshold for lambda for plotting (optional)
+#' @param color The color of the plot
 #' 
 #' @return A list with elements
 #' \item{data}{Output vector corrected Chi square statistic}
-#' \item{b}{Polinom coefficients}
+#' \item{b}{Polinomial coefficients}
 #' 
 #' @author Yakov Tsepilov
 #' 
 #' @examples
 #' data(ge03d2)
-#' qts=qtscore(dm2, ge03d2)
-#' chi2.1df=results(qts)$chi2.1df
-#' s=summary(ge03d2)
-#' MAF=s$Q.2
-#' result=PGC(data=chi2.1df,method="regress",p=MAF,df=1, pol.d=3, plot=FALSE, start.corr=FALSE)
-#'
+#' ge03d2 <- ge03d2[seq(from=1,to=nids(ge03d2),by=2),seq(from=1,to=nsnps(ge03d2),by=3)]
+#' qts <- mlreg(dm2~1,data=ge03d2,gtmode = "additive")
+#' chi2.1df <- results(qts)$chi2.1df
+#' s <- summary(ge03d2)
+#' freq <- s$Q.2
+#' result=PGC(data=chi2.1df,method="median",p=freq,df=1, pol.d=2, plot=TRUE, lmax=1.1,start.corr=FALSE)
+#' #"group_regress" is better to use when we have more than 50K SNPs
+#' #result=PGC(data=chi2.1df,method="group_regress",p=freq,df=1, pol.d=2, plot=TRUE, start.corr=FALSE,n_quiantile=3)
+#' 
 #' @keywords htest
 #' 
 
-PGC = function(data,method="regress",p,df, pol.d=3, plot=TRUE, index.filter=NULL,start.corr=FALSE,proportion=1){
-	if (is.null(index.filter)){
-		ind.function=which(!is.na(data))
-	} else ind.function=index.filter
-	ind.function=ind.function[which(!is.na(data[ind.function]))]
-    pol.m = function(p, b, pol.d){
-        out=0;
-        for (i in (1:pol.d)){
-            out=(p^i)*b[i]+out;
-        }     
-        out=out+b[pol.d+1];
-        return(out);
-    }     
+PGC=function (data, method = "group_regress", p, df, pol.d = 3, plot = TRUE, 
+    index.filter = NULL, start.corr = FALSE, proportion = 1,n_quiantile=5,title_name="Lambda",type_of_plot="plot",lmax=NULL,color="red") 
+{
+    if (is.null(index.filter)) {
+        ind.function = which(!is.na(data))
+    }
+    else ind.function = index.filter
+    ind.function = ind.function[which(!is.na(data[ind.function]))]
+	
+    pol.m = function(p, b, pol.d) {
+        out = 0
+        for (i in (1:pol.d)) {
+            out = (p^i) * b[i] + out
+        }
+        out = out + b[pol.d + 1]
+        return(out)
+    }
+	
+    if (!(method == "regress" | method == "median" | method == "ks.test" | method == "group_regress")) {
+        print("Error. I do not know this method")
+        break
+    }
+	
+    if (start.corr) {
+        data = data/(1 * median(data[ind.function], na.rm = T)/qchisq(0.5,df))
+    }
+    data_p <- data[ind.function]
+	p_p=p[ind.function]
+	lambda=median(data_p)/qchisq(0.5, df)
     
-     if (!(method=="regress" | method=="median" | method=="ks.test")){
-          print("Error. I do not know this method");
-          break;
-     }
-     if (start.corr){
-          data=data/(1*median(data[ind.function],na.rm=T)/qchisq(.5,df));
-    }
-    #N=sum(!is.na(data[ind.function]))
-	data_p <- data[ind.function]
-	if (proportion>1.0 || proportion<=0) 
-		stop("proportion argument should be greater than zero and less than or equal to one")
-	ntp <- round(proportion*length(data_p))
-	if (ntp<1) stop("no valid measurements")
-	if (ntp==1) {
-		warning(paste("One measurement, Lambda = 1 returned"))
-		return(list(estimate=1.0,se=999.99))
+			if (proportion > 1 || proportion <= 0) stop("proportion argument should be greater then zero and less than or equal to one")
+			ntp <- round(proportion * length(data_p))
+			if (ntp < 1) stop("no valid measurments")
+			if (ntp < 100) warning(paste("number of points is too small:", ntp))
+			if (min(data_p) < 0) stop("data argument has values <0")
+			
+    data_p <- sort(data_p)
+    ppoi <- ppoints(data_p)
+	if (method=="regress"){
+		Chi2 <- sort(qchisq(ppoi,df=df,lower.tail=FALSE))
+		Chi2 <- Chi2[1:ntp]
 	}
-	if (ntp<10) warning(paste("number of points is too small:",ntp))
-	if (min(data_p)<0) stop("data argument has values <0")
-	if (max(data_p)<=1) {
-		data_p<- qchisq(data_p,1,lower.tail=FALSE)
+	if (method=="group_regress"){
+		qt=quantile(p_p, probs = seq(0, 1,length=n_quiantile))
+		pp=p_p
+		i=1
+		for (i in (2:n_quiantile)){
+			pp[p_p>=qt[i-1] & p_p<=qt[i]]=i-1
+		}
+		tb=table(pp)
+		chi2_array=array(NA,c((n_quiantile-1),max(tb)))
+		for (i in (1:(n_quiantile-1))){
+			ppoi <- ppoints(1:tb[i])
+			ntp[i] <- round(proportion * tb[i])
+			chi2_array[i,1:tb[i]] <- sort(qchisq(ppoi,df=df,lower.tail=FALSE))
+		}
 	}
+    ppp_real = seq(from = min(p), to = max(p), length = 1000)
+	ppp_abstr= seq(from = 0, to = 1, length = 1000)
+	#ppp = seq(from = 0, to = 1, length = 1000)
+	
+    f2 = function(b) {
+        Zx = data
+		#if ((min(pol.m(ppp_abstr, b, pol.d))<1) & (max(pol.m(ppp_abstr, b, pol.d))>lambda*2)){
+		if (min(pol.m(ppp_abstr, b, pol.d))<1){
+			vv=1e5
+			#vv=pol.m(p, b, pol.d)
+		} else{
+			if (max(pol.m(ppp_abstr, b, pol.d))>10){
+				vv=1e5*max(pol.m(ppp_abstr, b, pol.d))
+			#vv=pol.m(p, b, pol.d)
+			}else{
+				vv=pol.m(p, b, pol.d)
+			}
+		}
 		
-	data_p <- sort(data_p)
-	ppoi <- ppoints(data_p)
-	Chi2 <- sort(qchisq(1-ppoi,df))
-	Chi2 <- Chi2[1:ntp]
-     #dta <- sort(data)
-     #ppp <- ppoints(dta)
-     #Chi2=qchisq(ppp,df);
-    
-    #delta=1/N;
-    #x1=0;
-    #i=1;
-    #Chi2=0;
-    #while (i<=N){
-    #x2=x1+(delta/2);
-    #Chi2[i]=qchisq(x2,df);
-    #x1=x1+delta;
-    #i=i+1;
-    #}
-	
-    #Chi2=(0:(N-1))/N;
-    #Chi2=qchisq(Chi2,df)
-    ppp=seq(from=min(p),to=max(p),length=1000)
-    #inf=which(!is.na(data))
-    f2 = function(b){
-        Zx=data
-              
-        #Chi2=sort(rchisq(N,df))
-        
-        #for (i in (1:N)){
-        #    j=inf[i];
-        #    if (pol.d==3) {
-        #        Zx[j]=Zx[j]/(b[1]*p[j]+b[2]*p[j]^2+b[3]*p[j]^3+b[4]);}
-        #    if (pol.d==2){
-          #            Zx[j]=Zx[j]/(b[1]*p[j]+b[2]*p[j]^2+b[3]);}
-        #}
-        #if (pol.d==4) {
-        #    Zx=Zx/(b[1]*p+b[2]*p^2+b[3]*p^3+b[4]*p^4+b[5]);}
-        #if (pol.d==3) {
-        #    Zx=Zx/(b[1]*p+b[2]*p^2+b[3]*p^3+b[4]);}
-        #if (pol.d==2){
-              #Zx=Zx/(b[1]*p+b[2]*p^2+b[3]);}
-        #}
-        Zx=Zx/(pol.m(p,b,pol.d))     
-        Zxl=sort(Zx[ind.function])
-		Zxl_r=Zxl[1:ntp]
-        if (method=="ks.test"){
-			F=-log(ks.test(Zxl,"pchisq",df=df)$p.value);
+        Zx = Zx/vv
+		if (method == "group_regress"){
+			Zxl=Zx[ind.function]
+			F=0
+			for (i in (1:(n_quiantile-1))){
+				Zxl_r=sort(Zxl[pp==i])
+				Zxl_r = Zxl_r[1:ntp[i]]
+				Chi2=chi2_array[i,1:ntp[i]]
+				F = sum((Zxl_r - Chi2)^2)+F
+			}
+		}
+        if (method == "ks.test") {
+			Zxl = sort(Zx[ind.function])
+            F = -log(ks.test(Zxl, "pchisq", df = df)$p.value)
         }
-        if (method=="median"){
-			F=abs(median(Zxl)-qchisq(.5,df))
+        if (method == "median") {
+			 Zxl = sort(Zx[ind.function])
+            F = abs(median(Zxl) - qchisq(0.5, df))
         }
-        if (method=="regress"){
-			F=sum((Zxl_r-Chi2)^2);
+        if (method == "regress") {
+			Zxl = sort(Zx[ind.function])
+			Zxl_r = Zxl[1:ntp]
+            F = sum((Zxl_r - Chi2)^2)
         }
-        return(F);
+        return(F)
     }
-    #b=c(-0.1807326,-0.2753122,-0.2110807,1.8401476);
-    #if (pol.d==4) {
-    #    b=c(0,0,0,0,1)
-         #    nlm=nlm(f2,b)
-         #    b=nlm$estimate;
-     #    if (plot) {plot(sort(b[1]*ppp+b[2]*ppp^2+b[3]*ppp^3+b[4]*ppp^4+b[5]),typ="l");}
-      #   data=data/(b[1]*p+b[2]*p^2+b[3]*p^3+b[4]*p^4+b[5]);
-    #}
-    #if (pol.d==3) {
-    #    b=c(0,0,0,1)
-         #nlm=nlm(f2,b)
-         #b=nlm$estimate;
-     #    if (plot) {plot(sort(b[1]*ppp+b[2]*ppp^2+b[3]*ppp^3+b[4]),typ="l");}
-      #   data=data/(b[1]*p+b[2]*p^2+b[3]*p^3+b[4]);
-    #}
-    # if (pol.d==2) {
-    #     b=c(0,0,1)
-         #nlm=nlm(f2,b)
-         #b=nlm$estimate;
-    #     if (plot) {plot(sort(b[1]*ppp+b[2]*ppp^2+b[3]),typ="l");}
-    #     data=data/(b[1]*p+b[2]*p^2+b[3]);
-    #}   
-
-    b=rep(0,pol.d+1);
-    b[pol.d+1]=1;
-    nlm=nlm(f2,b)
-    b=nlm$estimate;
-    if (plot) {plot(pol.m(ppp,b,pol.d),typ="l");}
-    vv=pol.m(p,b,pol.d);
-    data_check=data/vv;
-    data_check=data_check[!is.na(data_check)];
-    if (sum(data_check<0)==0 & sum(data_check>100)==0){ data=data/vv;}
-    out=list()
-    out$data=data
-    out$b=b
+    b = rep(0, pol.d + 1)
+    b[pol.d + 1] = 1
+    nlm = nlm(f2, b)
+    b = nlm$estimate
+    if (plot) {
+		if (type_of_plot=="plot"){
+			vv=pol.m(ppp_abstr, b, pol.d)
+			if (!is.null(lmax)){
+				ylim=0
+				ylim[1]=1-(lmax*5/90)
+				ylim[2]=lmax+(lmax*5/90)
+				plot(ppp_abstr,vv,main=title_name,xlab="Frequency",ylab="lambda(freq)",typ = "l",col=color,ylim=ylim)
+			} else{
+				plot(ppp_abstr,vv,main=title_name,xlab="Frequency",ylab="lambda(freq)",typ = "l",col=color)
+			}
+			#ylim=0
+			#ylim[1]=1-(lmax*5/90)
+			#ylim[2]=lmax+(lmax*5/90)
+			#plot(ppp_abstr,vv,main="Lambda",xlab="Frequency",ylab="lambda(freq)",typ = "l",col=color,ylim=ylim)
+			abline(v=min(p,na.rm=TRUE),col="green")
+			abline(v=max(p,na.rm=TRUE),col="green")
+		} else{
+			lines(ppp_abstr,pol.m(ppp_abstr, b, pol.d),main=title_name,xlab="Frequency",ylab="lambda(freq)",typ = "l",col=color)
+		}
+	}
+    vv = pol.m(p, b, pol.d)
+	data = data/vv
+    out = list()
+    out$data = data
+    out$b = b
     return(out)
-	#data
 }
 

Added: pkg/GenABEL/R/VIFGC.R
===================================================================
--- pkg/GenABEL/R/VIFGC.R	                        (rev 0)
+++ pkg/GenABEL/R/VIFGC.R	2013-11-21 20:00:01 UTC (rev 1412)
@@ -0,0 +1,290 @@
+#' Genomic control for various model of inheritance using VIF
+#' 
+#' This function estimates corrected statistic using genomic control
+#' for different models (recessive, dominant, additive etc.),
+#' using VIF. VIF coefficients are estimated
+#' by optimizing different error functions: regress,
+#' median and ks.test.
+#'  
+#' @param data Input vector of Chi square statistic
+#' @param method Function of error to be optimized. Can be
+#' "regress", "median" or "ks.test"
+#' @param p Input vector of allele frequencies
+#' @param x Model of inheritance (0 for recessive,0.5 for additive, 1 for dominant, also it could be arbitrary)
+#' @param index.filter Indexes for variables that will be use for analysis in data vector
+#' @param n The size of the sample
+#' @param proportion The proportion of lowest P (Chi2) to be used when
+#'   estimating the inflation factor Lambda for "regress" method only
+#' @param plot If TRUE, plot of lambda will be produced
+#' @param type_of_plot For developers only
+#' @param lmax The threshold for lambda for plotting (optional)
+#' @param color The color of the plot
+#' @param F The estimation of F (optional)
+#' @param K The estimation of K (optional)
+#' @param ladd The estimation of lambda for additive model (optional)
+#' @param clust For developers only
+#' @param vart0 For developers only
+#' @param tmp For developers only
+#' @param CA For developers only
+#' @param p.table For developers only
+#' 
+#' @return A list with elements
+#' \item{Zx}{output vector corrected Chi square statistic}
+#' \item{vv}{output vector of VIF}
+#' \item{exeps}{output vector of exepsons (NA)}
+#' \item{calrate}{output vector of calrate}
+#' \item{F}{F}
+#' \item{K}{K}
+#' 
+#' @author Yakov Tsepilov
+#' 
+#' @examples
+#' data(ge03d2)
+#' # truncate the data to make the example faster
+#' ge03d2 <- ge03d2[seq(from=1,to=nids(ge03d2),by=2),seq(from=1,to=nsnps(ge03d2),by=3)]
+#' qts <- mlreg(dm2~sex,data=ge03d2,gtmode = "dominant")
+#' chi2.1df <- results(qts)$chi2.1df
+#' s <- summary(ge03d2)
+#' freq <- s$Q.2
+#' result <- VIFGC(p=freq,x=1,method = "median",CA=FALSE,data=chi2.1df,n=nids(ge03d2))
+#' 
+#' @keywords htest
+#'
+
+VIFGC=function (data, p, x, method = "regress", n, index.filter = NULL,
+    proportion = 1, clust = 0, vart0 = 0, tmp = 0, CA = FALSE,
+    p.table = 0,plot=TRUE,lmax=NULL,color="red",F=NULL,K=NULL,type_of_plot="plot",ladd=NULL)
+{
+    if (is.null(index.filter)) {
+        ind.function = which(!is.na(data))
+    }
+    else ind.function = index.filter
+    ind.function = ind.function[which(!is.na(data[ind.function]))]
+    if (!(method == "regress" | method == "median" | method ==
+        "ks.test")) {
+        print("Error. I do not know this method")
+        break
+    }
+    if (CA) {
+        p00 <- p.table[1, 1, ]
+        p01 <- p.table[1, 2, ]
+        p02 <- p.table[1, 3, ]
+        n0 <- p.table[1, 4, ]
+        p10 <- p.table[2, 1, ]
+        p11 <- p.table[2, 2, ]
+        p12 <- p.table[2, 3, ]
+        n1 <- p.table[2, 4, ]
+        p0 <- p.table[3, 1, ]
+        p1 <- p.table[3, 2, ]
+        p2 <- p.table[3, 3, ]
+        n <- p.table[3, 4, ]
+        Zx = 0
+        exeps = 0
+        Dx = (p12 - p02) + x * (p11 - p01)
+        i = 1
+        while (i <= length(n)) {
+            vart = ((p2[i] + p1[i] * x^2) - (p2[i] + x * p1[i])^2) *
+                ((1/n1[i]) + (1/n0[i]))
+            if (vart == 0) {
+                Zx[i] = NA
+                exeps[i] = T
+            }
+            else {
+                Zx[i] = ((Dx[i])^2)/(vart)
+                exeps[i] = F
+            }
+            i = i + 1
+        }
+        inf = which(!is.na(Zx))
+        notinf = which(is.na(Zx))
+    }
+    else {
+        Zx = data
+        inf = which(!is.na(Zx))
+        notinf = which(is.na(Zx))
+    }
+    Clust <- function(k) {
+        data.dist <- as.dist(0.5 - tmp)
+        data.mds <- cmdscale(data.dist)
+        return(data.mds)
+    }
+    ClustN <- function(k, data1.mds) {
+        km <- kmeans(data1.mds, centers = k, nstart = 1000)
+        i = 1
+        cl = 0
+        while (i <= k) {
+            cl[i] <- length(which(km$cluster == i))
+            i = i + 1
+        }
+        return(cl)
+    }
+    VIF <- function(p, N, F, K) {
+        q = 1 - p
+        Var = ((2 * p * (1 - F) * q * x^2 + F * p + (1 - F) *
+            p^2) - (2 * p * q * (1 - F) * x + F * p + (1 - F) *
+            p^2)^2)
+        S = (1 + F) * (1 + 2 * F)
+        Cov1 = ((6 * F^3 * p + 11 * (1 - F) * F^2 * p^2 + (1 -
+            F)^3 * p^4 + 6 * (1 - F)^2 * F * p^3)/S) - ((1 -
+            F) * p^2 + F * p)^2
+        Cov2 = x * (((4 * (2 * (1 - F) * F^2 * p * q + (1 - F)^3 *
+            p^3 * q + 3 * (1 - F)^2 * F * p^2 * q))/S) - 2 *
+            ((1 - F) * p^2 + F * p) * (2 * (1 - F) * p * q))
+        Cov3 = x^2 * ((4 * ((1 - F)^3 * p^2 * q^2 + F * (1 -
+            F) * p * q))/S - (2 * (1 - F) * p * q)^2)
+        Cov = Cov1 + Cov2 + Cov3
+        if (vart0 == 1) {
+            VarT0 = N * Var - N * Cov
+        }
+        if (vart0 == 0) {
+            VarT0 = N * Var
+        }
+        VarT = VarT0 + Cov * K
+        VIF = VarT/VarT0
+        return(VIF)
+    }
+    GC_VIF_nlm = function(FK) {
+        F = FK[1]
+        K = FK[2]
+        vector_vif = VIF(p, n, F, K)
+        Zxl = Zx/vector_vif
+        Zxl = sort(Zxl[ind.function])
+        Zxl_r = Zxl[1:ntp]
+        if (method == "ks.test") {
+            dMedian = -log(ks.test(Zxl, "pchisq", df = 1)$p.value)
+        }
+        if (method == "median") {
+            dMedian = abs(qchisq(0.5, 1) - median(Zxl))
+        }
+        if (method == "regress") {
+            dMedian = sum((Zxl_r - Chi2)^2)
+        }
+        return(1 * dMedian)
+    }
+    GC_VIF = function(F, K) {
+        vector_vif = VIF(p, n, F, K)
+        Zxl = Zx/vector_vif
+        Zxl = sort(Zxl[ind.function])
+        Zxl_r = Zxl[1:ntp]
+        if (method == "ks.test") {
+            dMedian = -log(ks.test(Zxl, "pchisq", df = 1)$p.value)
+        }
+        if (method == "median") {
+            dMedian = abs(qchisq(0.5, 1) - median(Zxl))
+        }
+        if (method == "regress") {
+            dMedian = sum((Zxl_r - Chi2)^2)
+        }
+        return(1 * dMedian)
+    }
+    data_p <- data[ind.function]
+    if (proportion > 1 || proportion <= 0)
+        stop("proportion argument should be greater then zero and less than or equal to one")
+    ntp <- round(proportion * length(data_p))
+    if (ntp < 1)
+        stop("no valid measurments")
+    if (ntp == 1) {
+        warning(paste("One measurment, Lambda = 1 returned"))
+        return(list(estimate = 1, se = 999.99))
+    }
+    if (ntp < 10)
+        warning(paste("number of points is too small:", ntp))
+    if (min(data_p) < 0)
+        stop("data argument has values <0")
+    if (max(data_p) <= 1) {
+        data_p <- qchisq(data_p, 1, lower.tail = FALSE)
+    }
+	lambda=(median(data, na.rm = T)/qchisq(0.5, 1))
+    data_p <- sort(data_p)
+    ppoi <- ppoints(data_p)
+    Chi2 <- sort(qchisq(1 - ppoi, 1))
+    Chi2 <- Chi2[1:ntp]
+    if (clust == 1) {
+			F = 0.5
+			K = n[1]
+        data.mds0 <- Clust(0)
+        k = 2
+        cl0 <- ClustN(k, data.mds0)
+        data.mds1 <- Clust(1)
+        k = 2
+        cl1 <- ClustN(k, data.mds1)
+        if (length(cl1) > length(cl0)) {
+            cl0[(length(cl0) + 1):length(cl1)] = 0
+        }
+        if (length(cl0) > length(cl1)) {
+            cl1[(length(cl1) + 1):length(cl0)] = 0
+        }
+        S = sum(cl0)
+        R = sum(cl1)
+        K = sum((cl1 * S - cl0 * R)^2)
+        K1 = K/(R * S)
+        opt_2 = optimize(GC_VIF, c(0, 1), tol = 1e-04, K = K1)
+        F = opt_2$minimum
+        K = K1
+    }
+    if (clust == 0) {
+       
+		if (!is.null(F) & !is.null(K)){
+			FK=0
+			FK[1]=F
+			FK[2]=K
+		} else{
+
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/genabel -r 1412


More information about the Genabel-commits mailing list