[Genabel-commits] r1412 - in pkg/GenABEL: . R man
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Thu Nov 21 21:00:02 CET 2013
Author: nd_0001
Date: 2013-11-21 21:00:01 +0100 (Thu, 21 Nov 2013)
New Revision: 1412
Added:
pkg/GenABEL/R/VIFGC.R
pkg/GenABEL/R/VIFGC_ovdom.R
Removed:
pkg/GenABEL/R/GC.R
pkg/GenABEL/R/GC_ovdom.R
pkg/GenABEL/man/GC.Rd
pkg/GenABEL/man/GC_ovdom.Rd
Modified:
pkg/GenABEL/NAMESPACE
pkg/GenABEL/R/PGC.R
pkg/GenABEL/generate_documentation.R
pkg/GenABEL/man/GenABEL.Rd
pkg/GenABEL/man/PGC.Rd
pkg/GenABEL/man/estlambda.Rd
pkg/GenABEL/man/export.plink.Rd
pkg/GenABEL/man/grammar.Rd
pkg/GenABEL/man/impute2databel.Rd
pkg/GenABEL/man/impute2mach.Rd
Log:
Changes in GC functions
Modified: pkg/GenABEL/NAMESPACE
===================================================================
--- pkg/GenABEL/NAMESPACE 2013-11-21 09:29:15 UTC (rev 1411)
+++ pkg/GenABEL/NAMESPACE 2013-11-21 20:00:01 UTC (rev 1412)
@@ -55,8 +55,8 @@
findRelatives,
formetascore,
GASurv,
- GC,
- GC_ovdom,
+ VIFGC,
+ VIFGC_ovdom,
generateOffspring,
getLogLikelihoodGivenRelation,
grammar,
Deleted: pkg/GenABEL/R/GC.R
===================================================================
--- pkg/GenABEL/R/GC.R 2013-11-21 09:29:15 UTC (rev 1411)
+++ pkg/GenABEL/R/GC.R 2013-11-21 20:00:01 UTC (rev 1412)
@@ -1,285 +0,0 @@
-#' Genomic control for various model of inheritance using VIF
-#'
-#' This function estimates corrected statistic using genomic control
-#' for different models (recessive,dominant,additive etc.),
-#' using VIF. VIF coefficients are estimated
-#' by optimizing diffrent error functions: regress,
-#' median and ks.test.
-#'
-#' @param data Input vector of Chi square statistic
-#' @param method Function of error to be optimized. Can be
-#' "regress", "median" or "ks.test"
-#' @param p Input vector of allele frequencies
-#' @param x Model of inheritance (0 for recessive,0.5 for additive, 1 for dominant)
-#' @param index.filter Indexes for variables that will be use for analisis in data vector
-#' @param n size of the sample
-#' @param proportion The proportion of lowest P (Chi2) to be used when
-#' estimating the inflation factor Lambda for "regress" method only
-#' @param clust For developers only
-#' @param vart0 For developers only
-#' @param tmp For developers only
-#' @param CA For developers only
-#' @param p.table For developers only
-#'
-#' @return A list with elements
-#' \item{Zx}{output vector corrected Chi square statistic}
-#' \item{vv}{output vector of VIF}
-#' \item{exeps}{output vector of exepsons (NA)}
-#' \item{calrate}{output vector of calrate}
-#' \item{F}{F}
-#' \item{K}{K}
-#'
-#' @author Yakov Tsepilov
-#'
-#' @examples
-#' data(ge03d2)
-#' # truncate the data to make the example faster
-#' ge03d2 <- ge03d2[seq(from=1,to=nids(ge03d2),by=2),seq(from=1,to=nsnps(ge03d2),by=3)]
-#' qts <- mlreg(dm2~sex,data=ge03d2,gtmode = "dominant")
-#' chi2.1df <- results(qts)$chi2.1df
-#' s <- summary(ge03d2)
-#' freq <- s$Q.2
-#' result <- GC(p=freq,x=1,method = "median",CA=FALSE,data=chi2.1df,n=nids(ge03d2))
-#'
-#' @keywords htest
-#'
-
-GC = function(data,p,x,method = "regress",n,index.filter=NULL,proportion=1,clust=0,vart0=0,tmp=0,CA=FALSE,p.table=0){
-#
- if (is.null(index.filter)){
- ind.function=which(!is.na(data))
- } else ind.function=index.filter
- ind.function=ind.function[which(!is.na(data[ind.function]))]
- #
- if (!(method=="regress" | method=="median" | method=="ks.test")){
- print("Error. I do not know this method");
- break;
- }
- #
- if (CA){
- p.table[1,1,]->p00
- p.table[1,2,]->p01
- p.table[1,3,]->p02
- p.table[1,4,]->n0
-
- p.table[2,1,]->p10
- p.table[2,2,]->p11
- p.table[2,3,]->p12
- p.table[2,4,]->n1
-
- p.table[3,1,]->p0
- p.table[3,2,]->p1
- p.table[3,3,]->p2
- p.table[3,4,]->n
- Zx=0;
- exeps=0;
- Dx=(p12-p02)+x*(p11-p01);
- i=1;
- while (i<=length(n)){
- vart=((p2[i]+p1[i]*x^2)-(p2[i]+x*p1[i])^2)*((1/n1[i])+(1/n0[i]));
- if (vart==0) {Zx[i]=NA; exeps[i]=T; } else{
- Zx[i]=((Dx[i])^2)/(vart); exeps[i]=F}
- i=i+1;
- }
- #Zx=CASforVIF(p.table,p,x)
- inf=which(!is.na(Zx));
- notinf=which(is.na(Zx));
- } else {
- Zx=data
- inf=which(!is.na(Zx));
- notinf=which(is.na(Zx));
- }
- #
- Clust <- function(k){
- #l=which(data1 at phdata$dm2==k)
- #data1 <- data1[l,inf]
- #data1 <- Xfix(data1)
- #detach(df at phdata)
- #data1.gkin <- ibs(data1[,sample(which(data1 at gtdata@chromosome!="X"),1000)],weight="freq")
- data.dist <- as.dist(0.5-tmp)
- data.mds <- cmdscale(data.dist)
- return(data.mds)
- }
- #
- ClustN <- function(k,data1.mds){
- km <- kmeans(data1.mds,centers=k,nstart=1000)
- i=1;
- cl=0
- while (i<=k){
- cl[i] <- length(which(km$cluster==i))
- i=i+1;
- }
- return (cl);
- }
- #
-
- VIF <- function(p,N,F,K){
- q=1-p;
- Var = ((2*p*(1-F)*q*x^2+F*p+(1-F)*p^2)-(2*p*q*(1-F)*x+F*p+(1-F)*p^2)^2);
- S=(1+F)*(1+2*F);
- Cov1=((6*F^3*p+11*(1-F)*F^2*p^2+(1-F)^3*p^4+6*(1-F)^2*F*p^3)/S)-((1-F)*p^2 + F*p)^2;
- Cov2=x*(((4*(2*(1-F)*F^2*p*q+(1-F)^3*p^3*q+3*(1-F)^2*F*p^2*q))/S)-2*((1-F)*p^2+F*p)*(2*(1 - F)*p*q));
- Cov3=x^2*((4*((1-F)^3*p^2*q^2+F*(1-F)*p*q))/S-(2*(1-F)*p*q)^2);
- Cov=Cov1+Cov2+Cov3;
- if (vart0==1){VarT0=N*Var-N*Cov;}
- if (vart0==0){VarT0=N*Var;}
- VarT=VarT0+Cov*K;
- #VarT=VarT0+Cov*(K-N);
- VIF=VarT/VarT0;
- return(VIF)
- }
- #
- GC_VIF_nlm = function(FK){
- ###
- F=FK[1];
- K=FK[2];
- #if ((F>1) || (F<0) || (K<0) || (K>(n[1]^2))) {dMedian=(length(Zx)*100 + abs(K));} else{
- vector_vif=VIF(p,n,F,K);
- Zxl=Zx/vector_vif;
- Zxl=sort(Zxl[ind.function]);
- Zxl_r=Zxl[1:ntp]
- if (method=="ks.test"){
- dMedian=-log(ks.test(Zxl,"pchisq",df=1)$p.value);
- }
- if (method=="median"){
- dMedian=abs(qchisq(.5,1)-median(Zxl));
- }
- if (method=="regress")
- {dMedian=sum((Zxl_r-Chi2)^2);}
- #}
- return(1*dMedian)
- }
- ###
- GC_VIF = function(F,K){
- ###
- #F=FK[1];
- #K=FK[2];
- #if ((F>1) || (F<0) || (K<0) || (K>(n[1]^2))) {dMedian=(length(Zx)*100 + abs(K));} else{
- vector_vif=VIF(p,n,F,K);
- Zxl=Zx/vector_vif;
- Zxl=sort(Zxl[ind.function]);
- Zxl_r=Zxl[1:ntp]
- if (method=="ks.test"){
- dMedian=-log(ks.test(Zxl,"pchisq",df=1)$p.value);
- }
- if (method=="median"){
- dMedian=abs(qchisq(.5,1)-median(Zxl));
- }
- if (method=="regress")
- {dMedian=sum((Zxl_r-Chi2)^2);}
- #}
- return(1*dMedian)
- }
-
- ###
- F=0.5; K=n[1];
-
- #N_inf=sum(!is.na(data[ind.function]))
- #Chi2=(0:(N_inf-1))/N_inf;
- #Chi2=qchisq(Chi2,1)
- data_p <- data[ind.function]
- if (proportion>1.0 || proportion<=0)
- stop("proportion argument should be greater than zero and less than or equal to one")
- ntp <- round(proportion*length(data_p))
- if (ntp<1) stop("no valid measurements")
- if (ntp==1) {
- warning(paste("One measurement, Lambda = 1 returned"))
- return(list(estimate=1.0,se=999.99))
- }
- if (ntp<10) warning(paste("number of points is too small:",ntp))
- if (min(data_p)<0) stop("data argument has values <0")
- if (max(data_p)<=1) {
- data_p<- qchisq(data_p,1,lower.tail=FALSE)
- }
-
- data_p <- sort(data_p)
- ppoi <- ppoints(data_p)
- Chi2 <- sort(qchisq(1-ppoi,1))
- Chi2 <- Chi2[1:ntp]
- #####
- if (clust==1){
- data.mds0 <- Clust(0)
- #plot(data.mds0)
- k=2;
- cl0 <- ClustN(k,data.mds0)
- data.mds1 <- Clust(1)
- #plot(data.mds1)
- k=2;
- cl1 <- ClustN(k,data.mds1)
- if (length(cl1)>length(cl0)){cl0[(length(cl0)+1):length(cl1)]=0}
- if (length(cl0)>length(cl1)){cl1[(length(cl1)+1):length(cl0)]=0}
- S=sum(cl0);
- R=sum(cl1);
- K=sum((cl1*S-cl0*R)^2)
- K1=K/(R*S)
- #####
- #opt_1=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=0.2);
- opt_2=optimize(GC_VIF,c(0,1),tol=0.0001,K=K1);
- #opt_3=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=opt_2$minimum);
- F=opt_2$minimum;
- K=K1;
- #j=0.01;
- #min=10000;
- #while (j<1){
- # opt_1=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=j);
- # opt_2=optimize(GC_VIF,c(0,1),tol=0.0001,K=opt_1$minimum);
- # if (opt_1$objective<min){
- # K=opt_1$minimum;
- # F=j;
- # min=opt_1$objective;
- # }
- # if (opt_2$objective<min){
- # K=opt_1$minimum;
- # F=opt_2$minimum;
- # min=opt_2$objective;
- # }
- # j=j+0.1;
- #}
- #FK=nlm(GC_VIF ,p=c(F=0.5, K=680))$estimate;
- #F=FK[1];
- #K=FK[2];
- }
- if (clust==0){
- #diag(tmp) <- diag(tmp)-0.5;
- #F1=abs(mean(tmp[lower.tri(tmp,diag=T)]));
- #l1=which (tmp>=0 & lower.tri(tmp,diag=T));
- #F1=mean(tmp[l1]);
- if ((median(Zx,na.rm=T)/qchisq(.5,1))>=1){
- lda=median(Zx,na.rm=T)/qchisq(.5,1)+0.05;
- c=(lda-1)*(n[1]/2)
- F1=1/(c)^0.5
- K1=c*(1+F1)/F1
- } else{
- F1=0.5
- K1=n[1]
- }
- #F1=0.1225
- #K1=245.254
- #K1=279.394
-
- #opt_1=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=F1);
- #K1=opt_1$minimum;
-
- ####opt_1=optimize(GC_VIF,lower=10,upper=n[1]^2,tol=0.01,F=F1,maximum=T);
-
- FK=nlm(GC_VIF_nlm,c(F=F1,K=K1))$estimate;
- F=FK[1];
- K=FK[2];
- }
- vector_vif=VIF(p,n,F,K);
- Zx=Zx/vector_vif;
- #Zx[notinf]=NA;
- exeps <- is.na(Zx);
- #out <- as.data.frame(Zx)
- out=list()
- out$Zx<-Zx
- out$vv <-vector_vif;
- out$exeps <-exeps
- out$calrate <- n/max(n)
- out$F=F
- out$K=K
- out
- #Zx
- #return(Zx)
-}
-#ZxVIF=GC(p12,p02,p11,p01,p,x,n1,n0,n);
Deleted: pkg/GenABEL/R/GC_ovdom.R
===================================================================
--- pkg/GenABEL/R/GC_ovdom.R 2013-11-21 09:29:15 UTC (rev 1411)
+++ pkg/GenABEL/R/GC_ovdom.R 2013-11-21 20:00:01 UTC (rev 1412)
@@ -1,217 +0,0 @@
-#' Genomic control for overdimunant model of inheritance using VIF
-#'
-#' This function estimates the corrected statistic using genomic control
-#' for the overdominant model,
-#' using VIF. VIF coefficients are estimated
-#' by optimizing different error functions: regress,
-#' median and ks.test.
-#'
-#' @param data Input vector of Chi square statistic
-#' @param method Function of error to be optimized. Can be
-#' "regress", "median" or "ks.test"
-#' @param p Input vector of allele frequencies
-#' @param index.filter Indexes for variables that will be use for analisis in data vector
-#' @param n size of the sample
-#' @param proportion The proportion of lowest P (Chi2) to be used when
-#' estimating the inflation factor Lambda for "regress" method only
-#' @param clust For developers only
-#' @param vart0 For developers only
-#' @param tmp For developers only
-#'
-#' @return A list with elements
-#' \item{Zx}{output vector corrected Chi square statistic}
-#' \item{vv}{output vector of VIF}
-#' \item{exeps}{output vector of exepsons (NA)}
-#' \item{calrate}{output vector of calrate}
-#' \item{F}{F}
-#' \item{K}{K}
-#'
-#' @author Yakov Tsepilov
-#'
-#' @examples
-#' data(ge03d2)
-#' # truncate the data to make the example faster
-#' ge03d2 <- ge03d2[seq(from=1,to=nids(ge03d2),by=2),seq(from=1,to=nsnps(ge03d2),by=3)]
-#' qts <- mlreg(phdata(ge03d2)$dm2~1,data=ge03d2,gtmode = "overdominant")
-#' chi2.1df <- results(qts)$chi2.1df
-#' s <- summary(ge03d2)
-#' freq <- s$Q.2
-#' result <- GC_ovdom(p=freq,method = "median",data=chi2.1df,n=nids(ge03d2))
-#'
-#' @keywords htest
-#'
-
-GC_ovdom = function(data,p,method = "regress",n,index.filter=NULL,proportion=1,clust=0,vart0=0,tmp=0){
- #
- if (is.null(index.filter)){
- ind.function=which(!is.na(data))
- } else ind.function=index.filter
- ind.function=ind.function[which(!is.na(data[ind.function]))]
- #
- if (!(method=="regress" | method=="median" | method=="ks.test")){
- print("Error. I do not know this method");
- break;
- }
- #
- Zx=data
- inf=which(!is.na(Zx));
- notinf=which(is.na(Zx));
- #
- Clust <- function(k){
- #l=which(data1 at phdata$dm2==k)
- #data1 <- data1[l,inf]
- #data1 <- Xfix(data1)
- #detach(df at phdata)
- #data1.gkin <- ibs(data1[,sample(which(data1 at gtdata@chromosome!="X"),1000)],weight="freq")
- data.dist <- as.dist(0.5-tmp)
- data.mds <- cmdscale(data.dist)
- return(data.mds)
- }
- #
- ClustN <- function(k,data1.mds){
- km <- kmeans(data1.mds,centers=k,nstart=1000)
- i=1;
- cl=0
- while (i<=k){
- cl[i] <- length(which(km$cluster==i))
- i=i+1;
- }
- return (cl);
- }
- #
-
- VIF <- function(p,N,F,K){
- q=1-p;
- Var=2*p*q*x^2 - 2*F*p*q*x^2 - 4*p^2*q^2*x^2 + 8*F*p^2*q^2*x^2-4*F^2*p^2*q^2*x^2;
- #Var = ((2*p*(1-F)*q*x^2+F*p+(1-F)*p^2)-(2*p*q*(1-F)*x+F*p+(1-F)*p^2)^2);
- S=(1+F)*(1+2*F);
- Cov=x^2*((4*((1-F)^3*p^2*q^2+F*(1-F)*p*q))/S-(2*(1-F)*p*q)^2);
- if (vart0==1){VarT0=N*Var-N*Cov;}
- if (vart0==0){VarT0=N*Var;}
- VarT=VarT0+Cov*K;
- #VarT=VarT0+Cov*(K-N);
- VIF=VarT/VarT0;
- return(VIF)
- }
- #
- GC_VIF_nlm = function(FK){
- ###
- F=FK[1];
- K=FK[2];
- #if ((F>1) || (F<0) || (K<0) || (K>(n[1]^2))) {dMedian=(length(Zx)*100 + abs(K));} else{
- vector_vif=VIF(p,n,F,K);
- Zxl=Zx/vector_vif;
- Zxl=sort(Zxl[ind.function]);
- Zxl_r=Zxl[1:ntp]
- if (method=="ks.test"){
- dMedian=-log(ks.test(Zxl,"pchisq",df=1)$p.value);
- }
- if (method=="median"){
- dMedian=abs(qchisq(.5,1)-median(Zxl));
- }
- if (method=="regress")
- {dMedian=sum((Zxl_r-Chi2)^2);}
- #}
- return(1*dMedian)
- }
- ###
- GC_VIF = function(F,K){
- ###
- #F=FK[1];
- #K=FK[2];
- #if ((F>1) || (F<0) || (K<0) || (K>(n[1]^2))) {dMedian=(length(Zx)*100 + abs(K));} else{
- vector_vif=VIF(p,n,F,K);
- Zxl=Zx/vector_vif;
- Zxl=sort(Zxl[ind.function]);
- Zxl_r=Zxl[1:ntp]
- if (method=="ks.test"){
- dMedian=-log(ks.test(Zxl,"pchisq",df=1)$p.value);
- }
- if (method=="median"){
- dMedian=abs(qchisq(.5,1)-median(Zxl));
- }
- if (method=="regress")
- {dMedian=sum((Zxl_r-Chi2)^2);}
- #}
- return(1*dMedian)
- }
-
- F=0.5; K=0.2*n[1];
-
- #N_inf=sum(!is.na(data[ind.function]))
- #Chi2=(0:(N_inf-1))/N_inf;
- #Chi2=qchisq(Chi2,1)
- data_p <- data[ind.function]
- if (proportion>1.0 || proportion<=0)
- stop("proportion argument should be greater than zero and less than or equal to one")
- ntp <- round(proportion*length(data_p))
- if (ntp<1) stop("no valid measurements")
- if (ntp==1) {
- warning(paste("One measurement, Lambda = 1 returned"))
- return(list(estimate=1.0,se=999.99))
- }
- if (ntp<10) warning(paste("number of points is too small:",ntp))
- if (min(data_p)<0) stop("data argument has values <0")
- if (max(data_p)<=1) {
- data_p<- qchisq(data_p,1,lower.tail=FALSE)
- }
-
- data_p <- sort(data_p)
- ppoi <- ppoints(data_p)
- Chi2 <- sort(qchisq(1-ppoi,1))
- Chi2 <- Chi2[1:ntp]
- x=1;
-
- if (clust==1){
- data.mds0 <- Clust(0)
- #plot(data.mds0)
- k=2;
- cl0 <- ClustN(k,data.mds0)
- data.mds1 <- Clust(1)
- #plot(data.mds1)
- k=2;
- cl1 <- ClustN(k,data.mds1)
- if (length(cl1)>length(cl0)){cl0[(length(cl0)+1):length(cl1)]=0}
- if (length(cl0)>length(cl1)){cl1[(length(cl1)+1):length(cl0)]=0}
- S=sum(cl0);
- R=sum(cl1);
- K=sum((cl1*S-cl0*R)^2)
- K1=K/(R*S)
- #####
- #opt_1=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=0.2);
- opt_2=optimize(GC_VIF,c(0,1),tol=0.0001,K=K1);
- #opt_3=optimize(GC_VIF,c(0,n[1]^2),tol=0.0001,F=opt_2$minimum);
- F=opt_2$minimum;
- K=K1;
- }
- if (clust==0){
- #diag(tmp) <- diag(tmp)-0.5;
- #F1=abs(mean(tmp[lower.tri(tmp,diag=T)]));
- #l1=which (tmp>=0 & lower.tri(tmp,diag=T));
- #F1=mean(tmp[l1]);
-
- lda=median(Zx,na.rm=T)/qchisq(.5,1)+0.05;
- c=(lda-1)*(n[1]/2)
- F1=1/(c)^0.5
- K1=c*(1+F1)/F1
- FK=nlm(GC_VIF_nlm,c(F=F1,K=K1))$estimate;
- F=FK[1];
- K=FK[2];
- }
- vector_vif=VIF(p,n,F,K);
- Zx=Zx/vector_vif;
- #Zx[notinf]=NA;
- exeps <- is.na(Zx);
- #out <- as.data.frame(Zx)
- out=list()
- out$Zx<-Zx
- out$vv <-vector_vif;
- out$exeps <-exeps
- out$calrate <- n/max(n)
- out$F=F
- out$K=K
- out
- #Zx
- #return(Zx)
-}
-#ZxVIF=GC(p12,p02,p11,p01,p,x,n1,n0,n);
Modified: pkg/GenABEL/R/PGC.R
===================================================================
--- pkg/GenABEL/R/PGC.R 2013-11-21 09:29:15 UTC (rev 1411)
+++ pkg/GenABEL/R/PGC.R 2013-11-21 20:00:01 UTC (rev 1412)
@@ -2,169 +2,180 @@
#'
#' This function estimates the genomic controls
#' for different models and degrees of freedom,
-#' using polinom function. Polinomic coefficients are estimated
+#' using polinomial function. Polinomial coefficients are estimated
#' by optimizing different error functions: regress,
-#' median and ks.test.
+#' median, ks.test or group regress.
#'
#' @param data Input vector of Chi square statistic
#' @param method Function of error to be optimized. Can be
-#' "regress", "median" or "ks.test"
+#' "regress", "median", "ks.test" or "group_regress"
#' @param p Input vector of allele frequencies
-#' @param df Number of dergees of freedom
-#' @param pol.d Polinom degree
-#' @param plot If true, functon makes plot of lambda from allele frequencies
+#' @param df Number of degrees of freedom
+#' @param pol.d The degree of polinomial function
+#' @param plot If TRUE, plot of lambda will be produced
#' @param start.corr For regress method use it only when you want to make calculations faster
#' @param index.filter Index of variables in data vector, that will be used in analysis
-#' if zero - all variables will be used
+#' if zero - all variables will be used
#' @param proportion The proportion of lowest P (Chi2) to be used when
#' estimating the inflation factor Lambda for "regress" method only
+#' @param n_quiantile The number of groups for "group_regress" method
+#' @param title_name The title name for plot
+#' @param type_of_plot For developers only
+#' @param lmax The threshold for lambda for plotting (optional)
+#' @param color The color of the plot
#'
#' @return A list with elements
#' \item{data}{Output vector corrected Chi square statistic}
-#' \item{b}{Polinom coefficients}
+#' \item{b}{Polinomial coefficients}
#'
#' @author Yakov Tsepilov
#'
#' @examples
#' data(ge03d2)
-#' qts=qtscore(dm2, ge03d2)
-#' chi2.1df=results(qts)$chi2.1df
-#' s=summary(ge03d2)
-#' MAF=s$Q.2
-#' result=PGC(data=chi2.1df,method="regress",p=MAF,df=1, pol.d=3, plot=FALSE, start.corr=FALSE)
-#'
+#' ge03d2 <- ge03d2[seq(from=1,to=nids(ge03d2),by=2),seq(from=1,to=nsnps(ge03d2),by=3)]
+#' qts <- mlreg(dm2~1,data=ge03d2,gtmode = "additive")
+#' chi2.1df <- results(qts)$chi2.1df
+#' s <- summary(ge03d2)
+#' freq <- s$Q.2
+#' result=PGC(data=chi2.1df,method="median",p=freq,df=1, pol.d=2, plot=TRUE, lmax=1.1,start.corr=FALSE)
+#' #"group_regress" is better to use when we have more than 50K SNPs
+#' #result=PGC(data=chi2.1df,method="group_regress",p=freq,df=1, pol.d=2, plot=TRUE, start.corr=FALSE,n_quiantile=3)
+#'
#' @keywords htest
#'
-PGC = function(data,method="regress",p,df, pol.d=3, plot=TRUE, index.filter=NULL,start.corr=FALSE,proportion=1){
- if (is.null(index.filter)){
- ind.function=which(!is.na(data))
- } else ind.function=index.filter
- ind.function=ind.function[which(!is.na(data[ind.function]))]
- pol.m = function(p, b, pol.d){
- out=0;
- for (i in (1:pol.d)){
- out=(p^i)*b[i]+out;
- }
- out=out+b[pol.d+1];
- return(out);
- }
+PGC=function (data, method = "group_regress", p, df, pol.d = 3, plot = TRUE,
+ index.filter = NULL, start.corr = FALSE, proportion = 1,n_quiantile=5,title_name="Lambda",type_of_plot="plot",lmax=NULL,color="red")
+{
+ if (is.null(index.filter)) {
+ ind.function = which(!is.na(data))
+ }
+ else ind.function = index.filter
+ ind.function = ind.function[which(!is.na(data[ind.function]))]
+
+ pol.m = function(p, b, pol.d) {
+ out = 0
+ for (i in (1:pol.d)) {
+ out = (p^i) * b[i] + out
+ }
+ out = out + b[pol.d + 1]
+ return(out)
+ }
+
+ if (!(method == "regress" | method == "median" | method == "ks.test" | method == "group_regress")) {
+ print("Error. I do not know this method")
+ break
+ }
+
+ if (start.corr) {
+ data = data/(1 * median(data[ind.function], na.rm = T)/qchisq(0.5,df))
+ }
+ data_p <- data[ind.function]
+ p_p=p[ind.function]
+ lambda=median(data_p)/qchisq(0.5, df)
- if (!(method=="regress" | method=="median" | method=="ks.test")){
- print("Error. I do not know this method");
- break;
- }
- if (start.corr){
- data=data/(1*median(data[ind.function],na.rm=T)/qchisq(.5,df));
- }
- #N=sum(!is.na(data[ind.function]))
- data_p <- data[ind.function]
- if (proportion>1.0 || proportion<=0)
- stop("proportion argument should be greater than zero and less than or equal to one")
- ntp <- round(proportion*length(data_p))
- if (ntp<1) stop("no valid measurements")
- if (ntp==1) {
- warning(paste("One measurement, Lambda = 1 returned"))
- return(list(estimate=1.0,se=999.99))
+ if (proportion > 1 || proportion <= 0) stop("proportion argument should be greater then zero and less than or equal to one")
+ ntp <- round(proportion * length(data_p))
+ if (ntp < 1) stop("no valid measurments")
+ if (ntp < 100) warning(paste("number of points is too small:", ntp))
+ if (min(data_p) < 0) stop("data argument has values <0")
+
+ data_p <- sort(data_p)
+ ppoi <- ppoints(data_p)
+ if (method=="regress"){
+ Chi2 <- sort(qchisq(ppoi,df=df,lower.tail=FALSE))
+ Chi2 <- Chi2[1:ntp]
}
- if (ntp<10) warning(paste("number of points is too small:",ntp))
- if (min(data_p)<0) stop("data argument has values <0")
- if (max(data_p)<=1) {
- data_p<- qchisq(data_p,1,lower.tail=FALSE)
+ if (method=="group_regress"){
+ qt=quantile(p_p, probs = seq(0, 1,length=n_quiantile))
+ pp=p_p
+ i=1
+ for (i in (2:n_quiantile)){
+ pp[p_p>=qt[i-1] & p_p<=qt[i]]=i-1
+ }
+ tb=table(pp)
+ chi2_array=array(NA,c((n_quiantile-1),max(tb)))
+ for (i in (1:(n_quiantile-1))){
+ ppoi <- ppoints(1:tb[i])
+ ntp[i] <- round(proportion * tb[i])
+ chi2_array[i,1:tb[i]] <- sort(qchisq(ppoi,df=df,lower.tail=FALSE))
+ }
}
+ ppp_real = seq(from = min(p), to = max(p), length = 1000)
+ ppp_abstr= seq(from = 0, to = 1, length = 1000)
+ #ppp = seq(from = 0, to = 1, length = 1000)
+
+ f2 = function(b) {
+ Zx = data
+ #if ((min(pol.m(ppp_abstr, b, pol.d))<1) & (max(pol.m(ppp_abstr, b, pol.d))>lambda*2)){
+ if (min(pol.m(ppp_abstr, b, pol.d))<1){
+ vv=1e5
+ #vv=pol.m(p, b, pol.d)
+ } else{
+ if (max(pol.m(ppp_abstr, b, pol.d))>10){
+ vv=1e5*max(pol.m(ppp_abstr, b, pol.d))
+ #vv=pol.m(p, b, pol.d)
+ }else{
+ vv=pol.m(p, b, pol.d)
+ }
+ }
- data_p <- sort(data_p)
- ppoi <- ppoints(data_p)
- Chi2 <- sort(qchisq(1-ppoi,df))
- Chi2 <- Chi2[1:ntp]
- #dta <- sort(data)
- #ppp <- ppoints(dta)
- #Chi2=qchisq(ppp,df);
-
- #delta=1/N;
- #x1=0;
- #i=1;
- #Chi2=0;
- #while (i<=N){
- #x2=x1+(delta/2);
- #Chi2[i]=qchisq(x2,df);
- #x1=x1+delta;
- #i=i+1;
- #}
-
- #Chi2=(0:(N-1))/N;
- #Chi2=qchisq(Chi2,df)
- ppp=seq(from=min(p),to=max(p),length=1000)
- #inf=which(!is.na(data))
- f2 = function(b){
- Zx=data
-
- #Chi2=sort(rchisq(N,df))
-
- #for (i in (1:N)){
- # j=inf[i];
- # if (pol.d==3) {
- # Zx[j]=Zx[j]/(b[1]*p[j]+b[2]*p[j]^2+b[3]*p[j]^3+b[4]);}
- # if (pol.d==2){
- # Zx[j]=Zx[j]/(b[1]*p[j]+b[2]*p[j]^2+b[3]);}
- #}
- #if (pol.d==4) {
- # Zx=Zx/(b[1]*p+b[2]*p^2+b[3]*p^3+b[4]*p^4+b[5]);}
- #if (pol.d==3) {
- # Zx=Zx/(b[1]*p+b[2]*p^2+b[3]*p^3+b[4]);}
- #if (pol.d==2){
- #Zx=Zx/(b[1]*p+b[2]*p^2+b[3]);}
- #}
- Zx=Zx/(pol.m(p,b,pol.d))
- Zxl=sort(Zx[ind.function])
- Zxl_r=Zxl[1:ntp]
- if (method=="ks.test"){
- F=-log(ks.test(Zxl,"pchisq",df=df)$p.value);
+ Zx = Zx/vv
+ if (method == "group_regress"){
+ Zxl=Zx[ind.function]
+ F=0
+ for (i in (1:(n_quiantile-1))){
+ Zxl_r=sort(Zxl[pp==i])
+ Zxl_r = Zxl_r[1:ntp[i]]
+ Chi2=chi2_array[i,1:ntp[i]]
+ F = sum((Zxl_r - Chi2)^2)+F
+ }
+ }
+ if (method == "ks.test") {
+ Zxl = sort(Zx[ind.function])
+ F = -log(ks.test(Zxl, "pchisq", df = df)$p.value)
}
- if (method=="median"){
- F=abs(median(Zxl)-qchisq(.5,df))
+ if (method == "median") {
+ Zxl = sort(Zx[ind.function])
+ F = abs(median(Zxl) - qchisq(0.5, df))
}
- if (method=="regress"){
- F=sum((Zxl_r-Chi2)^2);
+ if (method == "regress") {
+ Zxl = sort(Zx[ind.function])
+ Zxl_r = Zxl[1:ntp]
+ F = sum((Zxl_r - Chi2)^2)
}
- return(F);
+ return(F)
}
- #b=c(-0.1807326,-0.2753122,-0.2110807,1.8401476);
- #if (pol.d==4) {
- # b=c(0,0,0,0,1)
- # nlm=nlm(f2,b)
- # b=nlm$estimate;
- # if (plot) {plot(sort(b[1]*ppp+b[2]*ppp^2+b[3]*ppp^3+b[4]*ppp^4+b[5]),typ="l");}
- # data=data/(b[1]*p+b[2]*p^2+b[3]*p^3+b[4]*p^4+b[5]);
- #}
- #if (pol.d==3) {
- # b=c(0,0,0,1)
- #nlm=nlm(f2,b)
- #b=nlm$estimate;
- # if (plot) {plot(sort(b[1]*ppp+b[2]*ppp^2+b[3]*ppp^3+b[4]),typ="l");}
- # data=data/(b[1]*p+b[2]*p^2+b[3]*p^3+b[4]);
- #}
- # if (pol.d==2) {
- # b=c(0,0,1)
- #nlm=nlm(f2,b)
- #b=nlm$estimate;
- # if (plot) {plot(sort(b[1]*ppp+b[2]*ppp^2+b[3]),typ="l");}
- # data=data/(b[1]*p+b[2]*p^2+b[3]);
- #}
-
- b=rep(0,pol.d+1);
- b[pol.d+1]=1;
- nlm=nlm(f2,b)
- b=nlm$estimate;
- if (plot) {plot(pol.m(ppp,b,pol.d),typ="l");}
- vv=pol.m(p,b,pol.d);
- data_check=data/vv;
- data_check=data_check[!is.na(data_check)];
- if (sum(data_check<0)==0 & sum(data_check>100)==0){ data=data/vv;}
- out=list()
- out$data=data
- out$b=b
+ b = rep(0, pol.d + 1)
+ b[pol.d + 1] = 1
+ nlm = nlm(f2, b)
+ b = nlm$estimate
+ if (plot) {
+ if (type_of_plot=="plot"){
+ vv=pol.m(ppp_abstr, b, pol.d)
+ if (!is.null(lmax)){
+ ylim=0
+ ylim[1]=1-(lmax*5/90)
+ ylim[2]=lmax+(lmax*5/90)
+ plot(ppp_abstr,vv,main=title_name,xlab="Frequency",ylab="lambda(freq)",typ = "l",col=color,ylim=ylim)
+ } else{
+ plot(ppp_abstr,vv,main=title_name,xlab="Frequency",ylab="lambda(freq)",typ = "l",col=color)
+ }
+ #ylim=0
+ #ylim[1]=1-(lmax*5/90)
+ #ylim[2]=lmax+(lmax*5/90)
+ #plot(ppp_abstr,vv,main="Lambda",xlab="Frequency",ylab="lambda(freq)",typ = "l",col=color,ylim=ylim)
+ abline(v=min(p,na.rm=TRUE),col="green")
+ abline(v=max(p,na.rm=TRUE),col="green")
+ } else{
+ lines(ppp_abstr,pol.m(ppp_abstr, b, pol.d),main=title_name,xlab="Frequency",ylab="lambda(freq)",typ = "l",col=color)
+ }
+ }
+ vv = pol.m(p, b, pol.d)
+ data = data/vv
+ out = list()
+ out$data = data
+ out$b = b
return(out)
- #data
}
Added: pkg/GenABEL/R/VIFGC.R
===================================================================
--- pkg/GenABEL/R/VIFGC.R (rev 0)
+++ pkg/GenABEL/R/VIFGC.R 2013-11-21 20:00:01 UTC (rev 1412)
@@ -0,0 +1,290 @@
+#' Genomic control for various model of inheritance using VIF
+#'
+#' This function estimates corrected statistic using genomic control
+#' for different models (recessive, dominant, additive etc.),
+#' using VIF. VIF coefficients are estimated
+#' by optimizing different error functions: regress,
+#' median and ks.test.
+#'
+#' @param data Input vector of Chi square statistic
+#' @param method Function of error to be optimized. Can be
+#' "regress", "median" or "ks.test"
+#' @param p Input vector of allele frequencies
+#' @param x Model of inheritance (0 for recessive,0.5 for additive, 1 for dominant, also it could be arbitrary)
+#' @param index.filter Indexes for variables that will be use for analysis in data vector
+#' @param n The size of the sample
+#' @param proportion The proportion of lowest P (Chi2) to be used when
+#' estimating the inflation factor Lambda for "regress" method only
+#' @param plot If TRUE, plot of lambda will be produced
+#' @param type_of_plot For developers only
+#' @param lmax The threshold for lambda for plotting (optional)
+#' @param color The color of the plot
+#' @param F The estimation of F (optional)
+#' @param K The estimation of K (optional)
+#' @param ladd The estimation of lambda for additive model (optional)
+#' @param clust For developers only
+#' @param vart0 For developers only
+#' @param tmp For developers only
+#' @param CA For developers only
+#' @param p.table For developers only
+#'
+#' @return A list with elements
+#' \item{Zx}{output vector corrected Chi square statistic}
+#' \item{vv}{output vector of VIF}
+#' \item{exeps}{output vector of exepsons (NA)}
+#' \item{calrate}{output vector of calrate}
+#' \item{F}{F}
+#' \item{K}{K}
+#'
+#' @author Yakov Tsepilov
+#'
+#' @examples
+#' data(ge03d2)
+#' # truncate the data to make the example faster
+#' ge03d2 <- ge03d2[seq(from=1,to=nids(ge03d2),by=2),seq(from=1,to=nsnps(ge03d2),by=3)]
+#' qts <- mlreg(dm2~sex,data=ge03d2,gtmode = "dominant")
+#' chi2.1df <- results(qts)$chi2.1df
+#' s <- summary(ge03d2)
+#' freq <- s$Q.2
+#' result <- VIFGC(p=freq,x=1,method = "median",CA=FALSE,data=chi2.1df,n=nids(ge03d2))
+#'
+#' @keywords htest
+#'
+
+VIFGC=function (data, p, x, method = "regress", n, index.filter = NULL,
+ proportion = 1, clust = 0, vart0 = 0, tmp = 0, CA = FALSE,
+ p.table = 0,plot=TRUE,lmax=NULL,color="red",F=NULL,K=NULL,type_of_plot="plot",ladd=NULL)
+{
+ if (is.null(index.filter)) {
+ ind.function = which(!is.na(data))
+ }
+ else ind.function = index.filter
+ ind.function = ind.function[which(!is.na(data[ind.function]))]
+ if (!(method == "regress" | method == "median" | method ==
+ "ks.test")) {
+ print("Error. I do not know this method")
+ break
+ }
+ if (CA) {
+ p00 <- p.table[1, 1, ]
+ p01 <- p.table[1, 2, ]
+ p02 <- p.table[1, 3, ]
+ n0 <- p.table[1, 4, ]
+ p10 <- p.table[2, 1, ]
+ p11 <- p.table[2, 2, ]
+ p12 <- p.table[2, 3, ]
+ n1 <- p.table[2, 4, ]
+ p0 <- p.table[3, 1, ]
+ p1 <- p.table[3, 2, ]
+ p2 <- p.table[3, 3, ]
+ n <- p.table[3, 4, ]
+ Zx = 0
+ exeps = 0
+ Dx = (p12 - p02) + x * (p11 - p01)
+ i = 1
+ while (i <= length(n)) {
+ vart = ((p2[i] + p1[i] * x^2) - (p2[i] + x * p1[i])^2) *
+ ((1/n1[i]) + (1/n0[i]))
+ if (vart == 0) {
+ Zx[i] = NA
+ exeps[i] = T
+ }
+ else {
+ Zx[i] = ((Dx[i])^2)/(vart)
+ exeps[i] = F
+ }
+ i = i + 1
+ }
+ inf = which(!is.na(Zx))
+ notinf = which(is.na(Zx))
+ }
+ else {
+ Zx = data
+ inf = which(!is.na(Zx))
+ notinf = which(is.na(Zx))
+ }
+ Clust <- function(k) {
+ data.dist <- as.dist(0.5 - tmp)
+ data.mds <- cmdscale(data.dist)
+ return(data.mds)
+ }
+ ClustN <- function(k, data1.mds) {
+ km <- kmeans(data1.mds, centers = k, nstart = 1000)
+ i = 1
+ cl = 0
+ while (i <= k) {
+ cl[i] <- length(which(km$cluster == i))
+ i = i + 1
+ }
+ return(cl)
+ }
+ VIF <- function(p, N, F, K) {
+ q = 1 - p
+ Var = ((2 * p * (1 - F) * q * x^2 + F * p + (1 - F) *
+ p^2) - (2 * p * q * (1 - F) * x + F * p + (1 - F) *
+ p^2)^2)
+ S = (1 + F) * (1 + 2 * F)
+ Cov1 = ((6 * F^3 * p + 11 * (1 - F) * F^2 * p^2 + (1 -
+ F)^3 * p^4 + 6 * (1 - F)^2 * F * p^3)/S) - ((1 -
+ F) * p^2 + F * p)^2
+ Cov2 = x * (((4 * (2 * (1 - F) * F^2 * p * q + (1 - F)^3 *
+ p^3 * q + 3 * (1 - F)^2 * F * p^2 * q))/S) - 2 *
+ ((1 - F) * p^2 + F * p) * (2 * (1 - F) * p * q))
+ Cov3 = x^2 * ((4 * ((1 - F)^3 * p^2 * q^2 + F * (1 -
+ F) * p * q))/S - (2 * (1 - F) * p * q)^2)
+ Cov = Cov1 + Cov2 + Cov3
+ if (vart0 == 1) {
+ VarT0 = N * Var - N * Cov
+ }
+ if (vart0 == 0) {
+ VarT0 = N * Var
+ }
+ VarT = VarT0 + Cov * K
+ VIF = VarT/VarT0
+ return(VIF)
+ }
+ GC_VIF_nlm = function(FK) {
+ F = FK[1]
+ K = FK[2]
+ vector_vif = VIF(p, n, F, K)
+ Zxl = Zx/vector_vif
+ Zxl = sort(Zxl[ind.function])
+ Zxl_r = Zxl[1:ntp]
+ if (method == "ks.test") {
+ dMedian = -log(ks.test(Zxl, "pchisq", df = 1)$p.value)
+ }
+ if (method == "median") {
+ dMedian = abs(qchisq(0.5, 1) - median(Zxl))
+ }
+ if (method == "regress") {
+ dMedian = sum((Zxl_r - Chi2)^2)
+ }
+ return(1 * dMedian)
+ }
+ GC_VIF = function(F, K) {
+ vector_vif = VIF(p, n, F, K)
+ Zxl = Zx/vector_vif
+ Zxl = sort(Zxl[ind.function])
+ Zxl_r = Zxl[1:ntp]
+ if (method == "ks.test") {
+ dMedian = -log(ks.test(Zxl, "pchisq", df = 1)$p.value)
+ }
+ if (method == "median") {
+ dMedian = abs(qchisq(0.5, 1) - median(Zxl))
+ }
+ if (method == "regress") {
+ dMedian = sum((Zxl_r - Chi2)^2)
+ }
+ return(1 * dMedian)
+ }
+ data_p <- data[ind.function]
+ if (proportion > 1 || proportion <= 0)
+ stop("proportion argument should be greater then zero and less than or equal to one")
+ ntp <- round(proportion * length(data_p))
+ if (ntp < 1)
+ stop("no valid measurments")
+ if (ntp == 1) {
+ warning(paste("One measurment, Lambda = 1 returned"))
+ return(list(estimate = 1, se = 999.99))
+ }
+ if (ntp < 10)
+ warning(paste("number of points is too small:", ntp))
+ if (min(data_p) < 0)
+ stop("data argument has values <0")
+ if (max(data_p) <= 1) {
+ data_p <- qchisq(data_p, 1, lower.tail = FALSE)
+ }
+ lambda=(median(data, na.rm = T)/qchisq(0.5, 1))
+ data_p <- sort(data_p)
+ ppoi <- ppoints(data_p)
+ Chi2 <- sort(qchisq(1 - ppoi, 1))
+ Chi2 <- Chi2[1:ntp]
+ if (clust == 1) {
+ F = 0.5
+ K = n[1]
+ data.mds0 <- Clust(0)
+ k = 2
+ cl0 <- ClustN(k, data.mds0)
+ data.mds1 <- Clust(1)
+ k = 2
+ cl1 <- ClustN(k, data.mds1)
+ if (length(cl1) > length(cl0)) {
+ cl0[(length(cl0) + 1):length(cl1)] = 0
+ }
+ if (length(cl0) > length(cl1)) {
+ cl1[(length(cl1) + 1):length(cl0)] = 0
+ }
+ S = sum(cl0)
+ R = sum(cl1)
+ K = sum((cl1 * S - cl0 * R)^2)
+ K1 = K/(R * S)
+ opt_2 = optimize(GC_VIF, c(0, 1), tol = 1e-04, K = K1)
+ F = opt_2$minimum
+ K = K1
+ }
+ if (clust == 0) {
+
+ if (!is.null(F) & !is.null(K)){
+ FK=0
+ FK[1]=F
+ FK[2]=K
+ } else{
+
[TRUNCATED]
To get the complete diff run:
svnlook diff /svnroot/genabel -r 1412
More information about the Genabel-commits
mailing list