[Genabel-commits] r788 - in pkg/PredictABEL: . R

Fri Sep 30 12:02:35 CEST 2011

Author: lckarssen
Date: 2011-09-30 12:02:34 +0200 (Fri, 30 Sep 2011)
New Revision: 788

Added:
   pkg/PredictABEL/R/simulation_codes.R
Modified:
   pkg/PredictABEL/DESCRIPTION
   pkg/PredictABEL/R/PredictABEL.R
Log:
Several updates to PredictABEL from Suman Kundu

Modified: pkg/PredictABEL/DESCRIPTION
===================================================================

--- pkg/PredictABEL/DESCRIPTION	2011-09-30 08:28:23 UTC (rev 787)
+++ pkg/PredictABEL/DESCRIPTION	2011-09-30 10:02:34 UTC (rev 788)
@@ -1,18 +1,27 @@
 Package: PredictABEL
 Title: Assessment of risk prediction models
-Version: 1.1
+Version: 1.1.1
 Date: 2011-02-09
 Author: Suman Kundu, Yurii S. Aulchenko, A. Cecile J.W. Janssens
-Maintainer: Suman Kundu <s.kundu at erasmusmc.nl>, A. Cecile J.W. Janssens <a.janssens at erasmusmc.nl>
+Maintainer: Suman Kundu <s.kundu at erasmusmc.nl>, 
+A. Cecile J.W. Janssens <a.janssens at erasmusmc.nl>
 Depends: R (>= 2.9.0), Hmisc, ROCR, epitools, PBSmodelling
 Suggests: GenABEL
-Description: PredictABEL includes functions to assess the performance of risk models. The package contains functions for the various measures that are used
-    in empirical studies, including univariate and multivariate odds ratios (OR) of the predictors, the c-statistic (or area under the receiver operating
-    characteristic (ROC) curve (AUC)), Hosmer-Lemeshow goodness of fit test, reclassification table, net reclassification improvement (NRI) and integrated
-    discrimination improvement (IDI). Also included are functions to create plots, such as risk distributions, ROC curves, calibration plot, discrimination
-    box plot and predictiveness curves. In addition to functions to assess the performance of risk models, the package includes functions to obtain weighted
-    and unweighted risk scores as well as predicted risks using logistic regression analysis. These logistic regression functions are specifically written
-    for models that include genetic variables, but they can also be applied to models that are based on non-genetic risk factors only.
+Description: PredictABEL includes functions to assess the performance of 
+risk models. The package contains functions for the various measures that are 
+used in empirical studies, including univariate and multivariate odds ratios
+ (OR) of the predictors, the c-statistic (or area under the receiver operating
+  characteristic (ROC) curve (AUC)), Hosmer-Lemeshow goodness of fit test,
+   reclassification table, net reclassification improvement (NRI) and 
+   integrated discrimination improvement (IDI). Also included are functions 
+   to create plots, such as risk distributions, ROC curves, calibration plot, 
+   discrimination box plot and predictiveness curves. In addition to functions 
+   to assess the performance of risk models, the package includes functions to 
+   obtain weighted and unweighted risk scores as well as predicted risks using 
+   logistic regression analysis. These logistic regression functions are 
+   specifically written for models that include genetic variables, but they 
+   can also be applied to models that are based on non-genetic risk factors only. 
 License: GPL (>= 2)
-Collate: 'PredictABEL.R'
-Packaged: 2011-02-09 12:39:12 UTC; 488810
+
+
+

Modified: pkg/PredictABEL/R/PredictABEL.R
===================================================================
--- pkg/PredictABEL/R/PredictABEL.R	2011-09-30 08:28:23 UTC (rev 787)
+++ pkg/PredictABEL/R/PredictABEL.R	2011-09-30 10:02:34 UTC (rev 788)
@@ -112,18 +112,18 @@
 #' 
 #' @references S Kundu, YS Aulchenko, CM van Duijn, ACJW Janssens. PredictABEL:
 #' an R package for the assessment of risk prediction models.
-#' Eur J Epidemiol 2011. In press. \cr
+#' Eur J Epidemiol. 2011;26:261-4. \cr
 #'
 #' ACJW Janssens, JPA Ioannidis, CM van Duijn, J Little, MJ Khoury.
 #' Strengthening the Reporting of Genetic Risk Prediction Studies: The GRIPS
-#' Statement Proposal. Eur J Epidemiol 2011. In press. \cr
+#' Statement Proposal. Eur J Epidemiol. 2011;26:255-9. \cr
 #'
 #' ACJW Janssens, JPA Ioannidis, S Bedrosian, P Boffetta, SM Dolan, N Dowling,
 #' I Fortier, AN. Freedman, JM Grimshaw, J Gulcher, M Gwinn, MA Hlatky, H Janes,
 #' P Kraft, S Melillo, CJ O'Donnell, MJ Pencina, D Ransohoff, SD Schully,
 #' D Seminara, DM Winn, CF Wright, CM van Duijn, J Little, MJ Khoury.
 #' Strengthening the reporting of genetic risk prediction studies
-#' (GRIPS)-Elaboration and explanation. Eur J Epidemiol 2011. In press. \cr  
+#' (GRIPS)-Elaboration and explanation. Eur J Epidemiol. 2011;26:313-37. \cr   
 #'
 #' Aulchenko YS, Ripke S, Isaacs A, van Duijn CM. GenABEL: an R package for genome-wide
 #' association analysis. Bioinformatics 2007;23(10):1294-6.
@@ -969,7 +969,7 @@
  p=predRisk
  y=data[,cOutcome]
  if (length(unique(y))!=2) {
-                            stop(" The outcome is a binary variable.\n")
+                            stop(" The specified outcome is not a binary variable.\n")
                             }
 else{
 
@@ -1046,6 +1046,11 @@
 #'
 #' @references Hanley JA, McNeil BJ. The meaning and use of the area under a  
 #' receiver operating characteristic (ROC) curve. Radiology 1982;143:29-36.
+#' 
+#' 
+#' Tobias Sing, Oliver Sander, Niko Beerenwinkel, Thomas Lengauer.
+#' ROCR: visualizing classifier performance in R.
+#' Bioinformatics 2005;21(20):3940-3941.  
 #'
 #' @seealso \code{\link{predRisk}}, \code{\link{plotRiskDistribution}}       
 #' @examples
@@ -1099,7 +1104,7 @@
 
    }
   lines(x=c(0,1), y=c(0,1), lwd=1,col=8)
- cat("AUC [95% CI] using predicted risks from model",i, ": ", round(rAllele[1],3),
+ cat("AUC [95% CI] for the model",i, ": ", round(rAllele[1],3),
  "[", round(rAllele[1]-1.96/2*rAllele[3],3)," - ",
  round(rAllele[1]+1.96/2*rAllele[3],3), "] \n")
   	}
@@ -1320,7 +1325,7 @@
 function(data, cOutcome, risks, interval, rangexaxis, rangeyaxis, plottitle,
 xlabel, ylabel, labels, fileplot, plottype)
  {
-   if (missing(plottitle)) {plottitle <- "Histigram of risks"}
+   if (missing(plottitle)) {plottitle <- "Histogram of risks"}
   if (missing(xlabel)) {xlabel<- "Risk score"}
   if (missing(ylabel)) {ylabel<- "Percentage"}
     if (missing(labels)) {labels<- c("Without outcome", "With outcome")}
@@ -1601,26 +1606,26 @@
 #'  plotDiscriminationBox(data=ExampleData, cOutcome=cOutcome, predrisk=predRisk, 
 #'  labels=labels) 
 #'
-"plotDiscriminationBox" <- 
 function(data, cOutcome, predrisk, labels, plottitle, ylabel, fileplot, plottype)
  {
   if (missing(labels)) {label <- c("Without disease", "With disease")}
   if (missing(plottitle)) {plottitle <- "Box plot"}
   if (missing(ylabel)) {ylabel<- "Predicted risks"}
 risk <- predrisk
-boxplot(risk~data[,cOutcome],ylab=ylabel,ylim=c(0,1),cex.lab=1.2,las=1 ,
+a<-0;b<-1
+if((max(predrisk)>1)|(min(predrisk)<0)){a<-min(predrisk);b<-max(predrisk)}
+boxplot(risk~data[,cOutcome],ylab=ylabel,ylim=c(a,b),cex.lab=1.2,las=1 ,
  main= plottitle, cex.axis=1.1, names=labels)
 boxplot(c(mean(risk[data[,cOutcome]==0]),mean(risk[data[,cOutcome]==1]))~c(0,1),
-add=TRUE, boxlty=0, staplelty=0, medlty=0, medlwd=0, medpch=15,las=1, 
+add=TRUE, boxlty=0, staplelty=0, medlty=0, medlwd=0, medpch=15,las=1,
 cex.axis=1.1, xaxt='n')
 p<- list(Discrim_Slope = round(mean(risk[data[,cOutcome]==1]) -
- mean(risk[data[,cOutcome]==0]),3)) 
+ mean(risk[data[,cOutcome]==0]),3))
 
  if (missing(plottype)) {plottype<- "jpg"}
  	if (!missing(fileplot))
       savePlot(filename = fileplot,type =plottype,device = dev.cur(),restoreConsole = TRUE)
 return(p)
-
 }
 #' An example code to construct a risk model using logistic regression analysis.
 #' \code{ExampleModels} constructs two risk models using logistic regression analysis. 

Added: pkg/PredictABEL/R/simulation_codes.R
===================================================================
--- pkg/PredictABEL/R/simulation_codes.R	                        (rev 0)
+++ pkg/PredictABEL/R/simulation_codes.R	2011-09-30 10:02:34 UTC (rev 788)
@@ -0,0 +1,278 @@
+#' Function to construct a simulated dataset containing individual genotype data, genetic risks and disease status.   
+#' Construct a dataset that contains genotype data, estimated risk based on 
+#' genetic variants, and disease status for a hypothetical population. 
+#' The dataset is constructed using simulation in such a way that the frequencies 
+#' and odds ratios (OR) of the genetic variants and the population disease risk 
+#' computed from this dataset are the same as specified by the input parameters.
+#' 
+#' The function will execute when the matrix with odds ratios and frequencies, 
+#' population disease risk and the number of individuals are specified. \cr 
+#'
+#' The simulation method is described in detail in the references. \cr 
+#'
+#'
+#' The method assumes that (i) the combined effect of the genetic variants 
+#' on disease risk follows a multiplicative (log additive) risk model; 
+#' (ii) genetic variants inherit independently, that is no linkage disequilibrium 
+#' between the variants; (iii) genetic variants have independent effects on the 
+#' disease risk, which indicates no interaction among variants; and (iv) all 
+#' genotypes and allele proportions are in Hardy-Weinberg equilibrium. 
+#' Assumption (ii) and (iv) are used to generate the genotype data, and assumption 
+#'(i), (ii) and (iii) are used to calculate disease risk.
+#'
+#'
+#' Simulating the dataset involves three steps: (1) modelling genotype data, 
+#' (2) modelling disease risks, and (3) modelling disease status. Brief 
+#' descriptions of these steps are as follows:
+#'
+#'
+#' (1) Modelling genotype data: For each genetic variant the genotype 
+#' frequencies are either specified or calculated from the allele frequencies 
+#' using Hardy-Weinberg equilibrium. Then, the genotypes for each genetic 
+#' variant are randomly distributed without replacement over all individuals.
+#'
+#'
+#' (2) Modelling disease risks: For the calculation of the individual disease 
+#' risk, Bayes' theorem is used, which states that the posterior odds of disease 
+#' are obtained by multiplying the prior odds by the likelihood ratio (LR) of 
+#' the individual genotype data. The prior odds are calculated from the 
+#' population disease risk or disease prevalence 
+#' (prior odds= prior risk/ (1- prior risk)) and the posterior odds are converted 
+#' back into disease risk (disease risk= posterior odds/ (1+ posterior odds)). 
+#' Under the no linkage disequilibrium (LD) assumption, the LR is obtained 
+#' by multiplying the LRs of all individual genotypes that are included in 
+#' the risk model. The LR of each genotype is calculated using frequencies 
+#' and ORs of genetic variants and population disease risk. See references 
+#' for more details.
+#'
+#'
+#' (3) Modelling disease status: To model disease status, we used a procedure 
+#' that compares the disease risk of each subject to a randomly drawn value 
+#' between 0 and 1 from a uniform distribution. A subject was assigned to the 
+#' group who will develop the disease when the disease risk was higher than the 
+#' random value and to the group who will not develop the disease when the risk 
+#' was lower than the random value. 
+#'
+#'
+#' This procedure ensures that for each genomic profile, the percentage of 
+#' people who will develop the disease equals the population disease risk 
+#' associated with that profile, when the subgroup of individuals with that 
+#' profile is sufficiently large.
+#'
+#'
+#' @param ORfreq Matrix with ORs and frequencies of the genetic variants. 
+#' The matrix contains four columns in which the first two describe ORs and the 
+#' last two describe the corresponding frequencies. The number of rows in this 
+#' matrix is same as the number of genetic variants included. Genetic variants 
+#' can be specified as per genotype, per allele, or per dominant/ recessive 
+#' effect of the risk allele. When per genotype data are used, OR of the 
+#' heterozygous and homozygous risk genotypes are mentioned in the first two 
+#' columns and the corresponding genotype frequencies are mentioned in the last 
+#' two columns. When per allele data are used, the OR and frequency of the risk 
+#' allele are specified in the first and third column and the remaining two cells 
+#' are coded as '1'.  Similarly, when per dominant/ recessive data are used, the 
+#' OR and frequency of the dominant/ recessive variant are specified in the first 
+#' and third column, and the remaining two cells are coded as '0'. \cr
+#' Note that, when OR of a genetic variant is less than 1, modify the reference 
+#' group such that the OR for the new reference group is 1 and above 1 for 
+#' other groups. Also, change the corresponding frequencies accordingly.    
+#' @param poprisk Population disease risk (expressed in proportion).
+#' @param popsize  Total number of individuals included in the dataset.
+#' @param filename Name of the file in which the dataset will be saved. 
+#' The file is saved in the working directory as a txt file. When no filename 
+#' is specified, the output is not saved. 
+#' 
+#'  @return 
+#'   The function returns:
+#'   \item{Dataset}{A data frame or matrix that includes genotype data, 
+#' estimated genetic risk and disease status for a hypothetical population. 
+#' The dataset contains (4+number of genetic variants included) columns. The 
+#' first column of this dataset is the unweighted risk score, which is the sum 
+#' of the number of risk alleles for each individual, the third column is the 
+#' estimated genetic risk, the forth column is the individual disease status with '1' 
+#' indicates with and '0' as without the outcome of interest, and the fifth until 
+#' the end column are genotype data for the variants expressed as '0', '1' or '2', 
+#' which indicate the number of risk alleles for that genetic variant.}
+#'
+#'
+#' @keywords models
+#' 
+#'
+#' @references Hanley JA, McNeil BJ. The meaning and use of the area under a  
+#' receiver operating characteristic (ROC) curve. Radiology 1982;143:29-36.
+#'
+#'
+#'  Janssens AC, Aulchenko YS, Elefante S, Borsboom GJ, Steyerberg EW, 
+#' van Duijn CM. Predictive testing for complex diseases using multiple genes: 
+#' fact or fiction? Genet Med. 2006;8:395-400.
+#'
+#'
+#' Janssens AC, Moonesinghe R, Yang Q, Steyerberg EW, van Duijn CM, Khoury MJ. 
+#' The impact of genotype frequencies on the clinical validity of genomic 
+#' profiling for predicting common chronic diseases. Genet Med. 2007;9:528-35.
+#'
+#'
+#' van der Net JB, Janssens AC, Sijbrands EJ, Steyerberg EW. Value of genetic 
+#' profiling for the prediction of coronary heart disease. 
+#' Am Heart J. 2009;158:105-10.
+#'
+#'
+#' van Zitteren M, van der Net JB, Kundu S, Freedman AN, van Duijn CM, 
+#' Janssens AC. Genome-based prediction of breast cancer risk in the general 
+#' population: a modeling study based on meta-analyses of genetic associations. 
+#' Cancer Epidemiol Biomarkers Prev. 2011;20:9-22.
+#'
+#'      
+#' @examples
+#' # specify the matrix containing the ORs and frequencies of genetic variants. 
+#' # In this example we used per allele genetic variants
+#' ORfreq<-cbind(c( 1.35,1.20,1.24,1.16), rep(1,4), c(.41,.29,.28,.51),rep(1,4))
+#'
+#' # Obtain the dataset
+#' Data <- simulatedDataset(ORfreq=ORfreq, poprisk=.3, popsize=1000)
+#'
+#' # Obtain the AUC and produce ROC curve
+#' plotROC(data=Data, cOutcome=4, predrisk=Data[,3]) 
+#'
+"simulatedDataset" <- function(ORfreq, poprisk, popsize, filename) 
+{
+if (missing(poprisk)) {stop("Population disease risk is not specified")}
+if (missing(popsize)) {stop("Total number of individuals is not mentioned")}
+
+g <- nrow(ORfreq)
+reconstruct.2x2table <- function(p,d,OR,s)
+{
+a <- 0
+b <- 0
+c <- (OR*p*s*(1-d)*d*s)/((1-p)*s*(1-d)+OR*p*s*(1-d))
+dd <- p*s-c
+e <- d*s-c
+f <- (1-p)*s-e
+tabel <- cbind(a,b,c,dd,e,f,g,OR)
+tabel
+}
+###################################################################
+# Reconstruct 2*3 table from OR - no rare disease assumption
+###################################################################
+reconstruct.2x3table <- function(OR1,OR2,p1,p2,d,s){
+	a	<- 1
+	eOR	<- 0
+	while (eOR<=OR2){
+		b	<- p2*s*(1-d)
+		snew <- s-a-b
+		p1new <-p1/(1-p2)
+		dnew <- (d-(a/s))/((d-(a/s))+ ((1-d)-b/s))
+		c	<-	(OR1*p1new*snew*(1-dnew)*dnew*snew)/((1-p1new)*snew*(1-dnew)+OR1*p1new*snew*(1-dnew))
+		dd	<-	p1new*((1-d)-b/s)*s
+		e	<-	(d-(a/s))*s-c
+		f	<- ((1-d)-b/s)*s-dd
+		eOR	<- (a*f)/(b*e)
+		tabel <- cbind(a,b,c,dd,e,f,g,OR1,OR2)
+		a	<- a+1
+		tabel
+		}
+	tabel
+}
+###################################################################
+# Reconstruct 2*3 table from OR - based on HWE - no rare disease assumption
+###################################################################
+reconstruct.2x3tableHWE <- function(OR,p,d,s){
+  OR1 <- OR
+  OR2 <- OR^2
+	p1 <-  2*p*(1-p)
+	p2 <-  p*p
+
+	a	<- 1
+	eOR	<- 0
+	while (eOR<=OR2){
+		b	<- p2*s*(1-d)
+		snew <- s-a-b
+		p1new <-p1/(1-p2)
+		dnew <- (d-(a/s))/((d-(a/s))+ ((1-d)-b/s))
+		c	<-	(OR1*p1new*snew*(1-dnew)*dnew*snew)/((1-p1new)*snew*(1-dnew)+OR1*p1new*snew*(1-dnew))
+		dd	<-	p1new*((1-d)-b/s)*s
+		e	<-	(d-(a/s))*s-c
+		f	<- ((1-d)-b/s)*s-dd
+		eOR	<- (a*f)/(b*e)
+		tabel <- cbind(a,b,c,dd,e,f,g,OR1,OR2)
+		a	<- a+1
+		tabel
+		}
+	tabel
+}
+###################################################################
+# Adjust such that mean (postp) = pd
+###################################################################
+# this correction is needed when the number of genes or ORs get large to ensure that the mean (postp) equals the prior risk
+
+adjust.postp <- function (pd, LR){		
+	odds.diff <- 0
+	prior.odds <- pd/(1-pd)	
+	for (i in (1:100000)) {
+	Postp <- (prior.odds*LR)/(1+(prior.odds*LR))
+	odds.diff <- (pd-mean(Postp))/ (1-(pd-mean(Postp)))
+	prior.odds	<- prior.odds+odds.diff
+	if (odds.diff < .0001) break
+	}
+	Postp
+}
+
+################################################################################
+
+func.data <- function(p,d,OR,s,g){
+  Data <- matrix (NA,s,4+g)
+  Data[,1] <- rep(0,s)                   
+	Data[,2] <- rep(1,s)										
+	Data[,3] <- rep(0,s)
+	i <- 0
+	while (i < g){
+    i <- i+1
+    cells2x3 <- rep(NA,9)
+    cells2x3 <- if(p[i,2]==0) {reconstruct.2x2table(p=p[i,1],d,OR=OR[i,1],s)} else {if(p[i,2]==1) {reconstruct.2x3tableHWE(OR=OR[i,1],p=p[i,1],d,s)}
+  else {reconstruct.2x3table(OR1=OR[i,1],OR2=OR[i,2],p1=p[i,1],p2=p[i,2],d,s)}}   # reconstruct table for calculation of likelihood ratios for genotypes
+      LREE 	  <- ((cells2x3[1]/d*s)/(cells2x3[2]/(1-d)*s))			# calculate likelihood ratios
+      LREe	  <- ((cells2x3[3]/d*s)/(cells2x3[4]/(1-d)*s))
+      LRee	  <- ((cells2x3[5]/d*s)/(cells2x3[6]/(1-d)*s))
+ 
+ Gene <- if(p[i,2]==0){c(rep(0,((1-p[i,1]-p[i,2])*s)),rep(1,p[i,1]*s),rep(2,p[i,2]*s))} 
+     else {c(rep(0,((1-p[i,1]*p[i,1]-2*p[i,1]*(1-p[i,1]))*s)),rep(1,2*p[i,1]*(1-p[i,1])*s),rep(2,p[i,1]*p[i,1]*s))}		# create vector of genotypes for all subjects based on hardy-weinberg distribution of alleles
+		Filler <- s-length(Gene)                               #soms is Gene 1 te subject te kort en dan werkt het niet
+		Gene <- sample(c(Gene,rep(0,Filler)),s,rep=F)
+    Data[,4+i] <- Gene
+    GeneLR <- ifelse(Gene==0,LRee,ifelse(Gene==1,LREe,LREE))
+   
+    Data[,1] <- Data[,1]+Gene
+    Data[,2] <- Data[,2]*GeneLR	
+			
+#	 cat(i,"")										# report proces on screen
+		}
+		
+		Data[,3] <- adjust.postp(pd=d, LR=Data[,2])				
+		Data[,4]  <- ifelse(runif(s)<=(Data[,3]), 1, 0)  					          	
+    Data <- as.data.frame(Data)
+    Data
+    }
+    
+ simulatedData <- func.data (p=ORfreq[,c(3,4)],d=poprisk,OR=ORfreq[,c(1,2)],s=popsize,g=nrow(ORfreq))   
+
+
+if (!missing(filename)) 
+	{write.table( simulatedData,file=filename, row.names=TRUE,sep = "\t")  }
+
+ return(simulatedData)
+}   
+
+#-----------------------
+######  run the function
+#-----------------------
+
+func.AUC <- function(cOutcome, cPredrisk )
+{
+  rAllele <- rcorr.cens(cPredrisk, cOutcome, outx=FALSE)
+  rAllele[1]
+}
+
+ORfreq<-cbind(c( 1.35,1.20,1.24,1.16,1.08,1.18,1.11), rep(1,7), c(.41,.29,.28,.51,.42,.40,.32),rep(1,7))
+Data <- simulatedDataset(ORfreq=ORfreq, poprisk=.3, popsize=1000)
+dim(Data)
+func.AUC (cOutcome=Data[,4], cPredrisk=Data[,3] )  
\ No newline at end of file