[Patchwork-commits] r189 - .git .git/logs .git/logs/refs/heads .git/logs/refs/remotes/origin .git/refs/heads .git/refs/remotes/origin pkg/TAPS pkg/TAPS/R pkg/TAPS/man pkg/patchwork pkg/patchwork/R pkg/patchwork/man www

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Wed Sep 11 14:49:44 CEST 2013


Author: sebastian_d
Date: 2013-09-11 14:49:43 +0200 (Wed, 11 Sep 2013)
New Revision: 189

Added:
   pkg/TAPS/man/TAPS_compare.Rd
   pkg/TAPS/man/TAPS_freq.Rd
Modified:
   .git/COMMIT_EDITMSG
   .git/index
   .git/logs/HEAD
   .git/logs/refs/heads/master
   .git/logs/refs/remotes/origin/master
   .git/refs/heads/master
   .git/refs/remotes/origin/master
   pkg/TAPS/DESCRIPTION
   pkg/TAPS/NAMESPACE
   pkg/TAPS/R/TAPS.r
   pkg/TAPS/R/zzz.r
   pkg/TAPS/man/TAPS_call.Rd
   pkg/patchwork/DESCRIPTION
   pkg/patchwork/NAMESPACE
   pkg/patchwork/R/karyotype.r
   pkg/patchwork/R/karyotype_check.r
   pkg/patchwork/R/karyotype_chroms.r
   pkg/patchwork/R/karyotype_chromsCN.r
   pkg/patchwork/R/patchwork.alleledata.r
   pkg/patchwork/R/patchwork.copynumbers.r
   pkg/patchwork/R/patchwork.plot.r
   pkg/patchwork/man/karyotype.Rd
   pkg/patchwork/man/karyotype_chroms.Rd
   pkg/patchwork/man/karyotype_chromsCN.Rd
   www/TAPS_inst.php
   www/changelog.php
Log:
Extensive changes to patchwork plotting. New functions, TAPS_compare and TAPS_freq, to TAPS

Modified: .git/COMMIT_EDITMSG
===================================================================
--- .git/COMMIT_EDITMSG	2013-09-05 07:39:10 UTC (rev 188)
+++ .git/COMMIT_EDITMSG	2013-09-11 12:49:43 UTC (rev 189)
@@ -1 +1 @@
-homepage info update pysam
+some updates to taps homepage info

Modified: .git/index
===================================================================
(Binary files differ)

Modified: .git/logs/HEAD
===================================================================
--- .git/logs/HEAD	2013-09-05 07:39:10 UTC (rev 188)
+++ .git/logs/HEAD	2013-09-11 12:49:43 UTC (rev 189)
@@ -58,3 +58,10 @@
 5a2afd158f70d21ed8bf6a45f61a19dbbb6b79c2 c08551d489e8bc28692c600dfffa7e7d4cb37ca5 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1372256038 +0200	commit: Move pysam out of package
 c08551d489e8bc28692c600dfffa7e7d4cb37ca5 b617a63c7595a2cb1b2a85e2ed73a67eccac8b32 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1372346578 +0200	commit: homepage info update pysam
 b617a63c7595a2cb1b2a85e2ed73a67eccac8b32 f626f1a4b4965df8008d349712318cedf6dde9eb Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1372427760 +0200	pull : Fast-forward
+f626f1a4b4965df8008d349712318cedf6dde9eb 9327100077803d89d70ede58fe98f0949fe726e8 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1376640199 +0200	commit: Updated plotting procedures significantly of patchwork.
+9327100077803d89d70ede58fe98f0949fe726e8 43c186b570d94002d0576a8f805be76267cf2b6a Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1376644958 +0200	commit: deleted two unused pngs
+43c186b570d94002d0576a8f805be76267cf2b6a d63069fedd7b5d39e37a64432b37a149711b9b21 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1377264167 +0200	commit: some changes to pw_requ to avoid missunderstandings
+d63069fedd7b5d39e37a64432b37a149711b9b21 74262096bdcb8c53bcf9f4719ff9ae6b23565262 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1378892982 +0200	commit: some updates to taps homepage info
+74262096bdcb8c53bcf9f4719ff9ae6b23565262 884409fb1057881cbdbd2db1c86c85d72bf477c6 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1378894727 +0200	pull : Fast-forward
+884409fb1057881cbdbd2db1c86c85d72bf477c6 7375bc63263a290cbfb537af8c38af2df6ba0f0c Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1378902806 +0200	pull : Fast-forward
+7375bc63263a290cbfb537af8c38af2df6ba0f0c 6f560c07eeb2bc5a9c449af56829257933ebcf87 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1378903472 +0200	pull : Fast-forward

Modified: .git/logs/refs/heads/master
===================================================================
--- .git/logs/refs/heads/master	2013-09-05 07:39:10 UTC (rev 188)
+++ .git/logs/refs/heads/master	2013-09-11 12:49:43 UTC (rev 189)
@@ -58,3 +58,10 @@
 5a2afd158f70d21ed8bf6a45f61a19dbbb6b79c2 c08551d489e8bc28692c600dfffa7e7d4cb37ca5 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1372256038 +0200	commit: Move pysam out of package
 c08551d489e8bc28692c600dfffa7e7d4cb37ca5 b617a63c7595a2cb1b2a85e2ed73a67eccac8b32 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1372346578 +0200	commit: homepage info update pysam
 b617a63c7595a2cb1b2a85e2ed73a67eccac8b32 f626f1a4b4965df8008d349712318cedf6dde9eb Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1372427760 +0200	pull : Fast-forward
+f626f1a4b4965df8008d349712318cedf6dde9eb 9327100077803d89d70ede58fe98f0949fe726e8 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1376640199 +0200	commit: Updated plotting procedures significantly of patchwork.
+9327100077803d89d70ede58fe98f0949fe726e8 43c186b570d94002d0576a8f805be76267cf2b6a Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1376644958 +0200	commit: deleted two unused pngs
+43c186b570d94002d0576a8f805be76267cf2b6a d63069fedd7b5d39e37a64432b37a149711b9b21 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1377264167 +0200	commit: some changes to pw_requ to avoid missunderstandings
+d63069fedd7b5d39e37a64432b37a149711b9b21 74262096bdcb8c53bcf9f4719ff9ae6b23565262 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1378892982 +0200	commit: some updates to taps homepage info
+74262096bdcb8c53bcf9f4719ff9ae6b23565262 884409fb1057881cbdbd2db1c86c85d72bf477c6 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1378894727 +0200	pull : Fast-forward
+884409fb1057881cbdbd2db1c86c85d72bf477c6 7375bc63263a290cbfb537af8c38af2df6ba0f0c Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1378902806 +0200	pull : Fast-forward
+7375bc63263a290cbfb537af8c38af2df6ba0f0c 6f560c07eeb2bc5a9c449af56829257933ebcf87 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1378903472 +0200	pull : Fast-forward

Modified: .git/logs/refs/remotes/origin/master
===================================================================
--- .git/logs/refs/remotes/origin/master	2013-09-05 07:39:10 UTC (rev 188)
+++ .git/logs/refs/remotes/origin/master	2013-09-11 12:49:43 UTC (rev 189)
@@ -56,3 +56,10 @@
 5a2afd158f70d21ed8bf6a45f61a19dbbb6b79c2 c08551d489e8bc28692c600dfffa7e7d4cb37ca5 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1372256058 +0200	update by push
 c08551d489e8bc28692c600dfffa7e7d4cb37ca5 b617a63c7595a2cb1b2a85e2ed73a67eccac8b32 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1372346594 +0200	update by push
 b617a63c7595a2cb1b2a85e2ed73a67eccac8b32 f626f1a4b4965df8008d349712318cedf6dde9eb Sebastian DiLorenzo <S_D at imv096.medsci.uu.se> 1372427760 +0200	pull : fast-forward
+f626f1a4b4965df8008d349712318cedf6dde9eb 9327100077803d89d70ede58fe98f0949fe726e8 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1376640235 +0200	update by push
+9327100077803d89d70ede58fe98f0949fe726e8 43c186b570d94002d0576a8f805be76267cf2b6a Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1376644972 +0200	update by push
+43c186b570d94002d0576a8f805be76267cf2b6a d63069fedd7b5d39e37a64432b37a149711b9b21 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1377264183 +0200	update by push
+d63069fedd7b5d39e37a64432b37a149711b9b21 74262096bdcb8c53bcf9f4719ff9ae6b23565262 Sebastian DiLorenzo <dilorenzo.sebastian at gmail.com> 1378893000 +0200	update by push
+74262096bdcb8c53bcf9f4719ff9ae6b23565262 884409fb1057881cbdbd2db1c86c85d72bf477c6 Sebastian DiLorenzo <S_D at imv092.medsci.uu.se> 1378894674 +0200	fetch: fast-forward
+884409fb1057881cbdbd2db1c86c85d72bf477c6 7375bc63263a290cbfb537af8c38af2df6ba0f0c Sebastian DiLorenzo <S_D at imv092.medsci.uu.se> 1378902806 +0200	pull : fast-forward
+7375bc63263a290cbfb537af8c38af2df6ba0f0c 6f560c07eeb2bc5a9c449af56829257933ebcf87 Sebastian DiLorenzo <S_D at imv092.medsci.uu.se> 1378903403 +0200	pull : fast-forward

Modified: .git/refs/heads/master
===================================================================
--- .git/refs/heads/master	2013-09-05 07:39:10 UTC (rev 188)
+++ .git/refs/heads/master	2013-09-11 12:49:43 UTC (rev 189)
@@ -1 +1 @@
-f626f1a4b4965df8008d349712318cedf6dde9eb
+6f560c07eeb2bc5a9c449af56829257933ebcf87

Modified: .git/refs/remotes/origin/master
===================================================================
--- .git/refs/remotes/origin/master	2013-09-05 07:39:10 UTC (rev 188)
+++ .git/refs/remotes/origin/master	2013-09-11 12:49:43 UTC (rev 189)
@@ -1 +1 @@
-f626f1a4b4965df8008d349712318cedf6dde9eb
+6f560c07eeb2bc5a9c449af56829257933ebcf87

Modified: pkg/TAPS/DESCRIPTION
===================================================================
--- pkg/TAPS/DESCRIPTION	2013-09-05 07:39:10 UTC (rev 188)
+++ pkg/TAPS/DESCRIPTION	2013-09-11 12:49:43 UTC (rev 189)
@@ -1,8 +1,8 @@
 Package: TAPS
 Type: Package
 Title: Tumor Abberation Prediction Suite
-Version: 1.0
-Date: 2013-05-11
+Version: 2.0
+Date: 2013-09-10
 Author: Markus Mayrhofer, Hanna Goransson-Kultima, Sebastian DiLorenzo
 Maintainer: Sebastian DiLorenzo <Sebastian.DiLorenzo at medsci.uu.se> #Markus Mayrhofer <Markus.Mayrhofer at medsci.uu.se>
 Description: Performs a allele-specific copy number analysis of array data.

Modified: pkg/TAPS/NAMESPACE
===================================================================
--- pkg/TAPS/NAMESPACE	2013-09-05 07:39:10 UTC (rev 188)
+++ pkg/TAPS/NAMESPACE	2013-09-11 12:49:43 UTC (rev 189)
@@ -1,5 +1,7 @@
 export(TAPS_plot,
 		TAPS_call,
-		TAPS_region)
+		TAPS_region,
+		TAPS_freq,
+		TAPS_compare)
 
 

Modified: pkg/TAPS/R/TAPS.r
===================================================================
--- pkg/TAPS/R/TAPS.r	2013-09-05 07:39:10 UTC (rev 188)
+++ pkg/TAPS/R/TAPS.r	2013-09-11 12:49:43 UTC (rev 189)
@@ -16,6 +16,7 @@
     suppressPackageStartupMessages(library(stats))
     suppressPackageStartupMessages(library(DNAcopy))
     suppressPackageStartupMessages(library(fields))
+    suppressPackageStartupMessages(library(xlsx))    
 
 
     #list.of.packages <- c("stats", "fields")
@@ -34,23 +35,25 @@
     ## This function takes a directory as input, then builds short-segment TAPS scatter plots for each sample (subdirectory) in the directory.
     setwd(directory)
     subs <- getSubdirs()
+    subs=subs[subs!='frequencies' & subs!='frequencies_comp']
     if (is.null(subs)) {                                         ## check samples = subdirectories or a single sample = current directory
         subs=thisSubdir()
         setwd('..')
     }
     
-    # create SampleData file if there is none.
-    if (length(grep('SampleData.txt',dir()))==0) {
-        sampleData=data.frame(Sample=subs,cn2='',delta='',loh='', MAPD=NA, MHOF=NA, calculate.copynumbers='yes')
+    # create SampleData file if there is none.   
+    if (length(grep('SampleData.xlsx',dir()))==0) {
+        sampleData <- data.frame(Sample=subs,cn1= -0.5, cn2=0, cn3=NA, loh=0.7, MAPD=NA, MHOF=NA)
+        write.xlsx(sampleData,'SampleData.xlsx',row.names=F)
     } else {
-        sampleData=load.txt('SampleData.txt')
+        sampleData=read.xlsx('SampleData.xlsx',1)
     }
         
     
     for (i in 1:length(subs)) {
         setwd(subs[i])
         name <- subs[i]
-        #if (length(grep('SampleData.txt',dir()))==0) save.txt(data.frame(Sample=name,cn2='',delta='',loh='',completed.analysis='no'),file='SampleData.txt')
+        #if (length(grep('sampleData.csv',dir()))==0) save.txt(data.frame(Sample=name,cn2='',delta='',loh='',completed.analysis='no'),file='sampleData.csv')
         cat(' ..loading', subs[i])
         
         if(length(grep("*.cyhd.cychp",dir()))==1)				##cyhd sample
@@ -137,13 +140,13 @@
             save.txt(segments,'_segments.txt') 
         }
         
-        segments$Value <- segments$Value-median(Log2$Value)     ## Median-centering
-        Log2$Value <- Log2$Value-median(Log2$Value)             ## Median-centering
+        segments$Value <- segments$Value-mean(Log2$Value)     ## Median-centering
+        Log2$Value <- Log2$Value-mean(Log2$Value)             ## Median-centering
         
-        allRegions=NULL; try(load('allRegions.Rdata'),silent=T)
+        allRegions=NULL; if ('allRegions.Rdata' %in% dir()) load('allRegions.Rdata')
         if (is.null(allRegions)) allRegions <- makeRegions(Log2, alf, segments)            ## Calculates necessary data for segments (all functions are in this file)
         save(allRegions,file='allRegions.Rdata')
-        regs=NULL; try(load('shortRegions.Rdata'),silent=T)
+        regs=NULL; if ('shortRegions.Rdata' %in% dir()) load('shortRegions.Rdata')
         if (is.null(regs)) {
             regs <- regsFromSegs(Log2,alf,segments,bin=bin,min=5)    ## Calculates the same data for shortened segments
             save(regs,file='shortRegions.Rdata')
@@ -151,14 +154,14 @@
         
         ## Sample QC 
         sampleData$MAPD[i] <- MAPD <- round(median(abs(diff(Log2$Value[Log2$Chromosome=='chr1'][order(Log2$Start[Log2$Chromosome=='chr1'])]))),2)
-        sampleData$MHOF[i] <- MHOF <- round(100*median(0.5+abs(0.5-alf$Value)),1)
+        sampleData$MHOF[i] <- MHOF <- round(100*median(0.5+abs(0.5-alf$Value[alf$Chromosome!='chrX'])),1)
             #round(100*median(0.5+regs$hom[regs$scores<0.5],na.rm=T),1)        
         #MAID=round(median(abs(diff(regs$scores[!is.na(regs$scores)]))),3)
         
         #Save for TAPS_region()
         save(regs,Log2,alf,segments,file="TAPS_plot_output.Rdata")
         
-        save.txt(sampleData,file='../SampleData.txt')
+        #save.txt(sampleData,file='../sampleData.csv')
         
         #Clear warnings generated previously so hopefully I can see what is actually causing the program to fail.
         #assign("last.warning", NULL, envir = baseenv())
@@ -179,12 +182,16 @@
         cat('..done\n')
         setwd('..')
     }
+    write.xlsx(sampleData,'SampleData.xlsx',row.names=F)
 }
 ###
 
 ###
-TAPS_call <- function(directory=NULL,#xlim=c(-1,1),ylim=c(0,1),
-                      minseg=1,maxCn=12) {
+TAPS_call <- function(samples='all',directory=getwd()) {
+    minseg=1
+    maxCn=12
+    suppressPackageStartupMessages(library(xlsx))    
+    
     ## TAPS_call outputs the total and minor allele copy numbers of all segments as a text file, and as images for visual confirmation.
     ## sampleInfo_TAPS.txt must be present in each sample folder. If TAPS_plot could not make a good guess of the Log-R of copy number 2 
     ## and the Log-R difference to a deletion, you must interpret the scatter plots and edit sampleInfo_TAPS.txt.
@@ -197,28 +204,33 @@
         #directory = readline("Please supply such a directory now: ")
     }
     
-    
     setwd(directory)
     #subs <- getSubdirs()
     
-    if (length(grep('SampleData.txt',dir()))==1)
+    if (length(grep('SampleData.xlsx',dir()))==1)
     {
-        sampleData <- load.txt('SampleData.txt')
+        sampleData=read.xlsx('SampleData.xlsx',1)
     }
     else
     {
-        sampleData <- load.txt('../SampleData.txt')
+        sampleData <- read.xlsx('../SampleData.xlsx',1)
     }
     subs=as.character(sampleData$Sample)
     
     if (is.null(subs)) {
         subs=thisSubdir()
+        subs=subs[subs!='frequencies' & subs!='frequencies_comp']
         setwd('..')
     }
-    for (i in 1:length(subs)) if (sampleData$calculate.copynumbers[i]=='yes') {
+    
+    if (samples[1]=='all') samples=rep(T,length(subs))
+    if (is.logical(samples)) samples=which(samples)
+    subs=subs[samples]
+    
+    for (i in 1:length(subs)) {
         setwd(subs[i])
         name <- subs[i]
-        sampleInfo <- sampleData[sampleData$Sample==subs[i],]
+        sampleInfo <- sampleData[sampleData$Sample==subs[i],2:5]
         if (nrow(sampleInfo)==1) {
             
             cat(' ..loading', subs[i])
@@ -236,38 +248,47 @@
             segments <- segments[!is.nan(segments$Value),]
             segments <- segments[!is.na(segments$Value),]    
             
-            segments$Value <- segments$Value-median(Log2$Value) 
-            Log2$Value <- Log2$Value-median(Log2$Value)
+            segments$Value <- segments$Value-mean(Log2$Value) 
+            Log2$Value <- Log2$Value-mean(Log2$Value)
             
             cat(' ..processing.\n')
             
             load('allRegions.Rdata')                            ## These were prepared in TAPS_plot
+            load('shortRegions.Rdata')
             #allRegions <- makeRegions(Log2, alf, segments)
             
             ## estimates the Log-R and Allelic Imbalance Ration of all variants up to maxCn
-            t <- findCNs(Log2,alf,allRegions,dmin=0.9,maxCn=maxCn,ceiling=1,sampleInfo=sampleInfo) 
+            t <- findCNs(Log2,alf,allRegions,regs,name,maxCn=maxCn,ceiling=1,sampleInfo=sampleInfo) 
+            save(t,file='t.Rdata')
             
-            u <- setCNs(allRegions,t$int,t$ai,maxCn)            ## Assigns copy number variant for all segments
+            u <- setCNs(allRegions,t$int,t$ai,t$model,maxCn)            ## Assigns copy number variant for all segments
             allRegions$regions <- u$regions
             ## adjacent segments with idendical copy number are merged (except over centromere) and all are saved to a text file
             save.txt(u$regions,file=paste(name,'_segmentCN.txt',sep='')) 
             regions=allRegions$regions
+<<<<<<< HEAD
+            #save(u$model,file="model.Rdata")
+            write.table(t(as.data.frame(u$model)),file='model.txt',row.names=T)
+=======
             save(t,regions,file="regions_t.Rdata")
+
+            #save parameters as strings
+            parameters=paste("Parameters given: cn2:",sampleInfo$cn2," delta:",sampleInfo$delta," loh:",sampleInfo$loh)
             
+>>>>>>> d63069fedd7b5d39e37a64432b37a149711b9b21
             karyotype_check(regions$Chromosome,regions$Start,regions$End,regions$log2,regions$imba,regions$Cn,regions$mCn,t,ideogram=NULL,name=name)
             
             karyotype_chromsCN(regions$Chromosome,regions$Start,regions$End,regions$log2,
                                regions$imba,regions$Cn,regions$mCn,ideogram=NULL,
                                as.character(Log2$Chromosome),Log2$Start,Log2$Value,as.character(alf$Chromosome),
-                               alf$Start,alf$Value,t,name=name,xlim=c(-1,1),ylim=c(0,1))
+                               alf$Start,alf$Value,t,name=name,xlim=c(-1,1),ylim=c(0,1),parameters=parameters)
             
             cat('..done\n')
-            sampleData$completed.analysis[i] <- ''
         } else cat('Skipped',name,'\n')
         
         setwd('..')
     }
-    save.txt(sampleData,file='SampleData.txt')
+    #save.txt(sampleData,file='sampleData.csv')
 }
 ###
 regsFromSegs <- function (Log2,alf, segments, bin=200,min=1) {
@@ -602,96 +623,109 @@
 
 ## 
 ###
-findCNs <- function(Log2,alf,allRegions,name=thisSubdir(),dmin=0.9,maxCn=10,ceiling=1,sampleInfo=NULL) {
-    ## This function takes an estimate of the Log-R of copy number two (shift) and the difference in log-R between copy numbers 2 and 1 (delta)
-    ## (3 and 2 works too). Then, the Log-R and Allelic Imbalance Ratio of all possible copy number variants up to maxCn are estimated from
-    ## the Log-R and Allelic Imbalance Ratio of all the segments. This function will NOT be useful unless there is already a solid estimate 
-    ## of 'shift' and 'delta'. See the previous function.
+findCNs <- function(Log2,alf,allRegions,regs,name=thisSubdir(),maxCn=10,ceiling=1,sampleInfo=NULL) {
+    ## This function takes an estimate of the Log-R of copy numbers 1, 2 and 3. At least two of these should be entered.
+    ## Then, the Log-R and Allelic Imbalance Ratio of all possible copy number variants up to maxCn are estimated from
+    ## the Log-R and Allelic Imbalance Ratio of all the segments. 
+    if (is.null(sampleInfo)) cat ('there was no estimation available for',name)
     
-    shift=sampleInfo$cn2
-    delta=sampleInfo$delta
+    cns=1:maxCn; est=sampleInfo[1:3]; est[est==' ']=NA; est=as.numeric(est)
+    m <- lm(2^est ~ cns[1:3])$coefficients # can handle one NA in est. This model is for "ratio as a function of copy number".
+    est[is.na(est)] = log2(m[1]+cns[which(is.na(est))]*m[2]) # simple linear regression to fill the missing
     
     tix=NULL     #temporary index
+    probes=NULL  #number of probes at each copy number, for weighting
     int=NULL     ## contains Log-R estimate of each (total) copy number
     ai=NULL         ## contains Allelic Imbalance Ratio estimate of each copy number variant.
     regions <- allRegions$regions
     regions <- regions[(is.autosome(regions$Chromosome)&regions$lengthMB>1)&(!is.na(regions$imba)),] ## will use these regions
     
-    ## likely cn2 regions sit within delta/3 from shift.
-    expectedAt <- shift
-    tix$cn2 <- abs(regions$log2 - expectedAt) < (delta/3)    ## index of likely cn2 regions
+    ## likely cn2 regions sit near the estimate.
+    expectedAt <- est[2]
+    tix$cn2 <- abs(regions$log2 - expectedAt) < diff(est)[2]/3    ## index of likely cn2 regions
     temp <- regions[tix$cn2,]                                ## cn2 regions
     med <- weightedMedian(temp$log2,temp$probes)            ## improved value of Log-R at cn2 (returns NULL if theres nothing there)
-    int$cn2 <- ifelse(!is.null(med),med,expectedAt)            ## saved to int.
+    probes[2] <- sum(temp$probes)
+    int[2] <- ifelse(!is.null(med),med,expectedAt)            ## saved to int.
     
-    ## likely cn1 regions sit at about cn2 - delta:
-    d <- delta                                                 ## the (Log-R) distance to cn1
-    expectedAt <- int$cn2-d                                     ## cn1 is expected here
-    tix$cn1 <- abs(regions$log2 - expectedAt) < (d/3)        ## index of likely cn1 regions
+    ## likely cn1 regions sit near estimate                                         
+    expectedAt <- est[1]                                     ## cn1 is expected here
+    tix$cn1 <- abs(regions$log2 - expectedAt) < diff(est)[1]/3        ## index of likely cn1 regions
     temp <- regions[tix$cn1,]
     med <- weightedMedian(temp$log2,temp$probes)
-    int$cn1 <- ifelse(!is.null(med),med,expectedAt)
+    probes[1] <- sum(temp$probes)
+    int[1] <- ifelse(!is.null(med),med,expectedAt)
     
-    ## likely cn0 regions sit below cn1 - delta:
-    d <- int$cn2-int$cn1
-    expectedAt <- int$cn1-d                                     
-    tix$cn0 <- regions$log2 < expectedAt
-    temp <- regions[tix$cn0,]
-    med <- weightedMedian(temp$log2,temp$probes)
-    int$cn0 <- ifelse(!is.null(med),med,expectedAt)
-    
-    ## likely cn3 regions sit at about cn2+delta*dmin (dmin is about 0.9, the factor by which the distance between consecutive copy numbers diminish)
-    d <- delta*dmin
-    expectedAt <- int$cn2+d
-    tix$cn3 <- abs(regions$log2 - expectedAt) < (d/3)
+    ## likely cn3 regions sit near estimate
+    expectedAt <- est[3]
+    tix$cn3 <- abs(regions$log2 - expectedAt) < diff(est)[2]/3
     temp <- regions[tix$cn3,]
     med <- weightedMedian(temp$log2,temp$probes)
-    int$cn3 <- ifelse(!is.null(med),med,expectedAt)
+    probes[3] <- sum(temp$probes)
+    int[3] <- ifelse(!is.null(med),med,expectedAt)
     
     ## cn4 follows at ...
-    d <- dmin*(int$cn3-int$cn2) 
-    expectedAt <- int$cn3+d
-    tix$cn4 <- abs(regions$log2 - expectedAt) < (d/4)
+    m <- lm(2^int[1:3] ~ cns[1:3], weights=probes[1:3])$coefficients ## use regression to estimate.
+    expectedAt <- log2(m[1]+4*m[2])
+    tix$cn4 <- abs(regions$log2 - expectedAt) < mean(diff(int))/3
     temp <- regions[tix$cn4,]
     med <- weightedMedian(temp$log2,temp$probes)
-    int$cn4 <- ifelse(!is.null(med),med,expectedAt)
+    probes[4] <- sum(temp$probes)
+    int[4] <- ifelse(!is.null(med),med,expectedAt)
     
     ## generalized for higher cns
     for (cn in 5:maxCn) {
         thisCn <- paste('cn',cn,sep='')
         prevCn <- paste('cn',cn-1,sep='')
         pprevCn <- paste('cn',cn-2,sep='')
-        d <- dmin*(int[prevCn][[1]]-int[pprevCn][[1]])
-        expectedAt <- int[prevCn][[1]]+d
-        tix[[thisCn]] <- abs(regions$log2 - expectedAt) < (d/5)
+        m <- lm(2^int[1:(cn-1)] ~ cns[1:(cn-1)], weights=probes[1:(cn-1)])$coefficients
+        expectedAt <- log2(m[1]+cn*m[2])
+        tix[[thisCn]] <- abs(regions$log2 - expectedAt) < mean(diff(int))/5
         temp <- regions[tix[thisCn][[1]],]
         med <- weightedMedian(temp$log2,temp$probes)
-        int[thisCn] <- ifelse(!is.null(med),med,expectedAt)
+        probes[cn] <- sum(temp$probes)
+        int[cn] <- ifelse(!is.null(med),med,expectedAt)
     }
     
+    ## likely cn0 regions sit below cn1 - delta:
+    expectedAt <- log2(m[1]+0*m[2])
+    tix$cn0 <- abs(regions$log2 - expectedAt) < 0.5*(int[2]-int[1])
+    temp <- regions[tix$cn0,]
+    med <- weightedMedian(temp$log2,temp$probes)
+    int0 <- ifelse(!is.null(med),med,expectedAt) ## "int0"
     
+    
+    ## Estimate tumor dna content from intensity-cn relationship and average ploidy --UNRELIABLE
+    md <- lm(2^int ~ cns, weights=probes)$coefficients
+    m=NULL; m$intercept=md[1][[1]]; m$k=md[2][[1]]
+    probes[is.na(probes)]=0
+    m$meanCn <- mean(rep(cns, probes), na.rm=T)
+    m$theoretical_delta=1/m$meanCn
+    #m$real_delta=0.57*m$theoretical_delta ## The 0.57 is empirical from cancer cell lines    
+    #m$dnafrac=m$k/m$real_delta
+    #m$cellfrac=1/(1+m$meanCn/2*(1/m$dnafrac-1))    
+    
+
+    
+    loh_exp <- as.numeric(sampleInfo[4])
     ## at cn2, find the variant clusters (normal and CNNLOH)
-    ix <- (abs(regions$log2 - int$cn2) < 0.2*(int$cn3-int$cn2) ) # taking only closely-matching segments
+    ix <- (abs(regions$log2 - int[2]) < 0.2*(int[3]-int[2]) ) # taking only closely-matching segments
     data <- regions[ix,]
     data <- data[!is.na(data$imba),] # ...with a calculated allelic imbalance.
-    
-    ix <- (abs(regions$log2 - int$cn3) < 0.2*(int$cn4-int$cn3) )
-    data3 <- regions[ix,]
-    data3 <- data[!is.na(data$imba),]
-    expectedAt <- 0.1
-    ix <- data$imba<min(data3$imba)
+    ## 2m1
+    expectedAt=0.1
+    ix <- abs(data$imba-.1) < abs(data$imba-loh_exp)
     med <- weightedMedian(data$imba[ix],data$snps[ix]) ## Average allelic imbalance weighted on snp count
     ai$cn2m1 <- ifelse (!is.null(med),med,expectedAt)
-    
-    expectedAt <- sampleInfo$loh
-    ix <- abs(data$imba-sampleInfo$loh) < 0.2* (sampleInfo$loh-ai$cn2m1)
+    ## loh
+    expextedAt=loh_exp
+    ix <- !ix
     med <- weightedMedian(data$imba[ix],data$snps[ix]) 
     ai$cn2m0 <- ifelse (!is.null(med),med,expectedAt)
     
     
-    
     ## for cn1 (and 0)
-    ix <- (abs(regions$log2 - int$cn1) < 0.2*(int$cn2-int$cn1) )
+    ix <- (abs(regions$log2 - int[1]) < 0.2*(int[2]-int[1]) )
     data <- regions[ix,]
     data <- data[!is.na(data$imba),]
     expectedAt <- (ai$cn2m0+ai$cn2m1)*3/5     ## Decent estimate.
@@ -700,7 +734,7 @@
     ai$cn0m0 <- NA #unimportant
     
     ## for cn3:
-    ix <- (abs(regions$log2 - int$cn3) < 0.2*(int$cn4-int$cn3) )
+    ix <- (abs(regions$log2 - int[3]) < 0.2*(int[4]-int[3]) )
     data <- regions[ix,]
     data <- data[!is.na(data$imba),]
     
@@ -714,7 +748,7 @@
         ai$cn3m1 <- ifelse (!is.null(med),med,expectedAt)
         
         # now for cn3m0
-        expectedAt <- ai$cn3m1 + range*dmin # approx of cn3m0
+        expectedAt <- ai$cn3m1 + range*0.9 # approx of cn3m0
         
         ix <- (abs(data$imba-expectedAt) / abs(data$imba-ai$cn3m1)) < 0.5  # take those much closer to exp than to cn3m1
         med <- weightedMedian(data$imba[ix],data$snps[ix]) 
@@ -737,7 +771,7 @@
     if (is.na(ai$cn1m0)) ai$cn1m0 <- (ai$cn3m1+ai$cn2m0)/2 ## If deletions were missing, place an estimate from cn3 
     
     ## now for cn4
-    ix <- abs(regions$log2 - int$cn4) < 0.2*(int$cn4-int$cn3) 
+    ix <- abs(regions$log2 - int[4]) < 0.2*(int[4]-int[3]) 
     data <- regions[ix,]
     data <- data[!is.na(data$imba),]
     
@@ -769,17 +803,17 @@
         prevCn <- paste('cn',cn-1,sep='')
         pprevCn <- paste('cn',cn-2,sep='')
         
-        ix <- (abs(regions$log2 - int[thisCn][[1]]) < 0.2*(int[thisCn][[1]]-int[prevCn][[1]]) )
+        ix <- (abs(regions$log2 - int[cn]) < 0.2*(int[cn]-int[cn-1]) )
         data <- regions[ix,]
         data <- data[!is.na(data$imba),]
         data <- data[data$lengthMB>3,] # long regions for safety
         
         ## try to find variants, starting with LOH
         # LOH such as 5(0)
-        m <- 0
+        mi <- 0
         thisVariant=paste(thisCn,'m',0,sep='')
-        c4m0 <- ai[paste(prevCn,'m',m,sep='')][[1]] # relative naming for clarity
-        c3m0 <- ai[paste(pprevCn,'m',m,sep='')][[1]]
+        c4m0 <- ai[paste(prevCn,'m',mi,sep='')][[1]] # relative naming for clarity
+        c3m0 <- ai[paste(pprevCn,'m',mi,sep='')][[1]]
         
         expectedAt <- ceiling-((ceiling-c4m0)*(ceiling-max(c4m0,c3m0))/(ceiling-min(c3m0,c4m0)))
         #ai[thisVariant] <- expectedAt
@@ -792,44 +826,48 @@
         ## then from balanced to less balanced
         minorVariants=trunc(cn/2):1 
         first <- T
-        for (m in minorVariants) {
-            thisVariant=paste(thisCn,'m',m,sep='')
-            if (m==cn/2) {
+        for (mi in minorVariants) {
+            thisVariant=paste(thisCn,'m',mi,sep='')
+            if (mi==cn/2) {
                 # We have balanced variant, so rather easy.
-                expectedAt <- ai[paste(pprevCn,'m',m-1,sep='')][[1]] # this is a good approx, balanced at cn-2
-                ix <- data$imba<ai[paste(prevCn,'m',m-1,sep='')][[1]] # let all below (cn-1, mcn-1) in
+                expectedAt <- ai[paste(pprevCn,'m',mi-1,sep='')][[1]] # this is a good approx, balanced at cn-2
+                ix <- data$imba<ai[paste(prevCn,'m',mi-1,sep='')][[1]] # let all below (cn-1, mcn-1) in
                 med <- weightedMedian(data$imba[ix],data$snps[ix]) 
                 ai[thisVariant] <- ifelse (!is.null(med),med,expectedAt)
-                if (ai[thisVariant] > ai[paste(pprevCn,'m',m-1,sep='')][[1]]) ai[thisVariant] <- ai[paste(pprevCn,'m',m-1,sep='')][[1]] # don't let it sneak off.'
+                if (ai[thisVariant] > ai[paste(pprevCn,'m',mi-1,sep='')][[1]]) ai[thisVariant] <- ai[paste(pprevCn,'m',mi-1,sep='')][[1]] # don't let it sneak off.'
                 first <- F
             } else if (first) { 
                 # its not balanced but its the most balanced of the unbalanced. something like 5(2)
-                expectedAt <- 0.5*( ai[paste(prevCn,'m',m,sep='')][[1]] + ai[paste(pprevCn,'m',m-1,sep='')][[1]] ) # that means between 4(2) and 3(1)
-                ix <- abs(data$imba-expectedAt) < ( ai[paste(prevCn,'m',m,sep='')][[1]] - ai[paste(pprevCn,'m',m-1,sep='')][[1]] ) /3 # let all "between" 4(2) and 3(1) in
+                expectedAt <- 0.5*( ai[paste(prevCn,'m',mi,sep='')][[1]] + ai[paste(pprevCn,'m',mi-1,sep='')][[1]] ) # that means between 4(2) and 3(1)
+                ix <- abs(data$imba-expectedAt) < ( ai[paste(prevCn,'m',mi,sep='')][[1]] - ai[paste(pprevCn,'m',mi-1,sep='')][[1]] ) /3 # let all "between" 4(2) and 3(1) in
                 med <- weightedMedian(data$imba[ix],data$snps[ix]) 
                 ai[thisVariant] <- ifelse (!is.null(med),med,expectedAt)
                 first <- F
             } else {
                 # not the most balanced unbalanced variant, for example 5(1):
                 expectedAt <- ai[paste(thisCn,'m',minorVariants[1],sep='')][[1]] + 
-                    (minorVariants[1]-m) * (ai[paste(thisCn,'m',0,sep='')][[1]] - ai[paste(thisCn,'m',minorVariants[1],sep='')][[1]]) / trunc(cn/2) # 5(2) + (which)* 5(0)-5(2) /(n)
-                ix <- abs(data$imba-expectedAt) < ( ai[paste(prevCn,'m',m,sep='')][[1]] - ai[paste(pprevCn,'m',m-1,sep='')][[1]] ) /3 # let all "between" 4(1) and 3(0) in
+                    (minorVariants[1]-mi) * (ai[paste(thisCn,'m',0,sep='')][[1]] - 
+                                                 ai[paste(thisCn,'m',minorVariants[1],sep='')][[1]]) / trunc(cn/2) # 5(2) + (which)* 5(0)-5(2) /(n)
+                ix <- abs(data$imba-expectedAt) < ( ai[paste(prevCn,'m',mi,sep='')][[1]] - ai[paste(pprevCn,'m',mi-1,sep='')][[1]] ) /3 # let all "between" 4(1) and 3(0) in
                 med <- weightedMedian(data$imba[ix],data$snps[ix]) 
                 ai[thisVariant] <- ifelse (!is.null(med),med,expectedAt)
             } 
         } # done with minor variants
     } # done with copy numbers
-    
-    return(list('int'=int,'ai'=ai))
+    int=as.list(c(int0,int)); names(int)=paste('cn',0:maxCn,sep='')
+    return(list('int'=int,'ai'=ai,'model'=m))
 }
+
+
+
 ###
-setCNs <- function(allRegions,int,ai,maxCn=12) {
+setCNs <- function(allRegions,int,ai,model,maxCn=12) {
     ##  Assign total and minor copy numbers to all segments.
     regions <- allRegions$regions[,-4]    ## This time, work on all segments available.
     
     Cn <- NULL            ## Total copy number
     mCn <- NULL            ## Minor allele copy number
-    fullCN <- NULL        ## Variant label. ('cnXmY')
+    Cnx <- NULL        ## Variant label. ('cnXmY')
     
     intDist <- NULL        ## distance to certain Log-R
     imbaDist <- NULL    ## distance to certain allelic imbalance
@@ -839,48 +877,58 @@
         # set total copy number
         distance <- Inf
         for (cn in 0:maxCn) {
-            t_int <- int[paste('cn',cn,sep='')][[1]]    ## get Log-R of particular cn from 'int'
+            t_int <- int[[paste('cn',cn,sep='')]]   ## get Log-R of particular cn from 'int'
             t_dis <- abs(regions$log2[i]-t_int)            ## distance to that particular cn
             if (t_dis < distance) {                        ## nearest so far, save.
                 distance <- t_dis -> intDist[i]
                 Cn[i] <- cn
             }
         }
-        
-        # Y makes CN0 sit very low, this is a fix on non-Y Cn0
-        #if ((Cn[i] == 1)&(intDist[i] > (int$cn2-int$cn1))) Cn[i] <- 0 ## currently not needed.
-        
+    }        
+
+    ### Calculate model based Cns
+    Cnx=Cn; for (cn in 0:maxCn) {
+        Cnx[Cn==cn]=cn + (2^regions$log2[Cn==cn]-2^int[[paste('cn',cn,sep='')]])/model$k
+    }; Cnx[Cnx<0]=0
+    Cn=round(Cnx)
+    
+    ## Set minor CN
+    for (i in 1:nrow(regions)) {       
         # set minor CN
         distance <- Inf
         if (Cn[i]<=1) {
             mCn[i] <- 0
-        } else if (!is.na(regions$imba[i])) for (m in 0:trunc(Cn[i]/2)) {
+        } else if (Cn[i]<=maxCn & !is.na(regions$imba[i])) for (m in 0:trunc(Cn[i]/2)) {
             t_ai <- ai[paste('cn',Cn[i],'m',m,sep='')][[1]]
             t_dis <- abs(regions$imba[i]-t_ai)
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/patchwork -r 189


More information about the Patchwork-commits mailing list