[Genabel-commits] r960 - in pkg/ProbABEL: doc src

Fri Sep 28 13:54:42 CEST 2012

Author: lckarssen
Date: 2012-09-28 13:54:41 +0200 (Fri, 28 Sep 2012)
New Revision: 960

Modified:
   pkg/ProbABEL/doc/CHANGES.LOG
   pkg/ProbABEL/src/probabel.pl
   pkg/ProbABEL/src/probabel_config.cfg.example
Log:
Updates to probabel.pl and probabel_config.cfg:
- The _._chr_._ replacement can now occur multiple times in the path inthe config file. Thanks to Marijn Verkerk for the patch!
- The .cfg file now accepts comments (lines that start with #) and ignores empty lines.
- Some clean up of probabel.pl code. Use Perl's -W switch and "use strict".
- Made the usage message shorter. The full message is available by using the -h or -help option
- Added a check for the Y chromosome. Don't know if anyone uses it, but this looks more consistent. 


Modified: pkg/ProbABEL/doc/CHANGES.LOG
===================================================================

--- pkg/ProbABEL/doc/CHANGES.LOG	2012-09-17 11:31:09 UTC (rev 959)
+++ pkg/ProbABEL/doc/CHANGES.LOG	2012-09-28 11:54:41 UTC (rev 960)
@@ -1,3 +1,10 @@
+*****
+* Update of the probabel.pl script and probabel_config.cfg.
+  The .cfg file now accepts the chr separator in multiple locations in the path
+  (thanks to Marijn Verkerk).
+  probabel.pl can now also run Y chromosome analysis and the help
+  message has been updated.
+
 ***** v.0.2.0 (2012.06.10)
 * The v.0.1-9e fix for working with prob files in pacoxph has been
   forward-ported to this branch as well (Lennart and Yurii).

Modified: pkg/ProbABEL/src/probabel.pl
===================================================================
--- pkg/ProbABEL/src/probabel.pl	2012-09-17 11:31:09 UTC (rev 959)
+++ pkg/ProbABEL/src/probabel.pl	2012-09-28 11:54:41 UTC (rev 960)
@@ -1,5 +1,4 @@
-#! /usr/bin/perl
-
+#! /usr/bin/perl -W
 #==========================================================================
 #
 #           Filename:  probabel.pl
@@ -7,228 +6,254 @@
 #        Description: Handy perl wrapper for ProbABEL functions
 #
 #==========================================================================
+use strict;
 
 #==========================================================================
 # Set variables
-$version="PROBABEL_VERSION";
+my $version="PROBABEL_VERSION";
 
 # Define some filename postfixes
-$_2df_file_postfix = "_2df.out.txt";
-$_add_file_postfix = "_add.out.txt";
-$_domin_file_postfix = "_domin.out.txt";
-$_recess_file_postfix = "_recess.out.txt";
-$_over_domin_file_postfix = "_over_domin.out.txt";
+my $_2df_file_postfix = "_2df.out.txt";
+my $_add_file_postfix = "_add.out.txt";
+my $_domin_file_postfix = "_domin.out.txt";
+my $_recess_file_postfix = "_recess.out.txt";
+my $_over_domin_file_postfix = "_over_domin.out.txt";
 
 
 # Separators in the config file
-$separator_cfg = ",";
-$separator_filename = "_._chr_._";
+my $separator_cfg = ",";
+my $chr_replacement = "_._chr_._";
 
 # Set file locations
-$base_path = "./";
- at anprog = ($base_path . "palinear",
+my $base_path = "./";
+my @anprog = ($base_path . "palinear",
 	   $base_path . "palogist",
 	   $base_path . "pacoxph");
-$config = "probabel_config.cfg";
+my $config = "probabel_config.cfg";
 
 # Define the regression methods that are implemented
- at method = ("linear", "logistic", "coxph");
+my @method = ("linear", "logistic", "coxph");
 
-%cohorts;
- at mlinfo;
- at mldose;
- at mlprobe;
- at legend;
+my %cohorts;
+my @mlinfos;
+my @mldoses;
+my @mlprobes;
+my @legends;
 
 
 #==========================================================================
 # Read config file
-open(CFG,"$config") or die "Reading configuration file failed: $!" .
+open(CFG, "$config") or die "Reading configuration file failed: $!" .
     "\nDid you forget to edit and rename the probabel_config.cfg.example file?\n";
 
 <CFG>; #skip the first line (header)
 
 for(my $i=0 ; my $line = <CFG> ; $i++)
-	{
-	next if (/^#/);
-	chomp($line);
-	@line_array = split(/$separator_cfg/, $line);
-	$cohorts{$line_array[0]} = $i;
-	$mlinfo[$i]  = $line_array[1];
-	$mldose[$i]  = $line_array[2];
-	$mlprobe[$i] = $line_array[3];
-	$legend[$i]  = $line_array[4];
-	}
+{
+    chomp($line);
+    next if ($line =~ /^#/);
+    next if ($line =~ /^$/);
+    my @line_array = split(/$separator_cfg/, $line);
+    $cohorts{$line_array[0]} = $i;
+    $mlinfos[$i]  = $line_array[1];
+    $mldoses[$i]  = $line_array[2];
+    $mlprobes[$i] = $line_array[3];
+    $legends[$i]  = $line_array[4];
+}
 close(CFG);
 
 
 #==========================================================================
 # Print usage info if arguments are not correct
-if(@ARGV<6 || $ARGV[0] eq "--help")
-	{
-	print "\nUsage:
-	probabel.pl chrom-start chrom-end method cohort <--allmodels OR --additive> trait <other available keys of ProbABEL functions>\n";
-	print "\n	* chrom-start - the first chromosome number, chrom-end - the last one";
-	print "\n	* method can be ";
-	foreach $me(@method) {print "\"".$me."\", "};
-	print "\n	* use --allmodels if you need dominant, recessive and heterozygous models
+if(@ARGV<6 || $ARGV[0] eq "--help" || $ARGV[0] eq "-h") {
+    print "Usage:
+	probabel.pl chrom-start chrom-end method cohort <--allmodels OR --additive> trait <other available options of ProbABEL functions>\n";
+    print "\n	* chrom-start - the first chromosome number, chrom-end - the last one; X or Y have to be run separately (specify them twice, once as chrom-start and once as chrom-end)";
+    print "\n	* method can be ";
+    foreach my $me(@method) {print "\"".$me."\", "};
+    print "\n	* use --allmodels if you need dominant, recessive and heterozygous models
 	  and --additive if additive only\n";
-	print "	* Available cohorts is ";
- 	foreach $coh(keys %cohorts) {print "\"".$coh."\", "};
-	print "\n	* example:
-	  probabel.pl 1 22 linear \"ERF\" --additive filename1 filename2
+    print "	* Available cohorts are ";
+    foreach my $coh(keys %cohorts) {
+	print "\"".$coh."\", "
+    };
+    print "\n	* example:
+	  probabel.pl 1 22 linear \"ERF\" --additive filename
 	  (filename has to be saved as filename.PHE)\n\n";
 
+    if(@ARGV == 1 && ($ARGV[0] eq "--help" || $ARGV[0] eq "-h")) {
+	print "\nDetails:\n";
+	print " The probabel.pl script is used for analysis of imputed data. First you have to create a file with the phenotype values that you are going to use. The first column contains ids in special order, the second one contains the trait which you are going	analyze, the others contain covariates.  For example:
 
-	print "\n Details:\n";
-	print "   The probabel.pl script is used for analysis of imputed data.
-   First you have to create a file with the phenotypes that you are going to use.
-   The first column contains ids in special order, the second: the trait which
-   you are going analyze, the others contains covariates.
-   Like this =>
+	id         phen1 covariate1  covariate2
+	1_2094  0     334         0
+	1_5060  1     56          1
+	1_4077  1     346         6
+	.
+	.
+	.
 
-   id         phen1 covariate1  covariate2
-   1_2094  0     334         0
-   1_5060  1     56          1
-   1_4077  1     346         6
-   .
-   .
-   .
+	This implies the model:
+	phen1 ~ covariate1 + covariate2 + SNP
 
-   This implies the model:
-   phen1 ~ covariate1 + covariate2 + SNP
 
+	Then save it to folder where you are doing the analysis. The name of the file must be name_of_file.PHE, where name_of_file is any name.
 
-   Then save it to folder where you are doing the analysis. The name of the file must be name_of_file.PHE, where name_of_file is any name.
-
-   Then run the following on the command line:
-   probabel.pl 1 22 \"method\" \"cohort\" --model name_of_file
-   Change \"method\" \"cohort\" --model to appropriate values\n";
+	Then run the following on the command line:
+	  probabel.pl 1 22 \"method\" \"cohort\" --model name_of_file
+	Change \"method\", \"cohort\" and --model to appropriate values\n";
 	print "\n	Version: $version";
 	print "\n\n	Authors: Lennart Karssen   - l.karssen\@erasmusmc.nl,
  		 Maksim Struchalin - m.struchalin\@erasmusmc.nl,
-		 Yurii Aulchenko   - yurii.aulchenko at gmail.com.\n\n";
-	exit;
-	}
+		 Yurii Aulchenko   - yurii.aulchenko\@gmail.com.\n\n";
+    }
+    else {
+	print "Type probabel.pl --help for more details.\n";
+    }
+    exit;
+}
 
 
 #==========================================================================
 # Put the command line arguments into variables and verify them
-$startchr = $ARGV[0];
-$endchr = $ARGV[1];
-$method = $ARGV[2];
-$chohort = $ARGV[3];
-$model = $ARGV[4];
+my $startchr = $ARGV[0];
+my $endchr = $ARGV[1];
+my $method = $ARGV[2];
+my $chohort = $ARGV[3];
+my $model = $ARGV[4];
 
 die "error: chrom-start is > 22" if($startchr > 22 && $startchr != "X") ;
 die "error: chrom-end is > 22" if($endchr > 22 && $endchr != "X");
 die "error: chrom-start > chrom-end" if($startchr > $endchr);
 
 
-$cohort_position = $cohorts{$chohort};
+my $cohort_position = $cohorts{$chohort};
 
 
-if(!defined($cohort_position))
-{
-print "\nerror: Wrong cohort name, \"$chohort\" is not an available cohort.
+if(!defined($cohort_position)) {
+    print "\nerror: Wrong cohort name, \"$chohort\" is not an available cohort.
 Available cohorts are ";
-foreach $coh(keys %cohorts) {print "\"".$coh."\", "};
-print "\n\n";
-exit;
+    foreach my $coh(keys %cohorts) {
+	print "\"".$coh."\", ";
+    }
+    print "\n\n";
+    exit;
 }
 
+my $mlinfo = $mlinfos[$cohort_position];
+my $mldose = $mldoses[$cohort_position];
+my $mlprobe = $mlprobes[$cohort_position];
+my $legend = $legends[$cohort_position];
 
- at mlinfo_split = split(/$separator_filename/, $mlinfo[$cohort_position]);
- at mldose_split = split(/$separator_filename/, $mldose[$cohort_position]);
- at mlprobe_split = split(/$separator_filename/, $mlprobe[$cohort_position]);
- at legend_split = split(/$separator_filename/, $legend[$cohort_position]);
 
-
-
-$passed = 0;
-for (my $i=0;$i<@method;$i++) {
-		if ($ARGV[2] eq $method[$i]) {
-				$passed = 1;
-				$prog = $anprog[$i];
-		}
+my $passed = 0;
+my $prog;
+for (my $i=0; $i<@method; $i++) {
+    if ($ARGV[2] eq $method[$i]) {
+	$passed = 1;
+	$prog = $anprog[$i];
+    }
 }
 die "error: Wrong method. method has to be one of: @method\n" if (!$passed);
 
 
-$phename = $ARGV[5];
-$keys="";
-
-
-for ($i=6;$i<@ARGV;$i++)
-	{
-	$keys = $keys.$ARGV[$i]." ";
-	}
+my $phename = $ARGV[5];
+my $keys="";
+for (my $i=6; $i<@ARGV; $i++) {
+    $keys = $keys.$ARGV[$i]." ";
+}
 chop($keys);
 
 
+my $model_option_num=0;
 
-$model_option_num=0;
+if($model eq "--additive") {
+    my $mldose_probe = $mldose;
+    $model_option_num=1;
+} elsif($model eq "--allmodels") {
+    my $mldose_probe = $mlprobe;
+    $model_option_num=2;
+} else {
+    die "error: Wrong key for model. You can use \"--additive\" or \"--allmodels\" only\n";
+}
 
-if($model eq "--additive")
-	{
-	$mldose_probe_split1 = $mldose_split[0];
-	$mldose_probe_split2 = $mldose_split[1];
-	$model_option_num=1;
-	}
-elsif($model eq "--allmodels")
-	{
-	$mldose_probe_split1 = $mlprobe_split[0];
-	$mldose_probe_split2 = $mlprobe_split[1];
-	$model_option_num=2;
-	}
-else
-	{
-	die "error: Wrong key for model. You can use \"--additive\" or \"--allmodels\" only\n";
-	}
 
-
 #==========================================================================
 # Start the analysis now that the input has been validated
 print "Start...\n";
 
-$chr = $startchr;
+my $chr = $startchr;
+my $hadhead=0;
+my $head;
+my $mlinfo_arg;
+my $mldose_arg;
+my $legend_arg;
 
-# Separate command for the X chromosome.
-if ($chr eq "X") {
-    print `$prog -p $phename.PHE --ngpreds $model_option_num -i $mlinfo_split[0]$mlinfo_split[1] -d $mldose_probe_split1$mldose_probe_split2 -m $legend_split[0]$legend_split[1] --chrom $chr -o $phename $keys`;
+# Separate command for the sex chromosomes.
+if ($chr eq "X" || $chr eq "Y") {
+    $mlinfo_arg = $mlinfo;
+    $mlinfo_arg =~ s/$chr_replacement/$chr/g;
+
+    $mldose_arg = $mldose;
+    $mldose_arg =~ s/$chr_replacement/$chr/g;
+
+    $legend_arg = $legend;
+    $legend_arg =~ s/$chr_replacement/$chr/g;
+
+    if($hadhead==0) {
+	$head="";
+	$hadhead=1;
+    } else {
+	my $head="--no-head";
+    }
+
+    print `$prog -p $phename.PHE --ngpreds $model_option_num -i $mlinfo_arg -d $mldose_arg -m $legend_arg --chrom $chr -o $phename $head $keys`;
+
     exit;
 }
 
 # Commands for the autosomes
-print `$prog -p $phename.PHE --ngpreds $model_option_num -i $mlinfo_split[0]${chr}$mlinfo_split[1] -d $mldose_probe_split1${chr}$mldose_probe_split2 -m $legend_split[0]${chr}$legend_split[1] --chrom $chr -o $phename $keys`;
-	for($chr=($startchr+1);$chr<=$endchr;$chr++)
-		{
-		print `$prog -p $phename.PHE --ngpreds $model_option_num -i $mlinfo_split[0]${chr}$mlinfo_split[1] -d $mldose_probe_split1${chr}$mldose_probe_split2 -m $legend_split[0]${chr}$legend_split[1] --chrom $chr -o $phename.$chr --no-head $keys`;
+for($chr=$startchr; $chr<=$endchr; $chr++) {
+    $mlinfo_arg = $mlinfo;
+    $mlinfo_arg =~ s/$chr_replacement/$chr/g;
 
-		if($model_option_num==2)
-			{
-			`cat $phename.${chr}$_2df_file_postfix >> ${phename}${_2df_file_postfix}`;
-			`rm $phename.${chr}$_2df_file_postfix`;
+    $mldose_arg = $mldose;
+    $mldose_arg =~ s/$chr_replacement/$chr/g;
 
-			`cat $phename.${chr}$_add_file_postfix >> ${phename}${_add_file_postfix}`;
-			`rm $phename.${chr}$_add_file_postfix`;
+    $legend_arg = $legend;
+    $legend_arg =~ s/$chr_replacement/$chr/g;
 
-			`cat $phename.${chr}$_domin_file_postfix >> ${phename}${_domin_file_postfix}`;
-			`rm $phename.${chr}$_domin_file_postfix`;
+    if($hadhead==0) {
+	$head="";
+	$hadhead=1;
+    } else {
+	$head="--no-head";
+    }
 
-			`cat $phename.${chr}$_recess_file_postfix >> ${phename}${_recess_file_postfix}`;
-			`rm $phename.${chr}$_recess_file_postfix`;
+    print `$prog -p $phename.PHE --ngpreds $model_option_num -i $mlinfo_arg -d $mldose_arg -m $legend_arg --chrom $chr -o $phename.$chr $head $keys`;
 
-			`cat $phename.${chr}$_over_domin_file_postfix >> ${phename}${_over_domin_file_postfix}`;
-			`rm $phename.${chr}$_over_domin_file_postfix`;
-			}
-		else
-			{
-			`cat $phename.${chr}$_add_file_postfix >> $phename${_add_file_postfix}`;
-			print "cat $phename.${chr}$_add_file_postfix >> $phename${_add_file_postfix}\n";
-			`rm $phename.${chr}$_add_file_postfix`;
-			print "rm $phename.${chr}$_add_file_postfix\n";
-			}
-		}
+    if($model_option_num==2)
+    {
+	`cat $phename.${chr}$_2df_file_postfix >> ${phename}${_2df_file_postfix}`;
+	`rm $phename.${chr}$_2df_file_postfix`;
 
+	`cat $phename.${chr}$_add_file_postfix >> ${phename}${_add_file_postfix}`;
+	`rm $phename.${chr}$_add_file_postfix`;
+
+	`cat $phename.${chr}$_domin_file_postfix >> ${phename}${_domin_file_postfix}`;
+	`rm $phename.${chr}$_domin_file_postfix`;
+
+	`cat $phename.${chr}$_recess_file_postfix >> ${phename}${_recess_file_postfix}`;
+	`rm $phename.${chr}$_recess_file_postfix`;
+
+	`cat $phename.${chr}$_over_domin_file_postfix >> ${phename}${_over_domin_file_postfix}`;
+	`rm $phename.${chr}$_over_domin_file_postfix`;
+    } else {
+	`cat $phename.${chr}$_add_file_postfix >> $phename${_add_file_postfix}`;
+	print "cat $phename.${chr}$_add_file_postfix >> $phename${_add_file_postfix}\n";
+	`rm $phename.${chr}$_add_file_postfix`;
+	print "rm $phename.${chr}$_add_file_postfix\n";
+    }
+}
+
 print "Finished all chromosomes ...\n";

Modified: pkg/ProbABEL/src/probabel_config.cfg.example
===================================================================
--- pkg/ProbABEL/src/probabel_config.cfg.example	2012-09-17 11:31:09 UTC (rev 959)
+++ pkg/ProbABEL/src/probabel_config.cfg.example	2012-09-28 11:54:41 UTC (rev 960)
@@ -1,3 +1,25 @@
-cohort,mlinfo_path,mldose_path,mlprobe_path,legend_path
+cohort,mlinfo_path,mldose_path,mlprob_path,legend_path
+# Configuration file for the probabel.pl wrapper script
+#
+# This file contains the location of the files with imputed data for the
+# cohorts/imputations that are available to the user.
+#
+# Use one line per cohort configuration. Each line consists of the following
+# entries, separated by a comma:
+# - cohort: a discriptive name for the cohort/imputation, e.g. MyCohort_HM2
+# - mlinfo_path: path to the file with the SNP imputation information
+# - mldose_path: path to the file with imputed genotype dosages
+# - mlprob_path: path to the file with imputed genotype probabilities
+# - legend_path: path to the legend file used for the imputations
+#
+# Use "_._chr_._" to specify a chromosome number in the configured paths. This
+# can be used multiple times per line, e.g.:
+#
+# mldose_path: /data/Cohort1/MachFiles/_._chr_._/mach_._chr_._-out.mldose
+#
+# For chromosome 1 this will be replaced with: /data/Cohort1/MachFiles/1/1-out.mldose
+#
+#
+
 ERGO,script_test/merlin._._chr_._.collected.ped.out.mlinfo,script_test/merlin._._chr_._.collected.ped.out.mldose,script_test/merlin._._chr_._.collected.ped.out.mlprob,script_test/genotypes_chr_._chr_.__CEU_r22_nr.b36_fwd_legend.txt
 ERGOPLUS,/mnt/space/ergoplus/ergoplus.chr_._chr_._.mach.mlinfo,/mnt/space/ergoplus/ergoplus.chr_._chr_._.mach.mldose,/mnt/space/ergoplus/ergoplus.chr_._chr_._.mach.mlprob,/mnt/impute/impute_b36/collected/genotypes_chr_._chr_.__CEU_r22_nr.b36_fwd_legend.txt