[Genabel-commits] r1017 - in branches/ProbABEL-refactoring/ProbABEL: . src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Thu Nov 8 22:36:15 CET 2012


Author: lckarssen
Date: 2012-11-08 22:36:15 +0100 (Thu, 08 Nov 2012)
New Revision: 1017

Added:
   branches/ProbABEL-refactoring/ProbABEL/src/extract-snp.cpp
Modified:
   branches/ProbABEL-refactoring/ProbABEL/Makefile.am
   branches/ProbABEL-refactoring/ProbABEL/configure.ac
   branches/ProbABEL-refactoring/ProbABEL/src/Makefile.am
   branches/ProbABEL-refactoring/ProbABEL/src/probabel.pl
   branches/ProbABEL-refactoring/ProbABEL/src/probabel_config.cfg.example
Log:
Trunk merged into ProbABEL-refactoring branch. For reg1.h, data.h and main.cpp, which had conflicts, I selected the "mine-full" resolution (rejecting all changes from trunk and keeping the ones from the branch). In the other cases of conflicts I merges the changes. 

This version builds succesfully (make distcheck finishes without errors). 


Modified: branches/ProbABEL-refactoring/ProbABEL/Makefile.am
===================================================================
--- branches/ProbABEL-refactoring/ProbABEL/Makefile.am	2012-11-08 08:47:27 UTC (rev 1016)
+++ branches/ProbABEL-refactoring/ProbABEL/Makefile.am	2012-11-08 21:36:15 UTC (rev 1017)
@@ -1,4 +1,4 @@
 ## Process this file with automake to produce Makefile.in
 
 AUTOMAKE_OPTIONS = foreign
-SUBDIRS = src doc examples
+SUBDIRS = src doc examples tests

Modified: branches/ProbABEL-refactoring/ProbABEL/configure.ac
===================================================================
--- branches/ProbABEL-refactoring/ProbABEL/configure.ac	2012-11-08 08:47:27 UTC (rev 1016)
+++ branches/ProbABEL-refactoring/ProbABEL/configure.ac	2012-11-08 21:36:15 UTC (rev 1017)
@@ -2,8 +2,8 @@
 # Process this file with autoconf to produce a configure script.
 
 AC_PREREQ([2.67])
-AC_INIT(ProbABEL, 0.2.0, genabel-devel at r-forge.wu-wien.ac.at)
-AM_INIT_AUTOMAKE(ProbABEL, 0.2.0)
+AC_INIT(ProbABEL, 0.2.2, genabel-devel at r-forge.wu-wien.ac.at)
+AM_INIT_AUTOMAKE(ProbABEL, 0.2.2)
 AC_CONFIG_SRCDIR([src/data.h])
 AC_CONFIG_HEADERS([src/config.h])
 
@@ -28,7 +28,6 @@
 # for the subsequent checks
 AC_LANG_PUSH([C++])
 
-
 # Checks for libraries.
 
 # Checks for header files.
@@ -93,6 +92,9 @@
 # Check for presence of sed. Needed to replace path to /etc in probabel.pl
 AC_PROG_SED
 
+# Check for the presence of awk. Needed in the test suite
+AC_PROG_AWK
+
 # Check for presence of pdfLaTeX
 AC_CHECK_PROG(PDFLATEX, pdflatex, pdflatex)
 if test -z "$PDFLATEX"; then
@@ -103,7 +105,8 @@
 
 # Since pacoxph is buggy it needs to be enabled explixitly
 AC_ARG_ENABLE([pacoxph],
-    [AS_HELP_STRING([--enable-pacoxph], [build the pacoxph binary (still contains lots of bugs)])],
+    [AS_HELP_STRING([--enable-pacoxph], [build the pacoxph binary (still \
+contains lots of bugs)])],
     [pacoxph=${enableval}],
     [pacoxph=no])
 
@@ -119,6 +122,7 @@
 	src/Makefile
 	doc/Makefile
 	examples/Makefile
+	tests/Makefile
 ])
 # Create output files
 AC_OUTPUT

Modified: branches/ProbABEL-refactoring/ProbABEL/src/Makefile.am
===================================================================
--- branches/ProbABEL-refactoring/ProbABEL/src/Makefile.am	2012-11-08 08:47:27 UTC (rev 1016)
+++ branches/ProbABEL-refactoring/ProbABEL/src/Makefile.am	2012-11-08 21:36:15 UTC (rev 1017)
@@ -29,9 +29,7 @@
  include/R_ext/Print.h include/R_ext/Random.h include/R_ext/Utils.h	\
  include/R_ext/RS.h
 
-AM_CXXFLAGS = -Wall
-
-bin_PROGRAMS = palinear palogist
+bin_PROGRAMS = palinear palogist extract-snp
 if BUILD_pacoxph
 ## Extra files for pacoxph
 COXBASE = coxfit2 chinv2 cholesky2 chsolve2 dmatrix
@@ -42,15 +40,20 @@
 
 palinear_SOURCES = $(REGFILES) $(FVSRC) $(FVHEADERS)
 palinear_CXXFLAGS = -DLINEAR $(AM_CXXFLAGS)
+palinear_CPPFLAGS = $(AM_CPPFLAGS)
 
 palogist_SOURCES = $(REGFILES) $(FVSRC) $(FVHEADERS)
 palogist_CXXFLAGS = -DLOGISTIC $(AM_CXXFLAGS)
+palogist_CPPFLAGS = $(AM_CPPFLAGS)
 
 pacoxph_SOURCES = $(COXSRC) $(REGFILES) $(FVSRC) $(FVHEADERS)	\
  $(RHEADERS) survS.h survproto.h
 pacoxph_CXXFLAGS = -DCOXPH -I $(top_srcdir)/src/include $(AM_CXXFLAGS)
+pacoxph_CPPFLAGS = $(AM_CPPFLAGS)
 pacoxph_CFLAGS = -DCOXPH -I $(top_srcdir)/src/include $(AM_CFLAGS)
 
+extract_snp_SOURCES = extract-snp.cpp $(FVSRC) $(FVHEADERS)
+
 ## Install these scripts in the bin directory as well:
 dist_bin_SCRIPTS = probabel.pl extIDS.pl
 

Copied: branches/ProbABEL-refactoring/ProbABEL/src/extract-snp.cpp (from rev 1015, pkg/ProbABEL/src/extract-snp.cpp)
===================================================================
--- branches/ProbABEL-refactoring/ProbABEL/src/extract-snp.cpp	                        (rev 0)
+++ branches/ProbABEL-refactoring/ProbABEL/src/extract-snp.cpp	2012-11-08 21:36:15 UTC (rev 1017)
@@ -0,0 +1,209 @@
+/*******************************************************************************
+ *
+ * This program extracts a given row (dosages for all individuals for a given
+ * SNP) from filevector files.
+ *
+ * (C) 2012 L.C. Karssen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+#include <stdio.h>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <getopt.h>
+
+using namespace std;
+
+#include "fvlib/Logger.h"
+#include "fvlib/FileVector.h"
+#include "fvlib/CastUtils.h"
+
+
+void info(char *program_name)
+{
+    cout << program_name
+	 << " extracts the dosage for a SNP for one or more individuals from a"
+	 << "file in filevector format (.fvi/.fvd)." << endl;
+    cout << endl;
+    cout << "Usage: " << program_name << " --file <fv file> --snp <snpname>"
+	 << endl;
+    cout << "   or: " << program_name << " -f <fv file> -s <snpname>" << endl;
+    cout << endl;
+    cout << "Additional options:" << endl;
+    cout << "\t--id (-i) <idname>: only print the dosage for the given SNP and ID"
+	 << endl;
+    cout << "\t--debug (-d): show debugging output" << endl;
+    cout << "\t--help (-h): show this information" << endl;
+}
+
+
+int main(int argc, char* argv[])
+{
+    if (argc < 2)
+    {
+	info(argv[0]);
+	exit(3);
+    }
+
+    int next_option;
+    const char * const short_options = "df:hi:s:";
+    const struct option long_options [] =
+	{
+	    {"debug",  0, NULL, 'd'},
+	    {"file",  1, NULL, 'f'},
+	    {"help",  0, NULL, 'h'},
+	    {"id",   1, NULL, 'i'},
+	    {"snp",  1, NULL, 's'},
+	    {NULL  ,   0, NULL, 0  }
+	};
+    char *program_name = argv[0];
+    bool debug = false;
+    string inputFileName = "";
+    string idname = "";
+    string snpname = "";
+    do
+    {
+	next_option = getopt_long(argc, argv, short_options, long_options, NULL);
+	switch (next_option)
+	{
+	case 'h':
+	    info(program_name);
+	    exit(0);
+	case 'd':
+	    debug = true;
+	    break;
+	case 'f':
+	    inputFileName = string(optarg);
+	    break;
+	case 'i':
+	    idname = string(optarg);
+	    break;
+	case 's':
+	    snpname = string(optarg);
+	    break;
+	case '?': break;
+	case -1 : break;
+	}
+    }
+    while (next_option != -1);
+
+    if (snpname.compare("") == 0)
+    {
+	cerr << "Error: No SNP name given" << endl << endl;
+	info(program_name);
+	exit(2);
+    }
+
+    unsigned long int snprow = 0;
+    bool snpfound = false;
+    bool idfound  = false;
+
+    if (debug) cout << "Input file is '" << inputFileName << "'." << endl;
+
+    FileVector fv(inputFileName, 64, true);
+
+    if (debug)
+    {
+	for (unsigned long int col=0; col<fv.getNumObservations(); col++)
+	{
+	    cout << fv.readObservationName(col).name << " ";
+	}
+
+	cout << endl;
+	cout << "----------------------" << endl;
+    }
+
+    // Look at the SNPs (rows) first
+    for (unsigned long int row=0; row<fv.getNumVariables(); row++)
+    {
+	string current_snpname = fv.readVariableName(row).name;
+	if (current_snpname.compare(snpname) == 0)
+	{
+	    snpfound = true;
+	    snprow = row;
+	    if (debug)
+	    {
+		cout << "*" << current_snpname << "* ";
+	    }
+	} else {
+	    if (debug)
+	    {
+		cout << current_snpname << " ";
+	    }
+	}
+    }
+    if (debug)
+    {
+	cout << endl;
+	cout << "----------------------" << endl;
+
+	cout << "N_obs = " << fv.getNumObservations() << "\telement size: "
+	     <<  fv.getElementSize() << "\tDataType: "
+	     << dataTypeToString(fv.getElementType()) << endl;
+    }
+
+    if(!snpfound)
+    {
+	cerr << "SNP name '" << snpname << "' not found in data file "
+	     << inputFileName << endl;
+	exit(1);
+    }
+
+    char * data = new (nothrow) char[fv.getNumObservations() *
+					 fv.getElementSize()];
+    if (!data)
+    {
+	cerr << "Cannot allocate memory for data vector. Exiting..." << endl;
+	exit(2);
+    }
+
+    fv.readVariable(snprow, data);
+    for (unsigned long int col=0; col<fv.getNumObservations(); col++)
+    {
+	if (idname.compare("") != 0)
+	{
+	    // An ID name has been set, only print the dosage for that ID
+	    // and SNP combination
+	    if(idname.compare(fv.readObservationName(col).name) == 0)
+	    {
+		idfound = true;
+		cout << fv.readObservationName(col).name << "\t"
+		     << bufToString(fv.getElementType(),
+				    &data[col*fv.getElementSize()],
+				    string("NA"))
+		     << endl;
+	    }
+	} else {
+	    // Print the dosages for all IDs
+	    cout << fv.readObservationName(col).name << "\t"
+		 << bufToString(fv.getElementType(),
+				&data[col*fv.getElementSize()],
+				string("NA"))
+		 << endl;
+	}
+    }
+
+    if (idfound == false && idname.compare("") != 0)
+    {
+	cerr << "Id '" << idname << "' not found in data file "
+	     << inputFileName << endl;
+	exit(1);
+    }
+
+    delete [] data;
+}

Modified: branches/ProbABEL-refactoring/ProbABEL/src/probabel.pl
===================================================================
--- branches/ProbABEL-refactoring/ProbABEL/src/probabel.pl	2012-11-08 08:47:27 UTC (rev 1016)
+++ branches/ProbABEL-refactoring/ProbABEL/src/probabel.pl	2012-11-08 21:36:15 UTC (rev 1017)
@@ -1,5 +1,4 @@
-#! /usr/bin/perl
-
+#! /usr/bin/perl -W
 #==========================================================================
 #
 #           Filename:  probabel.pl
@@ -7,228 +6,312 @@
 #        Description: Handy perl wrapper for ProbABEL functions
 #
 #==========================================================================
+use strict;
 
 #==========================================================================
 # Set variables
-$version="PROBABEL_VERSION";
+my $version="PROBABEL_VERSION";
 
 # Define some filename postfixes
-$_2df_file_postfix = "_2df.out.txt";
-$_add_file_postfix = "_add.out.txt";
-$_domin_file_postfix = "_domin.out.txt";
-$_recess_file_postfix = "_recess.out.txt";
-$_over_domin_file_postfix = "_over_domin.out.txt";
+my $_2df_file_postfix = "_2df.out.txt";
+my $_add_file_postfix = "_add.out.txt";
+my $_domin_file_postfix = "_domin.out.txt";
+my $_recess_file_postfix = "_recess.out.txt";
+my $_over_domin_file_postfix = "_over_domin.out.txt";
 
-
 # Separators in the config file
-$separator_cfg = ",";
-$separator_filename = "_._chr_._";
+my $separator_cfg = ",";
+my $chr_replacement = "_._chr_._";
+my $chunk_replacement = "_._chunk_._";
 
 # Set file locations
-$base_path = "./";
- at anprog = ($base_path . "palinear",
-	   $base_path . "palogist",
-	   $base_path . "pacoxph");
-$config = "probabel_config.cfg";
+my $base_path = "./";
+my @anprog = ($base_path . "palinear",
+	      $base_path . "palogist",
+	      $base_path . "pacoxph");
+my $config = "probabel_config.cfg";
 
 # Define the regression methods that are implemented
- at method = ("linear", "logistic", "coxph");
+my @method = ("linear", "logistic", "coxph");
 
-%cohorts;
- at mlinfo;
- at mldose;
- at mlprobe;
- at legend;
+my %cohorts;
+my @mlinfos;
+my @mldoses;
+my @mlprobs;
+my @legends;
 
 
 #==========================================================================
 # Read config file
-open(CFG,"$config") or die "Reading configuration file failed: $!" .
+open(CFG, "$config") or die "Reading configuration file $config failed: $!" .
     "\nDid you forget to edit and rename the probabel_config.cfg.example file?\n";
 
 <CFG>; #skip the first line (header)
 
 for(my $i=0 ; my $line = <CFG> ; $i++)
-	{
-	next if (/^#/);
-	chomp($line);
-	@line_array = split(/$separator_cfg/, $line);
-	$cohorts{$line_array[0]} = $i;
-	$mlinfo[$i]  = $line_array[1];
-	$mldose[$i]  = $line_array[2];
-	$mlprobe[$i] = $line_array[3];
-	$legend[$i]  = $line_array[4];
-	}
+{
+    chomp($line);
+    next if ($line =~ /^#/);
+    next if ($line =~ /^$/);
+    my @line_array = split(/$separator_cfg/, $line);
+    $cohorts{$line_array[0]} = $i;
+    $mlinfos[$i]  = $line_array[1];
+    $mldoses[$i]  = $line_array[2];
+    $mlprobs[$i] = $line_array[3];
+    $legends[$i]  = $line_array[4];
+}
 close(CFG);
 
 
 #==========================================================================
 # Print usage info if arguments are not correct
-if(@ARGV<6 || $ARGV[0] eq "--help")
-	{
-	print "\nUsage:
-	probabel.pl chrom-start chrom-end method cohort <--allmodels OR --additive> trait <other available keys of ProbABEL functions>\n";
-	print "\n	* chrom-start - the first chromosome number, chrom-end - the last one";
-	print "\n	* method can be ";
-	foreach $me(@method) {print "\"".$me."\", "};
-	print "\n	* use --allmodels if you need dominant, recessive and heterozygous models
+if(@ARGV<6 || $ARGV[0] eq "--help" || $ARGV[0] eq "-h") {
+    print "Usage:
+	probabel.pl chrom-start chrom-end method cohort <--allmodels OR --additive> trait <other available options of ProbABEL functions>\n";
+    print "\n	* chrom-start - the first chromosome number, chrom-end - the last one; X or Y have to be run separately (specify them twice, once as chrom-start and once as chrom-end)";
+    print "\n	* method can be ";
+    foreach my $me(@method) {print "\"".$me."\", "};
+    print "\n	* use --allmodels if you need dominant, recessive and heterozygous models
 	  and --additive if additive only\n";
-	print "	* Available cohorts is ";
- 	foreach $coh(keys %cohorts) {print "\"".$coh."\", "};
-	print "\n	* example:
-	  probabel.pl 1 22 linear \"ERF\" --additive filename1 filename2
+    print "	* Available cohorts are ";
+    foreach my $coh(keys %cohorts) {
+	print "\"".$coh."\", "
+    };
+    print "\n	* example:
+	  probabel.pl 1 22 linear \"ERF\" --additive filename
 	  (filename has to be saved as filename.PHE)\n\n";
 
+    if(@ARGV == 1 && ($ARGV[0] eq "--help" || $ARGV[0] eq "-h")) {
+	print "\nDetails:\n";
+	print " The probabel.pl script is used for analysis of imputed data. First you have to create a file with the phenotype values that you are going to use. The first column contains ids in special order, the second one contains the trait which you are going	analyze, the others contain covariates.  For example:
 
-	print "\n Details:\n";
-	print "   The probabel.pl script is used for analysis of imputed data.
-   First you have to create a file with the phenotypes that you are going to use.
-   The first column contains ids in special order, the second: the trait which
-   you are going analyze, the others contains covariates.
-   Like this =>
+	id         phen1 covariate1  covariate2
+	1_2094  0     334         0
+	1_5060  1     56          1
+	1_4077  1     346         6
+	.
+	.
+	.
 
-   id         phen1 covariate1  covariate2
-   1_2094  0     334         0
-   1_5060  1     56          1
-   1_4077  1     346         6
-   .
-   .
-   .
+	This implies the model:
+	phen1 ~ covariate1 + covariate2 + SNP
 
-   This implies the model:
-   phen1 ~ covariate1 + covariate2 + SNP
 
+	Then save it to folder where you are doing the analysis. The name of the file must be name_of_file.PHE, where name_of_file is any name.
 
-   Then save it to folder where you are doing the analysis. The name of the file must be name_of_file.PHE, where name_of_file is any name.
-
-   Then run the following on the command line:
-   probabel.pl 1 22 \"method\" \"cohort\" --model name_of_file
-   Change \"method\" \"cohort\" --model to appropriate values\n";
+	Then run the following on the command line:
+	  probabel.pl 1 22 \"method\" \"cohort\" --model name_of_file
+	Change \"method\", \"cohort\" and --model to appropriate values\n";
 	print "\n	Version: $version";
 	print "\n\n	Authors: Lennart Karssen   - l.karssen\@erasmusmc.nl,
  		 Maksim Struchalin - m.struchalin\@erasmusmc.nl,
-		 Yurii Aulchenko   - yurii.aulchenko at gmail.com.\n\n";
-	exit;
-	}
+		 Yurii Aulchenko   - yurii.aulchenko\@gmail.com.\n\n";
+    }
+    else {
+	print "Type probabel.pl --help for more details.\n";
+    }
+    exit;
+}
 
 
 #==========================================================================
 # Put the command line arguments into variables and verify them
-$startchr = $ARGV[0];
-$endchr = $ARGV[1];
-$method = $ARGV[2];
-$chohort = $ARGV[3];
-$model = $ARGV[4];
+my $startchr = $ARGV[0];
+my $endchr = $ARGV[1];
+my $method = $ARGV[2];
+my $chohort = $ARGV[3];
+my $model = $ARGV[4];
 
 die "error: chrom-start is > 22" if($startchr > 22 && $startchr != "X") ;
 die "error: chrom-end is > 22" if($endchr > 22 && $endchr != "X");
 die "error: chrom-start > chrom-end" if($startchr > $endchr);
 
 
-$cohort_position = $cohorts{$chohort};
+my $cohort_position = $cohorts{$chohort};
 
 
-if(!defined($cohort_position))
-{
-print "\nerror: Wrong cohort name, \"$chohort\" is not an available cohort.
+if(!defined($cohort_position)) {
+    print "\nerror: Wrong cohort name, \"$chohort\" is not an available cohort.
 Available cohorts are ";
-foreach $coh(keys %cohorts) {print "\"".$coh."\", "};
-print "\n\n";
-exit;
+    foreach my $coh(keys %cohorts) {
+	print "\"".$coh."\", ";
+    }
+    print "\n\n";
+    exit;
 }
 
+my $mlinfo = $mlinfos[$cohort_position];
+my $mldose = $mldoses[$cohort_position];
+my $mlprob = $mlprobs[$cohort_position];
+my $legend = $legends[$cohort_position];
 
- at mlinfo_split = split(/$separator_filename/, $mlinfo[$cohort_position]);
- at mldose_split = split(/$separator_filename/, $mldose[$cohort_position]);
- at mlprobe_split = split(/$separator_filename/, $mlprobe[$cohort_position]);
- at legend_split = split(/$separator_filename/, $legend[$cohort_position]);
 
-
-
-$passed = 0;
-for (my $i=0;$i<@method;$i++) {
-		if ($ARGV[2] eq $method[$i]) {
-				$passed = 1;
-				$prog = $anprog[$i];
-		}
+my $passed = 0;
+my $prog;
+for (my $i=0; $i<@method; $i++) {
+    if ($ARGV[2] eq $method[$i]) {
+	$passed = 1;
+	$prog = $anprog[$i];
+    }
 }
 die "error: Wrong method. method has to be one of: @method\n" if (!$passed);
 
 
-$phename = $ARGV[5];
-$keys="";
+my $phename = $ARGV[5];
+my $outfile_prefix = $phename;
+my $keys="";
+for (my $i=6; $i<@ARGV; $i++) {
+    if ($ARGV[$i] eq "-o")
+    {
+	# Apparently the user wants to change the output file name
+	# Let's interpret this as an addition to our own prefix
+	$outfile_prefix = $outfile_prefix.$ARGV[$i+1];
 
-
-for ($i=6;$i<@ARGV;$i++)
-	{
+	# Skip the next argument (supposedly the addition to the
+	# output file name).
+	$i++;
+    }
+    else
+    {
 	$keys = $keys.$ARGV[$i]." ";
-	}
+    }
+}
 chop($keys);
 
+my $model_option_num = 0;
+my $mldose_prob;
+if($model eq "--additive") {
+    $mldose_prob = $mldose;
+    $model_option_num = 1;
+} elsif($model eq "--allmodels") {
+    $mldose_prob = $mlprob;
+    $model_option_num = 2;
+} else {
+    die "error: Wrong key for model. You can use \"--additive\" or \"--allmodels\" only\n";
+}
 
 
-$model_option_num=0;
-
-if($model eq "--additive")
-	{
-	$mldose_probe_split1 = $mldose_split[0];
-	$mldose_probe_split2 = $mldose_split[1];
-	$model_option_num=1;
-	}
-elsif($model eq "--allmodels")
-	{
-	$mldose_probe_split1 = $mlprobe_split[0];
-	$mldose_probe_split2 = $mlprobe_split[1];
-	$model_option_num=2;
-	}
-else
-	{
-	die "error: Wrong key for model. You can use \"--additive\" or \"--allmodels\" only\n";
-	}
-
-
 #==========================================================================
 # Start the analysis now that the input has been validated
 print "Start...\n";
 
-$chr = $startchr;
+my $chr = $startchr;
+my $hadhead=0;
+my $head;
+my $mlinfo_arg;
+my $mldose_arg;
+my $legend_arg;
 
-# Separate command for the X chromosome.
-if ($chr eq "X") {
-    print `$prog -p $phename.PHE --ngpreds $model_option_num -i $mlinfo_split[0]$mlinfo_split[1] -d $mldose_probe_split1$mldose_probe_split2 -m $legend_split[0]$legend_split[1] --chrom $chr -o $phename $keys`;
+# Separate command for the sex chromosomes.
+if ($chr eq "X" || $chr eq "Y") {
+    $mlinfo_arg = $mlinfo;
+    $mlinfo_arg =~ s/$chr_replacement/$chr/g;
+
+    $mldose_arg = $mldose_prob;
+    $mldose_arg =~ s/$chr_replacement/$chr/g;
+
+    $legend_arg = $legend;
+    $legend_arg =~ s/$chr_replacement/$chr/g;
+
+    if($hadhead==0) {
+	$head="";
+	$hadhead=1;
+    } else {
+	my $head="--no-head";
+    }
+
+    system "$prog -p $phename.PHE --ngpreds $model_option_num -i $mlinfo_arg -d $mldose_arg -m $legend_arg --chrom $chr -o $outfile_prefix $head $keys";
+
     exit;
 }
 
 # Commands for the autosomes
-print `$prog -p $phename.PHE --ngpreds $model_option_num -i $mlinfo_split[0]${chr}$mlinfo_split[1] -d $mldose_probe_split1${chr}$mldose_probe_split2 -m $legend_split[0]${chr}$legend_split[1] --chrom $chr -o $phename $keys`;
-	for($chr=($startchr+1);$chr<=$endchr;$chr++)
-		{
-		print `$prog -p $phename.PHE --ngpreds $model_option_num -i $mlinfo_split[0]${chr}$mlinfo_split[1] -d $mldose_probe_split1${chr}$mldose_probe_split2 -m $legend_split[0]${chr}$legend_split[1] --chrom $chr -o $phename.$chr --no-head $keys`;
+for($chr=$startchr; $chr<=$endchr; $chr++) {
 
-		if($model_option_num==2)
-			{
-			`cat $phename.${chr}$_2df_file_postfix >> ${phename}${_2df_file_postfix}`;
-			`rm $phename.${chr}$_2df_file_postfix`;
+    my $nrchunks = 0;
+    # Find out the number of chunks for the current chromosome
+    my $infofiles = $mlinfo;
+    $infofiles =~ s/$chr_replacement/$chr/g;
+    $infofiles =~ s/$chunk_replacement/*/g;
+    $nrchunks = `ls $infofiles 2>/dev/null | wc -l`;
+    if ($nrchunks==0) {
+	# If no chunked info files exist the 'wc -l' command returns 0
+	# so that actually means 1 chunk containing all data.
+	$nrchunks = 1;
+    }
+    print "Nr. of chunks: $nrchunks";
 
-			`cat $phename.${chr}$_add_file_postfix >> ${phename}${_add_file_postfix}`;
-			`rm $phename.${chr}$_add_file_postfix`;
+    # Loop over all chunks
+    for (my $chunk=1; $chunk <= $nrchunks; $chunk++)
+    {
+	if($hadhead==0) {
+	    $head="";
+	    $hadhead=1;
+	} else {
+	    $head="--no-head";
+	}
+	$mlinfo_arg = $mlinfo;
+	$mlinfo_arg =~ s/$chr_replacement/$chr/g;
+	$mlinfo_arg =~ s/$chunk_replacement/$chunk/g;
 
-			`cat $phename.${chr}$_domin_file_postfix >> ${phename}${_domin_file_postfix}`;
-			`rm $phename.${chr}$_domin_file_postfix`;
+	$mldose_arg = $mldose_prob;
+	$mldose_arg =~ s/$chr_replacement/$chr/g;
+	$mldose_arg =~ s/$chunk_replacement/$chunk/g;
 
-			`cat $phename.${chr}$_recess_file_postfix >> ${phename}${_recess_file_postfix}`;
-			`rm $phename.${chr}$_recess_file_postfix`;
+	$legend_arg = $legend;
+	$legend_arg =~ s/$chr_replacement/$chr/g;
+	$legend_arg =~ s/$chunk_replacement/$chunk/g;
 
-			`cat $phename.${chr}$_over_domin_file_postfix >> ${phename}${_over_domin_file_postfix}`;
-			`rm $phename.${chr}$_over_domin_file_postfix`;
-			}
-		else
-			{
-			`cat $phename.${chr}$_add_file_postfix >> $phename${_add_file_postfix}`;
-			print "cat $phename.${chr}$_add_file_postfix >> $phename${_add_file_postfix}\n";
-			`rm $phename.${chr}$_add_file_postfix`;
-			print "rm $phename.${chr}$_add_file_postfix\n";
-			}
-		}
+	my $command = "$prog -p $phename.PHE --ngpreds $model_option_num -i $mlinfo_arg -d $mldose_arg -m $legend_arg --chrom $chr -o $outfile_prefix.chunk$chunk.chr$chr $head $keys";
+	print "$command \n";
+	system $command;
 
+	if($model_option_num==2)
+	{
+	    `cat $outfile_prefix.chunk${chunk}.chr${chr}$_2df_file_postfix >> ${outfile_prefix}.${chr}${_2df_file_postfix}`;
+	    `rm $outfile_prefix.chunk${chunk}.chr${chr}$_2df_file_postfix`;
+
+	    `cat $outfile_prefix.chunk${chunk}.chr${chr}$_add_file_postfix >> ${outfile_prefix}.${chr}${_add_file_postfix}`;
+	    `rm $outfile_prefix.chunk${chunk}.chr${chr}$_add_file_postfix`;
+
+	    `cat $outfile_prefix.chunk${chunk}.chr${chr}$_domin_file_postfix >> ${outfile_prefix}.${chr}${_domin_file_postfix}`;
+	    `rm $outfile_prefix.chunk${chunk}.chr${chr}$_domin_file_postfix`;
+
+	    `cat $outfile_prefix.chunk${chunk}.chr${chr}$_recess_file_postfix >> ${outfile_prefix}.${chr}${_recess_file_postfix}`;
+	    `rm $outfile_prefix.chunk${chunk}.chr${chr}$_recess_file_postfix`;
+
+	    `cat $outfile_prefix.chunk${chunk}.chr${chr}$_over_domin_file_postfix >> ${outfile_prefix}.${chr}${_over_domin_file_postfix}`;
+	    `rm $outfile_prefix.chunk${chunk}.chr${chr}$_over_domin_file_postfix`;
+	} else {
+	    `cat $outfile_prefix.chunk${chunk}.chr${chr}$_add_file_postfix >> $outfile_prefix.${chr}${_add_file_postfix}`;
+	    print "cat $outfile_prefix.chunk${chunk}.chr${chr}$_add_file_postfix >> $outfile_prefix.chr${chr}${_add_file_postfix}\n";
+	    `rm $outfile_prefix.chunk${chunk}.chr${chr}$_add_file_postfix`;
+	    print "rm $outfile_prefix.chunk${chunk}.chr${chr}$_add_file_postfix\n";
+	}
+    }
+
+    if($model_option_num==2)
+    {
+	`cat $outfile_prefix.${chr}$_2df_file_postfix >> ${outfile_prefix}${_2df_file_postfix}`;
+	`rm $outfile_prefix.${chr}$_2df_file_postfix`;
+
+	`cat $outfile_prefix.${chr}$_add_file_postfix >> ${outfile_prefix}${_add_file_postfix}`;
+	`rm $outfile_prefix.${chr}$_add_file_postfix`;
+
+	`cat $outfile_prefix.${chr}$_domin_file_postfix >> ${outfile_prefix}${_domin_file_postfix}`;
+	`rm $outfile_prefix.${chr}$_domin_file_postfix`;
+
+	`cat $outfile_prefix.${chr}$_recess_file_postfix >> ${outfile_prefix}${_recess_file_postfix}`;
+	`rm $outfile_prefix.${chr}$_recess_file_postfix`;
+
+	`cat $outfile_prefix.${chr}$_over_domin_file_postfix >> ${outfile_prefix}${_over_domin_file_postfix}`;
+	`rm $outfile_prefix.${chr}$_over_domin_file_postfix`;
+    } else {
+	`cat $outfile_prefix.${chr}$_add_file_postfix >> $outfile_prefix${_add_file_postfix}`;
+	print "cat $outfile_prefix.${chr}$_add_file_postfix >> $outfile_prefix${_add_file_postfix}\n";
+	`rm $outfile_prefix.${chr}$_add_file_postfix`;
+	print "rm $outfile_prefix.${chr}$_add_file_postfix\n";
+    }
+}
+
 print "Finished all chromosomes ...\n";

Modified: branches/ProbABEL-refactoring/ProbABEL/src/probabel_config.cfg.example
===================================================================
--- branches/ProbABEL-refactoring/ProbABEL/src/probabel_config.cfg.example	2012-11-08 08:47:27 UTC (rev 1016)
+++ branches/ProbABEL-refactoring/ProbABEL/src/probabel_config.cfg.example	2012-11-08 21:36:15 UTC (rev 1017)
@@ -1,3 +1,29 @@
-cohort,mlinfo_path,mldose_path,mlprobe_path,legend_path
-ERGO,script_test/merlin._._chr_._.collected.ped.out.mlinfo,script_test/merlin._._chr_._.collected.ped.out.mldose,script_test/merlin._._chr_._.collected.ped.out.mlprob,script_test/genotypes_chr_._chr_.__CEU_r22_nr.b36_fwd_legend.txt
-ERGOPLUS,/mnt/space/ergoplus/ergoplus.chr_._chr_._.mach.mlinfo,/mnt/space/ergoplus/ergoplus.chr_._chr_._.mach.mldose,/mnt/space/ergoplus/ergoplus.chr_._chr_._.mach.mlprob,/mnt/impute/impute_b36/collected/genotypes_chr_._chr_.__CEU_r22_nr.b36_fwd_legend.txt
+cohort,info_path,dose_path,prob_path,legend_path
+# Configuration file for the probabel.pl wrapper script
+#
+# This file contains the location of the files with imputed data for the
+# cohorts/imputations that are available to the user.
+#
+# Use one line per cohort configuration. Each line consists of the following
+# entries, separated by a comma:
+# - cohort: a discriptive name for the cohort/imputation, e.g. MyCohort_HM2
+# - mlinfo_path: path to the file with the SNP imputation information
+# - mldose_path: path to the file with imputed genotype dosages
+# - mlprob_path: path to the file with imputed genotype probabilities
+# - legend_path: path to the legend file used for the imputations
+#
+# If the dosage and probability files are in DatABEL format (i.e. filenames
+# ending in .fvi and .fvd), either of these extensions can be used.
+#
+# Use "_._chr_._" to specify a chromosome number in the configured paths. This
+# can be used multiple times per line, e.g.:
+#
+# mldose_path: /data/Cohort1/MachFiles/_._chr_._/mach_._chr_._-out.mldose
+#
+# For chromosome 1 this will be replaced with:
+#             /data/Cohort1/MachFiles/1/mach1-out.mldose
+#
+#
+
+STUDY_1,script_test/merlin._._chr_._.collected.ped.out.mlinfo,script_test/merlin._._chr_._.collected.ped.out.mldose,script_test/merlin._._chr_._.collected.ped.out.mlprob,script_test/genotypes_chr_._chr_.__CEU_r22_nr.b36_fwd_legend.txt
+STUDY_2,/tmp/PAtest/chunk_._chunk_._-chr_._chr_._.mlinfo,/tmp/PAtest/chunk_._chunk_._-chr_._chr_._.mldose,/tmp/PAtest/chunk_._chunk_._-chr_._chr_._.mlprob,/tmp/PAtest/genotypes_chr_._chr_.__CEU_r22_nr.b36_fwd_legend.txt



More information about the Genabel-commits mailing list