[Genabel-commits] r929 - pkg/ProbABEL/src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Thu Jul 12 14:16:01 CEST 2012


Author: lckarssen
Date: 2012-07-12 14:16:00 +0200 (Thu, 12 Jul 2012)
New Revision: 929

Added:
   pkg/ProbABEL/src/extract-snp.cpp
Modified:
   pkg/ProbABEL/src/Makefile.am
Log:
ProbABEL: added the extract-snp binary that extracts SNP dosages from filevector files (from the command line). It can do this for all IDs or just one. 

Of course, this program is not necessarily looking at SNPs and IDs. In filevector parlance SNPs are Variables (rows) and IDs are Observations (rows).

At the moment it can only extract one SNP at a time (given as command line option). This may change in the future. For example, we may want the program to read SNP names from stdin too. 
The -d option for debug info prints out all SNP names and IDs in the filevector file, which may be too much for your screen.


Modified: pkg/ProbABEL/src/Makefile.am
===================================================================
--- pkg/ProbABEL/src/Makefile.am	2012-07-06 05:14:23 UTC (rev 928)
+++ pkg/ProbABEL/src/Makefile.am	2012-07-12 12:16:00 UTC (rev 929)
@@ -24,7 +24,7 @@
  include/R_ext/Print.h include/R_ext/Random.h include/R_ext/Utils.h	\
  include/R_ext/RS.h
 
-bin_PROGRAMS = palinear palogist
+bin_PROGRAMS = palinear palogist extract-snp
 if BUILD_pacoxph
 ## Extra files for pacoxph
 COXBASE = coxfit2 chinv2 cholesky2 chsolve2 dmatrix
@@ -47,6 +47,8 @@
 pacoxph_CPPFLAGS = $(AM_CPPFLAGS)
 pacoxph_CFLAGS = -DCOXPH -I $(top_srcdir)/src/include $(AM_CFLAGS)
 
+extract_snp_SOURCES = extract-snp.cpp $(FVSRC) $(FVHEADERS)
+
 ## Install these scripts in the bin directory as well:
 dist_bin_SCRIPTS = probabel.pl extIDS.pl
 

Added: pkg/ProbABEL/src/extract-snp.cpp
===================================================================
--- pkg/ProbABEL/src/extract-snp.cpp	                        (rev 0)
+++ pkg/ProbABEL/src/extract-snp.cpp	2012-07-12 12:16:00 UTC (rev 929)
@@ -0,0 +1,209 @@
+/*******************************************************************************
+ *
+ * This program extracts a given row (dosages for all individuals for a given
+ * SNP) from filevector files.
+ *
+ * (C) 2012 L.C. Karssen
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ ******************************************************************************/
+
+#include <stdio.h>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <map>
+#include <getopt.h>
+
+using namespace std;
+
+#include "fvlib/Logger.h"
+#include "fvlib/FileVector.h"
+#include "fvlib/CastUtils.h"
+
+
+void info(char *program_name)
+{
+    cout << program_name
+	 << " extracts the dosage for a SNP for one or more individuals from a"
+	 << "file in filevector format (.fvi/.fvd)." << endl;
+    cout << endl;
+    cout << "Usage: " << program_name << " --file <fv file> --snp <snpname>"
+	 << endl;
+    cout << "   or: " << program_name << " -f <fv file> -s <snpname>" << endl;
+    cout << endl;
+    cout << "Additional options:" << endl;
+    cout << "\t--id (-i) <idname>: only print the dosage for the given SNP and ID"
+	 << endl;
+    cout << "\t--debug (-d): show debugging output" << endl;
+    cout << "\t--help (-h): show this information" << endl;
+}
+
+
+int main(int argc, char* argv[])
+{
+    if (argc < 2)
+    {
+	info(argv[0]);
+	exit(3);
+    }
+
+    int next_option;
+    const char * const short_options = "df:hi:s:";
+    const struct option long_options [] =
+	{
+	    {"debug",  0, NULL, 'd'},
+	    {"file",  1, NULL, 'f'},
+	    {"help",  0, NULL, 'h'},
+	    {"id",   1, NULL, 'i'},
+	    {"snp",  1, NULL, 's'},
+	    {NULL  ,   0, NULL, 0  }
+	};
+    char *program_name = argv[0];
+    bool debug = false;
+    string inputFileName = "";
+    string idname = "";
+    string snpname = "";
+    do
+    {
+	next_option = getopt_long(argc, argv, short_options, long_options, NULL);
+	switch (next_option)
+	{
+	case 'h':
+	    info(program_name);
+	    exit(0);
+	case 'd':
+	    debug = true;
+	    break;
+	case 'f':
+	    inputFileName = string(optarg);
+	    break;
+	case 'i':
+	    idname = string(optarg);
+	    break;
+	case 's':
+	    snpname = string(optarg);
+	    break;
+	case '?': break;
+	case -1 : break;
+	}
+    }
+    while (next_option != -1);
+
+    if (snpname.compare("") == 0)
+    {
+	cerr << "Error: No SNP name given" << endl << endl;
+	info(program_name);
+	exit(2);
+    }
+
+    unsigned long int snprow = 0;
+    bool snpfound = false;
+    bool idfound  = false;
+
+    if (debug) cout << "Input file is '" << inputFileName << "'." << endl;
+
+    FileVector fv(inputFileName, 64, true);
+
+    if (debug)
+    {
+	for (unsigned long int col=0; col<fv.getNumObservations(); col++)
+	{
+	    cout << fv.readObservationName(col).name << " ";
+	}
+
+	cout << endl;
+	cout << "----------------------" << endl;
+    }
+
+    // Look at the SNPs (rows) first
+    for (unsigned long int row=0; row<fv.getNumVariables(); row++)
+    {
+	string current_snpname = fv.readVariableName(row).name;
+	if (current_snpname.compare(snpname) == 0)
+	{
+	    snpfound = true;
+	    snprow = row;
+	    if (debug)
+	    {
+		cout << "*" << current_snpname << "* ";
+	    }
+	} else {
+	    if (debug)
+	    {
+		cout << current_snpname << " ";
+	    }
+	}
+    }
+    if (debug)
+    {
+	cout << endl;
+	cout << "----------------------" << endl;
+
+	cout << "N_obs = " << fv.getNumObservations() << "\telement size: "
+	     <<  fv.getElementSize() << "\tDataType: "
+	     << dataTypeToString(fv.getElementType()) << endl;
+    }
+
+    if(!snpfound)
+    {
+	cerr << "SNP name '" << snpname << "' not found in data file "
+	     << inputFileName << endl;
+	exit(1);
+    }
+
+    char * data = new (nothrow) char[fv.getNumObservations() *
+					 fv.getElementSize()];
+    if (!data)
+    {
+	cerr << "Cannot allocate memory for data vector. Exiting..." << endl;
+	exit(2);
+    }
+
+    fv.readVariable(snprow, data);
+    for (unsigned long int col=0; col<fv.getNumObservations(); col++)
+    {
+	if (idname.compare("") != 0)
+	{
+	    // An ID name has been set, only print the dosage for that ID
+	    // and SNP combination
+	    if(idname.compare(fv.readObservationName(col).name) == 0)
+	    {
+		idfound = true;
+		cout << fv.readObservationName(col).name << "\t"
+		     << bufToString(fv.getElementType(),
+				    &data[col*fv.getElementSize()],
+				    string("NA"))
+		     << endl;
+	    }
+	} else {
+	    // Print the dosages for all IDs
+	    cout << fv.readObservationName(col).name << "\t"
+		 << bufToString(fv.getElementType(),
+				&data[col*fv.getElementSize()],
+				string("NA"))
+		 << endl;
+	}
+    }
+
+    if (idfound == false && idname.compare("") != 0)
+    {
+	cerr << "Id '" << idname << "' not found in data file "
+	     << inputFileName << endl;
+	exit(1);
+    }
+
+    delete data;
+}



More information about the Genabel-commits mailing list