[Genabel-commits] r1203 - pkg/ProbABEL/src

Fri Apr 26 23:03:28 CEST 2013

Author: lckarssen
Date: 2013-04-26 23:03:28 +0200 (Fri, 26 Apr 2013)
New Revision: 1203

Modified:
   pkg/ProbABEL/src/coxph_data.cpp
   pkg/ProbABEL/src/gendata.cpp
   pkg/ProbABEL/src/gendata.h
   pkg/ProbABEL/src/main.cpp
   pkg/ProbABEL/src/reg1.cpp
   pkg/ProbABEL/src/regdata.cpp
Log:
In ProbABEL: change all remaining occurences of float to double. With this change we can finally compare the results from prob files with dose files again. 
This is now done with a dirty hack in gendata.cpp, where we read in the data. The data is first converted to a string (stream), which is then read back as a double using strtod() (just like we do when reading text files). 

I've also set the precision of the output of beta and se_beta to 9 significant digits. We should discuss if we want to keep it this precise and/or want to switch to scientific notation for these variables. 

main.cpp: also some small code layout cleanup/added comments/etc.

reg1.cpp: cotains some commented (using and preprocessor #if statement) code in which I used the functions from the Eigen library for solving the linear system instead of running transposition and Choleski decomposition ourselves. The output from this should be checked at a later time and then replace the current way of solving the linear system.



Modified: pkg/ProbABEL/src/coxph_data.cpp
===================================================================

--- pkg/ProbABEL/src/coxph_data.cpp	2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/coxph_data.cpp	2013-04-26 21:03:28 UTC (rev 1203)
@@ -109,7 +109,7 @@
     if (snpnum > 0)
         for (int j = 0; j < ngpreds; j++)
         {
-            float snpdata[nids];
+            double snpdata[nids];
             gend.get_var(snpnum * ngpreds + j, snpdata);
             for (int i = 0; i < nids; i++)
                 X.put(snpdata[i], i, (ncov - ngpreds + j));
@@ -184,7 +184,7 @@
 
     for (int j = 0; j < ngpreds; j++)
     {
-        float snpdata[nids];
+        double snpdata[nids];
         for (int i = 0; i < nids; i++)
             masked_data[i] = 0;
 

Modified: pkg/ProbABEL/src/gendata.cpp
===================================================================
--- pkg/ProbABEL/src/gendata.cpp	2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/gendata.cpp	2013-04-26 21:03:28 UTC (rev 1203)
@@ -16,29 +16,34 @@
 #endif
 #include "utilities.h"
 
-void gendata::get_var(int var, float * data)
+void gendata::get_var(int var, double * data)
 {
-    if (DAG == NULL)
+    if (DAG == NULL)            // Read from text file
     {
         for (int i = 0; i < G.nrow; i++)
         {
             data[i] = G.get(i, var);
         }
     }
-    else if (DAG != NULL)
+    else if (DAG != NULL)       // Read from fv file
     {
-        float tmpdata[DAG->getNumObservations()];
+        double tmpdata[DAG->getNumObservations()];
         DAG->readVariableAs((unsigned long int) var, tmpdata);
+
         unsigned int j = 0;
         for (unsigned int i = 0; i < DAG->getNumObservations(); i++)
         {
             if (!DAGmask[i])
             {
-                data[j++] = tmpdata[i];
+                // A dirty trick to get rid of conversion
+                // errors. Instead of casting float data to double we
+                // convert the data to string and then do strtod()
+                std::ostringstream strs;
+                strs << tmpdata[i];
+                std::string str = strs.str();
+                data[j++] = strtod(str.c_str(), (char **) NULL);
             }
         }
-        // std::cout << j << " " << DAG->get_nobservations() << " "
-        //           << nids << "\n";
     }
     else
     {
@@ -165,15 +170,15 @@
                 {
                     infile >> tmpstr;
                     // tmpstr contains the dosage in string form. Convert
-                    // it to float (if tmpstr is NaN it will be set to nan).
+                    // it to double (if tmpstr is NaN it will be set to nan).
                     // Note that Valgrind 3.7.0 gives "Invalid read of
                     // size 8" error messages here. A bug in Valgrind?!
-                    float dosage = strtod(tmpstr.c_str(), (char **) NULL);
+                    double dosage = strtod(tmpstr.c_str(), (char **) NULL);
                     G.put(dosage, k, j);
                 }
                 else
                 {
-                    std::cerr << "cannot read dose-file: "
+                    std::cerr << "cannot read dose-file: " << fname
                               << "check skipd and ngpreds parameters\n";
                     infile.close();
                     exit(1);

Modified: pkg/ProbABEL/src/gendata.h
===================================================================
--- pkg/ProbABEL/src/gendata.h	2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/gendata.h	2013-04-26 21:03:28 UTC (rev 1203)
@@ -32,7 +32,7 @@
             unsigned int npeople, unsigned int nmeasured,
             unsigned short int * allmeasured, std::string * idnames);
 
-    void get_var(int var, float * data);
+    void get_var(int var, double * data);
 
     ~gendata();
 
@@ -40,10 +40,9 @@
     // ANOTHER PRIVATE OBJECT IS A POINTER TO DATABELBASECPP
     // UPDATE SNP, ALL REGRESSION METHODS: ACCOUNT FOR MISSING
 private:
-    mematrix<float> G;
+    mematrix<double> G;
     AbstractMatrix * DAG;
     unsigned short int * DAGmask;
-    //	mematrix<double> G;
 };
 
 #endif /* GENDATA_H_ */

Modified: pkg/ProbABEL/src/main.cpp
===================================================================
--- pkg/ProbABEL/src/main.cpp	2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/main.cpp	2013-04-26 21:03:28 UTC (rev 1203)
@@ -74,6 +74,7 @@
         }
         std::cout.flush();
     }
+    std::cout << setprecision(6);
 }
 
 void open_files_for_output(std::vector<std::ofstream*>& outfile,
@@ -491,20 +492,25 @@
     for (int i = 0; i < maxmod; i++)
     {
         beta_sebeta.push_back(new std::ostringstream());
+        beta_sebeta[i]->precision(9);
         //Han Chen
         covvalue.push_back(new std::ostringstream());
+        covvalue[i]->precision(9);
         //Oct 26, 2009
         chi2.push_back(new std::ostringstream());
     }
 
+
+    // Here we start the analysis for each SNP.
     for (int csnp = 0; csnp < nsnps; csnp++)
     {
         rgd.update_snp(gtd, csnp);
         double freq = 0.;
-        int gcount = 0;
-        float snpdata1[gtd.nids];
-        float snpdata2[gtd.nids];
-        if (input_var.getNgpreds() == 2)
+        unsigned int gcount = 0;
+        double snpdata1[gtd.nids];
+        double snpdata2[gtd.nids];
+
+        if (input_var.getNgpreds() == 2) // Two predictors (probs)
         {
             //freq = ((gtd.G).column_mean(csnp*2)*2. +
             //        (gtd.G).column_mean(csnp*2+1))/2.;
@@ -516,7 +522,8 @@
                     gcount++;
                     freq += snpdata1[ii] + snpdata2[ii] * 0.5;
                 }
-        } else
+        }
+        else // Only one predictor (dosage data)
         {
             // freq = (gtd.G).column_mean(csnp)/2.;
             gtd.get_var(csnp, snpdata1);
@@ -533,34 +540,33 @@
         {
             poly = 0;
         }
+
         if (fabs(mli.Rsq[csnp]) < 1.e-16)
         {
             poly = 0;
         }
+
+        // All models output. One file per model
         //
-        //All models output. One file per each model
-        //
         if (input_var.getNgpreds() == 2)
         {
-            //Write mlinfo to output:
+            // Write mlinfo to output:
             for (unsigned int file = 0; file < outfile.size(); file++)
             {
                 write_mlinfo(outfile, file, mli, csnp, input_var, gcount, freq);
             }
         } else
         {
-
             int file = 0;
             write_mlinfo(outfile, file, mli, csnp, input_var, gcount, freq);
             maxmod = 1;
-
         }
 
+        // Run regression for each model
         for (int model = 0; model < maxmod; model++)
         {
-            if (poly) //allel freq is not to rare
+            if (poly) // Allele freq is not too rare; still ngpreds=2
             {
-
 #if LOGISTIC
                 logistic_reg rd(rgd);
                 if (input_var.getScore())
@@ -660,9 +666,10 @@
                         *chi2[model] << "nan";
                     }
                 }
-                //________________________________
-            } else //beta, sebeta = nan
-            {
+            } // END first part of if(poly); allele not too rare
+            else
+            {   // SNP is rare: beta, sebeta = nan
+
                 int number_of_rows_or_columns = rgd.X.ncol;
                 start_pos = get_start_position(input_var, model,
                         number_of_rows_or_columns);
@@ -730,13 +737,11 @@
                         *chi2[model] << "nan";
                     }
                 }
-
             }
         }
         //end of model cycle
         if (input_var.getNgpreds() == 2)
         {
-
             //Han Chen
             *outfile[0] << beta_sebeta[0]->str() << input_var.getSep();
 #if !COXPH
@@ -783,10 +788,9 @@
 #endif
             *outfile[4] << chi2[4]->str() << "\n";
             //Oct 26, 2009
-
-        } else  //ngpreds == 1
+        }
+        else // Dose data: only additive model. Only one output file
         {
-
             if (input_var.getInverseFilename() == NULL)
             {
                 //Han Chen
@@ -819,7 +823,7 @@
         }
         update_progress_to_cmd_line(csnp, nsnps);
     }
-
+    std::cout << setprecision(2) << fixed;
     std::cout << "\b\b\b\b\b\b\b\b\b" << 100.;
     std::cout << "%... done\n";
 

Modified: pkg/ProbABEL/src/reg1.cpp
===================================================================
--- pkg/ProbABEL/src/reg1.cpp	2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/reg1.cpp	2013-04-26 21:03:28 UTC (rev 1203)
@@ -1,5 +1,6 @@
 #include "reg1.h"
 
+
 mematrix<double> apply_model(mematrix<double>& X, int model, int interaction,
                              int ngpreds, bool is_interaction_excluded,
                              bool iscox, int nullmodel)
@@ -294,8 +295,8 @@
     {
         cout << rdata.is_interaction_excluded
                 << " <-irdata.is_interaction_excluded\n";
-        std::cout << "invvarmatrix:\n";
-        invvarmatrixin.masked_data->print();
+        // std::cout << "invvarmatrix:\n";
+        // invvarmatrixin.masked_data->print();
         std::cout << "rdata.X:\n";
         rdata.X.print();
     }
@@ -306,6 +307,8 @@
     {
         std::cout << "X:\n";
         X.print();
+        std::cout << "Y:\n";
+        rdata.Y.print();
     }
     int length_beta = X.ncol;
     beta.reinit(length_beta, 1);
@@ -333,9 +336,39 @@
         // = invvarmatrix*X;
         // std::cout<<"new tX.nrow="<<X.nrow<<" tX.ncol="<<X.ncol<<"\n";
     }
+
     mematrix<double> tXX = tX * X;
-    //      double N = tXX.get(0,0);
     double N = X.nrow;
+
+#if EIGEN_COMMENTEDOUT
+    MatrixXd Xeigen    = X.data;
+    MatrixXd tXeigen   = Xeigen.transpose();
+    MatrixXd tXXeigen  = tXeigen * Xeigen;
+    VectorXd Yeigen    = rdata.Y.data;
+    VectorXd tXYeigen  = tXeigen * Yeigen;
+    // Solve X^T * X * beta = X^T * Y for beta:
+    VectorXd betaeigen = tXXeigen.fullPivLu().solve(tXYeigen);
+    beta.data = betaeigen;
+
+    if (verbose)
+    {
+        std::cout << setprecision(9) << "Xeigen:\n"  << Xeigen  << endl;
+        std::cout << setprecision(9) << "tX:\n"  << tXeigen  << endl;
+        std::cout << setprecision(9) << "tXX:\n" << tXXeigen << endl;
+        std::cout << setprecision(9) << "tXY:\n" << tXYeigen << endl;
+        std::cout << setprecision(9) << "beta:\n"<< betaeigen << endl;
+        printf("----\n");
+        printf("beta[0] = %e\n", betaeigen.data()[0]);
+        printf("----\n");
+//        (beta).print();
+        double relative_error = (tXXeigen * betaeigen - tXYeigen).norm() / tXYeigen.norm(); // norm() is L2 norm
+        cout << "The relative error is:\n" << relative_error << endl;
+
+    }
+
+    // This one is needed later on in this function
+    mematrix<double> tXX_i = invert(tXX);
+#else
     //
     // use cholesky to invert
     //
@@ -344,8 +377,10 @@
     chinv2_mm(tXX_i);
     // before was
     // mematrix<double> tXX_i = invert(tXX);
+
     mematrix<double> tXY = tX * (rdata.Y);
     beta = tXX_i * tXY;
+
     if (verbose)
     {
         std::cout << "tX:\n";
@@ -361,6 +396,7 @@
         std::cout << "beta:\n";
         (beta).print();
     }
+#endif
     // now compute residual variance
     sigma2 = 0.;
     mematrix<double> ttX = transpose(tX);

Modified: pkg/ProbABEL/src/regdata.cpp
===================================================================
--- pkg/ProbABEL/src/regdata.cpp	2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/regdata.cpp	2013-04-26 21:03:28 UTC (rev 1203)
@@ -82,7 +82,7 @@
     if (snpnum > 0)
         for (int j = 0; j < ngpreds; j++)
         {
-            float snpdata[nids];
+            double snpdata[nids];
             gend.get_var(snpnum * ngpreds + j, snpdata);
             for (int i = 0; i < nids; i++)
                 X.put(snpdata[i], i, (ncov - ngpreds + 1 + j));
@@ -97,7 +97,7 @@
 {
     for (int j = 0; j < ngpreds; j++)
     {
-        float snpdata[nids];
+        double snpdata[nids];
         for (int i = 0; i < nids; i++)
         {
             masked_data[i] = 0;