[Genabel-commits] r1203 - pkg/ProbABEL/src
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Fri Apr 26 23:03:28 CEST 2013
Author: lckarssen
Date: 2013-04-26 23:03:28 +0200 (Fri, 26 Apr 2013)
New Revision: 1203
Modified:
pkg/ProbABEL/src/coxph_data.cpp
pkg/ProbABEL/src/gendata.cpp
pkg/ProbABEL/src/gendata.h
pkg/ProbABEL/src/main.cpp
pkg/ProbABEL/src/reg1.cpp
pkg/ProbABEL/src/regdata.cpp
Log:
In ProbABEL: change all remaining occurences of float to double. With this change we can finally compare the results from prob files with dose files again.
This is now done with a dirty hack in gendata.cpp, where we read in the data. The data is first converted to a string (stream), which is then read back as a double using strtod() (just like we do when reading text files).
I've also set the precision of the output of beta and se_beta to 9 significant digits. We should discuss if we want to keep it this precise and/or want to switch to scientific notation for these variables.
main.cpp: also some small code layout cleanup/added comments/etc.
reg1.cpp: cotains some commented (using and preprocessor #if statement) code in which I used the functions from the Eigen library for solving the linear system instead of running transposition and Choleski decomposition ourselves. The output from this should be checked at a later time and then replace the current way of solving the linear system.
Modified: pkg/ProbABEL/src/coxph_data.cpp
===================================================================
--- pkg/ProbABEL/src/coxph_data.cpp 2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/coxph_data.cpp 2013-04-26 21:03:28 UTC (rev 1203)
@@ -109,7 +109,7 @@
if (snpnum > 0)
for (int j = 0; j < ngpreds; j++)
{
- float snpdata[nids];
+ double snpdata[nids];
gend.get_var(snpnum * ngpreds + j, snpdata);
for (int i = 0; i < nids; i++)
X.put(snpdata[i], i, (ncov - ngpreds + j));
@@ -184,7 +184,7 @@
for (int j = 0; j < ngpreds; j++)
{
- float snpdata[nids];
+ double snpdata[nids];
for (int i = 0; i < nids; i++)
masked_data[i] = 0;
Modified: pkg/ProbABEL/src/gendata.cpp
===================================================================
--- pkg/ProbABEL/src/gendata.cpp 2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/gendata.cpp 2013-04-26 21:03:28 UTC (rev 1203)
@@ -16,29 +16,34 @@
#endif
#include "utilities.h"
-void gendata::get_var(int var, float * data)
+void gendata::get_var(int var, double * data)
{
- if (DAG == NULL)
+ if (DAG == NULL) // Read from text file
{
for (int i = 0; i < G.nrow; i++)
{
data[i] = G.get(i, var);
}
}
- else if (DAG != NULL)
+ else if (DAG != NULL) // Read from fv file
{
- float tmpdata[DAG->getNumObservations()];
+ double tmpdata[DAG->getNumObservations()];
DAG->readVariableAs((unsigned long int) var, tmpdata);
+
unsigned int j = 0;
for (unsigned int i = 0; i < DAG->getNumObservations(); i++)
{
if (!DAGmask[i])
{
- data[j++] = tmpdata[i];
+ // A dirty trick to get rid of conversion
+ // errors. Instead of casting float data to double we
+ // convert the data to string and then do strtod()
+ std::ostringstream strs;
+ strs << tmpdata[i];
+ std::string str = strs.str();
+ data[j++] = strtod(str.c_str(), (char **) NULL);
}
}
- // std::cout << j << " " << DAG->get_nobservations() << " "
- // << nids << "\n";
}
else
{
@@ -165,15 +170,15 @@
{
infile >> tmpstr;
// tmpstr contains the dosage in string form. Convert
- // it to float (if tmpstr is NaN it will be set to nan).
+ // it to double (if tmpstr is NaN it will be set to nan).
// Note that Valgrind 3.7.0 gives "Invalid read of
// size 8" error messages here. A bug in Valgrind?!
- float dosage = strtod(tmpstr.c_str(), (char **) NULL);
+ double dosage = strtod(tmpstr.c_str(), (char **) NULL);
G.put(dosage, k, j);
}
else
{
- std::cerr << "cannot read dose-file: "
+ std::cerr << "cannot read dose-file: " << fname
<< "check skipd and ngpreds parameters\n";
infile.close();
exit(1);
Modified: pkg/ProbABEL/src/gendata.h
===================================================================
--- pkg/ProbABEL/src/gendata.h 2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/gendata.h 2013-04-26 21:03:28 UTC (rev 1203)
@@ -32,7 +32,7 @@
unsigned int npeople, unsigned int nmeasured,
unsigned short int * allmeasured, std::string * idnames);
- void get_var(int var, float * data);
+ void get_var(int var, double * data);
~gendata();
@@ -40,10 +40,9 @@
// ANOTHER PRIVATE OBJECT IS A POINTER TO DATABELBASECPP
// UPDATE SNP, ALL REGRESSION METHODS: ACCOUNT FOR MISSING
private:
- mematrix<float> G;
+ mematrix<double> G;
AbstractMatrix * DAG;
unsigned short int * DAGmask;
- // mematrix<double> G;
};
#endif /* GENDATA_H_ */
Modified: pkg/ProbABEL/src/main.cpp
===================================================================
--- pkg/ProbABEL/src/main.cpp 2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/main.cpp 2013-04-26 21:03:28 UTC (rev 1203)
@@ -74,6 +74,7 @@
}
std::cout.flush();
}
+ std::cout << setprecision(6);
}
void open_files_for_output(std::vector<std::ofstream*>& outfile,
@@ -491,20 +492,25 @@
for (int i = 0; i < maxmod; i++)
{
beta_sebeta.push_back(new std::ostringstream());
+ beta_sebeta[i]->precision(9);
//Han Chen
covvalue.push_back(new std::ostringstream());
+ covvalue[i]->precision(9);
//Oct 26, 2009
chi2.push_back(new std::ostringstream());
}
+
+ // Here we start the analysis for each SNP.
for (int csnp = 0; csnp < nsnps; csnp++)
{
rgd.update_snp(gtd, csnp);
double freq = 0.;
- int gcount = 0;
- float snpdata1[gtd.nids];
- float snpdata2[gtd.nids];
- if (input_var.getNgpreds() == 2)
+ unsigned int gcount = 0;
+ double snpdata1[gtd.nids];
+ double snpdata2[gtd.nids];
+
+ if (input_var.getNgpreds() == 2) // Two predictors (probs)
{
//freq = ((gtd.G).column_mean(csnp*2)*2. +
// (gtd.G).column_mean(csnp*2+1))/2.;
@@ -516,7 +522,8 @@
gcount++;
freq += snpdata1[ii] + snpdata2[ii] * 0.5;
}
- } else
+ }
+ else // Only one predictor (dosage data)
{
// freq = (gtd.G).column_mean(csnp)/2.;
gtd.get_var(csnp, snpdata1);
@@ -533,34 +540,33 @@
{
poly = 0;
}
+
if (fabs(mli.Rsq[csnp]) < 1.e-16)
{
poly = 0;
}
+
+ // All models output. One file per model
//
- //All models output. One file per each model
- //
if (input_var.getNgpreds() == 2)
{
- //Write mlinfo to output:
+ // Write mlinfo to output:
for (unsigned int file = 0; file < outfile.size(); file++)
{
write_mlinfo(outfile, file, mli, csnp, input_var, gcount, freq);
}
} else
{
-
int file = 0;
write_mlinfo(outfile, file, mli, csnp, input_var, gcount, freq);
maxmod = 1;
-
}
+ // Run regression for each model
for (int model = 0; model < maxmod; model++)
{
- if (poly) //allel freq is not to rare
+ if (poly) // Allele freq is not too rare; still ngpreds=2
{
-
#if LOGISTIC
logistic_reg rd(rgd);
if (input_var.getScore())
@@ -660,9 +666,10 @@
*chi2[model] << "nan";
}
}
- //________________________________
- } else //beta, sebeta = nan
- {
+ } // END first part of if(poly); allele not too rare
+ else
+ { // SNP is rare: beta, sebeta = nan
+
int number_of_rows_or_columns = rgd.X.ncol;
start_pos = get_start_position(input_var, model,
number_of_rows_or_columns);
@@ -730,13 +737,11 @@
*chi2[model] << "nan";
}
}
-
}
}
//end of model cycle
if (input_var.getNgpreds() == 2)
{
-
//Han Chen
*outfile[0] << beta_sebeta[0]->str() << input_var.getSep();
#if !COXPH
@@ -783,10 +788,9 @@
#endif
*outfile[4] << chi2[4]->str() << "\n";
//Oct 26, 2009
-
- } else //ngpreds == 1
+ }
+ else // Dose data: only additive model. Only one output file
{
-
if (input_var.getInverseFilename() == NULL)
{
//Han Chen
@@ -819,7 +823,7 @@
}
update_progress_to_cmd_line(csnp, nsnps);
}
-
+ std::cout << setprecision(2) << fixed;
std::cout << "\b\b\b\b\b\b\b\b\b" << 100.;
std::cout << "%... done\n";
Modified: pkg/ProbABEL/src/reg1.cpp
===================================================================
--- pkg/ProbABEL/src/reg1.cpp 2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/reg1.cpp 2013-04-26 21:03:28 UTC (rev 1203)
@@ -1,5 +1,6 @@
#include "reg1.h"
+
mematrix<double> apply_model(mematrix<double>& X, int model, int interaction,
int ngpreds, bool is_interaction_excluded,
bool iscox, int nullmodel)
@@ -294,8 +295,8 @@
{
cout << rdata.is_interaction_excluded
<< " <-irdata.is_interaction_excluded\n";
- std::cout << "invvarmatrix:\n";
- invvarmatrixin.masked_data->print();
+ // std::cout << "invvarmatrix:\n";
+ // invvarmatrixin.masked_data->print();
std::cout << "rdata.X:\n";
rdata.X.print();
}
@@ -306,6 +307,8 @@
{
std::cout << "X:\n";
X.print();
+ std::cout << "Y:\n";
+ rdata.Y.print();
}
int length_beta = X.ncol;
beta.reinit(length_beta, 1);
@@ -333,9 +336,39 @@
// = invvarmatrix*X;
// std::cout<<"new tX.nrow="<<X.nrow<<" tX.ncol="<<X.ncol<<"\n";
}
+
mematrix<double> tXX = tX * X;
- // double N = tXX.get(0,0);
double N = X.nrow;
+
+#if EIGEN_COMMENTEDOUT
+ MatrixXd Xeigen = X.data;
+ MatrixXd tXeigen = Xeigen.transpose();
+ MatrixXd tXXeigen = tXeigen * Xeigen;
+ VectorXd Yeigen = rdata.Y.data;
+ VectorXd tXYeigen = tXeigen * Yeigen;
+ // Solve X^T * X * beta = X^T * Y for beta:
+ VectorXd betaeigen = tXXeigen.fullPivLu().solve(tXYeigen);
+ beta.data = betaeigen;
+
+ if (verbose)
+ {
+ std::cout << setprecision(9) << "Xeigen:\n" << Xeigen << endl;
+ std::cout << setprecision(9) << "tX:\n" << tXeigen << endl;
+ std::cout << setprecision(9) << "tXX:\n" << tXXeigen << endl;
+ std::cout << setprecision(9) << "tXY:\n" << tXYeigen << endl;
+ std::cout << setprecision(9) << "beta:\n"<< betaeigen << endl;
+ printf("----\n");
+ printf("beta[0] = %e\n", betaeigen.data()[0]);
+ printf("----\n");
+// (beta).print();
+ double relative_error = (tXXeigen * betaeigen - tXYeigen).norm() / tXYeigen.norm(); // norm() is L2 norm
+ cout << "The relative error is:\n" << relative_error << endl;
+
+ }
+
+ // This one is needed later on in this function
+ mematrix<double> tXX_i = invert(tXX);
+#else
//
// use cholesky to invert
//
@@ -344,8 +377,10 @@
chinv2_mm(tXX_i);
// before was
// mematrix<double> tXX_i = invert(tXX);
+
mematrix<double> tXY = tX * (rdata.Y);
beta = tXX_i * tXY;
+
if (verbose)
{
std::cout << "tX:\n";
@@ -361,6 +396,7 @@
std::cout << "beta:\n";
(beta).print();
}
+#endif
// now compute residual variance
sigma2 = 0.;
mematrix<double> ttX = transpose(tX);
Modified: pkg/ProbABEL/src/regdata.cpp
===================================================================
--- pkg/ProbABEL/src/regdata.cpp 2013-04-26 20:50:32 UTC (rev 1202)
+++ pkg/ProbABEL/src/regdata.cpp 2013-04-26 21:03:28 UTC (rev 1203)
@@ -82,7 +82,7 @@
if (snpnum > 0)
for (int j = 0; j < ngpreds; j++)
{
- float snpdata[nids];
+ double snpdata[nids];
gend.get_var(snpnum * ngpreds + j, snpdata);
for (int i = 0; i < nids; i++)
X.put(snpdata[i], i, (ncov - ngpreds + 1 + j));
@@ -97,7 +97,7 @@
{
for (int j = 0; j < ngpreds; j++)
{
- float snpdata[nids];
+ double snpdata[nids];
for (int i = 0; i < nids; i++)
{
masked_data[i] = 0;
More information about the Genabel-commits
mailing list