[Genabel-commits] r1664 - branches/ProbABEL-0.50/src
Yury Aulchenko
yurii.aulchenko at gmail.com
Fri Mar 28 20:15:09 CET 2014
10 fold is good speed up. An order of magnitude :)
Wonder how it compares now to the reading from plain text files?
Y
----------------
Sent from mobile device, please excuse possible typos
> On 28 Mar 2014, at 20:12, noreply at r-forge.r-project.org wrote:
>
> Author: maartenk
> Date: 2014-03-28 20:12:41 +0100 (Fri, 28 Mar 2014)
> New Revision: 1664
>
> Modified:
> branches/ProbABEL-0.50/src/gendata.cpp
> branches/ProbABEL-0.50/src/gendata.h
> Log:
> new implementation of reading in numbers of mldose file: this version is about a 10(!) fold faster than in ProABEL 0.42
>
> Modified: branches/ProbABEL-0.50/src/gendata.cpp
> ===================================================================
> --- branches/ProbABEL-0.50/src/gendata.cpp 2014-03-27 21:16:16 UTC (rev 1663)
> +++ branches/ProbABEL-0.50/src/gendata.cpp 2014-03-28 19:12:41 UTC (rev 1664)
> @@ -40,58 +40,69 @@
> #endif
> #include "utilities.h"
>
> -double mldose_strtod(const char *str_pointer) {
> - // This function is inspired on some answers found at stackoverflow :
> - // eg question 5678932
> - int sign = 0;
> - double result = 0;
> - //check if not a null pointer or NaN (right now checks only first character)
> -//TODO: make catching of NaN more rigid
> - if (!*str_pointer | *str_pointer == 'N'){
> - return std::numeric_limits<double>::quiet_NaN();
> +
> +void gendata::mldose_line_to_matrix(int k,const char *all_numbers,int amount_of_numbers){
> + int j = 0;
> + //check if not a null pointer
> + if (!*all_numbers){
> + perror("Error while reading genetic data (expected pointer to char but found a null pointer)");
> + exit(EXIT_FAILURE);
> }
> - //skip whitespace
> - while (*str_pointer == ' ')
> + while (j<amount_of_numbers)
> {
> - str_pointer++;
> - }
> - //set sign to -1 if negative: multiply by sign just before return
> - if (*str_pointer == '-')
> - {
> - str_pointer++;
> - sign = -1;
> - }
> - //read digits before dot
> - while (*str_pointer <= '9' && *str_pointer >= '0'){
> - result = result * 10 + (*str_pointer++ - '0');
> - }
> - //read digit after dot
> - if (*str_pointer == '.')
> - {
> - double decimal_counter = 1.0;
> - str_pointer++;
> - while (*str_pointer <= '9' && *str_pointer >= '0')
> + double result = 0;
> + //skip whitespace
> + while (*all_numbers == ' ')
> {
> - decimal_counter *= 0.1;
> - result += (*str_pointer++ - '0') * decimal_counter;
> + all_numbers++;
> }
> + //check NaN (right now checks only first character)
> + //TODO: make catching of NaN more rigid
> + if (*all_numbers == 'N')
> + {
> + result = std::numeric_limits<double>::quiet_NaN();
> + //skip other characters of NaN
> + while ((*all_numbers == 'a') | (*all_numbers == 'N'))
> + {
> + all_numbers++;
> + }
> + }
> + else
> + {
> + int sign = 0;
> + //set sign to -1 if negative: multiply by sign just before return
> + if (*all_numbers == '-')
> + {
> + all_numbers++;
> + sign = -1;
> + }
> + //read digits before dot
> + while (*all_numbers <= '9' && *all_numbers >= '0')
> + {
> + result = result * 10 + (*all_numbers++ - '0');
> + }
> + //read digit after dot
> + if (*all_numbers == '.')
> + {
> + double decimal_counter = 1.0;
> + all_numbers++;
> + while (*all_numbers <= '9' && *all_numbers >= '0')
> + {
> + decimal_counter *= 0.1;
> + result += (*all_numbers++ - '0') * decimal_counter;
> + }
> + }
> + //correct for negative number
> + if (sign == -1)
> + {
> + result = sign * result;
> + }
> + }
> + G.put(result, k, j);
> + j++;
> }
> - //str_pointer should be null since all characters are read.
> - if (*str_pointer){
> - perror("Error while reading genetic data (mldose_strtod)");
> - exit(EXIT_FAILURE);
> - }
> - //correct for negative number
> - if (sign == -1){
> - return sign * result;
> - }else{
> - return result;
> - }
> -
> }
>
> -
> -
> void gendata::get_var(int var, double * data)
> {
> // Read the genetic data for SNP 'var' and store in the array 'data'
> @@ -246,7 +257,7 @@
> size_t strpos = tmpstr.find("->");
> if (strpos != string::npos)
> {
> - tmpid = tmpstr.substr(strpos+2, string::npos);
> + tmpid = tmpstr.substr(strpos + 2, string::npos);
> }
> else
> {
> @@ -255,8 +266,8 @@
> if (tmpid != idnames[k])
> {
> cerr << "phenotype file and dose or probability file "
> - << "did not match at line " << i + 2 << " (" << tmpid
> - << " != " << idnames[k] << ")" << endl;
> + << "did not match at line " << i + 2 << " ("
> + << tmpid << " != " << idnames[k] << ")" << endl;
> infile.close();
> exit(1);
> }
> @@ -267,47 +278,58 @@
> infile >> tmpstr;
> }
>
> - for (unsigned int j = 0; j < (nsnps * ngpreds); j++)
> + int oldstyle = 0;
> + if (oldstyle == 1)
> {
> - if (infile.good())
> + for (unsigned int j = 0; j < (nsnps * ngpreds); j++)
> {
> - infile >> inStr;
> - // tmpstr contains the dosage/probability in
> - // string form. Convert it to double (if tmpstr is
> - // NaN it will be set to nan).
> - double dosage;
> - char *endptr;
> - errno = 0; // To distinguish success/failure
> - // after strtod()
> + if (infile.good())
> + {
> + infile >> inStr;
> + // tmpstr contains the dosage/probability in
> + // string form. Convert it to double (if tmpstr is
> + // NaN it will be set to nan).
> + double dosage;
> + char *endptr;
> + errno = 0; // To distinguish success/failure
> + // after strtod()
>
> - dosage = mldose_strtod(inStr);
> - //dosage = strtod(tmpstr.c_str(), &endptr);
> -// if ((errno == ERANGE &&
> -// (dosage == HUGE_VALF || dosage == HUGE_VALL))
> -// || (errno != 0 && dosage == 0)) {
> -// perror("Error while reading genetic data (strtod)");
> -// exit(EXIT_FAILURE);
> -// }
> + dosage = strtod(inStr, &endptr);
> + if ((errno == ERANGE
> + && (dosage == HUGE_VALF || dosage == HUGE_VALL))
> + || (errno != 0 && dosage == 0))
> + {
> + perror("Error while reading genetic data (strtod)");
> + exit(EXIT_FAILURE);
> + }
>
> - if (endptr == tmpstr.c_str()) {
> - cerr << "No digits were found while reading genetic data"
> - << " (individual " << i + 1
> - << ", position " << j + 1 << ")"
> - << endl;
> - exit(EXIT_FAILURE);
> + if (endptr == tmpstr.c_str())
> + {
> + cerr
> + << "No digits were found while reading genetic data"
> + << " (individual " << i + 1 << ", position "
> + << j + 1 << ")" << endl;
> + exit(EXIT_FAILURE);
> + }
> + /* If we got here, strtod() successfully parsed a number */
> + G.put(dosage, k, j);
> }
> -
> - /* If we got here, strtod() successfully parsed a number */
> - G.put(dosage, k, j);
> + else
> + {
> + std::cerr << "cannot read dose-file: " << fname
> + << "check skipd and ngpreds parameters\n";
> + infile.close();
> + exit(1);
> + }
> }
> - else
> - {
> - std::cerr << "cannot read dose-file: " << fname
> - << "check skipd and ngpreds parameters\n";
> - infile.close();
> - exit(1);
> - }
> }
> + else
> + {
> + std::string all_numbers;
> + all_numbers.reserve(nsnps * ngpreds * 7);
> + std::getline(infile, all_numbers);
> + mldose_line_to_matrix(k, all_numbers.c_str(), nsnps * ngpreds);
> + }
> k++;
> }
> else
>
> Modified: branches/ProbABEL-0.50/src/gendata.h
> ===================================================================
> --- branches/ProbABEL-0.50/src/gendata.h 2014-03-27 21:16:16 UTC (rev 1663)
> +++ branches/ProbABEL-0.50/src/gendata.h 2014-03-28 19:12:41 UTC (rev 1664)
> @@ -44,7 +44,7 @@
> unsigned int nids;
> unsigned int ngpreds;
> gendata();
> - double convert( char* source, char** endPtr );
> + void mldose_line_to_matrix(int k,const char *all_numbers,int amount_of_numbers);
>
> void re_gendata(char * fname, unsigned int insnps, unsigned int ingpreds,
> unsigned int npeople, unsigned int nmeasured,
>
> _______________________________________________
> Genabel-commits mailing list
> Genabel-commits at lists.r-forge.r-project.org
> https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/genabel-commits
More information about the Genabel-commits
mailing list