[Genabel-commits] r1664 - branches/ProbABEL-0.50/src

Maarten Kooyman kooyman at gmail.com
Fri Mar 28 20:30:23 CET 2014


I tested speed of ProbABEL on a dataset 33815 snp / 3485 people adjusted 
for sex and age (I did not run it in triplet but gives an idea)

version 0.42 0.50_branch
FV         58     52
mldose  48    12
all times ate in seconds.

As you can see the filevector format in the part that slows down the 
program. When profiling the reading from FV takes up 86% of all the time 
the program takes.

On 28-03-14 20:15, Yury Aulchenko wrote:
> 10 fold is good speed up. An order of magnitude :)
>
> Wonder how it compares now to the reading from plain text files?
>
> Y
>
> ----------------
> Sent from mobile device, please excuse possible typos
>
>> On 28 Mar 2014, at 20:12, noreply at r-forge.r-project.org wrote:
>>
>> Author: maartenk
>> Date: 2014-03-28 20:12:41 +0100 (Fri, 28 Mar 2014)
>> New Revision: 1664
>>
>> Modified:
>>    branches/ProbABEL-0.50/src/gendata.cpp
>>    branches/ProbABEL-0.50/src/gendata.h
>> Log:
>> new implementation of reading in numbers of mldose file: this version is about a 10(!) fold faster than in ProABEL 0.42
>>
>> Modified: branches/ProbABEL-0.50/src/gendata.cpp
>> ===================================================================
>> --- branches/ProbABEL-0.50/src/gendata.cpp    2014-03-27 21:16:16 UTC (rev 1663)
>> +++ branches/ProbABEL-0.50/src/gendata.cpp    2014-03-28 19:12:41 UTC (rev 1664)
>> @@ -40,58 +40,69 @@
>> #endif
>> #include "utilities.h"
>>
>> -double mldose_strtod(const char *str_pointer) {
>> -    // This function is inspired on some answers found at stackoverflow :
>> -    // eg question 5678932
>> -    int sign = 0;
>> -    double result = 0;
>> -    //check if not a null pointer or NaN (right now checks only first character)
>> -//TODO: make catching of NaN more rigid
>> -    if (!*str_pointer | *str_pointer == 'N'){
>> -        return std::numeric_limits<double>::quiet_NaN();
>> +
>> +void gendata::mldose_line_to_matrix(int k,const char *all_numbers,int amount_of_numbers){
>> +    int j = 0;
>> +    //check if not a null pointer
>> +    if (!*all_numbers){
>> +        perror("Error while reading genetic data (expected pointer to char but found a null pointer)");
>> +                       exit(EXIT_FAILURE);
>>      }
>> -    //skip whitespace
>> -    while (*str_pointer == ' ')
>> +    while (j<amount_of_numbers)
>>      {
>> -        str_pointer++;
>> -    }
>> -    //set sign to -1 if negative: multiply by sign just before return
>> -    if (*str_pointer == '-')
>> -    {
>> -        str_pointer++;
>> -        sign = -1;
>> -    }
>> -    //read digits before dot
>> -    while (*str_pointer <= '9' && *str_pointer >= '0'){
>> -        result = result * 10 + (*str_pointer++ - '0');
>> -    }
>> -    //read digit after dot
>> -    if (*str_pointer == '.')
>> -    {
>> -        double decimal_counter = 1.0;
>> -        str_pointer++;
>> -        while (*str_pointer <= '9' && *str_pointer >= '0')
>> +        double result = 0;
>> +        //skip whitespace
>> +        while (*all_numbers == ' ')
>>          {
>> -            decimal_counter *= 0.1;
>> -            result += (*str_pointer++ - '0') * decimal_counter;
>> +            all_numbers++;
>>          }
>> +        //check NaN (right now checks only first character)
>> +        //TODO: make catching of NaN more rigid
>> +        if (*all_numbers == 'N')
>> +        {
>> +            result = std::numeric_limits<double>::quiet_NaN();
>> +            //skip other characters of NaN
>> +            while ((*all_numbers == 'a') | (*all_numbers == 'N'))
>> +            {
>> +                all_numbers++;
>> +            }
>> +        }
>> +        else
>> +        {
>> +            int sign = 0;
>> +            //set sign to -1 if negative: multiply by sign just before return
>> +            if (*all_numbers == '-')
>> +            {
>> +                all_numbers++;
>> +                sign = -1;
>> +            }
>> +            //read digits before dot
>> +            while (*all_numbers <= '9' && *all_numbers >= '0')
>> +            {
>> +                result = result * 10 + (*all_numbers++ - '0');
>> +            }
>> +            //read digit after dot
>> +            if (*all_numbers == '.')
>> +            {
>> +                double decimal_counter = 1.0;
>> +                all_numbers++;
>> +                while (*all_numbers <= '9' && *all_numbers >= '0')
>> +                {
>> +                    decimal_counter *= 0.1;
>> +                    result += (*all_numbers++ - '0') * decimal_counter;
>> +                }
>> +            }
>> +            //correct for negative number
>> +            if (sign == -1)
>> +            {
>> +                result = sign * result;
>> +            }
>> +        }
>> +        G.put(result, k, j);
>> +        j++;
>>      }
>> -    //str_pointer should be null since all characters are read.
>> -    if (*str_pointer){
>> -        perror("Error while reading genetic data (mldose_strtod)");
>> -                          exit(EXIT_FAILURE);
>> -    }
>> -    //correct for negative number
>> -    if (sign == -1){
>> -        return sign * result;
>> -    }else{
>> -        return result;
>> -    }
>> -
>> }
>>
>> -
>> -
>> void gendata::get_var(int var, double * data)
>> {
>>      // Read the genetic data for SNP 'var' and store in the array 'data'
>> @@ -246,7 +257,7 @@
>>                  size_t strpos = tmpstr.find("->");
>>                  if (strpos != string::npos)
>>                  {
>> -                    tmpid = tmpstr.substr(strpos+2, string::npos);
>> +                    tmpid = tmpstr.substr(strpos + 2, string::npos);
>>                  }
>>                  else
>>                  {
>> @@ -255,8 +266,8 @@
>>                  if (tmpid != idnames[k])
>>                  {
>>                      cerr << "phenotype file and dose or probability file "
>> -                         << "did not match at line " << i + 2 << " (" << tmpid
>> -                         << " != " << idnames[k] << ")" << endl;
>> +                            << "did not match at line " << i + 2 << " ("
>> +                            << tmpid << " != " << idnames[k] << ")" << endl;
>>                      infile.close();
>>                      exit(1);
>>                  }
>> @@ -267,47 +278,58 @@
>>                  infile >> tmpstr;
>>              }
>>
>> -            for (unsigned int j = 0; j < (nsnps * ngpreds); j++)
>> +            int oldstyle = 0;
>> +            if (oldstyle == 1)
>>              {
>> -                if (infile.good())
>> +                for (unsigned int j = 0; j < (nsnps * ngpreds); j++)
>>                  {
>> -                    infile >> inStr;
>> -                    // tmpstr contains the dosage/probability in
>> -                    // string form. Convert it to double (if tmpstr is
>> -                    // NaN it will be set to nan).
>> -                    double dosage;
>> -                    char *endptr;
>> -                    errno = 0;      // To distinguish success/failure
>> -                                    // after strtod()
>> +                    if (infile.good())
>> +                    {
>> +                        infile >> inStr;
>> +                        // tmpstr contains the dosage/probability in
>> +                        // string form. Convert it to double (if tmpstr is
>> +                        // NaN it will be set to nan).
>> +                        double dosage;
>> +                        char *endptr;
>> +                        errno = 0;      // To distinguish success/failure
>> +                                        // after strtod()
>>
>> -                    dosage = mldose_strtod(inStr);
>> -                    //dosage = strtod(tmpstr.c_str(), &endptr);
>> -//                    if ((errno == ERANGE &&
>> -//                         (dosage == HUGE_VALF || dosage == HUGE_VALL))
>> -//                        || (errno != 0 && dosage == 0)) {
>> -//                        perror("Error while reading genetic data (strtod)");
>> -//                        exit(EXIT_FAILURE);
>> -//                    }
>> +                        dosage = strtod(inStr, &endptr);
>> +                        if ((errno == ERANGE
>> +                                && (dosage == HUGE_VALF || dosage == HUGE_VALL))
>> +                                || (errno != 0 && dosage == 0))
>> +                        {
>> +                            perror("Error while reading genetic data (strtod)");
>> +                            exit(EXIT_FAILURE);
>> +                        }
>>
>> -                    if (endptr == tmpstr.c_str()) {
>> -                        cerr << "No digits were found while reading genetic data"
>> -                             << " (individual " << i + 1
>> -                             << ", position " << j + 1 << ")"
>> -                             << endl;
>> -                        exit(EXIT_FAILURE);
>> +                        if (endptr == tmpstr.c_str())
>> +                        {
>> +                            cerr
>> +                                    << "No digits were found while reading genetic data"
>> +                                    << " (individual " << i + 1 << ", position "
>> +                                    << j + 1 << ")" << endl;
>> +                            exit(EXIT_FAILURE);
>> +                        }
>> +                        /* If we got here, strtod() successfully parsed a number */
>> +                        G.put(dosage, k, j);
>>                      }
>> -
>> -                    /* If we got here, strtod() successfully parsed a number */
>> -                    G.put(dosage, k, j);
>> +                    else
>> +                    {
>> +                        std::cerr << "cannot read dose-file: " << fname
>> +                                << "check skipd and ngpreds parameters\n";
>> +                        infile.close();
>> +                        exit(1);
>> +                    }
>>                  }
>> -                else
>> -                {
>> -                    std::cerr << "cannot read dose-file: " << fname
>> -                              << "check skipd and ngpreds parameters\n";
>> -                    infile.close();
>> -                    exit(1);
>> -                }
>>              }
>> +            else
>> +            {
>> +                std::string all_numbers;
>> +                all_numbers.reserve(nsnps * ngpreds * 7);
>> +                std::getline(infile, all_numbers);
>> +                mldose_line_to_matrix(k, all_numbers.c_str(), nsnps * ngpreds);
>> +            }
>>              k++;
>>          }
>>          else
>>
>> Modified: branches/ProbABEL-0.50/src/gendata.h
>> ===================================================================
>> --- branches/ProbABEL-0.50/src/gendata.h    2014-03-27 21:16:16 UTC (rev 1663)
>> +++ branches/ProbABEL-0.50/src/gendata.h    2014-03-28 19:12:41 UTC (rev 1664)
>> @@ -44,7 +44,7 @@
>>      unsigned int nids;
>>      unsigned int ngpreds;
>>      gendata();
>> -    double convert(   char* source,  char** endPtr );
>> +    void mldose_line_to_matrix(int k,const char *all_numbers,int amount_of_numbers);
>>
>>      void re_gendata(char * fname, unsigned int insnps, unsigned int ingpreds,
>>              unsigned int npeople, unsigned int nmeasured,
>>
>> _______________________________________________
>> Genabel-commits mailing list
>> Genabel-commits at lists.r-forge.r-project.org
>> https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/genabel-commits
> _______________________________________________
> Genabel-commits mailing list
> Genabel-commits at lists.r-forge.r-project.org
> https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/genabel-commits



More information about the Genabel-commits mailing list