[Genabel-commits] r1664 - branches/ProbABEL-0.50/src

Yury Aulchenko yurii.aulchenko at gmail.com
Fri Mar 28 20:15:09 CET 2014


10 fold is good speed up. An order of magnitude :)

Wonder how it compares now to the reading from plain text files?

Y

----------------
Sent from mobile device, please excuse possible typos

> On 28 Mar 2014, at 20:12, noreply at r-forge.r-project.org wrote:
> 
> Author: maartenk
> Date: 2014-03-28 20:12:41 +0100 (Fri, 28 Mar 2014)
> New Revision: 1664
> 
> Modified:
>   branches/ProbABEL-0.50/src/gendata.cpp
>   branches/ProbABEL-0.50/src/gendata.h
> Log:
> new implementation of reading in numbers of mldose file: this version is about a 10(!) fold faster than in ProABEL 0.42 
> 
> Modified: branches/ProbABEL-0.50/src/gendata.cpp
> ===================================================================
> --- branches/ProbABEL-0.50/src/gendata.cpp    2014-03-27 21:16:16 UTC (rev 1663)
> +++ branches/ProbABEL-0.50/src/gendata.cpp    2014-03-28 19:12:41 UTC (rev 1664)
> @@ -40,58 +40,69 @@
> #endif
> #include "utilities.h"
> 
> -double mldose_strtod(const char *str_pointer) {
> -    // This function is inspired on some answers found at stackoverflow :
> -    // eg question 5678932
> -    int sign = 0;
> -    double result = 0;
> -    //check if not a null pointer or NaN (right now checks only first character)
> -//TODO: make catching of NaN more rigid
> -    if (!*str_pointer | *str_pointer == 'N'){
> -        return std::numeric_limits<double>::quiet_NaN();
> +
> +void gendata::mldose_line_to_matrix(int k,const char *all_numbers,int amount_of_numbers){
> +    int j = 0;
> +    //check if not a null pointer
> +    if (!*all_numbers){
> +        perror("Error while reading genetic data (expected pointer to char but found a null pointer)");
> +                       exit(EXIT_FAILURE);
>     }
> -    //skip whitespace
> -    while (*str_pointer == ' ')
> +    while (j<amount_of_numbers)
>     {
> -        str_pointer++;
> -    }
> -    //set sign to -1 if negative: multiply by sign just before return
> -    if (*str_pointer == '-')
> -    {
> -        str_pointer++;
> -        sign = -1;
> -    }
> -    //read digits before dot
> -    while (*str_pointer <= '9' && *str_pointer >= '0'){
> -        result = result * 10 + (*str_pointer++ - '0');
> -    }
> -    //read digit after dot
> -    if (*str_pointer == '.')
> -    {
> -        double decimal_counter = 1.0;
> -        str_pointer++;
> -        while (*str_pointer <= '9' && *str_pointer >= '0')
> +        double result = 0;
> +        //skip whitespace
> +        while (*all_numbers == ' ')
>         {
> -            decimal_counter *= 0.1;
> -            result += (*str_pointer++ - '0') * decimal_counter;
> +            all_numbers++;
>         }
> +        //check NaN (right now checks only first character)
> +        //TODO: make catching of NaN more rigid
> +        if (*all_numbers == 'N')
> +        {
> +            result = std::numeric_limits<double>::quiet_NaN();
> +            //skip other characters of NaN
> +            while ((*all_numbers == 'a') | (*all_numbers == 'N'))
> +            {
> +                all_numbers++;
> +            }
> +        }
> +        else
> +        {
> +            int sign = 0;
> +            //set sign to -1 if negative: multiply by sign just before return
> +            if (*all_numbers == '-')
> +            {
> +                all_numbers++;
> +                sign = -1;
> +            }
> +            //read digits before dot
> +            while (*all_numbers <= '9' && *all_numbers >= '0')
> +            {
> +                result = result * 10 + (*all_numbers++ - '0');
> +            }
> +            //read digit after dot
> +            if (*all_numbers == '.')
> +            {
> +                double decimal_counter = 1.0;
> +                all_numbers++;
> +                while (*all_numbers <= '9' && *all_numbers >= '0')
> +                {
> +                    decimal_counter *= 0.1;
> +                    result += (*all_numbers++ - '0') * decimal_counter;
> +                }
> +            }
> +            //correct for negative number
> +            if (sign == -1)
> +            {
> +                result = sign * result;
> +            }
> +        }
> +        G.put(result, k, j);
> +        j++;
>     }
> -    //str_pointer should be null since all characters are read.
> -    if (*str_pointer){
> -        perror("Error while reading genetic data (mldose_strtod)");
> -                          exit(EXIT_FAILURE);
> -    }
> -    //correct for negative number
> -    if (sign == -1){
> -        return sign * result;
> -    }else{
> -        return result;
> -    }
> -
> }
> 
> -
> -
> void gendata::get_var(int var, double * data)
> {
>     // Read the genetic data for SNP 'var' and store in the array 'data'
> @@ -246,7 +257,7 @@
>                 size_t strpos = tmpstr.find("->");
>                 if (strpos != string::npos)
>                 {
> -                    tmpid = tmpstr.substr(strpos+2, string::npos);
> +                    tmpid = tmpstr.substr(strpos + 2, string::npos);
>                 }
>                 else
>                 {
> @@ -255,8 +266,8 @@
>                 if (tmpid != idnames[k])
>                 {
>                     cerr << "phenotype file and dose or probability file "
> -                         << "did not match at line " << i + 2 << " (" << tmpid
> -                         << " != " << idnames[k] << ")" << endl;
> +                            << "did not match at line " << i + 2 << " ("
> +                            << tmpid << " != " << idnames[k] << ")" << endl;
>                     infile.close();
>                     exit(1);
>                 }
> @@ -267,47 +278,58 @@
>                 infile >> tmpstr;
>             }
> 
> -            for (unsigned int j = 0; j < (nsnps * ngpreds); j++)
> +            int oldstyle = 0;
> +            if (oldstyle == 1)
>             {
> -                if (infile.good())
> +                for (unsigned int j = 0; j < (nsnps * ngpreds); j++)
>                 {
> -                    infile >> inStr;
> -                    // tmpstr contains the dosage/probability in
> -                    // string form. Convert it to double (if tmpstr is
> -                    // NaN it will be set to nan).
> -                    double dosage;
> -                    char *endptr;
> -                    errno = 0;      // To distinguish success/failure
> -                                    // after strtod()
> +                    if (infile.good())
> +                    {
> +                        infile >> inStr;
> +                        // tmpstr contains the dosage/probability in
> +                        // string form. Convert it to double (if tmpstr is
> +                        // NaN it will be set to nan).
> +                        double dosage;
> +                        char *endptr;
> +                        errno = 0;      // To distinguish success/failure
> +                                        // after strtod()
> 
> -                    dosage = mldose_strtod(inStr);
> -                    //dosage = strtod(tmpstr.c_str(), &endptr);
> -//                    if ((errno == ERANGE &&
> -//                         (dosage == HUGE_VALF || dosage == HUGE_VALL))
> -//                        || (errno != 0 && dosage == 0)) {
> -//                        perror("Error while reading genetic data (strtod)");
> -//                        exit(EXIT_FAILURE);
> -//                    }
> +                        dosage = strtod(inStr, &endptr);
> +                        if ((errno == ERANGE
> +                                && (dosage == HUGE_VALF || dosage == HUGE_VALL))
> +                                || (errno != 0 && dosage == 0))
> +                        {
> +                            perror("Error while reading genetic data (strtod)");
> +                            exit(EXIT_FAILURE);
> +                        }
> 
> -                    if (endptr == tmpstr.c_str()) {
> -                        cerr << "No digits were found while reading genetic data"
> -                             << " (individual " << i + 1
> -                             << ", position " << j + 1 << ")"
> -                             << endl;
> -                        exit(EXIT_FAILURE);
> +                        if (endptr == tmpstr.c_str())
> +                        {
> +                            cerr
> +                                    << "No digits were found while reading genetic data"
> +                                    << " (individual " << i + 1 << ", position "
> +                                    << j + 1 << ")" << endl;
> +                            exit(EXIT_FAILURE);
> +                        }
> +                        /* If we got here, strtod() successfully parsed a number */
> +                        G.put(dosage, k, j);
>                     }
> -
> -                    /* If we got here, strtod() successfully parsed a number */
> -                    G.put(dosage, k, j);
> +                    else
> +                    {
> +                        std::cerr << "cannot read dose-file: " << fname
> +                                << "check skipd and ngpreds parameters\n";
> +                        infile.close();
> +                        exit(1);
> +                    }
>                 }
> -                else
> -                {
> -                    std::cerr << "cannot read dose-file: " << fname
> -                              << "check skipd and ngpreds parameters\n";
> -                    infile.close();
> -                    exit(1);
> -                }
>             }
> +            else
> +            {
> +                std::string all_numbers;
> +                all_numbers.reserve(nsnps * ngpreds * 7);
> +                std::getline(infile, all_numbers);
> +                mldose_line_to_matrix(k, all_numbers.c_str(), nsnps * ngpreds);
> +            }
>             k++;
>         }
>         else
> 
> Modified: branches/ProbABEL-0.50/src/gendata.h
> ===================================================================
> --- branches/ProbABEL-0.50/src/gendata.h    2014-03-27 21:16:16 UTC (rev 1663)
> +++ branches/ProbABEL-0.50/src/gendata.h    2014-03-28 19:12:41 UTC (rev 1664)
> @@ -44,7 +44,7 @@
>     unsigned int nids;
>     unsigned int ngpreds;
>     gendata();
> -    double convert(   char* source,  char** endPtr );
> +    void mldose_line_to_matrix(int k,const char *all_numbers,int amount_of_numbers);
> 
>     void re_gendata(char * fname, unsigned int insnps, unsigned int ingpreds,
>             unsigned int npeople, unsigned int nmeasured,
> 
> _______________________________________________
> Genabel-commits mailing list
> Genabel-commits at lists.r-forge.r-project.org
> https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/genabel-commits


More information about the Genabel-commits mailing list