[GenABEL-dev] [Genabel-commits] r1609 - pkg/OmicABELnoMM/src

L.C. Karssen lennart at karssen.org
Fri Feb 14 11:32:20 CET 2014


Hi Alvaro,

Thanks for your contribution and congratulations on your first commit in
the GenABEL project.

Glad to see that the number of has been reduced a lot! I still get a few
errors after running
 ./configure  LDFLAGS="-L/usr/lib/openblas-base"
 make -j4
but they seem to be minor and easy to fix. Have a look at
http://jenkins.genabel.org/jenkins/job/OmicABELnoMM/15/warnings23Result/
for the warning messages and corresponding lines in the code.
Also try to have a look at the CPPcheck results in Jenkins. It warns
about three (possible) memory leaks.


Thanks a lot and looking forward to your next commits.


Lennart.


On 13-02-14 21:42, noreply at r-forge.r-project.org wrote:
> Author: afrank
> Date: 2014-02-13 21:42:52 +0100 (Thu, 13 Feb 2014)
> New Revision: 1609
> 
> Modified:
>    pkg/OmicABELnoMM/src/AIOwrapper.cpp
>    pkg/OmicABELnoMM/src/Algorithm.cpp
>    pkg/OmicABELnoMM/src/Algorithm.h
>    pkg/OmicABELnoMM/src/Definitions.h
>    pkg/OmicABELnoMM/src/Utility.cpp
>    pkg/OmicABELnoMM/src/Utility.h
>    pkg/OmicABELnoMM/src/main.cpp
> Log:
> Warnings from c/c++ inconsistencies and -Wall fixed. Legacy code from unused functionality removed. A few bugs with indices fixed. Stable code.
> 
> Modified: pkg/OmicABELnoMM/src/AIOwrapper.cpp
> ===================================================================
> --- pkg/OmicABELnoMM/src/AIOwrapper.cpp	2014-02-13 09:24:32 UTC (rev 1608)
> +++ pkg/OmicABELnoMM/src/AIOwrapper.cpp	2014-02-13 20:42:52 UTC (rev 1609)
> @@ -29,9 +29,9 @@
>  
>      Fhandler->fakefiles = params.use_fake_files;
>  
> -    databel_fvi* Yfvi;
> -    databel_fvi* ALfvi;
> -    databel_fvi* ARfvi;
> +//    databel_fvi* Yfvi;
> +//    databel_fvi* ALfvi;
> +//    databel_fvi* ARfvi;
>  
>      if (!Fhandler->fakefiles)
>      {
> @@ -49,7 +49,7 @@
>          params.l = ALfvi->fvi_header.numVariables;
>  
>          //block size to keep mem under 1 gigabyte
> -        int opt_block = params.n / (4*1000^3) * (1/(2*params.r));
> +        //int opt_block = params.n / (4*1000^3) * (1/(2*params.r));
>          int opt_tb = 1000;
>          int opt_mb = 1000;
>  
> @@ -81,7 +81,7 @@
>  void AIOwrapper::finalize()
>  {
>      //cout << "f";
> -    void *status;
> +    //void *status;
>  
>      Fhandler->not_done = false;
>      pthread_mutex_lock(&(Fhandler->m_more));
> @@ -368,13 +368,15 @@
>      }
>  //
>  //            //!induce realistic fileread delay
> +
> +    return 0;
>  }
>  
>  
>  void AIOwrapper::load_ARblock(type_precision** Ar, int &Ar_blockSize)
>  {
> -    int status;
> -    int createstatus = 0;
> +    //int status;
> +    //int createstatus = 0;
>      //cout<<"^";
>  
>      while (Fhandler->ar_full_buffers.empty())
> @@ -428,8 +430,8 @@
>  
>  void AIOwrapper::load_Yblock(type_precision** Y, int &y_blockSize)
>  {
> -    int status;
> -    int createstatus = 0;
> +    //int status;
> +    //int createstatus = 0;
>  
>      while (Fhandler->full_buffers.empty())
>      {
> @@ -528,7 +530,7 @@
>  
>  void AIOwrapper::reset_Y()
>  {
> -    void *status;
> +    //void *status;
>  
>      Fhandler->seed = 1337;
>  
> @@ -569,7 +571,7 @@
>  
>  void AIOwrapper::reset_AR()
>  {
> -    void *status;
> +    //void *status;
>  
>  
>      //cout << "ra" << flush;
> @@ -697,7 +699,7 @@
>  FILE * AIOwrapper::fgls_fopen( const char *path, const char *mode )
>  {
>          FILE * f;
> -        char err[100];
> +        //char err[100];
>  
>          f = fopen(path, mode );
>          if ( f == NULL )
> 
> Modified: pkg/OmicABELnoMM/src/Algorithm.cpp
> ===================================================================
> --- pkg/OmicABELnoMM/src/Algorithm.cpp	2014-02-13 09:24:32 UTC (rev 1608)
> +++ pkg/OmicABELnoMM/src/Algorithm.cpp	2014-02-13 20:42:52 UTC (rev 1609)
> @@ -16,30 +16,12 @@
>  {
>      switch (type)
>      {
> -    case FULL_NEQ:
> -        fullNEQ(params, out);
> -        break;
> -    case P_NEQ:
> -        partialNEQ(params, out);
> -        break;
> -    case P_NEQ_B_OPT:
> -        partialNEQ_Blocked_STL(params, out);
> -        break;
> -    case FULL_QR:
> -        fullQR(params, out);
> -        break;
> -    case P_QR:
> -        partialQR(params, out);
> -        break;
> -    case P_QR_B_OPT:
> -        partialQR_Blocked_Rtl(params, out);
> -        break;
> -    case P_NEQ_B_OPT_MD:
> -        partialNEQ_Blocked_STL_MD(params, out);
> -        break;
> +        case P_NEQ_B_OPT_MD:
> +            partialNEQ_Blocked_STL_MD(params, out);
> +            break;
>  
> -    default:
> -        break;
> +        default:
> +            break;
>      }
>  }
>  
> @@ -74,7 +56,7 @@
>                                 int dim2_b, int dim1_b_bot)
>  {
>      // memcpy are faster version of the fors
> -    int i, k, w, top_idx, bot_idx, max = dim1_b*dim2_b;
> +    int i, k, w, top_idx, bot_idx;
>      int size;
>      top_idx = 0;
>      bot_idx = 0;
> @@ -111,7 +93,7 @@
>                             type_precision* bot, int dim1_QY,
>                             int dim2_QY, int dim1_qy_bot, int bot_blocks )
>  {
> -    int i, k, w, top_idx, bot_idx, max = dim1_QY*dim2_QY;
> +    int i, k, w, top_idx, bot_idx;
>      top_idx = 0;
>      bot_idx = 0;
>      for (k = 0; k < dim2_QY; k++)
> @@ -326,12 +308,9 @@
>      blas_set_num_threads(max_threads);
>  
>  
> -    type_precision *Ytemp;
> -    lapack_int info, n, lda, ldy, l, r, k, p;
> +    //type_precision *Ytemp;
> +    lapack_int info, n, lda, l, r, p;
>  
> -    int i, j, w;
> -    int m;
> -
>      cputime_type start_tick, start_tick2, end_tick;
>  
>      AIOfile.initialize(params);
> @@ -349,12 +328,12 @@
>      int y_iters = (y_amount + y_block_size - 1) / y_block_size;
>  
>  
> -    lda = n; ldy = n;
> -    k = p;
> +    lda = n;
>  
> +
>      for (int j = 0; j < y_iters; j++)
>      {
> -        if (y_iters >= 40 && (i%(y_iters/40)) == 0)
> +        if (y_iters >= 40 && (j%(y_iters/40)) == 0)
>          {
>              cout << "*" << flush;
>          }
> @@ -389,7 +368,9 @@
>  
>  
>      AIOfile.load_AL(&backupAL);
> -    int total_al_nans = replace_nans(0, backupAL, n, l);
> +    //int total_al_nans = replace_nans(0, backupAL, n, l);
> +    replace_nans(0, backupAL, n, l);
> +
>      copy_vec(backupAL, AL, n*l);
>  
>  
> @@ -402,7 +383,7 @@
>  
>      for (int j = 0; j < y_iters; j++)
>      {
> -        if (y_iters >= 40 && (i%(y_iters/40)) == 0)
> +        if (y_iters >= 40 && (j%(y_iters/40)) == 0)
>          {
>              cout << AIOfile.io_overhead << flush;
>              AIOfile.io_overhead = "*";
> @@ -412,7 +393,8 @@
>  
>          list<long int> y_nan_idxs[y_block_size];
>  
> -        int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
> +        //int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
> +        replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
>  
>  
>          //! Ay_top = AL'*Y
> @@ -424,7 +406,8 @@
>          for (int i = 0; i < a_iters; i++)
>          {
>              AIOfile.load_ARblock(&backupAR, a_block_size);
> -            int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
> +            //int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
> +            replace_nans(0, backupAR, n, a_block_size * r);
>              copy_vec(backupAR, AR, n *  r * a_block_size);
>  
>              get_ticks(start_tick2);
> @@ -501,7 +484,7 @@
>                                   &Ay[l+ii*p], r);
>  
>  
> -                        type_precision* B = Ay;
> +                        //type_precision* B = Ay;
>                          type_precision S[p * p];
>  
>                          //! Rebuild S
> @@ -564,1194 +547,187 @@
>  
>  //!*************************************************!//
>  
> -
> +//for reference
>  void Algorithm::partialNEQ_Blocked_STL(struct Settings params,
>                                         struct Outputs &out)
>  {
> -    int max_threads = params.threads;
> -
> -    srand(time(NULL));
> -
> -    blas_set_num_threads(max_threads);
> -
> -    type_precision *Ytemp;
> -    lapack_int info, n, lda, ldy, l, r, k, p;
> -
> -    int i, j, w;
> -    int m;
> -
> -    cputime_type start_tick, start_tick2, end_tick;
> -
> -    AIOfile.initialize(params);
> -
> -    n = params.n; l = params.l; r = params.r; p = l+r;
> -
> -    int y_amount = params.m;
> -    int y_block_size = params.mb;  // kk
> -
> -    int a_amount = params.t;
> -    int a_block_size = params.tb;
> -
> -    int a_iters = (a_amount + a_block_size - 1) / a_block_size;
> -
> -    lda = n; ldy = n;
> -    k = p;
> -
> -
> -
> -    type_precision* backupAL;
> -
> -    AIOfile.load_AL(&backupAL);
> -
> -
> -    type_precision Stl[l * l];
> -    type_precision Str[l * r * a_block_size];
> -
> -
> -    type_precision* Ay_top = new type_precision[l * y_amount];
> -    type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
> -    type_precision* A = new type_precision[n * p];
> -    type_precision* AR = new type_precision[n * r * a_block_size];
> -    type_precision* AL = new type_precision[n * l];
> -
> -    copy_vec(backupAL, AL, n * l);
> -    copy_vec(backupAL, A, n * l);
> -
> -
> -    int y_iters = y_amount/y_block_size;
> -
> -    type_precision* Y;
> -    type_precision* backupAR;
> -
> -    // printf("\n\n%%Computations\n%%");
> -
> -
> -    get_ticks(start_tick);
> -
> -
> -    //! Generate S
> -    cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> -                l, n, 1.0, backupAL, lda, 0.0, Stl, l);
> -
> -    for (j = 0; j < y_iters; j++)
> -    {
> -        AIOfile.load_Yblock(&Y, y_block_size);
> -
> -        //! Ay_top = AL'*Y
> -        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                    l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
> -                    &Ay_top[j * l * y_block_size], l);
> -    }
> -
> -
> -    for (i = 0; i < a_iters; i++)
> -    {
> -        if (a_iters > 10 && (i%(a_iters/10)) == 0)
> -        {
> -            cout << "%" << flush;
> -        }
> -
> -
> -        AIOfile.load_ARblock(&backupAR, a_block_size);
> -        AIOfile.reset_Y();
> -        copy_vec(backupAR, AR, n*r*a_block_size);
> -
> -        // matlab_print_matrix("A", n, p, A);
> -        //! Generate Str
> -        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                    l, r * a_block_size, n, 1.0, AL, n, AR, n, 0.0, Str, l);
> -
> -        type_precision Sbr[r*r*a_block_size];
> -
> -        #pragma omp parallel for default(shared) schedule(static)
> -        for (int ii= 0; ii < a_block_size; ii++)
> -        {
> -            //! Generate Sbr
> -            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                        r, r, n, 1.0, &AR[ii*r*n], n, &AR[ii*r*n],
> -                        n, 0.0, &Sbr[ii*r*r], r);
> -        }
> -
> -        type_precision Ay[y_block_size*p];
> -        type_precision S[p*p];
> -
> -
> -        for (j = 0; j < y_iters; j++)
> -        {
> -            if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
> -            {
> -                cout << "%" << flush;
> -            }
> -
> -
> -            AIOfile.load_Yblock(&Y, y_block_size);
> -
> -            //! Ay_bot = AR'*Y
> -            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                        r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
> -                        0.0, Ay_bot, r * a_block_size);
> -
> -            #pragma omp parallel for private(S, Ay) default(shared) schedule(static)
> -            for (int ii= 0; ii < a_block_size; ii++)
> -            {
> -                // matlab_print_matrix("Y", n, y_block_size, Y);
> -
> -                //! Rebuild AY
> -
> -                for (int jj = 0; jj < y_block_size; jj++)
> -                {
> -                    copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[jj*p], l);
> -                    copy_vec(&Ay_bot[ii*r + jj*r*a_block_size], &Ay[l+jj*p], r);
> -                }
> -
> -                type_precision* B = Ay;
> -                // matlab_print_matrix("Ay_top", l, y_block_size,&Ay_top[j*l*y_block_size]);
> -                // matlab_print_matrix("Ay_bot", r, y_block_size, Ay_bot);
> -                // matlab_print_matrix("Ay", p, y_block_size, Ay);
> -
> -                //! Rebuild S
> -                build_S(S, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
> -                // matlab_print_matrix("S", p, p, S);
> -
> -                //! b = S\Ay
> -                info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
> -                                     S, p, Ay, p);
> -                // assert(info == 0,"POSV");
> -
> -
> -                if (ForceCheck)
> -                {
> -                    check_result(backupAL, &AR[ii * r * n], n, p,
> -                                 y_block_size, r, Y, B);
> -                }
> -            }
> -        }
> -    }
> -
> -    get_ticks(end_tick);
> -    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> -    // out.gflops = a_amount/1000.0*(n*l*r+n*r*r+y_amount*(n*r+p*p*p)))/1000.0/1000.0;
> -    out.gflops = gemm_flops(l, n, l, 0) + gemm_flops(l, n, y_amount, 0) +
> -                 a_amount * (gemm_flops(l, n, r, 0) +
> -                             gemm_flops(r, n, r, 0) +
> -                             gemm_flops(r, n, y_amount, 0) +
> -                             (y_amount/1000.0) *
> -                             (p * p * p / 3.0) / 1000.0/1000.0);
> -
> -
> -    AIOfile.finalize();
> -
> -    delete []Ay_top;
> -    delete []Ay_bot;
> -    delete []A;
> -    delete []AL;
> -    delete []AR;
> -}
> -
> -
> -void Algorithm::partialNEQ(struct Settings params, struct Outputs &out)
> -{
> -    int max_threads = params.threads;
> -
> -
> -    srand(time(NULL));
> -
> -
> -    blas_set_num_threads(max_threads);
> -
> -
> -    type_precision *Ytemp;
> -    lapack_int info, n, lda, ldy, l, r, k, p;
> -
> -    int i, j, w;
> -    int m;
> -
> -    cputime_type start_tick, start_tick2, end_tick;
> -    AIOfile.initialize(params);
> -
> -    n = params.n; l = params.l; r = params.r; p = l+r;
> -
> -    int y_amount = params.m;
> -    int y_block_size = params.mb;// kk
> -
> -    int a_amount = params.t;
> -    int a_block_size = 1;
> -
> -    int a_iters = a_amount/a_block_size;
> -
> -    lda = n; ldy = n;
> -    k = p;
> -
> -
> -
> -    type_precision* backupAL;
> -
> -    AIOfile.load_AL(&backupAL);
> -
> -    type_precision* S = new type_precision[p * p];
> -    type_precision Stl[l * l];
> -    type_precision Str[l * r];
> -    type_precision Sbr[r * r];
> -    type_precision* A = new type_precision[n * p];
> -    type_precision* AL = A;
> -    type_precision* AR = &A[l * n];
> -
> -    copy_vec(backupAL, AL, n*l);
> -
> -
> -    int y_iters = y_amount / y_block_size;
> -
> -    type_precision* Y;
> -    type_precision* backupAR;
> -
> -    // printf("\n\n%%Computations\n%%");
> -
> -    get_ticks(start_tick);
> -
> -    //! Generate S
> -    cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> -                l, n, 1.0, backupAL, lda, 0.0, Stl, l);
> -
> -    for (i = 0; i < a_iters; i++)
> -    {
> -        if (a_iters < 10 || (i%(a_iters/10)) == 0)
> -        {
> -            cout << "%" << flush;
> -        }
> -
> -        AIOfile.load_ARblock(&backupAR, a_block_size);
> -        AIOfile.reset_Y();
> -        copy_vec(backupAR, AR, n*r);
> -
> -        // matlab_print_matrix("A", n, p, A);
> -        //! Generate Str
> -        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                    l, r, n, 1.0, AL, n, AR, n, 0.0, Str, l);
> -
> -        //! Generate Sbr
> -        cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> -                    r, n, 1.0, backupAR, lda, 0.0, Sbr, r);
> -
> -
> -
> -        // matlab_print_matrix("S", p, p, S);
> -
> -        for (j = 0; j < y_iters; j++)
> -        {
> -            build_S(S, Stl, Str, Sbr, l, r);
> -
> -            AIOfile.load_Yblock(&Y, y_block_size);
> -            type_precision Ay[y_block_size * p];
> -
> -            // matlab_print_matrix("Y", n, y_block_size, Y);
> -
> -            //! Ay = A'*Y
> -            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                        p, y_block_size, n, 1.0, A, n, Y, n, 0.0, Ay, p);
> -
> -            type_precision* B = Ay;
> -            // matlab_print_matrix("Ay", p, y_block_size, Ay);
> -
> -            //! b = S\qy
> -            info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
> -                                 S, p, Ay, p);
> -            assert(info == 0, "POSV");
> -
> -            // matlab_print_matrix("B", p, y_block_size, B);
> -            // exit(0);
> -            if ( ForceCheck)
> -            {
> -                check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
> -            }
> -        }  // END for over j < y_iters
> -    }  // END for over i < a_iters
> -
> -    get_ticks(end_tick);
> -    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> -    out.gflops = 2.0 * (n * l * l / 1000.0 + a_amount / 1000.0 *
> -                        (n * l * r + n * r * r +
> -                         y_amount * (p * n + p * p * p))) /1000.0 /1000.0;
> -
> -
> -
> -    AIOfile.finalize();
> -
> -    delete []A;
> -    delete []S;
> -}
> -
> -
> -void Algorithm::fullNEQ(struct Settings params, struct Outputs &out)
> -{
> -    int max_threads = params.threads;
> -
> -
> -    srand(time(NULL));
> -
> -
> -    blas_set_num_threads(max_threads);
> -
> -
> -    type_precision *Ytemp;
> -    lapack_int info, n, lda, ldy, l, r, k, p;
> -
> -    int i, j, w;
> -    int m;
> -
> -    cputime_type start_tick, start_tick2, end_tick;
> -
> -    AIOfile.initialize(params);
> -
> -    n = params.n; l = params.l; r = params.r; p = l+r;
> -
> -    int y_amount = params.m;
> -    int y_block_size = params.mb;// kk
> -
> -    int a_amount = params.t;
> -    int a_block_size = 1;
> -
> -    int a_iters = a_amount/a_block_size;
> -
> -    lda = n; ldy = n;
> -    k = p;
> -
> -
> -
> -    type_precision* backupAL;
> -
> -    AIOfile.load_AL(&backupAL);
> -
> -    type_precision* S = new type_precision[p * p];
> -    type_precision* Stemp = new type_precision[p * p];
> -    type_precision* A = new type_precision[n * p];
> -    type_precision* AL = A;
> -    type_precision* AR = &A[l * n];
> -
> -    int y_iters = y_amount / y_block_size;
> -
> -    type_precision* Y;
> -    type_precision* backupAR;
> -
> -    // printf("\n\n%%Computations\n%%");
> -
> -    get_ticks(start_tick);
> -
> -    for (i = 0; i < a_iters; i++)
> -    {
> -        if (a_iters < 10 || (i%(a_iters/10)) == 0)
> -        {
> -            cout << "%" << flush;
> -        }
> -
> -        copy_vec(backupAL, AL, n * l);
> -
> -        AIOfile.load_ARblock(&backupAR, a_block_size);
> -        AIOfile.reset_Y();
> -        copy_vec(backupAR, AR, n*r);
> -
> -        // matlab_print_matrix("A", n, p, A);
> -        //! Generate S
> -        cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> -                    p, n, 1.0, A, lda, 0.0, S, p);
> -        // matlab_print_matrix("S", p, p, S);
> -
> -        for (j = 0; j < y_iters; j++)
> -        {
> -            AIOfile.load_Yblock(&Y, y_block_size);
> -            type_precision Ay[y_block_size * p];
> -            copy_vec(S, Stemp, p*p);
> -
> -            // matlab_print_matrix("Y", n, y_block_size, Y);
> -
> -            //! Ay = A'*Y
> -            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                        p, y_block_size, n, 1.0, A, n, Y, n, 0.0, Ay, p);
> -
> -            type_precision* B = Ay;
> -            // matlab_print_matrix("Ay", p, y_block_size, Ay);
> -
> -            //! b = S\qy
> -            info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
> -                                 Stemp, p, Ay, p);
> -            assert(info == 0, "POSV");
> -
> -            // matlab_print_matrix("B", p, y_block_size, B);
> -            // exit(0);
> -            if ( ForceCheck)
> -            {
> -                check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
> -            }
> -        }
> -    }
> -
> -    get_ticks(end_tick);
> -    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> -    out.gflops = 2.0 * a_amount * (n * p * p / 1000.0 + y_amount / 1000.0 *
> -                                   (p * n + p * p * p))/1000.0/1000.0;
> -
> -
> -    AIOfile.finalize();
> -
> -
> -    free(AL);
> -    delete []A;
> -    delete []S;
> -    delete []Stemp;
> -}
> -
> -
> -//!*************************************************!//
> -void Algorithm::fullQR(struct Settings params, struct Outputs &out)
> -{
> -    int max_threads = params.threads;
> -
> -
> -    srand(time(NULL));
> -
> -    blas_set_num_threads(max_threads);
> -
> -
> -
> -    cputime_type start_tick, start_tick2, end_tick;
> -    double acc_gemm = 0;
> -    double acc_trsm = 0;
> -    double acc_other = 0;
> -    double acc_atemp = 0;
> -    double acc_rtr = 0;
> -    double acc_rqr = 0;
> -
> -
> -    type_precision *Ytemp;
> -    lapack_int info, n, lda, ldy, l, r, k, p;
> -
> -    int i, j, w;
> -    int m;
> -
> -    AIOfile.initialize(params);
> -
> -    n = params.n; l = params.l; r = params.r; p = l+r;
> -
> -    int y_amount = params.m;
> -    int y_block_size = 1;// kk
> -
> -    int a_amount = params.t;
> -    int a_block_size = 1;
> -
> -    int a_iters = a_amount/a_block_size;
> -
> -    lda = n; ldy = n;
> -    k = p;
> -
> -
> -
> -    type_precision* backupAL;
> -
> -    AIOfile.load_AL(&backupAL);
> -
> -    type_precision* Q = new type_precision[n*p];
> -    type_precision* A = Q;
> -    type_precision* AL = A;
> -    type_precision* AR = &A[l*n];
> -
> -    type_precision tau[k];
> -
> -
> -    int y_iters = y_amount / y_block_size;
> -
> -
> -    type_precision* Y;
> -    type_precision* backupAR;
> -
> -
> -
> -    // printf("\n\n%%Computations\n%%");
> -
> -    get_ticks(start_tick);
> -    for (i = 0; i < a_iters; i++)
> -    {
> -        if (a_iters < 10 || (i%(a_iters/10)) == 0)
> -        {
> -            cout << "%" << flush;
> -        }
> -
> -        copy_vec(backupAL, AL, n*l);
> -
> -        AIOfile.load_ARblock(&backupAR, a_block_size);
> -        AIOfile.reset_Y();
> -        copy_vec(backupAR, AR, n*r);
> -
> -        //! Generate R
> -        info = LAPACKE_sgeqrf(STORAGE_TYPE, n, p, A, lda, tau);
> -        assert(info == 0, "QR decomp");
> -        type_precision* R = extract_R(A, n, p);
> -
> -        //! generate Q
> -        info = LAPACKE_sorgqr(STORAGE_TYPE, n, p, k, Q, lda, tau);
> -        assert(info == 0, "Q form");
> -
> -
> -
> -//        matlab_print_matrix("ALL", n, l, backupAL);
> -//        matlab_print_matrix("ARR", n, r, backupAR);
> -
> -        for (j = 0; j < y_iters; j++)
> -        {
> -            // matlab_print_matrix("RR", p, p, R);
> -            // matlab_print_matrix("QQ", n, p, Q);
> -
> -            AIOfile.load_Yblock(&Y, y_block_size);
> -            type_precision Qy[y_block_size*p];
> -
> -            //! qy = Q'*Y
> -            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                        p, y_block_size, n, 1.0, Q, n, Y, n, 0.0, Qy, p);
> -
> -            type_precision* B = Qy;
> -
> -            //! b = R\qy
> -            cblas_strsm(CblasColMajor, CblasLeft, CblasUpper,
> -                        CblasNoTrans, CblasNonUnit,
> -                        p, y_block_size, 1.0, R, p, Qy, p);
> -
> -            if (ForceCheck)
> -            {
> -                check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
> -            }
> -        }
> -
> -        free(R);
> -    }
> -
> -    get_ticks(end_tick);
> -    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> -    out.gflops = 2.0 * a_amount *
> -        (2.0 * n * p * p / 1000.0 + y_amount / 1000.0 *
> -         (p * n + p * p * p)) / 1000.0 / 1000.0;
> -
> -
> -    AIOfile.finalize();
> -
> -
> -    free(AL);
> -    delete[] Q;
> -}
> -
> -
> -void Algorithm::partialQR(struct Settings params, struct Outputs &out)
> -{
> -    int max_threads = params.threads;
> -
> -
> -    srand(time(NULL));
> -
> -    blas_set_num_threads(max_threads);
> -
> -
> -
> -    cputime_type start_tick, start_tick2, end_tick;
> -    double acc_gemm = 0;
> -    double acc_trsm = 0;
> -    double acc_other = 0;
> -    double acc_atemp = 0;
> -    double acc_rtr = 0;
> -    double acc_rqr = 0;
> -
> -
> -    type_precision *RL, *Ytemp, *RtlRtr;
> -    lapack_int info, n, lda, ldy, l, r, k, p;
> -
> -    int i, j, w;
> -    int m;
> -
> -    AIOfile.initialize(params);
> -
> -    n = params.n; l = params.l; r = params.r; p = l+r;
> -
> -    int y_amount = params.m;
> -    int y_block_size = params.mb;// kk
> -
> -    int a_amount = params.t;
> -    int a_block_size = 1;
> -
> -    int a_iters = a_amount/a_block_size;
> -
> -    lda = n; ldy = n;
> -    k = l;
> -
> -
> -
> -    type_precision* backupAL;
> -
> -    AIOfile.load_AL(&backupAL);
> -
> -    type_precision* Q = new type_precision[n*p];
> -    type_precision* AL = Q;
> -    copy_vec(backupAL, AL, n*l);
> -
> -
> -    type_precision tau[k];
> -
> -    type_precision* QL;
> -    type_precision* QR = &Q[n*l];
> -
> -    int y_iters = y_amount / y_block_size;
> -
> -
> -    type_precision* Y;
> -
> -    type_precision Rtr[l*r];
> -    type_precision* Ar;
> -    type_precision Rbr[r*r];
> -
> -    get_ticks(start_tick);
> -    //! Generate RTL
> -    info = LAPACKE_sgeqrf(STORAGE_TYPE, n, l, AL, lda, tau);
> -    assert(info == 0, "QR decomp");
> -
> -    type_precision* R =prepare_R(AL, n, l, r);
> -    type_precision* Rtl = R;
> -
> -    QL = AL;// same as Q
> -    AL = backupAL;
> -    //! generate QL
> -    info = LAPACKE_sorgqr(STORAGE_TYPE, n, l, k, QL, lda, tau);
> -
> -    assert(info == 0, "Q form");
> -
> -    // printf("\n\n%%Computations\n%%");
> -
> -
> -    for (i = 0; i < a_iters; i++)
> -    {
> -        if (a_iters < 10 || (i%(a_iters/10)) == 0)
> -        {
> -            cout << "%" << flush;
> -        }
> -
> -        AIOfile.load_ARblock(&Ar, a_block_size);
> -        AIOfile.reset_Y();
> -
> -        //! Rtr = Q1'*AR
> -        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                    l, r*a_block_size, n, 1.0, QL, n, Ar, n, 0.0, Rtr, l);
> -
> -        type_precision* Atemp = replicate_vec(Ar, n*r*a_block_size);
> -
> -        //! Atemp = AR-Q1*Rtr
> -        cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
> -                    n, r*a_block_size, l, -1.0, QL, n, Rtr, l, 1.0, Atemp, n);
> -
> -        type_precision* QrRbr = QR;
> -        copy_vec(Atemp, QrRbr, n*r);
> -        info = LAPACKE_sgeqrf(STORAGE_TYPE, n, r, QrRbr, lda, tau);
> -        assert(info == 0, "QR decomp of QrRbr");
> -
> -        // reuse of old function
> -        type_precision* Rbrtemp = prepare_R(QrRbr, n, r, 0);
> -
> -        copy_vec(Rbrtemp, Rbr, r*r);
> -        info = LAPACKE_sorgqr(STORAGE_TYPE, n, r, r, QrRbr, lda, tau);
> -        assert(info == 0, "QR form");
> -        free(Rbrtemp);
> -
> -        update_R(R, Rtr, Rbr, p, p, r);
> -
> -//        matlab_print_matrix("RL", p, l, Rtl);
> -//        matlab_print_matrix("Rtr", l, r, Rtr);
> -//        matlab_print_matrix("Rbr", r, r, Rbr);
> -//        matlab_print_matrix("RR", p, p, R);
> -//        matlab_print_matrix("QQ", n, p, Q);
> -        // matlab_print_matrix("ALL", n, l, AL);
> -        // matlab_print_matrix("ARR", n, r, Ar);
> -        // matlab_print_matrix("R", p, p, R);
> -        int top_idx = 0;
> -
> -        for (j = 0; j < y_iters; j++)
> -        {
> -            AIOfile.load_Yblock(&Y, y_block_size);
> -            type_precision Qy[y_block_size*p];
> -
> -
> -            //! qy = Q'*Y
> -            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                        p, y_block_size, n, 1.0, Q, n, Y, n, 0.0, Qy, p);
> -
> -            type_precision* B = Qy;
> -
> -            //! b = R\qy
> -            cblas_strsm(CblasColMajor, CblasLeft, CblasUpper,
> -                        CblasNoTrans, CblasNonUnit,
> -                        p, y_block_size, 1.0, R, p, Qy, p);
> -
> -            // matlab_print_matrix("bcomp", p, y_block_size, B);
> -
> -
> -            if ( ForceCheck)
> -            {
> -                check_result(AL, Ar, n, p, y_block_size, r, Y, B);
> -            }
> -        }
> -
> -        free(Atemp);
> -    }
> -
> -
> -    get_ticks(end_tick);
> -    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> -    out.gflops = 2.0*(n*l*l/1000.0 + a_amount *
> -                      (2.0 * n * l * r / 1000.0 +
> -                       n *  r * r / 1000.0 +
> -                       y_amount / 1000.0 * (p * n + p * p * p))) /
> -        1000.0 / 1000.0;
> -
> -
> -    AIOfile.finalize();
> -
> -
> -    free(Rtl);
> -    free(AL);
> -    delete[] Q;
> -}
> -
> -
> -void  Algorithm::partialQR_Blocked_Rtl(struct Settings params,
> -                                       struct Outputs &out)
> -{
> -    int max_threads = params.threads;
> -
> -    // srand (time(NULL));
> -
> -    int cpu_cores = max_threads;
> -
> -
> -    blas_set_num_threads(max_threads);
> -    omp_set_num_threads(max_threads);
> -
> -
> -
> -
> -    cputime_type start_tick, start_tick2, end_tick;
> -    double acc_pre     = 0;
> -    double acc_trsm    = 0;
> -    double acc_other   = 0;
> -    double acc_atemp   = 0;
> -    double acc_rtr     = 0;
> -    double acc_rqr     = 0;
> -    double acc_RTL_QLY = 0;
> -
> -
> -    type_precision *AL, *RL, *RQy_top, *Ytemp, *RtlRtr;
> -    lapack_int info, n, lda, ldy, l, r, k, p;
> -
> -    int i, j, w;
> -    int m;
> -
> -
> -    AIOfile.initialize(params);
> -
> -    n = params.n; l = params.l; r = params.r; p = l+r;
> -
> -
> -
> -    double lvl_3_cache_size = 6;
> -
> -
> -    int y_amount = params.t;
> -    int y_block_size = params.tb;// kk
> -
> -    int a_amount = params.m;
> -    int a_block_size = params.mb;
> -
> -    int a_iters = (a_amount + a_block_size - 1) / a_block_size;
> -
> -    int y_iters = (y_amount+ y_block_size - 1)/y_block_size;
> -
> -    int qy_idx = 0;
> -
> -    out.gflops = n/1000.0*l/1000.0*l/1000.0 +
> -        gemm_flops(l, n, y_amount, 0) +
> -        y_amount*l/1000.0*l/1000.0/1000.0 +
> -        a_amount * (gemm_flops(l, n, r, 0) +
> -                    l/1000.0*l/1000.0/1000.0 +
> -                    gemm_flops(n, l, r, 1) +
> -                    n/1000.0*r/1000.0*r/1000.0 +
> -                    gemm_flops(r, n, y_amount, 0) +
> -                    y_amount/1000.0*r/1000.0*r/1000.0 +
> -                    y_amount/1000.0*l/1000.0/1000.0);
> -
> -    int sch_block_size = max(1.0,
> -                             min((double)a_block_size / (double)max_threads,
> -                                 (1.0*1000*1000) /
> -                                 (double)((sizeof(type_precision) * n * r))));
> -
> -    // cout << endl<< "taskChunk "<< sch_block_size <<endl;
> -
> -    lda = n; ldy = n;
> -    k = l;
> -
> -    AIOfile.load_AL(&AL);
> -
> -    type_precision* backupAL = replicate_vec(AL, n*l);
> -
> -
> -
> -    RQy_top = new type_precision[l*y_amount];
> -
> -    type_precision* Bfinal = (type_precision*)malloc(max_threads * p *
> -                                                     y_block_size *
> -                                                     sizeof(type_precision));
> -
> -    type_precision tau[k];
> -
> -    type_precision* QL;
> -
> -    type_precision* Rtr = (type_precision*)malloc(l * r * a_block_size *
> -                                                  sizeof(type_precision));
> -    type_precision* Qy_bot = (type_precision*)malloc(a_block_size *
> -                                                     y_block_size * r *
> -                                                     sizeof(type_precision));
> -
> -    type_precision* B_top = new type_precision[max_threads*y_block_size*l];
> -    type_precision* B_bot = new type_precision[max_threads*y_block_size*r];
> -
> -    type_precision* Ar;
> -    type_precision Rbr[r*r*a_block_size];
> -
> -
> -
> -    // printf("\n\n%%Computations\n%%");
> -    for (i = 0; i < a_iters; i++)
> -    {
> -        if (a_iters >= 10 && (i%(a_iters/10)) == 0)
> -        {
> -            // cout << "*" << flush;
> -        }
> -
> -        for (j = 0; j < y_iters; j++)
> -        {
> -            if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
> -            {
> -                // cout << "*" << flush;
> -            }
> -        }
> -    }
> -    cout << endl;
> -
> -
> -    //! Start of Computations
> -
> -    get_ticks(start_tick);
> -
> -
> -    info = LAPACKE_sgeqrf(STORAGE_TYPE, n, l, AL, lda, tau);
> -    assert(info == 0, "QR decomp");
> -
> -    type_precision* Rtl = prepare_R(AL, n, l, 0);
> -
> -    QL = AL;
> -    AL = backupAL;
> -
> -    info = LAPACKE_sorgqr(STORAGE_TYPE, n, l, k, QL, lda, tau);
> -
> -    assert(info == 0, "Q form");
> -
> -    matlab_print_matrix("QL", n, l, QL);
> -
> -    qy_idx = 0;
> -
> -    // printf("\n%%Preparing IO\n");
> -    //! Prepare File for the Y
> -
> -
> -    type_precision* Y;
> -
> -    // #pragma omp parallel default(shared)
> +//    int max_threads = params.threads;
> +//
> +//    srand(time(NULL));
> +//
> +//    blas_set_num_threads(max_threads);
> +//
> +//    type_precision *Ytemp;
> +//    lapack_int info, n, lda,ldy, l, r, p;
> +//
> +//    int i,k;
> +//
> +//    cputime_type start_tick, start_tick2, end_tick;
> +//
> +//    AIOfile.initialize(params);
> +//
> +//    n = params.n; l = params.l; r = params.r; p = l+r;
> +//
> +//    int y_amount = params.m;
> +//    int y_block_size = params.mb;  // kk
> +//
> +//    int a_amount = params.t;
> +//    int a_block_size = params.tb;
> +//
> +//    int a_iters = (a_amount + a_block_size - 1) / a_block_size;
> +//
> +//    lda = n; ldy = n;
> +//    k = p;
> +//
> +//
> +//
> +//    type_precision* backupAL;
> +//
> +//    AIOfile.load_AL(&backupAL);
> +//
> +//
> +//    type_precision Stl[l * l];
> +//    type_precision Str[l * r * a_block_size];
> +//
> +//
> +//    type_precision* Ay_top = new type_precision[l * y_amount];
> +//    type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
> +//    type_precision* A = new type_precision[n * p];
> +//    type_precision* AR = new type_precision[n * r * a_block_size];
> +//    type_precision* AL = new type_precision[n * l];
> +//
> +//    copy_vec(backupAL, AL, n * l);
> +//    copy_vec(backupAL, A, n * l);
> +//
> +//
> +//    int y_iters = y_amount/y_block_size;
> +//
> +//    type_precision* Y;
> +//    type_precision* backupAR;
> +//
> +//    // printf("\n\n%%Computations\n%%");
> +//
> +//
> +//    get_ticks(start_tick);
> +//
> +//
> +//    //! Generate S
> +//    cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> +//                l, n, 1.0, backupAL, lda, 0.0, Stl, l);
> +//
> +//    for (int j = 0; j < y_iters; j++)
>  //    {
> -//        // #pragma omp for private(Y, y_block_size, qy_idx) nowait schedule(static, 1)
> +//        AIOfile.load_Yblock(&Y, y_block_size);
> +//
> +//        //! Ay_top = AL'*Y
> +//        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> +//                    l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
> +//                    &Ay_top[j * l * y_block_size], l);
> +//    }
> +//
> +//
> +//    for (int i = 0; i < a_iters; i++)
> +//    {
> +//        if (a_iters > 10 && (i%(a_iters/10)) == 0)
> +//        {
> +//            cout << "%" << flush;
> +//        }
> +//
> +//
> +//        AIOfile.load_ARblock(&backupAR, a_block_size);
> +//        AIOfile.reset_Y();
> +//        copy_vec(backupAR, AR, n*r*a_block_size);
> +//
> +//        // matlab_print_matrix("A", n, p, A);
> +//        //! Generate Str
> +//        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> +//                    l, r * a_block_size, n, 1.0, AL, n, AR, n, 0.0, Str, l);
> +//
> +//        type_precision Sbr[r*r*a_block_size];
> +//
> +//        #pragma omp parallel for default(shared) schedule(static)
> +//        for (int ii= 0; ii < a_block_size; ii++)
> +//        {
> +//            //! Generate Sbr
> +//            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> +//                        r, r, n, 1.0, &AR[ii*r*n], n, &AR[ii*r*n],
> +//                        n, 0.0, &Sbr[ii*r*r], r);
> +//        }
> +//
> +//        type_precision Ay[y_block_size*p];
> +//        type_precision S[p*p];
> +//
> +//
>  //        for (int j = 0; j < y_iters; j++)
>  //        {
> -//            // qy_idx = j*y_block_size*l;
> -//            // #pragma omp critical
> -//            {AIOfile.load_Yblock(&Y, y_block_size);}
> +//            if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
> +//            {
> +//                cout << "%" << flush;
> +//            }
>  //
> -//            //! qy_top = QL'*Y
> +//
> +//            AIOfile.load_Yblock(&Y, y_block_size);
> +//
> +//            //! Ay_bot = AR'*Y
>  //            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -//                            l, y_block_size, n, 1.0, QL, n, Y, n, 0.0,&RQy_top[qy_idx], l);
> +//                        r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
> +//                        0.0, Ay_bot, r * a_block_size);
>  //
> -//            //! K | RtlQlY =RTL\qy_top
> -//            cblas_strsm(CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit,
> -//                                            l, y_block_size, 1.0, Rtl, l,&RQy_top[qy_idx], l);
> -//            qy_idx += y_block_size*l;
> +//            #pragma omp parallel for private(S, Ay) default(shared) schedule(static)
> +//            for (int ii= 0; ii < a_block_size; ii++)
> +//            {
> [TRUNCATED]
> 
> To get the complete diff run:
>     svnlook diff /svnroot/genabel -r 1609
> _______________________________________________
> Genabel-commits mailing list
> Genabel-commits at lists.r-forge.r-project.org
> https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/genabel-commits
> 

-- 
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
L.C. Karssen
Utrecht
The Netherlands

lennart at karssen.org
http://blog.karssen.org
GPG key ID: A88F554A
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 230 bytes
Desc: OpenPGP digital signature
URL: <http://lists.r-forge.r-project.org/pipermail/genabel-devel/attachments/20140214/e7a65565/attachment-0001.sig>


More information about the genabel-devel mailing list