[GenABEL-dev] [Genabel-commits] r1609 - pkg/OmicABELnoMM/src
L.C. Karssen
lennart at karssen.org
Mon Feb 17 12:03:54 CET 2014
Dear list,
As you may have noticed, I have given Alvaro Frank commit access to our
SVN repository. As announced in December [1], Alvaro is the main force
behind OmicABELnoMM. Alvaro will initially concentrate on removing the
compiler warnings and creating a test suite for OmicABELnoMM.
Welcome Alvaro, and looking forward to seeing more of your work/code!
Best regards,
Lennart.
[1]
http://lists.r-forge.r-project.org/pipermail/genabel-devel/2013-December/000891.html
On 14-02-14 11:32, L.C. Karssen wrote:
> Hi Alvaro,
>
> Thanks for your contribution and congratulations on your first commit in
> the GenABEL project.
>
> Glad to see that the number of has been reduced a lot! I still get a few
> errors after running
> ./configure LDFLAGS="-L/usr/lib/openblas-base"
> make -j4
> but they seem to be minor and easy to fix. Have a look at
> http://jenkins.genabel.org/jenkins/job/OmicABELnoMM/15/warnings23Result/
> for the warning messages and corresponding lines in the code.
> Also try to have a look at the CPPcheck results in Jenkins. It warns
> about three (possible) memory leaks.
>
>
> Thanks a lot and looking forward to your next commits.
>
>
> Lennart.
>
>
> On 13-02-14 21:42, noreply at r-forge.r-project.org wrote:
>> Author: afrank
>> Date: 2014-02-13 21:42:52 +0100 (Thu, 13 Feb 2014)
>> New Revision: 1609
>>
>> Modified:
>> pkg/OmicABELnoMM/src/AIOwrapper.cpp
>> pkg/OmicABELnoMM/src/Algorithm.cpp
>> pkg/OmicABELnoMM/src/Algorithm.h
>> pkg/OmicABELnoMM/src/Definitions.h
>> pkg/OmicABELnoMM/src/Utility.cpp
>> pkg/OmicABELnoMM/src/Utility.h
>> pkg/OmicABELnoMM/src/main.cpp
>> Log:
>> Warnings from c/c++ inconsistencies and -Wall fixed. Legacy code from unused functionality removed. A few bugs with indices fixed. Stable code.
>>
>> Modified: pkg/OmicABELnoMM/src/AIOwrapper.cpp
>> ===================================================================
>> --- pkg/OmicABELnoMM/src/AIOwrapper.cpp 2014-02-13 09:24:32 UTC (rev 1608)
>> +++ pkg/OmicABELnoMM/src/AIOwrapper.cpp 2014-02-13 20:42:52 UTC (rev 1609)
>> @@ -29,9 +29,9 @@
>>
>> Fhandler->fakefiles = params.use_fake_files;
>>
>> - databel_fvi* Yfvi;
>> - databel_fvi* ALfvi;
>> - databel_fvi* ARfvi;
>> +// databel_fvi* Yfvi;
>> +// databel_fvi* ALfvi;
>> +// databel_fvi* ARfvi;
>>
>> if (!Fhandler->fakefiles)
>> {
>> @@ -49,7 +49,7 @@
>> params.l = ALfvi->fvi_header.numVariables;
>>
>> //block size to keep mem under 1 gigabyte
>> - int opt_block = params.n / (4*1000^3) * (1/(2*params.r));
>> + //int opt_block = params.n / (4*1000^3) * (1/(2*params.r));
>> int opt_tb = 1000;
>> int opt_mb = 1000;
>>
>> @@ -81,7 +81,7 @@
>> void AIOwrapper::finalize()
>> {
>> //cout << "f";
>> - void *status;
>> + //void *status;
>>
>> Fhandler->not_done = false;
>> pthread_mutex_lock(&(Fhandler->m_more));
>> @@ -368,13 +368,15 @@
>> }
>> //
>> // //!induce realistic fileread delay
>> +
>> + return 0;
>> }
>>
>>
>> void AIOwrapper::load_ARblock(type_precision** Ar, int &Ar_blockSize)
>> {
>> - int status;
>> - int createstatus = 0;
>> + //int status;
>> + //int createstatus = 0;
>> //cout<<"^";
>>
>> while (Fhandler->ar_full_buffers.empty())
>> @@ -428,8 +430,8 @@
>>
>> void AIOwrapper::load_Yblock(type_precision** Y, int &y_blockSize)
>> {
>> - int status;
>> - int createstatus = 0;
>> + //int status;
>> + //int createstatus = 0;
>>
>> while (Fhandler->full_buffers.empty())
>> {
>> @@ -528,7 +530,7 @@
>>
>> void AIOwrapper::reset_Y()
>> {
>> - void *status;
>> + //void *status;
>>
>> Fhandler->seed = 1337;
>>
>> @@ -569,7 +571,7 @@
>>
>> void AIOwrapper::reset_AR()
>> {
>> - void *status;
>> + //void *status;
>>
>>
>> //cout << "ra" << flush;
>> @@ -697,7 +699,7 @@
>> FILE * AIOwrapper::fgls_fopen( const char *path, const char *mode )
>> {
>> FILE * f;
>> - char err[100];
>> + //char err[100];
>>
>> f = fopen(path, mode );
>> if ( f == NULL )
>>
>> Modified: pkg/OmicABELnoMM/src/Algorithm.cpp
>> ===================================================================
>> --- pkg/OmicABELnoMM/src/Algorithm.cpp 2014-02-13 09:24:32 UTC (rev 1608)
>> +++ pkg/OmicABELnoMM/src/Algorithm.cpp 2014-02-13 20:42:52 UTC (rev 1609)
>> @@ -16,30 +16,12 @@
>> {
>> switch (type)
>> {
>> - case FULL_NEQ:
>> - fullNEQ(params, out);
>> - break;
>> - case P_NEQ:
>> - partialNEQ(params, out);
>> - break;
>> - case P_NEQ_B_OPT:
>> - partialNEQ_Blocked_STL(params, out);
>> - break;
>> - case FULL_QR:
>> - fullQR(params, out);
>> - break;
>> - case P_QR:
>> - partialQR(params, out);
>> - break;
>> - case P_QR_B_OPT:
>> - partialQR_Blocked_Rtl(params, out);
>> - break;
>> - case P_NEQ_B_OPT_MD:
>> - partialNEQ_Blocked_STL_MD(params, out);
>> - break;
>> + case P_NEQ_B_OPT_MD:
>> + partialNEQ_Blocked_STL_MD(params, out);
>> + break;
>>
>> - default:
>> - break;
>> + default:
>> + break;
>> }
>> }
>>
>> @@ -74,7 +56,7 @@
>> int dim2_b, int dim1_b_bot)
>> {
>> // memcpy are faster version of the fors
>> - int i, k, w, top_idx, bot_idx, max = dim1_b*dim2_b;
>> + int i, k, w, top_idx, bot_idx;
>> int size;
>> top_idx = 0;
>> bot_idx = 0;
>> @@ -111,7 +93,7 @@
>> type_precision* bot, int dim1_QY,
>> int dim2_QY, int dim1_qy_bot, int bot_blocks )
>> {
>> - int i, k, w, top_idx, bot_idx, max = dim1_QY*dim2_QY;
>> + int i, k, w, top_idx, bot_idx;
>> top_idx = 0;
>> bot_idx = 0;
>> for (k = 0; k < dim2_QY; k++)
>> @@ -326,12 +308,9 @@
>> blas_set_num_threads(max_threads);
>>
>>
>> - type_precision *Ytemp;
>> - lapack_int info, n, lda, ldy, l, r, k, p;
>> + //type_precision *Ytemp;
>> + lapack_int info, n, lda, l, r, p;
>>
>> - int i, j, w;
>> - int m;
>> -
>> cputime_type start_tick, start_tick2, end_tick;
>>
>> AIOfile.initialize(params);
>> @@ -349,12 +328,12 @@
>> int y_iters = (y_amount + y_block_size - 1) / y_block_size;
>>
>>
>> - lda = n; ldy = n;
>> - k = p;
>> + lda = n;
>>
>> +
>> for (int j = 0; j < y_iters; j++)
>> {
>> - if (y_iters >= 40 && (i%(y_iters/40)) == 0)
>> + if (y_iters >= 40 && (j%(y_iters/40)) == 0)
>> {
>> cout << "*" << flush;
>> }
>> @@ -389,7 +368,9 @@
>>
>>
>> AIOfile.load_AL(&backupAL);
>> - int total_al_nans = replace_nans(0, backupAL, n, l);
>> + //int total_al_nans = replace_nans(0, backupAL, n, l);
>> + replace_nans(0, backupAL, n, l);
>> +
>> copy_vec(backupAL, AL, n*l);
>>
>>
>> @@ -402,7 +383,7 @@
>>
>> for (int j = 0; j < y_iters; j++)
>> {
>> - if (y_iters >= 40 && (i%(y_iters/40)) == 0)
>> + if (y_iters >= 40 && (j%(y_iters/40)) == 0)
>> {
>> cout << AIOfile.io_overhead << flush;
>> AIOfile.io_overhead = "*";
>> @@ -412,7 +393,8 @@
>>
>> list<long int> y_nan_idxs[y_block_size];
>>
>> - int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
>> + //int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
>> + replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
>>
>>
>> //! Ay_top = AL'*Y
>> @@ -424,7 +406,8 @@
>> for (int i = 0; i < a_iters; i++)
>> {
>> AIOfile.load_ARblock(&backupAR, a_block_size);
>> - int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
>> + //int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
>> + replace_nans(0, backupAR, n, a_block_size * r);
>> copy_vec(backupAR, AR, n * r * a_block_size);
>>
>> get_ticks(start_tick2);
>> @@ -501,7 +484,7 @@
>> &Ay[l+ii*p], r);
>>
>>
>> - type_precision* B = Ay;
>> + //type_precision* B = Ay;
>> type_precision S[p * p];
>>
>> //! Rebuild S
>> @@ -564,1194 +547,187 @@
>>
>> //!*************************************************!//
>>
>> -
>> +//for reference
>> void Algorithm::partialNEQ_Blocked_STL(struct Settings params,
>> struct Outputs &out)
>> {
>> - int max_threads = params.threads;
>> -
>> - srand(time(NULL));
>> -
>> - blas_set_num_threads(max_threads);
>> -
>> - type_precision *Ytemp;
>> - lapack_int info, n, lda, ldy, l, r, k, p;
>> -
>> - int i, j, w;
>> - int m;
>> -
>> - cputime_type start_tick, start_tick2, end_tick;
>> -
>> - AIOfile.initialize(params);
>> -
>> - n = params.n; l = params.l; r = params.r; p = l+r;
>> -
>> - int y_amount = params.m;
>> - int y_block_size = params.mb; // kk
>> -
>> - int a_amount = params.t;
>> - int a_block_size = params.tb;
>> -
>> - int a_iters = (a_amount + a_block_size - 1) / a_block_size;
>> -
>> - lda = n; ldy = n;
>> - k = p;
>> -
>> -
>> -
>> - type_precision* backupAL;
>> -
>> - AIOfile.load_AL(&backupAL);
>> -
>> -
>> - type_precision Stl[l * l];
>> - type_precision Str[l * r * a_block_size];
>> -
>> -
>> - type_precision* Ay_top = new type_precision[l * y_amount];
>> - type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
>> - type_precision* A = new type_precision[n * p];
>> - type_precision* AR = new type_precision[n * r * a_block_size];
>> - type_precision* AL = new type_precision[n * l];
>> -
>> - copy_vec(backupAL, AL, n * l);
>> - copy_vec(backupAL, A, n * l);
>> -
>> -
>> - int y_iters = y_amount/y_block_size;
>> -
>> - type_precision* Y;
>> - type_precision* backupAR;
>> -
>> - // printf("\n\n%%Computations\n%%");
>> -
>> -
>> - get_ticks(start_tick);
>> -
>> -
>> - //! Generate S
>> - cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
>> - l, n, 1.0, backupAL, lda, 0.0, Stl, l);
>> -
>> - for (j = 0; j < y_iters; j++)
>> - {
>> - AIOfile.load_Yblock(&Y, y_block_size);
>> -
>> - //! Ay_top = AL'*Y
>> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> - l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
>> - &Ay_top[j * l * y_block_size], l);
>> - }
>> -
>> -
>> - for (i = 0; i < a_iters; i++)
>> - {
>> - if (a_iters > 10 && (i%(a_iters/10)) == 0)
>> - {
>> - cout << "%" << flush;
>> - }
>> -
>> -
>> - AIOfile.load_ARblock(&backupAR, a_block_size);
>> - AIOfile.reset_Y();
>> - copy_vec(backupAR, AR, n*r*a_block_size);
>> -
>> - // matlab_print_matrix("A", n, p, A);
>> - //! Generate Str
>> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> - l, r * a_block_size, n, 1.0, AL, n, AR, n, 0.0, Str, l);
>> -
>> - type_precision Sbr[r*r*a_block_size];
>> -
>> - #pragma omp parallel for default(shared) schedule(static)
>> - for (int ii= 0; ii < a_block_size; ii++)
>> - {
>> - //! Generate Sbr
>> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> - r, r, n, 1.0, &AR[ii*r*n], n, &AR[ii*r*n],
>> - n, 0.0, &Sbr[ii*r*r], r);
>> - }
>> -
>> - type_precision Ay[y_block_size*p];
>> - type_precision S[p*p];
>> -
>> -
>> - for (j = 0; j < y_iters; j++)
>> - {
>> - if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
>> - {
>> - cout << "%" << flush;
>> - }
>> -
>> -
>> - AIOfile.load_Yblock(&Y, y_block_size);
>> -
>> - //! Ay_bot = AR'*Y
>> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> - r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
>> - 0.0, Ay_bot, r * a_block_size);
>> -
>> - #pragma omp parallel for private(S, Ay) default(shared) schedule(static)
>> - for (int ii= 0; ii < a_block_size; ii++)
>> - {
>> - // matlab_print_matrix("Y", n, y_block_size, Y);
>> -
>> - //! Rebuild AY
>> -
>> - for (int jj = 0; jj < y_block_size; jj++)
>> - {
>> - copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[jj*p], l);
>> - copy_vec(&Ay_bot[ii*r + jj*r*a_block_size], &Ay[l+jj*p], r);
>> - }
>> -
>> - type_precision* B = Ay;
>> - // matlab_print_matrix("Ay_top", l, y_block_size,&Ay_top[j*l*y_block_size]);
>> - // matlab_print_matrix("Ay_bot", r, y_block_size, Ay_bot);
>> - // matlab_print_matrix("Ay", p, y_block_size, Ay);
>> -
>> - //! Rebuild S
>> - build_S(S, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
>> - // matlab_print_matrix("S", p, p, S);
>> -
>> - //! b = S\Ay
>> - info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
>> - S, p, Ay, p);
>> - // assert(info == 0,"POSV");
>> -
>> -
>> - if (ForceCheck)
>> - {
>> - check_result(backupAL, &AR[ii * r * n], n, p,
>> - y_block_size, r, Y, B);
>> - }
>> - }
>> - }
>> - }
>> -
>> - get_ticks(end_tick);
>> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
>> - // out.gflops = a_amount/1000.0*(n*l*r+n*r*r+y_amount*(n*r+p*p*p)))/1000.0/1000.0;
>> - out.gflops = gemm_flops(l, n, l, 0) + gemm_flops(l, n, y_amount, 0) +
>> - a_amount * (gemm_flops(l, n, r, 0) +
>> - gemm_flops(r, n, r, 0) +
>> - gemm_flops(r, n, y_amount, 0) +
>> - (y_amount/1000.0) *
>> - (p * p * p / 3.0) / 1000.0/1000.0);
>> -
>> -
>> - AIOfile.finalize();
>> -
>> - delete []Ay_top;
>> - delete []Ay_bot;
>> - delete []A;
>> - delete []AL;
>> - delete []AR;
>> -}
>> -
>> -
>> -void Algorithm::partialNEQ(struct Settings params, struct Outputs &out)
>> -{
>> - int max_threads = params.threads;
>> -
>> -
>> - srand(time(NULL));
>> -
>> -
>> - blas_set_num_threads(max_threads);
>> -
>> -
>> - type_precision *Ytemp;
>> - lapack_int info, n, lda, ldy, l, r, k, p;
>> -
>> - int i, j, w;
>> - int m;
>> -
>> - cputime_type start_tick, start_tick2, end_tick;
>> - AIOfile.initialize(params);
>> -
>> - n = params.n; l = params.l; r = params.r; p = l+r;
>> -
>> - int y_amount = params.m;
>> - int y_block_size = params.mb;// kk
>> -
>> - int a_amount = params.t;
>> - int a_block_size = 1;
>> -
>> - int a_iters = a_amount/a_block_size;
>> -
>> - lda = n; ldy = n;
>> - k = p;
>> -
>> -
>> -
>> - type_precision* backupAL;
>> -
>> - AIOfile.load_AL(&backupAL);
>> -
>> - type_precision* S = new type_precision[p * p];
>> - type_precision Stl[l * l];
>> - type_precision Str[l * r];
>> - type_precision Sbr[r * r];
>> - type_precision* A = new type_precision[n * p];
>> - type_precision* AL = A;
>> - type_precision* AR = &A[l * n];
>> -
>> - copy_vec(backupAL, AL, n*l);
>> -
>> -
>> - int y_iters = y_amount / y_block_size;
>> -
>> - type_precision* Y;
>> - type_precision* backupAR;
>> -
>> - // printf("\n\n%%Computations\n%%");
>> -
>> - get_ticks(start_tick);
>> -
>> - //! Generate S
>> - cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
>> - l, n, 1.0, backupAL, lda, 0.0, Stl, l);
>> -
>> - for (i = 0; i < a_iters; i++)
>> - {
>> - if (a_iters < 10 || (i%(a_iters/10)) == 0)
>> - {
>> - cout << "%" << flush;
>> - }
>> -
>> - AIOfile.load_ARblock(&backupAR, a_block_size);
>> - AIOfile.reset_Y();
>> - copy_vec(backupAR, AR, n*r);
>> -
>> - // matlab_print_matrix("A", n, p, A);
>> - //! Generate Str
>> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> - l, r, n, 1.0, AL, n, AR, n, 0.0, Str, l);
>> -
>> - //! Generate Sbr
>> - cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
>> - r, n, 1.0, backupAR, lda, 0.0, Sbr, r);
>> -
>> -
>> -
>> - // matlab_print_matrix("S", p, p, S);
>> -
>> - for (j = 0; j < y_iters; j++)
>> - {
>> - build_S(S, Stl, Str, Sbr, l, r);
>> -
>> - AIOfile.load_Yblock(&Y, y_block_size);
>> - type_precision Ay[y_block_size * p];
>> -
>> - // matlab_print_matrix("Y", n, y_block_size, Y);
>> -
>> - //! Ay = A'*Y
>> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> - p, y_block_size, n, 1.0, A, n, Y, n, 0.0, Ay, p);
>> -
>> - type_precision* B = Ay;
>> - // matlab_print_matrix("Ay", p, y_block_size, Ay);
>> -
>> - //! b = S\qy
>> - info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
>> - S, p, Ay, p);
>> - assert(info == 0, "POSV");
>> -
>> - // matlab_print_matrix("B", p, y_block_size, B);
>> - // exit(0);
>> - if ( ForceCheck)
>> - {
>> - check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
>> - }
>> - } // END for over j < y_iters
>> - } // END for over i < a_iters
>> -
>> - get_ticks(end_tick);
>> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
>> - out.gflops = 2.0 * (n * l * l / 1000.0 + a_amount / 1000.0 *
>> - (n * l * r + n * r * r +
>> - y_amount * (p * n + p * p * p))) /1000.0 /1000.0;
>> -
>> -
>> -
>> - AIOfile.finalize();
>> -
>> - delete []A;
>> - delete []S;
>> -}
>> -
>> -
>> -void Algorithm::fullNEQ(struct Settings params, struct Outputs &out)
>> -{
>> - int max_threads = params.threads;
>> -
>> -
>> - srand(time(NULL));
>> -
>> -
>> - blas_set_num_threads(max_threads);
>> -
>> -
>> - type_precision *Ytemp;
>> - lapack_int info, n, lda, ldy, l, r, k, p;
>> -
>> - int i, j, w;
>> - int m;
>> -
>> - cputime_type start_tick, start_tick2, end_tick;
>> -
>> - AIOfile.initialize(params);
>> -
>> - n = params.n; l = params.l; r = params.r; p = l+r;
>> -
>> - int y_amount = params.m;
>> - int y_block_size = params.mb;// kk
>> -
>> - int a_amount = params.t;
>> - int a_block_size = 1;
>> -
>> - int a_iters = a_amount/a_block_size;
>> -
>> - lda = n; ldy = n;
>> - k = p;
>> -
>> -
>> -
>> - type_precision* backupAL;
>> -
>> - AIOfile.load_AL(&backupAL);
>> -
>> - type_precision* S = new type_precision[p * p];
>> - type_precision* Stemp = new type_precision[p * p];
>> - type_precision* A = new type_precision[n * p];
>> - type_precision* AL = A;
>> - type_precision* AR = &A[l * n];
>> -
>> - int y_iters = y_amount / y_block_size;
>> -
>> - type_precision* Y;
>> - type_precision* backupAR;
>> -
>> - // printf("\n\n%%Computations\n%%");
>> -
>> - get_ticks(start_tick);
>> -
>> - for (i = 0; i < a_iters; i++)
>> - {
>> - if (a_iters < 10 || (i%(a_iters/10)) == 0)
>> - {
>> - cout << "%" << flush;
>> - }
>> -
>> - copy_vec(backupAL, AL, n * l);
>> -
>> - AIOfile.load_ARblock(&backupAR, a_block_size);
>> - AIOfile.reset_Y();
>> - copy_vec(backupAR, AR, n*r);
>> -
>> - // matlab_print_matrix("A", n, p, A);
>> - //! Generate S
>> - cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
>> - p, n, 1.0, A, lda, 0.0, S, p);
>> - // matlab_print_matrix("S", p, p, S);
>> -
>> - for (j = 0; j < y_iters; j++)
>> - {
>> - AIOfile.load_Yblock(&Y, y_block_size);
>> - type_precision Ay[y_block_size * p];
>> - copy_vec(S, Stemp, p*p);
>> -
>> - // matlab_print_matrix("Y", n, y_block_size, Y);
>> -
>> - //! Ay = A'*Y
>> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> - p, y_block_size, n, 1.0, A, n, Y, n, 0.0, Ay, p);
>> -
>> - type_precision* B = Ay;
>> - // matlab_print_matrix("Ay", p, y_block_size, Ay);
>> -
>> - //! b = S\qy
>> - info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
>> - Stemp, p, Ay, p);
>> - assert(info == 0, "POSV");
>> -
>> - // matlab_print_matrix("B", p, y_block_size, B);
>> - // exit(0);
>> - if ( ForceCheck)
>> - {
>> - check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
>> - }
>> - }
>> - }
>> -
>> - get_ticks(end_tick);
>> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
>> - out.gflops = 2.0 * a_amount * (n * p * p / 1000.0 + y_amount / 1000.0 *
>> - (p * n + p * p * p))/1000.0/1000.0;
>> -
>> -
>> - AIOfile.finalize();
>> -
>> -
>> - free(AL);
>> - delete []A;
>> - delete []S;
>> - delete []Stemp;
>> -}
>> -
>> -
>> -//!*************************************************!//
>> -void Algorithm::fullQR(struct Settings params, struct Outputs &out)
>> -{
>> - int max_threads = params.threads;
>> -
>> -
>> - srand(time(NULL));
>> -
>> - blas_set_num_threads(max_threads);
>> -
>> -
>> -
>> - cputime_type start_tick, start_tick2, end_tick;
>> - double acc_gemm = 0;
>> - double acc_trsm = 0;
>> - double acc_other = 0;
>> - double acc_atemp = 0;
>> - double acc_rtr = 0;
>> - double acc_rqr = 0;
>> -
>> -
>> - type_precision *Ytemp;
>> - lapack_int info, n, lda, ldy, l, r, k, p;
>> -
>> - int i, j, w;
>> - int m;
>> -
>> - AIOfile.initialize(params);
>> -
>> - n = params.n; l = params.l; r = params.r; p = l+r;
>> -
>> - int y_amount = params.m;
>> - int y_block_size = 1;// kk
>> -
>> - int a_amount = params.t;
>> - int a_block_size = 1;
>> -
>> - int a_iters = a_amount/a_block_size;
>> -
>> - lda = n; ldy = n;
>> - k = p;
>> -
>> -
>> -
>> - type_precision* backupAL;
>> -
>> - AIOfile.load_AL(&backupAL);
>> -
>> - type_precision* Q = new type_precision[n*p];
>> - type_precision* A = Q;
>> - type_precision* AL = A;
>> - type_precision* AR = &A[l*n];
>> -
>> - type_precision tau[k];
>> -
>> -
>> - int y_iters = y_amount / y_block_size;
>> -
>> -
>> - type_precision* Y;
>> - type_precision* backupAR;
>> -
>> -
>> -
>> - // printf("\n\n%%Computations\n%%");
>> -
>> - get_ticks(start_tick);
>> - for (i = 0; i < a_iters; i++)
>> - {
>> - if (a_iters < 10 || (i%(a_iters/10)) == 0)
>> - {
>> - cout << "%" << flush;
>> - }
>> -
>> - copy_vec(backupAL, AL, n*l);
>> -
>> - AIOfile.load_ARblock(&backupAR, a_block_size);
>> - AIOfile.reset_Y();
>> - copy_vec(backupAR, AR, n*r);
>> -
>> - //! Generate R
>> - info = LAPACKE_sgeqrf(STORAGE_TYPE, n, p, A, lda, tau);
>> - assert(info == 0, "QR decomp");
>> - type_precision* R = extract_R(A, n, p);
>> -
>> - //! generate Q
>> - info = LAPACKE_sorgqr(STORAGE_TYPE, n, p, k, Q, lda, tau);
>> - assert(info == 0, "Q form");
>> -
>> -
>> -
>> -// matlab_print_matrix("ALL", n, l, backupAL);
>> -// matlab_print_matrix("ARR", n, r, backupAR);
>> -
>> - for (j = 0; j < y_iters; j++)
>> - {
>> - // matlab_print_matrix("RR", p, p, R);
>> - // matlab_print_matrix("QQ", n, p, Q);
>> -
>> - AIOfile.load_Yblock(&Y, y_block_size);
>> - type_precision Qy[y_block_size*p];
>> -
>> - //! qy = Q'*Y
>> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> - p, y_block_size, n, 1.0, Q, n, Y, n, 0.0, Qy, p);
>> -
>> - type_precision* B = Qy;
>> -
>> - //! b = R\qy
>> - cblas_strsm(CblasColMajor, CblasLeft, CblasUpper,
>> - CblasNoTrans, CblasNonUnit,
>> - p, y_block_size, 1.0, R, p, Qy, p);
>> -
>> - if (ForceCheck)
>> - {
>> - check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
>> - }
>> - }
>> -
>> - free(R);
>> - }
>> -
>> - get_ticks(end_tick);
>> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
>> - out.gflops = 2.0 * a_amount *
>> - (2.0 * n * p * p / 1000.0 + y_amount / 1000.0 *
>> - (p * n + p * p * p)) / 1000.0 / 1000.0;
>> -
>> -
>> - AIOfile.finalize();
>> -
>> -
>> - free(AL);
>> - delete[] Q;
>> -}
>> -
>> -
>> -void Algorithm::partialQR(struct Settings params, struct Outputs &out)
>> -{
>> - int max_threads = params.threads;
>> -
>> -
>> - srand(time(NULL));
>> -
>> - blas_set_num_threads(max_threads);
>> -
>> -
>> -
>> - cputime_type start_tick, start_tick2, end_tick;
>> - double acc_gemm = 0;
>> - double acc_trsm = 0;
>> - double acc_other = 0;
>> - double acc_atemp = 0;
>> - double acc_rtr = 0;
>> - double acc_rqr = 0;
>> -
>> -
>> - type_precision *RL, *Ytemp, *RtlRtr;
>> - lapack_int info, n, lda, ldy, l, r, k, p;
>> -
>> - int i, j, w;
>> - int m;
>> -
>> - AIOfile.initialize(params);
>> -
>> - n = params.n; l = params.l; r = params.r; p = l+r;
>> -
>> - int y_amount = params.m;
>> - int y_block_size = params.mb;// kk
>> -
>> - int a_amount = params.t;
>> - int a_block_size = 1;
>> -
>> - int a_iters = a_amount/a_block_size;
>> -
>> - lda = n; ldy = n;
>> - k = l;
>> -
>> -
>> -
>> - type_precision* backupAL;
>> -
>> - AIOfile.load_AL(&backupAL);
>> -
>> - type_precision* Q = new type_precision[n*p];
>> - type_precision* AL = Q;
>> - copy_vec(backupAL, AL, n*l);
>> -
>> -
>> - type_precision tau[k];
>> -
>> - type_precision* QL;
>> - type_precision* QR = &Q[n*l];
>> -
>> - int y_iters = y_amount / y_block_size;
>> -
>> -
>> - type_precision* Y;
>> -
>> - type_precision Rtr[l*r];
>> - type_precision* Ar;
>> - type_precision Rbr[r*r];
>> -
>> - get_ticks(start_tick);
>> - //! Generate RTL
>> - info = LAPACKE_sgeqrf(STORAGE_TYPE, n, l, AL, lda, tau);
>> - assert(info == 0, "QR decomp");
>> -
>> - type_precision* R =prepare_R(AL, n, l, r);
>> - type_precision* Rtl = R;
>> -
>> - QL = AL;// same as Q
>> - AL = backupAL;
>> - //! generate QL
>> - info = LAPACKE_sorgqr(STORAGE_TYPE, n, l, k, QL, lda, tau);
>> -
>> - assert(info == 0, "Q form");
>> -
>> - // printf("\n\n%%Computations\n%%");
>> -
>> -
>> - for (i = 0; i < a_iters; i++)
>> - {
>> - if (a_iters < 10 || (i%(a_iters/10)) == 0)
>> - {
>> - cout << "%" << flush;
>> - }
>> -
>> - AIOfile.load_ARblock(&Ar, a_block_size);
>> - AIOfile.reset_Y();
>> -
>> - //! Rtr = Q1'*AR
>> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> - l, r*a_block_size, n, 1.0, QL, n, Ar, n, 0.0, Rtr, l);
>> -
>> - type_precision* Atemp = replicate_vec(Ar, n*r*a_block_size);
>> -
>> - //! Atemp = AR-Q1*Rtr
>> - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
>> - n, r*a_block_size, l, -1.0, QL, n, Rtr, l, 1.0, Atemp, n);
>> -
>> - type_precision* QrRbr = QR;
>> - copy_vec(Atemp, QrRbr, n*r);
>> - info = LAPACKE_sgeqrf(STORAGE_TYPE, n, r, QrRbr, lda, tau);
>> - assert(info == 0, "QR decomp of QrRbr");
>> -
>> - // reuse of old function
>> - type_precision* Rbrtemp = prepare_R(QrRbr, n, r, 0);
>> -
>> - copy_vec(Rbrtemp, Rbr, r*r);
>> - info = LAPACKE_sorgqr(STORAGE_TYPE, n, r, r, QrRbr, lda, tau);
>> - assert(info == 0, "QR form");
>> - free(Rbrtemp);
>> -
>> - update_R(R, Rtr, Rbr, p, p, r);
>> -
>> -// matlab_print_matrix("RL", p, l, Rtl);
>> -// matlab_print_matrix("Rtr", l, r, Rtr);
>> -// matlab_print_matrix("Rbr", r, r, Rbr);
>> -// matlab_print_matrix("RR", p, p, R);
>> -// matlab_print_matrix("QQ", n, p, Q);
>> - // matlab_print_matrix("ALL", n, l, AL);
>> - // matlab_print_matrix("ARR", n, r, Ar);
>> - // matlab_print_matrix("R", p, p, R);
>> - int top_idx = 0;
>> -
>> - for (j = 0; j < y_iters; j++)
>> - {
>> - AIOfile.load_Yblock(&Y, y_block_size);
>> - type_precision Qy[y_block_size*p];
>> -
>> -
>> - //! qy = Q'*Y
>> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> - p, y_block_size, n, 1.0, Q, n, Y, n, 0.0, Qy, p);
>> -
>> - type_precision* B = Qy;
>> -
>> - //! b = R\qy
>> - cblas_strsm(CblasColMajor, CblasLeft, CblasUpper,
>> - CblasNoTrans, CblasNonUnit,
>> - p, y_block_size, 1.0, R, p, Qy, p);
>> -
>> - // matlab_print_matrix("bcomp", p, y_block_size, B);
>> -
>> -
>> - if ( ForceCheck)
>> - {
>> - check_result(AL, Ar, n, p, y_block_size, r, Y, B);
>> - }
>> - }
>> -
>> - free(Atemp);
>> - }
>> -
>> -
>> - get_ticks(end_tick);
>> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
>> - out.gflops = 2.0*(n*l*l/1000.0 + a_amount *
>> - (2.0 * n * l * r / 1000.0 +
>> - n * r * r / 1000.0 +
>> - y_amount / 1000.0 * (p * n + p * p * p))) /
>> - 1000.0 / 1000.0;
>> -
>> -
>> - AIOfile.finalize();
>> -
>> -
>> - free(Rtl);
>> - free(AL);
>> - delete[] Q;
>> -}
>> -
>> -
>> -void Algorithm::partialQR_Blocked_Rtl(struct Settings params,
>> - struct Outputs &out)
>> -{
>> - int max_threads = params.threads;
>> -
>> - // srand (time(NULL));
>> -
>> - int cpu_cores = max_threads;
>> -
>> -
>> - blas_set_num_threads(max_threads);
>> - omp_set_num_threads(max_threads);
>> -
>> -
>> -
>> -
>> - cputime_type start_tick, start_tick2, end_tick;
>> - double acc_pre = 0;
>> - double acc_trsm = 0;
>> - double acc_other = 0;
>> - double acc_atemp = 0;
>> - double acc_rtr = 0;
>> - double acc_rqr = 0;
>> - double acc_RTL_QLY = 0;
>> -
>> -
>> - type_precision *AL, *RL, *RQy_top, *Ytemp, *RtlRtr;
>> - lapack_int info, n, lda, ldy, l, r, k, p;
>> -
>> - int i, j, w;
>> - int m;
>> -
>> -
>> - AIOfile.initialize(params);
>> -
>> - n = params.n; l = params.l; r = params.r; p = l+r;
>> -
>> -
>> -
>> - double lvl_3_cache_size = 6;
>> -
>> -
>> - int y_amount = params.t;
>> - int y_block_size = params.tb;// kk
>> -
>> - int a_amount = params.m;
>> - int a_block_size = params.mb;
>> -
>> - int a_iters = (a_amount + a_block_size - 1) / a_block_size;
>> -
>> - int y_iters = (y_amount+ y_block_size - 1)/y_block_size;
>> -
>> - int qy_idx = 0;
>> -
>> - out.gflops = n/1000.0*l/1000.0*l/1000.0 +
>> - gemm_flops(l, n, y_amount, 0) +
>> - y_amount*l/1000.0*l/1000.0/1000.0 +
>> - a_amount * (gemm_flops(l, n, r, 0) +
>> - l/1000.0*l/1000.0/1000.0 +
>> - gemm_flops(n, l, r, 1) +
>> - n/1000.0*r/1000.0*r/1000.0 +
>> - gemm_flops(r, n, y_amount, 0) +
>> - y_amount/1000.0*r/1000.0*r/1000.0 +
>> - y_amount/1000.0*l/1000.0/1000.0);
>> -
>> - int sch_block_size = max(1.0,
>> - min((double)a_block_size / (double)max_threads,
>> - (1.0*1000*1000) /
>> - (double)((sizeof(type_precision) * n * r))));
>> -
>> - // cout << endl<< "taskChunk "<< sch_block_size <<endl;
>> -
>> - lda = n; ldy = n;
>> - k = l;
>> -
>> - AIOfile.load_AL(&AL);
>> -
>> - type_precision* backupAL = replicate_vec(AL, n*l);
>> -
>> -
>> -
>> - RQy_top = new type_precision[l*y_amount];
>> -
>> - type_precision* Bfinal = (type_precision*)malloc(max_threads * p *
>> - y_block_size *
>> - sizeof(type_precision));
>> -
>> - type_precision tau[k];
>> -
>> - type_precision* QL;
>> -
>> - type_precision* Rtr = (type_precision*)malloc(l * r * a_block_size *
>> - sizeof(type_precision));
>> - type_precision* Qy_bot = (type_precision*)malloc(a_block_size *
>> - y_block_size * r *
>> - sizeof(type_precision));
>> -
>> - type_precision* B_top = new type_precision[max_threads*y_block_size*l];
>> - type_precision* B_bot = new type_precision[max_threads*y_block_size*r];
>> -
>> - type_precision* Ar;
>> - type_precision Rbr[r*r*a_block_size];
>> -
>> -
>> -
>> - // printf("\n\n%%Computations\n%%");
>> - for (i = 0; i < a_iters; i++)
>> - {
>> - if (a_iters >= 10 && (i%(a_iters/10)) == 0)
>> - {
>> - // cout << "*" << flush;
>> - }
>> -
>> - for (j = 0; j < y_iters; j++)
>> - {
>> - if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
>> - {
>> - // cout << "*" << flush;
>> - }
>> - }
>> - }
>> - cout << endl;
>> -
>> -
>> - //! Start of Computations
>> -
>> - get_ticks(start_tick);
>> -
>> -
>> - info = LAPACKE_sgeqrf(STORAGE_TYPE, n, l, AL, lda, tau);
>> - assert(info == 0, "QR decomp");
>> -
>> - type_precision* Rtl = prepare_R(AL, n, l, 0);
>> -
>> - QL = AL;
>> - AL = backupAL;
>> -
>> - info = LAPACKE_sorgqr(STORAGE_TYPE, n, l, k, QL, lda, tau);
>> -
>> - assert(info == 0, "Q form");
>> -
>> - matlab_print_matrix("QL", n, l, QL);
>> -
>> - qy_idx = 0;
>> -
>> - // printf("\n%%Preparing IO\n");
>> - //! Prepare File for the Y
>> -
>> -
>> - type_precision* Y;
>> -
>> - // #pragma omp parallel default(shared)
>> +// int max_threads = params.threads;
>> +//
>> +// srand(time(NULL));
>> +//
>> +// blas_set_num_threads(max_threads);
>> +//
>> +// type_precision *Ytemp;
>> +// lapack_int info, n, lda,ldy, l, r, p;
>> +//
>> +// int i,k;
>> +//
>> +// cputime_type start_tick, start_tick2, end_tick;
>> +//
>> +// AIOfile.initialize(params);
>> +//
>> +// n = params.n; l = params.l; r = params.r; p = l+r;
>> +//
>> +// int y_amount = params.m;
>> +// int y_block_size = params.mb; // kk
>> +//
>> +// int a_amount = params.t;
>> +// int a_block_size = params.tb;
>> +//
>> +// int a_iters = (a_amount + a_block_size - 1) / a_block_size;
>> +//
>> +// lda = n; ldy = n;
>> +// k = p;
>> +//
>> +//
>> +//
>> +// type_precision* backupAL;
>> +//
>> +// AIOfile.load_AL(&backupAL);
>> +//
>> +//
>> +// type_precision Stl[l * l];
>> +// type_precision Str[l * r * a_block_size];
>> +//
>> +//
>> +// type_precision* Ay_top = new type_precision[l * y_amount];
>> +// type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
>> +// type_precision* A = new type_precision[n * p];
>> +// type_precision* AR = new type_precision[n * r * a_block_size];
>> +// type_precision* AL = new type_precision[n * l];
>> +//
>> +// copy_vec(backupAL, AL, n * l);
>> +// copy_vec(backupAL, A, n * l);
>> +//
>> +//
>> +// int y_iters = y_amount/y_block_size;
>> +//
>> +// type_precision* Y;
>> +// type_precision* backupAR;
>> +//
>> +// // printf("\n\n%%Computations\n%%");
>> +//
>> +//
>> +// get_ticks(start_tick);
>> +//
>> +//
>> +// //! Generate S
>> +// cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
>> +// l, n, 1.0, backupAL, lda, 0.0, Stl, l);
>> +//
>> +// for (int j = 0; j < y_iters; j++)
>> // {
>> -// // #pragma omp for private(Y, y_block_size, qy_idx) nowait schedule(static, 1)
>> +// AIOfile.load_Yblock(&Y, y_block_size);
>> +//
>> +// //! Ay_top = AL'*Y
>> +// cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> +// l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
>> +// &Ay_top[j * l * y_block_size], l);
>> +// }
>> +//
>> +//
>> +// for (int i = 0; i < a_iters; i++)
>> +// {
>> +// if (a_iters > 10 && (i%(a_iters/10)) == 0)
>> +// {
>> +// cout << "%" << flush;
>> +// }
>> +//
>> +//
>> +// AIOfile.load_ARblock(&backupAR, a_block_size);
>> +// AIOfile.reset_Y();
>> +// copy_vec(backupAR, AR, n*r*a_block_size);
>> +//
>> +// // matlab_print_matrix("A", n, p, A);
>> +// //! Generate Str
>> +// cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> +// l, r * a_block_size, n, 1.0, AL, n, AR, n, 0.0, Str, l);
>> +//
>> +// type_precision Sbr[r*r*a_block_size];
>> +//
>> +// #pragma omp parallel for default(shared) schedule(static)
>> +// for (int ii= 0; ii < a_block_size; ii++)
>> +// {
>> +// //! Generate Sbr
>> +// cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> +// r, r, n, 1.0, &AR[ii*r*n], n, &AR[ii*r*n],
>> +// n, 0.0, &Sbr[ii*r*r], r);
>> +// }
>> +//
>> +// type_precision Ay[y_block_size*p];
>> +// type_precision S[p*p];
>> +//
>> +//
>> // for (int j = 0; j < y_iters; j++)
>> // {
>> -// // qy_idx = j*y_block_size*l;
>> -// // #pragma omp critical
>> -// {AIOfile.load_Yblock(&Y, y_block_size);}
>> +// if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
>> +// {
>> +// cout << "%" << flush;
>> +// }
>> //
>> -// //! qy_top = QL'*Y
>> +//
>> +// AIOfile.load_Yblock(&Y, y_block_size);
>> +//
>> +// //! Ay_bot = AR'*Y
>> // cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>> -// l, y_block_size, n, 1.0, QL, n, Y, n, 0.0,&RQy_top[qy_idx], l);
>> +// r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
>> +// 0.0, Ay_bot, r * a_block_size);
>> //
>> -// //! K | RtlQlY =RTL\qy_top
>> -// cblas_strsm(CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit,
>> -// l, y_block_size, 1.0, Rtl, l,&RQy_top[qy_idx], l);
>> -// qy_idx += y_block_size*l;
>> +// #pragma omp parallel for private(S, Ay) default(shared) schedule(static)
>> +// for (int ii= 0; ii < a_block_size; ii++)
>> +// {
>> [TRUNCATED]
>>
>> To get the complete diff run:
>> svnlook diff /svnroot/genabel -r 1609
>> _______________________________________________
>> Genabel-commits mailing list
>> Genabel-commits at lists.r-forge.r-project.org
>> https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/genabel-commits
>>
>
>
>
> _______________________________________________
> genabel-devel mailing list
> genabel-devel at lists.r-forge.r-project.org
> https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/genabel-devel
>
--
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
L.C. Karssen
Utrecht
The Netherlands
lennart at karssen.org
http://blog.karssen.org
GPG key ID: A88F554A
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 230 bytes
Desc: OpenPGP digital signature
URL: <http://lists.r-forge.r-project.org/pipermail/genabel-devel/attachments/20140217/1c5dfbc2/attachment-0001.sig>
More information about the genabel-devel
mailing list