[GenABEL-dev] [Genabel-commits] r1609 - pkg/OmicABELnoMM/src
L.C. Karssen
lennart at karssen.org
Fri Feb 14 11:32:20 CET 2014
Hi Alvaro,
Thanks for your contribution and congratulations on your first commit in
the GenABEL project.
Glad to see that the number of has been reduced a lot! I still get a few
errors after running
./configure LDFLAGS="-L/usr/lib/openblas-base"
make -j4
but they seem to be minor and easy to fix. Have a look at
http://jenkins.genabel.org/jenkins/job/OmicABELnoMM/15/warnings23Result/
for the warning messages and corresponding lines in the code.
Also try to have a look at the CPPcheck results in Jenkins. It warns
about three (possible) memory leaks.
Thanks a lot and looking forward to your next commits.
Lennart.
On 13-02-14 21:42, noreply at r-forge.r-project.org wrote:
> Author: afrank
> Date: 2014-02-13 21:42:52 +0100 (Thu, 13 Feb 2014)
> New Revision: 1609
>
> Modified:
> pkg/OmicABELnoMM/src/AIOwrapper.cpp
> pkg/OmicABELnoMM/src/Algorithm.cpp
> pkg/OmicABELnoMM/src/Algorithm.h
> pkg/OmicABELnoMM/src/Definitions.h
> pkg/OmicABELnoMM/src/Utility.cpp
> pkg/OmicABELnoMM/src/Utility.h
> pkg/OmicABELnoMM/src/main.cpp
> Log:
> Warnings from c/c++ inconsistencies and -Wall fixed. Legacy code from unused functionality removed. A few bugs with indices fixed. Stable code.
>
> Modified: pkg/OmicABELnoMM/src/AIOwrapper.cpp
> ===================================================================
> --- pkg/OmicABELnoMM/src/AIOwrapper.cpp 2014-02-13 09:24:32 UTC (rev 1608)
> +++ pkg/OmicABELnoMM/src/AIOwrapper.cpp 2014-02-13 20:42:52 UTC (rev 1609)
> @@ -29,9 +29,9 @@
>
> Fhandler->fakefiles = params.use_fake_files;
>
> - databel_fvi* Yfvi;
> - databel_fvi* ALfvi;
> - databel_fvi* ARfvi;
> +// databel_fvi* Yfvi;
> +// databel_fvi* ALfvi;
> +// databel_fvi* ARfvi;
>
> if (!Fhandler->fakefiles)
> {
> @@ -49,7 +49,7 @@
> params.l = ALfvi->fvi_header.numVariables;
>
> //block size to keep mem under 1 gigabyte
> - int opt_block = params.n / (4*1000^3) * (1/(2*params.r));
> + //int opt_block = params.n / (4*1000^3) * (1/(2*params.r));
> int opt_tb = 1000;
> int opt_mb = 1000;
>
> @@ -81,7 +81,7 @@
> void AIOwrapper::finalize()
> {
> //cout << "f";
> - void *status;
> + //void *status;
>
> Fhandler->not_done = false;
> pthread_mutex_lock(&(Fhandler->m_more));
> @@ -368,13 +368,15 @@
> }
> //
> // //!induce realistic fileread delay
> +
> + return 0;
> }
>
>
> void AIOwrapper::load_ARblock(type_precision** Ar, int &Ar_blockSize)
> {
> - int status;
> - int createstatus = 0;
> + //int status;
> + //int createstatus = 0;
> //cout<<"^";
>
> while (Fhandler->ar_full_buffers.empty())
> @@ -428,8 +430,8 @@
>
> void AIOwrapper::load_Yblock(type_precision** Y, int &y_blockSize)
> {
> - int status;
> - int createstatus = 0;
> + //int status;
> + //int createstatus = 0;
>
> while (Fhandler->full_buffers.empty())
> {
> @@ -528,7 +530,7 @@
>
> void AIOwrapper::reset_Y()
> {
> - void *status;
> + //void *status;
>
> Fhandler->seed = 1337;
>
> @@ -569,7 +571,7 @@
>
> void AIOwrapper::reset_AR()
> {
> - void *status;
> + //void *status;
>
>
> //cout << "ra" << flush;
> @@ -697,7 +699,7 @@
> FILE * AIOwrapper::fgls_fopen( const char *path, const char *mode )
> {
> FILE * f;
> - char err[100];
> + //char err[100];
>
> f = fopen(path, mode );
> if ( f == NULL )
>
> Modified: pkg/OmicABELnoMM/src/Algorithm.cpp
> ===================================================================
> --- pkg/OmicABELnoMM/src/Algorithm.cpp 2014-02-13 09:24:32 UTC (rev 1608)
> +++ pkg/OmicABELnoMM/src/Algorithm.cpp 2014-02-13 20:42:52 UTC (rev 1609)
> @@ -16,30 +16,12 @@
> {
> switch (type)
> {
> - case FULL_NEQ:
> - fullNEQ(params, out);
> - break;
> - case P_NEQ:
> - partialNEQ(params, out);
> - break;
> - case P_NEQ_B_OPT:
> - partialNEQ_Blocked_STL(params, out);
> - break;
> - case FULL_QR:
> - fullQR(params, out);
> - break;
> - case P_QR:
> - partialQR(params, out);
> - break;
> - case P_QR_B_OPT:
> - partialQR_Blocked_Rtl(params, out);
> - break;
> - case P_NEQ_B_OPT_MD:
> - partialNEQ_Blocked_STL_MD(params, out);
> - break;
> + case P_NEQ_B_OPT_MD:
> + partialNEQ_Blocked_STL_MD(params, out);
> + break;
>
> - default:
> - break;
> + default:
> + break;
> }
> }
>
> @@ -74,7 +56,7 @@
> int dim2_b, int dim1_b_bot)
> {
> // memcpy are faster version of the fors
> - int i, k, w, top_idx, bot_idx, max = dim1_b*dim2_b;
> + int i, k, w, top_idx, bot_idx;
> int size;
> top_idx = 0;
> bot_idx = 0;
> @@ -111,7 +93,7 @@
> type_precision* bot, int dim1_QY,
> int dim2_QY, int dim1_qy_bot, int bot_blocks )
> {
> - int i, k, w, top_idx, bot_idx, max = dim1_QY*dim2_QY;
> + int i, k, w, top_idx, bot_idx;
> top_idx = 0;
> bot_idx = 0;
> for (k = 0; k < dim2_QY; k++)
> @@ -326,12 +308,9 @@
> blas_set_num_threads(max_threads);
>
>
> - type_precision *Ytemp;
> - lapack_int info, n, lda, ldy, l, r, k, p;
> + //type_precision *Ytemp;
> + lapack_int info, n, lda, l, r, p;
>
> - int i, j, w;
> - int m;
> -
> cputime_type start_tick, start_tick2, end_tick;
>
> AIOfile.initialize(params);
> @@ -349,12 +328,12 @@
> int y_iters = (y_amount + y_block_size - 1) / y_block_size;
>
>
> - lda = n; ldy = n;
> - k = p;
> + lda = n;
>
> +
> for (int j = 0; j < y_iters; j++)
> {
> - if (y_iters >= 40 && (i%(y_iters/40)) == 0)
> + if (y_iters >= 40 && (j%(y_iters/40)) == 0)
> {
> cout << "*" << flush;
> }
> @@ -389,7 +368,9 @@
>
>
> AIOfile.load_AL(&backupAL);
> - int total_al_nans = replace_nans(0, backupAL, n, l);
> + //int total_al_nans = replace_nans(0, backupAL, n, l);
> + replace_nans(0, backupAL, n, l);
> +
> copy_vec(backupAL, AL, n*l);
>
>
> @@ -402,7 +383,7 @@
>
> for (int j = 0; j < y_iters; j++)
> {
> - if (y_iters >= 40 && (i%(y_iters/40)) == 0)
> + if (y_iters >= 40 && (j%(y_iters/40)) == 0)
> {
> cout << AIOfile.io_overhead << flush;
> AIOfile.io_overhead = "*";
> @@ -412,7 +393,8 @@
>
> list<long int> y_nan_idxs[y_block_size];
>
> - int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
> + //int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
> + replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
>
>
> //! Ay_top = AL'*Y
> @@ -424,7 +406,8 @@
> for (int i = 0; i < a_iters; i++)
> {
> AIOfile.load_ARblock(&backupAR, a_block_size);
> - int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
> + //int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
> + replace_nans(0, backupAR, n, a_block_size * r);
> copy_vec(backupAR, AR, n * r * a_block_size);
>
> get_ticks(start_tick2);
> @@ -501,7 +484,7 @@
> &Ay[l+ii*p], r);
>
>
> - type_precision* B = Ay;
> + //type_precision* B = Ay;
> type_precision S[p * p];
>
> //! Rebuild S
> @@ -564,1194 +547,187 @@
>
> //!*************************************************!//
>
> -
> +//for reference
> void Algorithm::partialNEQ_Blocked_STL(struct Settings params,
> struct Outputs &out)
> {
> - int max_threads = params.threads;
> -
> - srand(time(NULL));
> -
> - blas_set_num_threads(max_threads);
> -
> - type_precision *Ytemp;
> - lapack_int info, n, lda, ldy, l, r, k, p;
> -
> - int i, j, w;
> - int m;
> -
> - cputime_type start_tick, start_tick2, end_tick;
> -
> - AIOfile.initialize(params);
> -
> - n = params.n; l = params.l; r = params.r; p = l+r;
> -
> - int y_amount = params.m;
> - int y_block_size = params.mb; // kk
> -
> - int a_amount = params.t;
> - int a_block_size = params.tb;
> -
> - int a_iters = (a_amount + a_block_size - 1) / a_block_size;
> -
> - lda = n; ldy = n;
> - k = p;
> -
> -
> -
> - type_precision* backupAL;
> -
> - AIOfile.load_AL(&backupAL);
> -
> -
> - type_precision Stl[l * l];
> - type_precision Str[l * r * a_block_size];
> -
> -
> - type_precision* Ay_top = new type_precision[l * y_amount];
> - type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
> - type_precision* A = new type_precision[n * p];
> - type_precision* AR = new type_precision[n * r * a_block_size];
> - type_precision* AL = new type_precision[n * l];
> -
> - copy_vec(backupAL, AL, n * l);
> - copy_vec(backupAL, A, n * l);
> -
> -
> - int y_iters = y_amount/y_block_size;
> -
> - type_precision* Y;
> - type_precision* backupAR;
> -
> - // printf("\n\n%%Computations\n%%");
> -
> -
> - get_ticks(start_tick);
> -
> -
> - //! Generate S
> - cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> - l, n, 1.0, backupAL, lda, 0.0, Stl, l);
> -
> - for (j = 0; j < y_iters; j++)
> - {
> - AIOfile.load_Yblock(&Y, y_block_size);
> -
> - //! Ay_top = AL'*Y
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
> - &Ay_top[j * l * y_block_size], l);
> - }
> -
> -
> - for (i = 0; i < a_iters; i++)
> - {
> - if (a_iters > 10 && (i%(a_iters/10)) == 0)
> - {
> - cout << "%" << flush;
> - }
> -
> -
> - AIOfile.load_ARblock(&backupAR, a_block_size);
> - AIOfile.reset_Y();
> - copy_vec(backupAR, AR, n*r*a_block_size);
> -
> - // matlab_print_matrix("A", n, p, A);
> - //! Generate Str
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - l, r * a_block_size, n, 1.0, AL, n, AR, n, 0.0, Str, l);
> -
> - type_precision Sbr[r*r*a_block_size];
> -
> - #pragma omp parallel for default(shared) schedule(static)
> - for (int ii= 0; ii < a_block_size; ii++)
> - {
> - //! Generate Sbr
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - r, r, n, 1.0, &AR[ii*r*n], n, &AR[ii*r*n],
> - n, 0.0, &Sbr[ii*r*r], r);
> - }
> -
> - type_precision Ay[y_block_size*p];
> - type_precision S[p*p];
> -
> -
> - for (j = 0; j < y_iters; j++)
> - {
> - if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
> - {
> - cout << "%" << flush;
> - }
> -
> -
> - AIOfile.load_Yblock(&Y, y_block_size);
> -
> - //! Ay_bot = AR'*Y
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
> - 0.0, Ay_bot, r * a_block_size);
> -
> - #pragma omp parallel for private(S, Ay) default(shared) schedule(static)
> - for (int ii= 0; ii < a_block_size; ii++)
> - {
> - // matlab_print_matrix("Y", n, y_block_size, Y);
> -
> - //! Rebuild AY
> -
> - for (int jj = 0; jj < y_block_size; jj++)
> - {
> - copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[jj*p], l);
> - copy_vec(&Ay_bot[ii*r + jj*r*a_block_size], &Ay[l+jj*p], r);
> - }
> -
> - type_precision* B = Ay;
> - // matlab_print_matrix("Ay_top", l, y_block_size,&Ay_top[j*l*y_block_size]);
> - // matlab_print_matrix("Ay_bot", r, y_block_size, Ay_bot);
> - // matlab_print_matrix("Ay", p, y_block_size, Ay);
> -
> - //! Rebuild S
> - build_S(S, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
> - // matlab_print_matrix("S", p, p, S);
> -
> - //! b = S\Ay
> - info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
> - S, p, Ay, p);
> - // assert(info == 0,"POSV");
> -
> -
> - if (ForceCheck)
> - {
> - check_result(backupAL, &AR[ii * r * n], n, p,
> - y_block_size, r, Y, B);
> - }
> - }
> - }
> - }
> -
> - get_ticks(end_tick);
> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> - // out.gflops = a_amount/1000.0*(n*l*r+n*r*r+y_amount*(n*r+p*p*p)))/1000.0/1000.0;
> - out.gflops = gemm_flops(l, n, l, 0) + gemm_flops(l, n, y_amount, 0) +
> - a_amount * (gemm_flops(l, n, r, 0) +
> - gemm_flops(r, n, r, 0) +
> - gemm_flops(r, n, y_amount, 0) +
> - (y_amount/1000.0) *
> - (p * p * p / 3.0) / 1000.0/1000.0);
> -
> -
> - AIOfile.finalize();
> -
> - delete []Ay_top;
> - delete []Ay_bot;
> - delete []A;
> - delete []AL;
> - delete []AR;
> -}
> -
> -
> -void Algorithm::partialNEQ(struct Settings params, struct Outputs &out)
> -{
> - int max_threads = params.threads;
> -
> -
> - srand(time(NULL));
> -
> -
> - blas_set_num_threads(max_threads);
> -
> -
> - type_precision *Ytemp;
> - lapack_int info, n, lda, ldy, l, r, k, p;
> -
> - int i, j, w;
> - int m;
> -
> - cputime_type start_tick, start_tick2, end_tick;
> - AIOfile.initialize(params);
> -
> - n = params.n; l = params.l; r = params.r; p = l+r;
> -
> - int y_amount = params.m;
> - int y_block_size = params.mb;// kk
> -
> - int a_amount = params.t;
> - int a_block_size = 1;
> -
> - int a_iters = a_amount/a_block_size;
> -
> - lda = n; ldy = n;
> - k = p;
> -
> -
> -
> - type_precision* backupAL;
> -
> - AIOfile.load_AL(&backupAL);
> -
> - type_precision* S = new type_precision[p * p];
> - type_precision Stl[l * l];
> - type_precision Str[l * r];
> - type_precision Sbr[r * r];
> - type_precision* A = new type_precision[n * p];
> - type_precision* AL = A;
> - type_precision* AR = &A[l * n];
> -
> - copy_vec(backupAL, AL, n*l);
> -
> -
> - int y_iters = y_amount / y_block_size;
> -
> - type_precision* Y;
> - type_precision* backupAR;
> -
> - // printf("\n\n%%Computations\n%%");
> -
> - get_ticks(start_tick);
> -
> - //! Generate S
> - cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> - l, n, 1.0, backupAL, lda, 0.0, Stl, l);
> -
> - for (i = 0; i < a_iters; i++)
> - {
> - if (a_iters < 10 || (i%(a_iters/10)) == 0)
> - {
> - cout << "%" << flush;
> - }
> -
> - AIOfile.load_ARblock(&backupAR, a_block_size);
> - AIOfile.reset_Y();
> - copy_vec(backupAR, AR, n*r);
> -
> - // matlab_print_matrix("A", n, p, A);
> - //! Generate Str
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - l, r, n, 1.0, AL, n, AR, n, 0.0, Str, l);
> -
> - //! Generate Sbr
> - cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> - r, n, 1.0, backupAR, lda, 0.0, Sbr, r);
> -
> -
> -
> - // matlab_print_matrix("S", p, p, S);
> -
> - for (j = 0; j < y_iters; j++)
> - {
> - build_S(S, Stl, Str, Sbr, l, r);
> -
> - AIOfile.load_Yblock(&Y, y_block_size);
> - type_precision Ay[y_block_size * p];
> -
> - // matlab_print_matrix("Y", n, y_block_size, Y);
> -
> - //! Ay = A'*Y
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - p, y_block_size, n, 1.0, A, n, Y, n, 0.0, Ay, p);
> -
> - type_precision* B = Ay;
> - // matlab_print_matrix("Ay", p, y_block_size, Ay);
> -
> - //! b = S\qy
> - info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
> - S, p, Ay, p);
> - assert(info == 0, "POSV");
> -
> - // matlab_print_matrix("B", p, y_block_size, B);
> - // exit(0);
> - if ( ForceCheck)
> - {
> - check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
> - }
> - } // END for over j < y_iters
> - } // END for over i < a_iters
> -
> - get_ticks(end_tick);
> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> - out.gflops = 2.0 * (n * l * l / 1000.0 + a_amount / 1000.0 *
> - (n * l * r + n * r * r +
> - y_amount * (p * n + p * p * p))) /1000.0 /1000.0;
> -
> -
> -
> - AIOfile.finalize();
> -
> - delete []A;
> - delete []S;
> -}
> -
> -
> -void Algorithm::fullNEQ(struct Settings params, struct Outputs &out)
> -{
> - int max_threads = params.threads;
> -
> -
> - srand(time(NULL));
> -
> -
> - blas_set_num_threads(max_threads);
> -
> -
> - type_precision *Ytemp;
> - lapack_int info, n, lda, ldy, l, r, k, p;
> -
> - int i, j, w;
> - int m;
> -
> - cputime_type start_tick, start_tick2, end_tick;
> -
> - AIOfile.initialize(params);
> -
> - n = params.n; l = params.l; r = params.r; p = l+r;
> -
> - int y_amount = params.m;
> - int y_block_size = params.mb;// kk
> -
> - int a_amount = params.t;
> - int a_block_size = 1;
> -
> - int a_iters = a_amount/a_block_size;
> -
> - lda = n; ldy = n;
> - k = p;
> -
> -
> -
> - type_precision* backupAL;
> -
> - AIOfile.load_AL(&backupAL);
> -
> - type_precision* S = new type_precision[p * p];
> - type_precision* Stemp = new type_precision[p * p];
> - type_precision* A = new type_precision[n * p];
> - type_precision* AL = A;
> - type_precision* AR = &A[l * n];
> -
> - int y_iters = y_amount / y_block_size;
> -
> - type_precision* Y;
> - type_precision* backupAR;
> -
> - // printf("\n\n%%Computations\n%%");
> -
> - get_ticks(start_tick);
> -
> - for (i = 0; i < a_iters; i++)
> - {
> - if (a_iters < 10 || (i%(a_iters/10)) == 0)
> - {
> - cout << "%" << flush;
> - }
> -
> - copy_vec(backupAL, AL, n * l);
> -
> - AIOfile.load_ARblock(&backupAR, a_block_size);
> - AIOfile.reset_Y();
> - copy_vec(backupAR, AR, n*r);
> -
> - // matlab_print_matrix("A", n, p, A);
> - //! Generate S
> - cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> - p, n, 1.0, A, lda, 0.0, S, p);
> - // matlab_print_matrix("S", p, p, S);
> -
> - for (j = 0; j < y_iters; j++)
> - {
> - AIOfile.load_Yblock(&Y, y_block_size);
> - type_precision Ay[y_block_size * p];
> - copy_vec(S, Stemp, p*p);
> -
> - // matlab_print_matrix("Y", n, y_block_size, Y);
> -
> - //! Ay = A'*Y
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - p, y_block_size, n, 1.0, A, n, Y, n, 0.0, Ay, p);
> -
> - type_precision* B = Ay;
> - // matlab_print_matrix("Ay", p, y_block_size, Ay);
> -
> - //! b = S\qy
> - info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
> - Stemp, p, Ay, p);
> - assert(info == 0, "POSV");
> -
> - // matlab_print_matrix("B", p, y_block_size, B);
> - // exit(0);
> - if ( ForceCheck)
> - {
> - check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
> - }
> - }
> - }
> -
> - get_ticks(end_tick);
> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> - out.gflops = 2.0 * a_amount * (n * p * p / 1000.0 + y_amount / 1000.0 *
> - (p * n + p * p * p))/1000.0/1000.0;
> -
> -
> - AIOfile.finalize();
> -
> -
> - free(AL);
> - delete []A;
> - delete []S;
> - delete []Stemp;
> -}
> -
> -
> -//!*************************************************!//
> -void Algorithm::fullQR(struct Settings params, struct Outputs &out)
> -{
> - int max_threads = params.threads;
> -
> -
> - srand(time(NULL));
> -
> - blas_set_num_threads(max_threads);
> -
> -
> -
> - cputime_type start_tick, start_tick2, end_tick;
> - double acc_gemm = 0;
> - double acc_trsm = 0;
> - double acc_other = 0;
> - double acc_atemp = 0;
> - double acc_rtr = 0;
> - double acc_rqr = 0;
> -
> -
> - type_precision *Ytemp;
> - lapack_int info, n, lda, ldy, l, r, k, p;
> -
> - int i, j, w;
> - int m;
> -
> - AIOfile.initialize(params);
> -
> - n = params.n; l = params.l; r = params.r; p = l+r;
> -
> - int y_amount = params.m;
> - int y_block_size = 1;// kk
> -
> - int a_amount = params.t;
> - int a_block_size = 1;
> -
> - int a_iters = a_amount/a_block_size;
> -
> - lda = n; ldy = n;
> - k = p;
> -
> -
> -
> - type_precision* backupAL;
> -
> - AIOfile.load_AL(&backupAL);
> -
> - type_precision* Q = new type_precision[n*p];
> - type_precision* A = Q;
> - type_precision* AL = A;
> - type_precision* AR = &A[l*n];
> -
> - type_precision tau[k];
> -
> -
> - int y_iters = y_amount / y_block_size;
> -
> -
> - type_precision* Y;
> - type_precision* backupAR;
> -
> -
> -
> - // printf("\n\n%%Computations\n%%");
> -
> - get_ticks(start_tick);
> - for (i = 0; i < a_iters; i++)
> - {
> - if (a_iters < 10 || (i%(a_iters/10)) == 0)
> - {
> - cout << "%" << flush;
> - }
> -
> - copy_vec(backupAL, AL, n*l);
> -
> - AIOfile.load_ARblock(&backupAR, a_block_size);
> - AIOfile.reset_Y();
> - copy_vec(backupAR, AR, n*r);
> -
> - //! Generate R
> - info = LAPACKE_sgeqrf(STORAGE_TYPE, n, p, A, lda, tau);
> - assert(info == 0, "QR decomp");
> - type_precision* R = extract_R(A, n, p);
> -
> - //! generate Q
> - info = LAPACKE_sorgqr(STORAGE_TYPE, n, p, k, Q, lda, tau);
> - assert(info == 0, "Q form");
> -
> -
> -
> -// matlab_print_matrix("ALL", n, l, backupAL);
> -// matlab_print_matrix("ARR", n, r, backupAR);
> -
> - for (j = 0; j < y_iters; j++)
> - {
> - // matlab_print_matrix("RR", p, p, R);
> - // matlab_print_matrix("QQ", n, p, Q);
> -
> - AIOfile.load_Yblock(&Y, y_block_size);
> - type_precision Qy[y_block_size*p];
> -
> - //! qy = Q'*Y
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - p, y_block_size, n, 1.0, Q, n, Y, n, 0.0, Qy, p);
> -
> - type_precision* B = Qy;
> -
> - //! b = R\qy
> - cblas_strsm(CblasColMajor, CblasLeft, CblasUpper,
> - CblasNoTrans, CblasNonUnit,
> - p, y_block_size, 1.0, R, p, Qy, p);
> -
> - if (ForceCheck)
> - {
> - check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
> - }
> - }
> -
> - free(R);
> - }
> -
> - get_ticks(end_tick);
> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> - out.gflops = 2.0 * a_amount *
> - (2.0 * n * p * p / 1000.0 + y_amount / 1000.0 *
> - (p * n + p * p * p)) / 1000.0 / 1000.0;
> -
> -
> - AIOfile.finalize();
> -
> -
> - free(AL);
> - delete[] Q;
> -}
> -
> -
> -void Algorithm::partialQR(struct Settings params, struct Outputs &out)
> -{
> - int max_threads = params.threads;
> -
> -
> - srand(time(NULL));
> -
> - blas_set_num_threads(max_threads);
> -
> -
> -
> - cputime_type start_tick, start_tick2, end_tick;
> - double acc_gemm = 0;
> - double acc_trsm = 0;
> - double acc_other = 0;
> - double acc_atemp = 0;
> - double acc_rtr = 0;
> - double acc_rqr = 0;
> -
> -
> - type_precision *RL, *Ytemp, *RtlRtr;
> - lapack_int info, n, lda, ldy, l, r, k, p;
> -
> - int i, j, w;
> - int m;
> -
> - AIOfile.initialize(params);
> -
> - n = params.n; l = params.l; r = params.r; p = l+r;
> -
> - int y_amount = params.m;
> - int y_block_size = params.mb;// kk
> -
> - int a_amount = params.t;
> - int a_block_size = 1;
> -
> - int a_iters = a_amount/a_block_size;
> -
> - lda = n; ldy = n;
> - k = l;
> -
> -
> -
> - type_precision* backupAL;
> -
> - AIOfile.load_AL(&backupAL);
> -
> - type_precision* Q = new type_precision[n*p];
> - type_precision* AL = Q;
> - copy_vec(backupAL, AL, n*l);
> -
> -
> - type_precision tau[k];
> -
> - type_precision* QL;
> - type_precision* QR = &Q[n*l];
> -
> - int y_iters = y_amount / y_block_size;
> -
> -
> - type_precision* Y;
> -
> - type_precision Rtr[l*r];
> - type_precision* Ar;
> - type_precision Rbr[r*r];
> -
> - get_ticks(start_tick);
> - //! Generate RTL
> - info = LAPACKE_sgeqrf(STORAGE_TYPE, n, l, AL, lda, tau);
> - assert(info == 0, "QR decomp");
> -
> - type_precision* R =prepare_R(AL, n, l, r);
> - type_precision* Rtl = R;
> -
> - QL = AL;// same as Q
> - AL = backupAL;
> - //! generate QL
> - info = LAPACKE_sorgqr(STORAGE_TYPE, n, l, k, QL, lda, tau);
> -
> - assert(info == 0, "Q form");
> -
> - // printf("\n\n%%Computations\n%%");
> -
> -
> - for (i = 0; i < a_iters; i++)
> - {
> - if (a_iters < 10 || (i%(a_iters/10)) == 0)
> - {
> - cout << "%" << flush;
> - }
> -
> - AIOfile.load_ARblock(&Ar, a_block_size);
> - AIOfile.reset_Y();
> -
> - //! Rtr = Q1'*AR
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - l, r*a_block_size, n, 1.0, QL, n, Ar, n, 0.0, Rtr, l);
> -
> - type_precision* Atemp = replicate_vec(Ar, n*r*a_block_size);
> -
> - //! Atemp = AR-Q1*Rtr
> - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
> - n, r*a_block_size, l, -1.0, QL, n, Rtr, l, 1.0, Atemp, n);
> -
> - type_precision* QrRbr = QR;
> - copy_vec(Atemp, QrRbr, n*r);
> - info = LAPACKE_sgeqrf(STORAGE_TYPE, n, r, QrRbr, lda, tau);
> - assert(info == 0, "QR decomp of QrRbr");
> -
> - // reuse of old function
> - type_precision* Rbrtemp = prepare_R(QrRbr, n, r, 0);
> -
> - copy_vec(Rbrtemp, Rbr, r*r);
> - info = LAPACKE_sorgqr(STORAGE_TYPE, n, r, r, QrRbr, lda, tau);
> - assert(info == 0, "QR form");
> - free(Rbrtemp);
> -
> - update_R(R, Rtr, Rbr, p, p, r);
> -
> -// matlab_print_matrix("RL", p, l, Rtl);
> -// matlab_print_matrix("Rtr", l, r, Rtr);
> -// matlab_print_matrix("Rbr", r, r, Rbr);
> -// matlab_print_matrix("RR", p, p, R);
> -// matlab_print_matrix("QQ", n, p, Q);
> - // matlab_print_matrix("ALL", n, l, AL);
> - // matlab_print_matrix("ARR", n, r, Ar);
> - // matlab_print_matrix("R", p, p, R);
> - int top_idx = 0;
> -
> - for (j = 0; j < y_iters; j++)
> - {
> - AIOfile.load_Yblock(&Y, y_block_size);
> - type_precision Qy[y_block_size*p];
> -
> -
> - //! qy = Q'*Y
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - p, y_block_size, n, 1.0, Q, n, Y, n, 0.0, Qy, p);
> -
> - type_precision* B = Qy;
> -
> - //! b = R\qy
> - cblas_strsm(CblasColMajor, CblasLeft, CblasUpper,
> - CblasNoTrans, CblasNonUnit,
> - p, y_block_size, 1.0, R, p, Qy, p);
> -
> - // matlab_print_matrix("bcomp", p, y_block_size, B);
> -
> -
> - if ( ForceCheck)
> - {
> - check_result(AL, Ar, n, p, y_block_size, r, Y, B);
> - }
> - }
> -
> - free(Atemp);
> - }
> -
> -
> - get_ticks(end_tick);
> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> - out.gflops = 2.0*(n*l*l/1000.0 + a_amount *
> - (2.0 * n * l * r / 1000.0 +
> - n * r * r / 1000.0 +
> - y_amount / 1000.0 * (p * n + p * p * p))) /
> - 1000.0 / 1000.0;
> -
> -
> - AIOfile.finalize();
> -
> -
> - free(Rtl);
> - free(AL);
> - delete[] Q;
> -}
> -
> -
> -void Algorithm::partialQR_Blocked_Rtl(struct Settings params,
> - struct Outputs &out)
> -{
> - int max_threads = params.threads;
> -
> - // srand (time(NULL));
> -
> - int cpu_cores = max_threads;
> -
> -
> - blas_set_num_threads(max_threads);
> - omp_set_num_threads(max_threads);
> -
> -
> -
> -
> - cputime_type start_tick, start_tick2, end_tick;
> - double acc_pre = 0;
> - double acc_trsm = 0;
> - double acc_other = 0;
> - double acc_atemp = 0;
> - double acc_rtr = 0;
> - double acc_rqr = 0;
> - double acc_RTL_QLY = 0;
> -
> -
> - type_precision *AL, *RL, *RQy_top, *Ytemp, *RtlRtr;
> - lapack_int info, n, lda, ldy, l, r, k, p;
> -
> - int i, j, w;
> - int m;
> -
> -
> - AIOfile.initialize(params);
> -
> - n = params.n; l = params.l; r = params.r; p = l+r;
> -
> -
> -
> - double lvl_3_cache_size = 6;
> -
> -
> - int y_amount = params.t;
> - int y_block_size = params.tb;// kk
> -
> - int a_amount = params.m;
> - int a_block_size = params.mb;
> -
> - int a_iters = (a_amount + a_block_size - 1) / a_block_size;
> -
> - int y_iters = (y_amount+ y_block_size - 1)/y_block_size;
> -
> - int qy_idx = 0;
> -
> - out.gflops = n/1000.0*l/1000.0*l/1000.0 +
> - gemm_flops(l, n, y_amount, 0) +
> - y_amount*l/1000.0*l/1000.0/1000.0 +
> - a_amount * (gemm_flops(l, n, r, 0) +
> - l/1000.0*l/1000.0/1000.0 +
> - gemm_flops(n, l, r, 1) +
> - n/1000.0*r/1000.0*r/1000.0 +
> - gemm_flops(r, n, y_amount, 0) +
> - y_amount/1000.0*r/1000.0*r/1000.0 +
> - y_amount/1000.0*l/1000.0/1000.0);
> -
> - int sch_block_size = max(1.0,
> - min((double)a_block_size / (double)max_threads,
> - (1.0*1000*1000) /
> - (double)((sizeof(type_precision) * n * r))));
> -
> - // cout << endl<< "taskChunk "<< sch_block_size <<endl;
> -
> - lda = n; ldy = n;
> - k = l;
> -
> - AIOfile.load_AL(&AL);
> -
> - type_precision* backupAL = replicate_vec(AL, n*l);
> -
> -
> -
> - RQy_top = new type_precision[l*y_amount];
> -
> - type_precision* Bfinal = (type_precision*)malloc(max_threads * p *
> - y_block_size *
> - sizeof(type_precision));
> -
> - type_precision tau[k];
> -
> - type_precision* QL;
> -
> - type_precision* Rtr = (type_precision*)malloc(l * r * a_block_size *
> - sizeof(type_precision));
> - type_precision* Qy_bot = (type_precision*)malloc(a_block_size *
> - y_block_size * r *
> - sizeof(type_precision));
> -
> - type_precision* B_top = new type_precision[max_threads*y_block_size*l];
> - type_precision* B_bot = new type_precision[max_threads*y_block_size*r];
> -
> - type_precision* Ar;
> - type_precision Rbr[r*r*a_block_size];
> -
> -
> -
> - // printf("\n\n%%Computations\n%%");
> - for (i = 0; i < a_iters; i++)
> - {
> - if (a_iters >= 10 && (i%(a_iters/10)) == 0)
> - {
> - // cout << "*" << flush;
> - }
> -
> - for (j = 0; j < y_iters; j++)
> - {
> - if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
> - {
> - // cout << "*" << flush;
> - }
> - }
> - }
> - cout << endl;
> -
> -
> - //! Start of Computations
> -
> - get_ticks(start_tick);
> -
> -
> - info = LAPACKE_sgeqrf(STORAGE_TYPE, n, l, AL, lda, tau);
> - assert(info == 0, "QR decomp");
> -
> - type_precision* Rtl = prepare_R(AL, n, l, 0);
> -
> - QL = AL;
> - AL = backupAL;
> -
> - info = LAPACKE_sorgqr(STORAGE_TYPE, n, l, k, QL, lda, tau);
> -
> - assert(info == 0, "Q form");
> -
> - matlab_print_matrix("QL", n, l, QL);
> -
> - qy_idx = 0;
> -
> - // printf("\n%%Preparing IO\n");
> - //! Prepare File for the Y
> -
> -
> - type_precision* Y;
> -
> - // #pragma omp parallel default(shared)
> +// int max_threads = params.threads;
> +//
> +// srand(time(NULL));
> +//
> +// blas_set_num_threads(max_threads);
> +//
> +// type_precision *Ytemp;
> +// lapack_int info, n, lda,ldy, l, r, p;
> +//
> +// int i,k;
> +//
> +// cputime_type start_tick, start_tick2, end_tick;
> +//
> +// AIOfile.initialize(params);
> +//
> +// n = params.n; l = params.l; r = params.r; p = l+r;
> +//
> +// int y_amount = params.m;
> +// int y_block_size = params.mb; // kk
> +//
> +// int a_amount = params.t;
> +// int a_block_size = params.tb;
> +//
> +// int a_iters = (a_amount + a_block_size - 1) / a_block_size;
> +//
> +// lda = n; ldy = n;
> +// k = p;
> +//
> +//
> +//
> +// type_precision* backupAL;
> +//
> +// AIOfile.load_AL(&backupAL);
> +//
> +//
> +// type_precision Stl[l * l];
> +// type_precision Str[l * r * a_block_size];
> +//
> +//
> +// type_precision* Ay_top = new type_precision[l * y_amount];
> +// type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
> +// type_precision* A = new type_precision[n * p];
> +// type_precision* AR = new type_precision[n * r * a_block_size];
> +// type_precision* AL = new type_precision[n * l];
> +//
> +// copy_vec(backupAL, AL, n * l);
> +// copy_vec(backupAL, A, n * l);
> +//
> +//
> +// int y_iters = y_amount/y_block_size;
> +//
> +// type_precision* Y;
> +// type_precision* backupAR;
> +//
> +// // printf("\n\n%%Computations\n%%");
> +//
> +//
> +// get_ticks(start_tick);
> +//
> +//
> +// //! Generate S
> +// cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> +// l, n, 1.0, backupAL, lda, 0.0, Stl, l);
> +//
> +// for (int j = 0; j < y_iters; j++)
> // {
> -// // #pragma omp for private(Y, y_block_size, qy_idx) nowait schedule(static, 1)
> +// AIOfile.load_Yblock(&Y, y_block_size);
> +//
> +// //! Ay_top = AL'*Y
> +// cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> +// l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
> +// &Ay_top[j * l * y_block_size], l);
> +// }
> +//
> +//
> +// for (int i = 0; i < a_iters; i++)
> +// {
> +// if (a_iters > 10 && (i%(a_iters/10)) == 0)
> +// {
> +// cout << "%" << flush;
> +// }
> +//
> +//
> +// AIOfile.load_ARblock(&backupAR, a_block_size);
> +// AIOfile.reset_Y();
> +// copy_vec(backupAR, AR, n*r*a_block_size);
> +//
> +// // matlab_print_matrix("A", n, p, A);
> +// //! Generate Str
> +// cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> +// l, r * a_block_size, n, 1.0, AL, n, AR, n, 0.0, Str, l);
> +//
> +// type_precision Sbr[r*r*a_block_size];
> +//
> +// #pragma omp parallel for default(shared) schedule(static)
> +// for (int ii= 0; ii < a_block_size; ii++)
> +// {
> +// //! Generate Sbr
> +// cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> +// r, r, n, 1.0, &AR[ii*r*n], n, &AR[ii*r*n],
> +// n, 0.0, &Sbr[ii*r*r], r);
> +// }
> +//
> +// type_precision Ay[y_block_size*p];
> +// type_precision S[p*p];
> +//
> +//
> // for (int j = 0; j < y_iters; j++)
> // {
> -// // qy_idx = j*y_block_size*l;
> -// // #pragma omp critical
> -// {AIOfile.load_Yblock(&Y, y_block_size);}
> +// if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
> +// {
> +// cout << "%" << flush;
> +// }
> //
> -// //! qy_top = QL'*Y
> +//
> +// AIOfile.load_Yblock(&Y, y_block_size);
> +//
> +// //! Ay_bot = AR'*Y
> // cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -// l, y_block_size, n, 1.0, QL, n, Y, n, 0.0,&RQy_top[qy_idx], l);
> +// r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
> +// 0.0, Ay_bot, r * a_block_size);
> //
> -// //! K | RtlQlY =RTL\qy_top
> -// cblas_strsm(CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit,
> -// l, y_block_size, 1.0, Rtl, l,&RQy_top[qy_idx], l);
> -// qy_idx += y_block_size*l;
> +// #pragma omp parallel for private(S, Ay) default(shared) schedule(static)
> +// for (int ii= 0; ii < a_block_size; ii++)
> +// {
> [TRUNCATED]
>
> To get the complete diff run:
> svnlook diff /svnroot/genabel -r 1609
> _______________________________________________
> Genabel-commits mailing list
> Genabel-commits at lists.r-forge.r-project.org
> https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/genabel-commits
>
--
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
L.C. Karssen
Utrecht
The Netherlands
lennart at karssen.org
http://blog.karssen.org
GPG key ID: A88F554A
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 230 bytes
Desc: OpenPGP digital signature
URL: <http://lists.r-forge.r-project.org/pipermail/genabel-devel/attachments/20140214/e7a65565/attachment-0001.sig>
More information about the genabel-devel
mailing list