[Genabel-commits] r1609 - pkg/OmicABELnoMM/src
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Thu Feb 13 21:42:58 CET 2014
Author: afrank
Date: 2014-02-13 21:42:52 +0100 (Thu, 13 Feb 2014)
New Revision: 1609
Modified:
pkg/OmicABELnoMM/src/AIOwrapper.cpp
pkg/OmicABELnoMM/src/Algorithm.cpp
pkg/OmicABELnoMM/src/Algorithm.h
pkg/OmicABELnoMM/src/Definitions.h
pkg/OmicABELnoMM/src/Utility.cpp
pkg/OmicABELnoMM/src/Utility.h
pkg/OmicABELnoMM/src/main.cpp
Log:
Warnings from c/c++ inconsistencies and -Wall fixed. Legacy code from unused functionality removed. A few bugs with indices fixed. Stable code.
Modified: pkg/OmicABELnoMM/src/AIOwrapper.cpp
===================================================================
--- pkg/OmicABELnoMM/src/AIOwrapper.cpp 2014-02-13 09:24:32 UTC (rev 1608)
+++ pkg/OmicABELnoMM/src/AIOwrapper.cpp 2014-02-13 20:42:52 UTC (rev 1609)
@@ -29,9 +29,9 @@
Fhandler->fakefiles = params.use_fake_files;
- databel_fvi* Yfvi;
- databel_fvi* ALfvi;
- databel_fvi* ARfvi;
+// databel_fvi* Yfvi;
+// databel_fvi* ALfvi;
+// databel_fvi* ARfvi;
if (!Fhandler->fakefiles)
{
@@ -49,7 +49,7 @@
params.l = ALfvi->fvi_header.numVariables;
//block size to keep mem under 1 gigabyte
- int opt_block = params.n / (4*1000^3) * (1/(2*params.r));
+ //int opt_block = params.n / (4*1000^3) * (1/(2*params.r));
int opt_tb = 1000;
int opt_mb = 1000;
@@ -81,7 +81,7 @@
void AIOwrapper::finalize()
{
//cout << "f";
- void *status;
+ //void *status;
Fhandler->not_done = false;
pthread_mutex_lock(&(Fhandler->m_more));
@@ -368,13 +368,15 @@
}
//
// //!induce realistic fileread delay
+
+ return 0;
}
void AIOwrapper::load_ARblock(type_precision** Ar, int &Ar_blockSize)
{
- int status;
- int createstatus = 0;
+ //int status;
+ //int createstatus = 0;
//cout<<"^";
while (Fhandler->ar_full_buffers.empty())
@@ -428,8 +430,8 @@
void AIOwrapper::load_Yblock(type_precision** Y, int &y_blockSize)
{
- int status;
- int createstatus = 0;
+ //int status;
+ //int createstatus = 0;
while (Fhandler->full_buffers.empty())
{
@@ -528,7 +530,7 @@
void AIOwrapper::reset_Y()
{
- void *status;
+ //void *status;
Fhandler->seed = 1337;
@@ -569,7 +571,7 @@
void AIOwrapper::reset_AR()
{
- void *status;
+ //void *status;
//cout << "ra" << flush;
@@ -697,7 +699,7 @@
FILE * AIOwrapper::fgls_fopen( const char *path, const char *mode )
{
FILE * f;
- char err[100];
+ //char err[100];
f = fopen(path, mode );
if ( f == NULL )
Modified: pkg/OmicABELnoMM/src/Algorithm.cpp
===================================================================
--- pkg/OmicABELnoMM/src/Algorithm.cpp 2014-02-13 09:24:32 UTC (rev 1608)
+++ pkg/OmicABELnoMM/src/Algorithm.cpp 2014-02-13 20:42:52 UTC (rev 1609)
@@ -16,30 +16,12 @@
{
switch (type)
{
- case FULL_NEQ:
- fullNEQ(params, out);
- break;
- case P_NEQ:
- partialNEQ(params, out);
- break;
- case P_NEQ_B_OPT:
- partialNEQ_Blocked_STL(params, out);
- break;
- case FULL_QR:
- fullQR(params, out);
- break;
- case P_QR:
- partialQR(params, out);
- break;
- case P_QR_B_OPT:
- partialQR_Blocked_Rtl(params, out);
- break;
- case P_NEQ_B_OPT_MD:
- partialNEQ_Blocked_STL_MD(params, out);
- break;
+ case P_NEQ_B_OPT_MD:
+ partialNEQ_Blocked_STL_MD(params, out);
+ break;
- default:
- break;
+ default:
+ break;
}
}
@@ -74,7 +56,7 @@
int dim2_b, int dim1_b_bot)
{
// memcpy are faster version of the fors
- int i, k, w, top_idx, bot_idx, max = dim1_b*dim2_b;
+ int i, k, w, top_idx, bot_idx;
int size;
top_idx = 0;
bot_idx = 0;
@@ -111,7 +93,7 @@
type_precision* bot, int dim1_QY,
int dim2_QY, int dim1_qy_bot, int bot_blocks )
{
- int i, k, w, top_idx, bot_idx, max = dim1_QY*dim2_QY;
+ int i, k, w, top_idx, bot_idx;
top_idx = 0;
bot_idx = 0;
for (k = 0; k < dim2_QY; k++)
@@ -326,12 +308,9 @@
blas_set_num_threads(max_threads);
- type_precision *Ytemp;
- lapack_int info, n, lda, ldy, l, r, k, p;
+ //type_precision *Ytemp;
+ lapack_int info, n, lda, l, r, p;
- int i, j, w;
- int m;
-
cputime_type start_tick, start_tick2, end_tick;
AIOfile.initialize(params);
@@ -349,12 +328,12 @@
int y_iters = (y_amount + y_block_size - 1) / y_block_size;
- lda = n; ldy = n;
- k = p;
+ lda = n;
+
for (int j = 0; j < y_iters; j++)
{
- if (y_iters >= 40 && (i%(y_iters/40)) == 0)
+ if (y_iters >= 40 && (j%(y_iters/40)) == 0)
{
cout << "*" << flush;
}
@@ -389,7 +368,9 @@
AIOfile.load_AL(&backupAL);
- int total_al_nans = replace_nans(0, backupAL, n, l);
+ //int total_al_nans = replace_nans(0, backupAL, n, l);
+ replace_nans(0, backupAL, n, l);
+
copy_vec(backupAL, AL, n*l);
@@ -402,7 +383,7 @@
for (int j = 0; j < y_iters; j++)
{
- if (y_iters >= 40 && (i%(y_iters/40)) == 0)
+ if (y_iters >= 40 && (j%(y_iters/40)) == 0)
{
cout << AIOfile.io_overhead << flush;
AIOfile.io_overhead = "*";
@@ -412,7 +393,8 @@
list<long int> y_nan_idxs[y_block_size];
- int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
+ //int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
+ replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
//! Ay_top = AL'*Y
@@ -424,7 +406,8 @@
for (int i = 0; i < a_iters; i++)
{
AIOfile.load_ARblock(&backupAR, a_block_size);
- int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
+ //int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
+ replace_nans(0, backupAR, n, a_block_size * r);
copy_vec(backupAR, AR, n * r * a_block_size);
get_ticks(start_tick2);
@@ -501,7 +484,7 @@
&Ay[l+ii*p], r);
- type_precision* B = Ay;
+ //type_precision* B = Ay;
type_precision S[p * p];
//! Rebuild S
@@ -564,1194 +547,187 @@
//!*************************************************!//
-
+//for reference
void Algorithm::partialNEQ_Blocked_STL(struct Settings params,
struct Outputs &out)
{
- int max_threads = params.threads;
-
- srand(time(NULL));
-
- blas_set_num_threads(max_threads);
-
- type_precision *Ytemp;
- lapack_int info, n, lda, ldy, l, r, k, p;
-
- int i, j, w;
- int m;
-
- cputime_type start_tick, start_tick2, end_tick;
-
- AIOfile.initialize(params);
-
- n = params.n; l = params.l; r = params.r; p = l+r;
-
- int y_amount = params.m;
- int y_block_size = params.mb; // kk
-
- int a_amount = params.t;
- int a_block_size = params.tb;
-
- int a_iters = (a_amount + a_block_size - 1) / a_block_size;
-
- lda = n; ldy = n;
- k = p;
-
-
-
- type_precision* backupAL;
-
- AIOfile.load_AL(&backupAL);
-
-
- type_precision Stl[l * l];
- type_precision Str[l * r * a_block_size];
-
-
- type_precision* Ay_top = new type_precision[l * y_amount];
- type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
- type_precision* A = new type_precision[n * p];
- type_precision* AR = new type_precision[n * r * a_block_size];
- type_precision* AL = new type_precision[n * l];
-
- copy_vec(backupAL, AL, n * l);
- copy_vec(backupAL, A, n * l);
-
-
- int y_iters = y_amount/y_block_size;
-
- type_precision* Y;
- type_precision* backupAR;
-
- // printf("\n\n%%Computations\n%%");
-
-
- get_ticks(start_tick);
-
-
- //! Generate S
- cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
- l, n, 1.0, backupAL, lda, 0.0, Stl, l);
-
- for (j = 0; j < y_iters; j++)
- {
- AIOfile.load_Yblock(&Y, y_block_size);
-
- //! Ay_top = AL'*Y
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
- &Ay_top[j * l * y_block_size], l);
- }
-
-
- for (i = 0; i < a_iters; i++)
- {
- if (a_iters > 10 && (i%(a_iters/10)) == 0)
- {
- cout << "%" << flush;
- }
-
-
- AIOfile.load_ARblock(&backupAR, a_block_size);
- AIOfile.reset_Y();
- copy_vec(backupAR, AR, n*r*a_block_size);
-
- // matlab_print_matrix("A", n, p, A);
- //! Generate Str
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- l, r * a_block_size, n, 1.0, AL, n, AR, n, 0.0, Str, l);
-
- type_precision Sbr[r*r*a_block_size];
-
- #pragma omp parallel for default(shared) schedule(static)
- for (int ii= 0; ii < a_block_size; ii++)
- {
- //! Generate Sbr
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- r, r, n, 1.0, &AR[ii*r*n], n, &AR[ii*r*n],
- n, 0.0, &Sbr[ii*r*r], r);
- }
-
- type_precision Ay[y_block_size*p];
- type_precision S[p*p];
-
-
- for (j = 0; j < y_iters; j++)
- {
- if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
- {
- cout << "%" << flush;
- }
-
-
- AIOfile.load_Yblock(&Y, y_block_size);
-
- //! Ay_bot = AR'*Y
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
- 0.0, Ay_bot, r * a_block_size);
-
- #pragma omp parallel for private(S, Ay) default(shared) schedule(static)
- for (int ii= 0; ii < a_block_size; ii++)
- {
- // matlab_print_matrix("Y", n, y_block_size, Y);
-
- //! Rebuild AY
-
- for (int jj = 0; jj < y_block_size; jj++)
- {
- copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[jj*p], l);
- copy_vec(&Ay_bot[ii*r + jj*r*a_block_size], &Ay[l+jj*p], r);
- }
-
- type_precision* B = Ay;
- // matlab_print_matrix("Ay_top", l, y_block_size,&Ay_top[j*l*y_block_size]);
- // matlab_print_matrix("Ay_bot", r, y_block_size, Ay_bot);
- // matlab_print_matrix("Ay", p, y_block_size, Ay);
-
- //! Rebuild S
- build_S(S, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
- // matlab_print_matrix("S", p, p, S);
-
- //! b = S\Ay
- info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
- S, p, Ay, p);
- // assert(info == 0,"POSV");
-
-
- if (ForceCheck)
- {
- check_result(backupAL, &AR[ii * r * n], n, p,
- y_block_size, r, Y, B);
- }
- }
- }
- }
-
- get_ticks(end_tick);
- out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
- // out.gflops = a_amount/1000.0*(n*l*r+n*r*r+y_amount*(n*r+p*p*p)))/1000.0/1000.0;
- out.gflops = gemm_flops(l, n, l, 0) + gemm_flops(l, n, y_amount, 0) +
- a_amount * (gemm_flops(l, n, r, 0) +
- gemm_flops(r, n, r, 0) +
- gemm_flops(r, n, y_amount, 0) +
- (y_amount/1000.0) *
- (p * p * p / 3.0) / 1000.0/1000.0);
-
-
- AIOfile.finalize();
-
- delete []Ay_top;
- delete []Ay_bot;
- delete []A;
- delete []AL;
- delete []AR;
-}
-
-
-void Algorithm::partialNEQ(struct Settings params, struct Outputs &out)
-{
- int max_threads = params.threads;
-
-
- srand(time(NULL));
-
-
- blas_set_num_threads(max_threads);
-
-
- type_precision *Ytemp;
- lapack_int info, n, lda, ldy, l, r, k, p;
-
- int i, j, w;
- int m;
-
- cputime_type start_tick, start_tick2, end_tick;
- AIOfile.initialize(params);
-
- n = params.n; l = params.l; r = params.r; p = l+r;
-
- int y_amount = params.m;
- int y_block_size = params.mb;// kk
-
- int a_amount = params.t;
- int a_block_size = 1;
-
- int a_iters = a_amount/a_block_size;
-
- lda = n; ldy = n;
- k = p;
-
-
-
- type_precision* backupAL;
-
- AIOfile.load_AL(&backupAL);
-
- type_precision* S = new type_precision[p * p];
- type_precision Stl[l * l];
- type_precision Str[l * r];
- type_precision Sbr[r * r];
- type_precision* A = new type_precision[n * p];
- type_precision* AL = A;
- type_precision* AR = &A[l * n];
-
- copy_vec(backupAL, AL, n*l);
-
-
- int y_iters = y_amount / y_block_size;
-
- type_precision* Y;
- type_precision* backupAR;
-
- // printf("\n\n%%Computations\n%%");
-
- get_ticks(start_tick);
-
- //! Generate S
- cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
- l, n, 1.0, backupAL, lda, 0.0, Stl, l);
-
- for (i = 0; i < a_iters; i++)
- {
- if (a_iters < 10 || (i%(a_iters/10)) == 0)
- {
- cout << "%" << flush;
- }
-
- AIOfile.load_ARblock(&backupAR, a_block_size);
- AIOfile.reset_Y();
- copy_vec(backupAR, AR, n*r);
-
- // matlab_print_matrix("A", n, p, A);
- //! Generate Str
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- l, r, n, 1.0, AL, n, AR, n, 0.0, Str, l);
-
- //! Generate Sbr
- cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
- r, n, 1.0, backupAR, lda, 0.0, Sbr, r);
-
-
-
- // matlab_print_matrix("S", p, p, S);
-
- for (j = 0; j < y_iters; j++)
- {
- build_S(S, Stl, Str, Sbr, l, r);
-
- AIOfile.load_Yblock(&Y, y_block_size);
- type_precision Ay[y_block_size * p];
-
- // matlab_print_matrix("Y", n, y_block_size, Y);
-
- //! Ay = A'*Y
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- p, y_block_size, n, 1.0, A, n, Y, n, 0.0, Ay, p);
-
- type_precision* B = Ay;
- // matlab_print_matrix("Ay", p, y_block_size, Ay);
-
- //! b = S\qy
- info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
- S, p, Ay, p);
- assert(info == 0, "POSV");
-
- // matlab_print_matrix("B", p, y_block_size, B);
- // exit(0);
- if ( ForceCheck)
- {
- check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
- }
- } // END for over j < y_iters
- } // END for over i < a_iters
-
- get_ticks(end_tick);
- out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
- out.gflops = 2.0 * (n * l * l / 1000.0 + a_amount / 1000.0 *
- (n * l * r + n * r * r +
- y_amount * (p * n + p * p * p))) /1000.0 /1000.0;
-
-
-
- AIOfile.finalize();
-
- delete []A;
- delete []S;
-}
-
-
-void Algorithm::fullNEQ(struct Settings params, struct Outputs &out)
-{
- int max_threads = params.threads;
-
-
- srand(time(NULL));
-
-
- blas_set_num_threads(max_threads);
-
-
- type_precision *Ytemp;
- lapack_int info, n, lda, ldy, l, r, k, p;
-
- int i, j, w;
- int m;
-
- cputime_type start_tick, start_tick2, end_tick;
-
- AIOfile.initialize(params);
-
- n = params.n; l = params.l; r = params.r; p = l+r;
-
- int y_amount = params.m;
- int y_block_size = params.mb;// kk
-
- int a_amount = params.t;
- int a_block_size = 1;
-
- int a_iters = a_amount/a_block_size;
-
- lda = n; ldy = n;
- k = p;
-
-
-
- type_precision* backupAL;
-
- AIOfile.load_AL(&backupAL);
-
- type_precision* S = new type_precision[p * p];
- type_precision* Stemp = new type_precision[p * p];
- type_precision* A = new type_precision[n * p];
- type_precision* AL = A;
- type_precision* AR = &A[l * n];
-
- int y_iters = y_amount / y_block_size;
-
- type_precision* Y;
- type_precision* backupAR;
-
- // printf("\n\n%%Computations\n%%");
-
- get_ticks(start_tick);
-
- for (i = 0; i < a_iters; i++)
- {
- if (a_iters < 10 || (i%(a_iters/10)) == 0)
- {
- cout << "%" << flush;
- }
-
- copy_vec(backupAL, AL, n * l);
-
- AIOfile.load_ARblock(&backupAR, a_block_size);
- AIOfile.reset_Y();
- copy_vec(backupAR, AR, n*r);
-
- // matlab_print_matrix("A", n, p, A);
- //! Generate S
- cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
- p, n, 1.0, A, lda, 0.0, S, p);
- // matlab_print_matrix("S", p, p, S);
-
- for (j = 0; j < y_iters; j++)
- {
- AIOfile.load_Yblock(&Y, y_block_size);
- type_precision Ay[y_block_size * p];
- copy_vec(S, Stemp, p*p);
-
- // matlab_print_matrix("Y", n, y_block_size, Y);
-
- //! Ay = A'*Y
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- p, y_block_size, n, 1.0, A, n, Y, n, 0.0, Ay, p);
-
- type_precision* B = Ay;
- // matlab_print_matrix("Ay", p, y_block_size, Ay);
-
- //! b = S\qy
- info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
- Stemp, p, Ay, p);
- assert(info == 0, "POSV");
-
- // matlab_print_matrix("B", p, y_block_size, B);
- // exit(0);
- if ( ForceCheck)
- {
- check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
- }
- }
- }
-
- get_ticks(end_tick);
- out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
- out.gflops = 2.0 * a_amount * (n * p * p / 1000.0 + y_amount / 1000.0 *
- (p * n + p * p * p))/1000.0/1000.0;
-
-
- AIOfile.finalize();
-
-
- free(AL);
- delete []A;
- delete []S;
- delete []Stemp;
-}
-
-
-//!*************************************************!//
-void Algorithm::fullQR(struct Settings params, struct Outputs &out)
-{
- int max_threads = params.threads;
-
-
- srand(time(NULL));
-
- blas_set_num_threads(max_threads);
-
-
-
- cputime_type start_tick, start_tick2, end_tick;
- double acc_gemm = 0;
- double acc_trsm = 0;
- double acc_other = 0;
- double acc_atemp = 0;
- double acc_rtr = 0;
- double acc_rqr = 0;
-
-
- type_precision *Ytemp;
- lapack_int info, n, lda, ldy, l, r, k, p;
-
- int i, j, w;
- int m;
-
- AIOfile.initialize(params);
-
- n = params.n; l = params.l; r = params.r; p = l+r;
-
- int y_amount = params.m;
- int y_block_size = 1;// kk
-
- int a_amount = params.t;
- int a_block_size = 1;
-
- int a_iters = a_amount/a_block_size;
-
- lda = n; ldy = n;
- k = p;
-
-
-
- type_precision* backupAL;
-
- AIOfile.load_AL(&backupAL);
-
- type_precision* Q = new type_precision[n*p];
- type_precision* A = Q;
- type_precision* AL = A;
- type_precision* AR = &A[l*n];
-
- type_precision tau[k];
-
-
- int y_iters = y_amount / y_block_size;
-
-
- type_precision* Y;
- type_precision* backupAR;
-
-
-
- // printf("\n\n%%Computations\n%%");
-
- get_ticks(start_tick);
- for (i = 0; i < a_iters; i++)
- {
- if (a_iters < 10 || (i%(a_iters/10)) == 0)
- {
- cout << "%" << flush;
- }
-
- copy_vec(backupAL, AL, n*l);
-
- AIOfile.load_ARblock(&backupAR, a_block_size);
- AIOfile.reset_Y();
- copy_vec(backupAR, AR, n*r);
-
- //! Generate R
- info = LAPACKE_sgeqrf(STORAGE_TYPE, n, p, A, lda, tau);
- assert(info == 0, "QR decomp");
- type_precision* R = extract_R(A, n, p);
-
- //! generate Q
- info = LAPACKE_sorgqr(STORAGE_TYPE, n, p, k, Q, lda, tau);
- assert(info == 0, "Q form");
-
-
-
-// matlab_print_matrix("ALL", n, l, backupAL);
-// matlab_print_matrix("ARR", n, r, backupAR);
-
- for (j = 0; j < y_iters; j++)
- {
- // matlab_print_matrix("RR", p, p, R);
- // matlab_print_matrix("QQ", n, p, Q);
-
- AIOfile.load_Yblock(&Y, y_block_size);
- type_precision Qy[y_block_size*p];
-
- //! qy = Q'*Y
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- p, y_block_size, n, 1.0, Q, n, Y, n, 0.0, Qy, p);
-
- type_precision* B = Qy;
-
- //! b = R\qy
- cblas_strsm(CblasColMajor, CblasLeft, CblasUpper,
- CblasNoTrans, CblasNonUnit,
- p, y_block_size, 1.0, R, p, Qy, p);
-
- if (ForceCheck)
- {
- check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
- }
- }
-
- free(R);
- }
-
- get_ticks(end_tick);
- out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
- out.gflops = 2.0 * a_amount *
- (2.0 * n * p * p / 1000.0 + y_amount / 1000.0 *
- (p * n + p * p * p)) / 1000.0 / 1000.0;
-
-
- AIOfile.finalize();
-
-
- free(AL);
- delete[] Q;
-}
-
-
-void Algorithm::partialQR(struct Settings params, struct Outputs &out)
-{
- int max_threads = params.threads;
-
-
- srand(time(NULL));
-
- blas_set_num_threads(max_threads);
-
-
-
- cputime_type start_tick, start_tick2, end_tick;
- double acc_gemm = 0;
- double acc_trsm = 0;
- double acc_other = 0;
- double acc_atemp = 0;
- double acc_rtr = 0;
- double acc_rqr = 0;
-
-
- type_precision *RL, *Ytemp, *RtlRtr;
- lapack_int info, n, lda, ldy, l, r, k, p;
-
- int i, j, w;
- int m;
-
- AIOfile.initialize(params);
-
- n = params.n; l = params.l; r = params.r; p = l+r;
-
- int y_amount = params.m;
- int y_block_size = params.mb;// kk
-
- int a_amount = params.t;
- int a_block_size = 1;
-
- int a_iters = a_amount/a_block_size;
-
- lda = n; ldy = n;
- k = l;
-
-
-
- type_precision* backupAL;
-
- AIOfile.load_AL(&backupAL);
-
- type_precision* Q = new type_precision[n*p];
- type_precision* AL = Q;
- copy_vec(backupAL, AL, n*l);
-
-
- type_precision tau[k];
-
- type_precision* QL;
- type_precision* QR = &Q[n*l];
-
- int y_iters = y_amount / y_block_size;
-
-
- type_precision* Y;
-
- type_precision Rtr[l*r];
- type_precision* Ar;
- type_precision Rbr[r*r];
-
- get_ticks(start_tick);
- //! Generate RTL
- info = LAPACKE_sgeqrf(STORAGE_TYPE, n, l, AL, lda, tau);
- assert(info == 0, "QR decomp");
-
- type_precision* R =prepare_R(AL, n, l, r);
- type_precision* Rtl = R;
-
- QL = AL;// same as Q
- AL = backupAL;
- //! generate QL
- info = LAPACKE_sorgqr(STORAGE_TYPE, n, l, k, QL, lda, tau);
-
- assert(info == 0, "Q form");
-
- // printf("\n\n%%Computations\n%%");
-
-
- for (i = 0; i < a_iters; i++)
- {
- if (a_iters < 10 || (i%(a_iters/10)) == 0)
- {
- cout << "%" << flush;
- }
-
- AIOfile.load_ARblock(&Ar, a_block_size);
- AIOfile.reset_Y();
-
- //! Rtr = Q1'*AR
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- l, r*a_block_size, n, 1.0, QL, n, Ar, n, 0.0, Rtr, l);
-
- type_precision* Atemp = replicate_vec(Ar, n*r*a_block_size);
-
- //! Atemp = AR-Q1*Rtr
- cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
- n, r*a_block_size, l, -1.0, QL, n, Rtr, l, 1.0, Atemp, n);
-
- type_precision* QrRbr = QR;
- copy_vec(Atemp, QrRbr, n*r);
- info = LAPACKE_sgeqrf(STORAGE_TYPE, n, r, QrRbr, lda, tau);
- assert(info == 0, "QR decomp of QrRbr");
-
- // reuse of old function
- type_precision* Rbrtemp = prepare_R(QrRbr, n, r, 0);
-
- copy_vec(Rbrtemp, Rbr, r*r);
- info = LAPACKE_sorgqr(STORAGE_TYPE, n, r, r, QrRbr, lda, tau);
- assert(info == 0, "QR form");
- free(Rbrtemp);
-
- update_R(R, Rtr, Rbr, p, p, r);
-
-// matlab_print_matrix("RL", p, l, Rtl);
-// matlab_print_matrix("Rtr", l, r, Rtr);
-// matlab_print_matrix("Rbr", r, r, Rbr);
-// matlab_print_matrix("RR", p, p, R);
-// matlab_print_matrix("QQ", n, p, Q);
- // matlab_print_matrix("ALL", n, l, AL);
- // matlab_print_matrix("ARR", n, r, Ar);
- // matlab_print_matrix("R", p, p, R);
- int top_idx = 0;
-
- for (j = 0; j < y_iters; j++)
- {
- AIOfile.load_Yblock(&Y, y_block_size);
- type_precision Qy[y_block_size*p];
-
-
- //! qy = Q'*Y
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- p, y_block_size, n, 1.0, Q, n, Y, n, 0.0, Qy, p);
-
- type_precision* B = Qy;
-
- //! b = R\qy
- cblas_strsm(CblasColMajor, CblasLeft, CblasUpper,
- CblasNoTrans, CblasNonUnit,
- p, y_block_size, 1.0, R, p, Qy, p);
-
- // matlab_print_matrix("bcomp", p, y_block_size, B);
-
-
- if ( ForceCheck)
- {
- check_result(AL, Ar, n, p, y_block_size, r, Y, B);
- }
- }
-
- free(Atemp);
- }
-
-
- get_ticks(end_tick);
- out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
- out.gflops = 2.0*(n*l*l/1000.0 + a_amount *
- (2.0 * n * l * r / 1000.0 +
- n * r * r / 1000.0 +
- y_amount / 1000.0 * (p * n + p * p * p))) /
- 1000.0 / 1000.0;
-
-
- AIOfile.finalize();
-
-
- free(Rtl);
- free(AL);
- delete[] Q;
-}
-
-
-void Algorithm::partialQR_Blocked_Rtl(struct Settings params,
- struct Outputs &out)
-{
- int max_threads = params.threads;
-
- // srand (time(NULL));
-
- int cpu_cores = max_threads;
-
-
- blas_set_num_threads(max_threads);
- omp_set_num_threads(max_threads);
-
-
-
-
- cputime_type start_tick, start_tick2, end_tick;
- double acc_pre = 0;
- double acc_trsm = 0;
- double acc_other = 0;
- double acc_atemp = 0;
- double acc_rtr = 0;
- double acc_rqr = 0;
- double acc_RTL_QLY = 0;
-
-
- type_precision *AL, *RL, *RQy_top, *Ytemp, *RtlRtr;
- lapack_int info, n, lda, ldy, l, r, k, p;
-
- int i, j, w;
- int m;
-
-
- AIOfile.initialize(params);
-
- n = params.n; l = params.l; r = params.r; p = l+r;
-
-
-
- double lvl_3_cache_size = 6;
-
-
- int y_amount = params.t;
- int y_block_size = params.tb;// kk
-
- int a_amount = params.m;
- int a_block_size = params.mb;
-
- int a_iters = (a_amount + a_block_size - 1) / a_block_size;
-
- int y_iters = (y_amount+ y_block_size - 1)/y_block_size;
-
- int qy_idx = 0;
-
- out.gflops = n/1000.0*l/1000.0*l/1000.0 +
- gemm_flops(l, n, y_amount, 0) +
- y_amount*l/1000.0*l/1000.0/1000.0 +
- a_amount * (gemm_flops(l, n, r, 0) +
- l/1000.0*l/1000.0/1000.0 +
- gemm_flops(n, l, r, 1) +
- n/1000.0*r/1000.0*r/1000.0 +
- gemm_flops(r, n, y_amount, 0) +
- y_amount/1000.0*r/1000.0*r/1000.0 +
- y_amount/1000.0*l/1000.0/1000.0);
-
- int sch_block_size = max(1.0,
- min((double)a_block_size / (double)max_threads,
- (1.0*1000*1000) /
- (double)((sizeof(type_precision) * n * r))));
-
- // cout << endl<< "taskChunk "<< sch_block_size <<endl;
-
- lda = n; ldy = n;
- k = l;
-
- AIOfile.load_AL(&AL);
-
- type_precision* backupAL = replicate_vec(AL, n*l);
-
-
-
- RQy_top = new type_precision[l*y_amount];
-
- type_precision* Bfinal = (type_precision*)malloc(max_threads * p *
- y_block_size *
- sizeof(type_precision));
-
- type_precision tau[k];
-
- type_precision* QL;
-
- type_precision* Rtr = (type_precision*)malloc(l * r * a_block_size *
- sizeof(type_precision));
- type_precision* Qy_bot = (type_precision*)malloc(a_block_size *
- y_block_size * r *
- sizeof(type_precision));
-
- type_precision* B_top = new type_precision[max_threads*y_block_size*l];
- type_precision* B_bot = new type_precision[max_threads*y_block_size*r];
-
- type_precision* Ar;
- type_precision Rbr[r*r*a_block_size];
-
-
-
- // printf("\n\n%%Computations\n%%");
- for (i = 0; i < a_iters; i++)
- {
- if (a_iters >= 10 && (i%(a_iters/10)) == 0)
- {
- // cout << "*" << flush;
- }
-
- for (j = 0; j < y_iters; j++)
- {
- if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
- {
- // cout << "*" << flush;
- }
- }
- }
- cout << endl;
-
-
- //! Start of Computations
-
- get_ticks(start_tick);
-
-
- info = LAPACKE_sgeqrf(STORAGE_TYPE, n, l, AL, lda, tau);
- assert(info == 0, "QR decomp");
-
- type_precision* Rtl = prepare_R(AL, n, l, 0);
-
- QL = AL;
- AL = backupAL;
-
- info = LAPACKE_sorgqr(STORAGE_TYPE, n, l, k, QL, lda, tau);
-
- assert(info == 0, "Q form");
-
- matlab_print_matrix("QL", n, l, QL);
-
- qy_idx = 0;
-
- // printf("\n%%Preparing IO\n");
- //! Prepare File for the Y
-
-
- type_precision* Y;
-
- // #pragma omp parallel default(shared)
+// int max_threads = params.threads;
+//
+// srand(time(NULL));
+//
+// blas_set_num_threads(max_threads);
+//
+// type_precision *Ytemp;
+// lapack_int info, n, lda,ldy, l, r, p;
+//
+// int i,k;
+//
+// cputime_type start_tick, start_tick2, end_tick;
+//
+// AIOfile.initialize(params);
+//
+// n = params.n; l = params.l; r = params.r; p = l+r;
+//
+// int y_amount = params.m;
+// int y_block_size = params.mb; // kk
+//
+// int a_amount = params.t;
+// int a_block_size = params.tb;
+//
+// int a_iters = (a_amount + a_block_size - 1) / a_block_size;
+//
+// lda = n; ldy = n;
+// k = p;
+//
+//
+//
+// type_precision* backupAL;
+//
+// AIOfile.load_AL(&backupAL);
+//
+//
+// type_precision Stl[l * l];
+// type_precision Str[l * r * a_block_size];
+//
+//
+// type_precision* Ay_top = new type_precision[l * y_amount];
+// type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
+// type_precision* A = new type_precision[n * p];
+// type_precision* AR = new type_precision[n * r * a_block_size];
+// type_precision* AL = new type_precision[n * l];
+//
+// copy_vec(backupAL, AL, n * l);
+// copy_vec(backupAL, A, n * l);
+//
+//
+// int y_iters = y_amount/y_block_size;
+//
+// type_precision* Y;
+// type_precision* backupAR;
+//
+// // printf("\n\n%%Computations\n%%");
+//
+//
+// get_ticks(start_tick);
+//
+//
+// //! Generate S
+// cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
+// l, n, 1.0, backupAL, lda, 0.0, Stl, l);
+//
+// for (int j = 0; j < y_iters; j++)
// {
-// // #pragma omp for private(Y, y_block_size, qy_idx) nowait schedule(static, 1)
+// AIOfile.load_Yblock(&Y, y_block_size);
+//
+// //! Ay_top = AL'*Y
+// cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+// l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
+// &Ay_top[j * l * y_block_size], l);
+// }
+//
+//
+// for (int i = 0; i < a_iters; i++)
+// {
+// if (a_iters > 10 && (i%(a_iters/10)) == 0)
+// {
+// cout << "%" << flush;
+// }
+//
+//
+// AIOfile.load_ARblock(&backupAR, a_block_size);
+// AIOfile.reset_Y();
+// copy_vec(backupAR, AR, n*r*a_block_size);
+//
+// // matlab_print_matrix("A", n, p, A);
+// //! Generate Str
+// cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+// l, r * a_block_size, n, 1.0, AL, n, AR, n, 0.0, Str, l);
+//
+// type_precision Sbr[r*r*a_block_size];
+//
+// #pragma omp parallel for default(shared) schedule(static)
+// for (int ii= 0; ii < a_block_size; ii++)
+// {
+// //! Generate Sbr
+// cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+// r, r, n, 1.0, &AR[ii*r*n], n, &AR[ii*r*n],
+// n, 0.0, &Sbr[ii*r*r], r);
+// }
+//
+// type_precision Ay[y_block_size*p];
+// type_precision S[p*p];
+//
+//
// for (int j = 0; j < y_iters; j++)
// {
-// // qy_idx = j*y_block_size*l;
-// // #pragma omp critical
-// {AIOfile.load_Yblock(&Y, y_block_size);}
+// if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
+// {
+// cout << "%" << flush;
+// }
//
-// //! qy_top = QL'*Y
+//
+// AIOfile.load_Yblock(&Y, y_block_size);
+//
+// //! Ay_bot = AR'*Y
// cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-// l, y_block_size, n, 1.0, QL, n, Y, n, 0.0,&RQy_top[qy_idx], l);
+// r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
+// 0.0, Ay_bot, r * a_block_size);
//
-// //! K | RtlQlY =RTL\qy_top
-// cblas_strsm(CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit,
-// l, y_block_size, 1.0, Rtl, l,&RQy_top[qy_idx], l);
-// qy_idx += y_block_size*l;
+// #pragma omp parallel for private(S, Ay) default(shared) schedule(static)
+// for (int ii= 0; ii < a_block_size; ii++)
+// {
[TRUNCATED]
To get the complete diff run:
svnlook diff /svnroot/genabel -r 1609
More information about the Genabel-commits
mailing list