[Genabel-commits] r1609 - pkg/OmicABELnoMM/src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Thu Feb 13 21:42:58 CET 2014


Author: afrank
Date: 2014-02-13 21:42:52 +0100 (Thu, 13 Feb 2014)
New Revision: 1609

Modified:
   pkg/OmicABELnoMM/src/AIOwrapper.cpp
   pkg/OmicABELnoMM/src/Algorithm.cpp
   pkg/OmicABELnoMM/src/Algorithm.h
   pkg/OmicABELnoMM/src/Definitions.h
   pkg/OmicABELnoMM/src/Utility.cpp
   pkg/OmicABELnoMM/src/Utility.h
   pkg/OmicABELnoMM/src/main.cpp
Log:
Warnings from c/c++ inconsistencies and -Wall fixed. Legacy code from unused functionality removed. A few bugs with indices fixed. Stable code.

Modified: pkg/OmicABELnoMM/src/AIOwrapper.cpp
===================================================================
--- pkg/OmicABELnoMM/src/AIOwrapper.cpp	2014-02-13 09:24:32 UTC (rev 1608)
+++ pkg/OmicABELnoMM/src/AIOwrapper.cpp	2014-02-13 20:42:52 UTC (rev 1609)
@@ -29,9 +29,9 @@
 
     Fhandler->fakefiles = params.use_fake_files;
 
-    databel_fvi* Yfvi;
-    databel_fvi* ALfvi;
-    databel_fvi* ARfvi;
+//    databel_fvi* Yfvi;
+//    databel_fvi* ALfvi;
+//    databel_fvi* ARfvi;
 
     if (!Fhandler->fakefiles)
     {
@@ -49,7 +49,7 @@
         params.l = ALfvi->fvi_header.numVariables;
 
         //block size to keep mem under 1 gigabyte
-        int opt_block = params.n / (4*1000^3) * (1/(2*params.r));
+        //int opt_block = params.n / (4*1000^3) * (1/(2*params.r));
         int opt_tb = 1000;
         int opt_mb = 1000;
 
@@ -81,7 +81,7 @@
 void AIOwrapper::finalize()
 {
     //cout << "f";
-    void *status;
+    //void *status;
 
     Fhandler->not_done = false;
     pthread_mutex_lock(&(Fhandler->m_more));
@@ -368,13 +368,15 @@
     }
 //
 //            //!induce realistic fileread delay
+
+    return 0;
 }
 
 
 void AIOwrapper::load_ARblock(type_precision** Ar, int &Ar_blockSize)
 {
-    int status;
-    int createstatus = 0;
+    //int status;
+    //int createstatus = 0;
     //cout<<"^";
 
     while (Fhandler->ar_full_buffers.empty())
@@ -428,8 +430,8 @@
 
 void AIOwrapper::load_Yblock(type_precision** Y, int &y_blockSize)
 {
-    int status;
-    int createstatus = 0;
+    //int status;
+    //int createstatus = 0;
 
     while (Fhandler->full_buffers.empty())
     {
@@ -528,7 +530,7 @@
 
 void AIOwrapper::reset_Y()
 {
-    void *status;
+    //void *status;
 
     Fhandler->seed = 1337;
 
@@ -569,7 +571,7 @@
 
 void AIOwrapper::reset_AR()
 {
-    void *status;
+    //void *status;
 
 
     //cout << "ra" << flush;
@@ -697,7 +699,7 @@
 FILE * AIOwrapper::fgls_fopen( const char *path, const char *mode )
 {
         FILE * f;
-        char err[100];
+        //char err[100];
 
         f = fopen(path, mode );
         if ( f == NULL )

Modified: pkg/OmicABELnoMM/src/Algorithm.cpp
===================================================================
--- pkg/OmicABELnoMM/src/Algorithm.cpp	2014-02-13 09:24:32 UTC (rev 1608)
+++ pkg/OmicABELnoMM/src/Algorithm.cpp	2014-02-13 20:42:52 UTC (rev 1609)
@@ -16,30 +16,12 @@
 {
     switch (type)
     {
-    case FULL_NEQ:
-        fullNEQ(params, out);
-        break;
-    case P_NEQ:
-        partialNEQ(params, out);
-        break;
-    case P_NEQ_B_OPT:
-        partialNEQ_Blocked_STL(params, out);
-        break;
-    case FULL_QR:
-        fullQR(params, out);
-        break;
-    case P_QR:
-        partialQR(params, out);
-        break;
-    case P_QR_B_OPT:
-        partialQR_Blocked_Rtl(params, out);
-        break;
-    case P_NEQ_B_OPT_MD:
-        partialNEQ_Blocked_STL_MD(params, out);
-        break;
+        case P_NEQ_B_OPT_MD:
+            partialNEQ_Blocked_STL_MD(params, out);
+            break;
 
-    default:
-        break;
+        default:
+            break;
     }
 }
 
@@ -74,7 +56,7 @@
                                int dim2_b, int dim1_b_bot)
 {
     // memcpy are faster version of the fors
-    int i, k, w, top_idx, bot_idx, max = dim1_b*dim2_b;
+    int i, k, w, top_idx, bot_idx;
     int size;
     top_idx = 0;
     bot_idx = 0;
@@ -111,7 +93,7 @@
                            type_precision* bot, int dim1_QY,
                            int dim2_QY, int dim1_qy_bot, int bot_blocks )
 {
-    int i, k, w, top_idx, bot_idx, max = dim1_QY*dim2_QY;
+    int i, k, w, top_idx, bot_idx;
     top_idx = 0;
     bot_idx = 0;
     for (k = 0; k < dim2_QY; k++)
@@ -326,12 +308,9 @@
     blas_set_num_threads(max_threads);
 
 
-    type_precision *Ytemp;
-    lapack_int info, n, lda, ldy, l, r, k, p;
+    //type_precision *Ytemp;
+    lapack_int info, n, lda, l, r, p;
 
-    int i, j, w;
-    int m;
-
     cputime_type start_tick, start_tick2, end_tick;
 
     AIOfile.initialize(params);
@@ -349,12 +328,12 @@
     int y_iters = (y_amount + y_block_size - 1) / y_block_size;
 
 
-    lda = n; ldy = n;
-    k = p;
+    lda = n;
 
+
     for (int j = 0; j < y_iters; j++)
     {
-        if (y_iters >= 40 && (i%(y_iters/40)) == 0)
+        if (y_iters >= 40 && (j%(y_iters/40)) == 0)
         {
             cout << "*" << flush;
         }
@@ -389,7 +368,9 @@
 
 
     AIOfile.load_AL(&backupAL);
-    int total_al_nans = replace_nans(0, backupAL, n, l);
+    //int total_al_nans = replace_nans(0, backupAL, n, l);
+    replace_nans(0, backupAL, n, l);
+
     copy_vec(backupAL, AL, n*l);
 
 
@@ -402,7 +383,7 @@
 
     for (int j = 0; j < y_iters; j++)
     {
-        if (y_iters >= 40 && (i%(y_iters/40)) == 0)
+        if (y_iters >= 40 && (j%(y_iters/40)) == 0)
         {
             cout << AIOfile.io_overhead << flush;
             AIOfile.io_overhead = "*";
@@ -412,7 +393,8 @@
 
         list<long int> y_nan_idxs[y_block_size];
 
-        int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
+        //int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
+        replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
 
 
         //! Ay_top = AL'*Y
@@ -424,7 +406,8 @@
         for (int i = 0; i < a_iters; i++)
         {
             AIOfile.load_ARblock(&backupAR, a_block_size);
-            int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
+            //int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
+            replace_nans(0, backupAR, n, a_block_size * r);
             copy_vec(backupAR, AR, n *  r * a_block_size);
 
             get_ticks(start_tick2);
@@ -501,7 +484,7 @@
                                  &Ay[l+ii*p], r);
 
 
-                        type_precision* B = Ay;
+                        //type_precision* B = Ay;
                         type_precision S[p * p];
 
                         //! Rebuild S
@@ -564,1194 +547,187 @@
 
 //!*************************************************!//
 
-
+//for reference
 void Algorithm::partialNEQ_Blocked_STL(struct Settings params,
                                        struct Outputs &out)
 {
-    int max_threads = params.threads;
-
-    srand(time(NULL));
-
-    blas_set_num_threads(max_threads);
-
-    type_precision *Ytemp;
-    lapack_int info, n, lda, ldy, l, r, k, p;
-
-    int i, j, w;
-    int m;
-
-    cputime_type start_tick, start_tick2, end_tick;
-
-    AIOfile.initialize(params);
-
-    n = params.n; l = params.l; r = params.r; p = l+r;
-
-    int y_amount = params.m;
-    int y_block_size = params.mb;  // kk
-
-    int a_amount = params.t;
-    int a_block_size = params.tb;
-
-    int a_iters = (a_amount + a_block_size - 1) / a_block_size;
-
-    lda = n; ldy = n;
-    k = p;
-
-
-
-    type_precision* backupAL;
-
-    AIOfile.load_AL(&backupAL);
-
-
-    type_precision Stl[l * l];
-    type_precision Str[l * r * a_block_size];
-
-
-    type_precision* Ay_top = new type_precision[l * y_amount];
-    type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
-    type_precision* A = new type_precision[n * p];
-    type_precision* AR = new type_precision[n * r * a_block_size];
-    type_precision* AL = new type_precision[n * l];
-
-    copy_vec(backupAL, AL, n * l);
-    copy_vec(backupAL, A, n * l);
-
-
-    int y_iters = y_amount/y_block_size;
-
-    type_precision* Y;
-    type_precision* backupAR;
-
-    // printf("\n\n%%Computations\n%%");
-
-
-    get_ticks(start_tick);
-
-
-    //! Generate S
-    cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
-                l, n, 1.0, backupAL, lda, 0.0, Stl, l);
-
-    for (j = 0; j < y_iters; j++)
-    {
-        AIOfile.load_Yblock(&Y, y_block_size);
-
-        //! Ay_top = AL'*Y
-        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                    l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
-                    &Ay_top[j * l * y_block_size], l);
-    }
-
-
-    for (i = 0; i < a_iters; i++)
-    {
-        if (a_iters > 10 && (i%(a_iters/10)) == 0)
-        {
-            cout << "%" << flush;
-        }
-
-
-        AIOfile.load_ARblock(&backupAR, a_block_size);
-        AIOfile.reset_Y();
-        copy_vec(backupAR, AR, n*r*a_block_size);
-
-        // matlab_print_matrix("A", n, p, A);
-        //! Generate Str
-        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                    l, r * a_block_size, n, 1.0, AL, n, AR, n, 0.0, Str, l);
-
-        type_precision Sbr[r*r*a_block_size];
-
-        #pragma omp parallel for default(shared) schedule(static)
-        for (int ii= 0; ii < a_block_size; ii++)
-        {
-            //! Generate Sbr
-            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                        r, r, n, 1.0, &AR[ii*r*n], n, &AR[ii*r*n],
-                        n, 0.0, &Sbr[ii*r*r], r);
-        }
-
-        type_precision Ay[y_block_size*p];
-        type_precision S[p*p];
-
-
-        for (j = 0; j < y_iters; j++)
-        {
-            if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
-            {
-                cout << "%" << flush;
-            }
-
-
-            AIOfile.load_Yblock(&Y, y_block_size);
-
-            //! Ay_bot = AR'*Y
-            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                        r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
-                        0.0, Ay_bot, r * a_block_size);
-
-            #pragma omp parallel for private(S, Ay) default(shared) schedule(static)
-            for (int ii= 0; ii < a_block_size; ii++)
-            {
-                // matlab_print_matrix("Y", n, y_block_size, Y);
-
-                //! Rebuild AY
-
-                for (int jj = 0; jj < y_block_size; jj++)
-                {
-                    copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[jj*p], l);
-                    copy_vec(&Ay_bot[ii*r + jj*r*a_block_size], &Ay[l+jj*p], r);
-                }
-
-                type_precision* B = Ay;
-                // matlab_print_matrix("Ay_top", l, y_block_size,&Ay_top[j*l*y_block_size]);
-                // matlab_print_matrix("Ay_bot", r, y_block_size, Ay_bot);
-                // matlab_print_matrix("Ay", p, y_block_size, Ay);
-
-                //! Rebuild S
-                build_S(S, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
-                // matlab_print_matrix("S", p, p, S);
-
-                //! b = S\Ay
-                info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
-                                     S, p, Ay, p);
-                // assert(info == 0,"POSV");
-
-
-                if (ForceCheck)
-                {
-                    check_result(backupAL, &AR[ii * r * n], n, p,
-                                 y_block_size, r, Y, B);
-                }
-            }
-        }
-    }
-
-    get_ticks(end_tick);
-    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
-    // out.gflops = a_amount/1000.0*(n*l*r+n*r*r+y_amount*(n*r+p*p*p)))/1000.0/1000.0;
-    out.gflops = gemm_flops(l, n, l, 0) + gemm_flops(l, n, y_amount, 0) +
-                 a_amount * (gemm_flops(l, n, r, 0) +
-                             gemm_flops(r, n, r, 0) +
-                             gemm_flops(r, n, y_amount, 0) +
-                             (y_amount/1000.0) *
-                             (p * p * p / 3.0) / 1000.0/1000.0);
-
-
-    AIOfile.finalize();
-
-    delete []Ay_top;
-    delete []Ay_bot;
-    delete []A;
-    delete []AL;
-    delete []AR;
-}
-
-
-void Algorithm::partialNEQ(struct Settings params, struct Outputs &out)
-{
-    int max_threads = params.threads;
-
-
-    srand(time(NULL));
-
-
-    blas_set_num_threads(max_threads);
-
-
-    type_precision *Ytemp;
-    lapack_int info, n, lda, ldy, l, r, k, p;
-
-    int i, j, w;
-    int m;
-
-    cputime_type start_tick, start_tick2, end_tick;
-    AIOfile.initialize(params);
-
-    n = params.n; l = params.l; r = params.r; p = l+r;
-
-    int y_amount = params.m;
-    int y_block_size = params.mb;// kk
-
-    int a_amount = params.t;
-    int a_block_size = 1;
-
-    int a_iters = a_amount/a_block_size;
-
-    lda = n; ldy = n;
-    k = p;
-
-
-
-    type_precision* backupAL;
-
-    AIOfile.load_AL(&backupAL);
-
-    type_precision* S = new type_precision[p * p];
-    type_precision Stl[l * l];
-    type_precision Str[l * r];
-    type_precision Sbr[r * r];
-    type_precision* A = new type_precision[n * p];
-    type_precision* AL = A;
-    type_precision* AR = &A[l * n];
-
-    copy_vec(backupAL, AL, n*l);
-
-
-    int y_iters = y_amount / y_block_size;
-
-    type_precision* Y;
-    type_precision* backupAR;
-
-    // printf("\n\n%%Computations\n%%");
-
-    get_ticks(start_tick);
-
-    //! Generate S
-    cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
-                l, n, 1.0, backupAL, lda, 0.0, Stl, l);
-
-    for (i = 0; i < a_iters; i++)
-    {
-        if (a_iters < 10 || (i%(a_iters/10)) == 0)
-        {
-            cout << "%" << flush;
-        }
-
-        AIOfile.load_ARblock(&backupAR, a_block_size);
-        AIOfile.reset_Y();
-        copy_vec(backupAR, AR, n*r);
-
-        // matlab_print_matrix("A", n, p, A);
-        //! Generate Str
-        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                    l, r, n, 1.0, AL, n, AR, n, 0.0, Str, l);
-
-        //! Generate Sbr
-        cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
-                    r, n, 1.0, backupAR, lda, 0.0, Sbr, r);
-
-
-
-        // matlab_print_matrix("S", p, p, S);
-
-        for (j = 0; j < y_iters; j++)
-        {
-            build_S(S, Stl, Str, Sbr, l, r);
-
-            AIOfile.load_Yblock(&Y, y_block_size);
-            type_precision Ay[y_block_size * p];
-
-            // matlab_print_matrix("Y", n, y_block_size, Y);
-
-            //! Ay = A'*Y
-            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                        p, y_block_size, n, 1.0, A, n, Y, n, 0.0, Ay, p);
-
-            type_precision* B = Ay;
-            // matlab_print_matrix("Ay", p, y_block_size, Ay);
-
-            //! b = S\qy
-            info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
-                                 S, p, Ay, p);
-            assert(info == 0, "POSV");
-
-            // matlab_print_matrix("B", p, y_block_size, B);
-            // exit(0);
-            if ( ForceCheck)
-            {
-                check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
-            }
-        }  // END for over j < y_iters
-    }  // END for over i < a_iters
-
-    get_ticks(end_tick);
-    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
-    out.gflops = 2.0 * (n * l * l / 1000.0 + a_amount / 1000.0 *
-                        (n * l * r + n * r * r +
-                         y_amount * (p * n + p * p * p))) /1000.0 /1000.0;
-
-
-
-    AIOfile.finalize();
-
-    delete []A;
-    delete []S;
-}
-
-
-void Algorithm::fullNEQ(struct Settings params, struct Outputs &out)
-{
-    int max_threads = params.threads;
-
-
-    srand(time(NULL));
-
-
-    blas_set_num_threads(max_threads);
-
-
-    type_precision *Ytemp;
-    lapack_int info, n, lda, ldy, l, r, k, p;
-
-    int i, j, w;
-    int m;
-
-    cputime_type start_tick, start_tick2, end_tick;
-
-    AIOfile.initialize(params);
-
-    n = params.n; l = params.l; r = params.r; p = l+r;
-
-    int y_amount = params.m;
-    int y_block_size = params.mb;// kk
-
-    int a_amount = params.t;
-    int a_block_size = 1;
-
-    int a_iters = a_amount/a_block_size;
-
-    lda = n; ldy = n;
-    k = p;
-
-
-
-    type_precision* backupAL;
-
-    AIOfile.load_AL(&backupAL);
-
-    type_precision* S = new type_precision[p * p];
-    type_precision* Stemp = new type_precision[p * p];
-    type_precision* A = new type_precision[n * p];
-    type_precision* AL = A;
-    type_precision* AR = &A[l * n];
-
-    int y_iters = y_amount / y_block_size;
-
-    type_precision* Y;
-    type_precision* backupAR;
-
-    // printf("\n\n%%Computations\n%%");
-
-    get_ticks(start_tick);
-
-    for (i = 0; i < a_iters; i++)
-    {
-        if (a_iters < 10 || (i%(a_iters/10)) == 0)
-        {
-            cout << "%" << flush;
-        }
-
-        copy_vec(backupAL, AL, n * l);
-
-        AIOfile.load_ARblock(&backupAR, a_block_size);
-        AIOfile.reset_Y();
-        copy_vec(backupAR, AR, n*r);
-
-        // matlab_print_matrix("A", n, p, A);
-        //! Generate S
-        cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
-                    p, n, 1.0, A, lda, 0.0, S, p);
-        // matlab_print_matrix("S", p, p, S);
-
-        for (j = 0; j < y_iters; j++)
-        {
-            AIOfile.load_Yblock(&Y, y_block_size);
-            type_precision Ay[y_block_size * p];
-            copy_vec(S, Stemp, p*p);
-
-            // matlab_print_matrix("Y", n, y_block_size, Y);
-
-            //! Ay = A'*Y
-            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                        p, y_block_size, n, 1.0, A, n, Y, n, 0.0, Ay, p);
-
-            type_precision* B = Ay;
-            // matlab_print_matrix("Ay", p, y_block_size, Ay);
-
-            //! b = S\qy
-            info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, y_block_size,
-                                 Stemp, p, Ay, p);
-            assert(info == 0, "POSV");
-
-            // matlab_print_matrix("B", p, y_block_size, B);
-            // exit(0);
-            if ( ForceCheck)
-            {
-                check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
-            }
-        }
-    }
-
-    get_ticks(end_tick);
-    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
-    out.gflops = 2.0 * a_amount * (n * p * p / 1000.0 + y_amount / 1000.0 *
-                                   (p * n + p * p * p))/1000.0/1000.0;
-
-
-    AIOfile.finalize();
-
-
-    free(AL);
-    delete []A;
-    delete []S;
-    delete []Stemp;
-}
-
-
-//!*************************************************!//
-void Algorithm::fullQR(struct Settings params, struct Outputs &out)
-{
-    int max_threads = params.threads;
-
-
-    srand(time(NULL));
-
-    blas_set_num_threads(max_threads);
-
-
-
-    cputime_type start_tick, start_tick2, end_tick;
-    double acc_gemm = 0;
-    double acc_trsm = 0;
-    double acc_other = 0;
-    double acc_atemp = 0;
-    double acc_rtr = 0;
-    double acc_rqr = 0;
-
-
-    type_precision *Ytemp;
-    lapack_int info, n, lda, ldy, l, r, k, p;
-
-    int i, j, w;
-    int m;
-
-    AIOfile.initialize(params);
-
-    n = params.n; l = params.l; r = params.r; p = l+r;
-
-    int y_amount = params.m;
-    int y_block_size = 1;// kk
-
-    int a_amount = params.t;
-    int a_block_size = 1;
-
-    int a_iters = a_amount/a_block_size;
-
-    lda = n; ldy = n;
-    k = p;
-
-
-
-    type_precision* backupAL;
-
-    AIOfile.load_AL(&backupAL);
-
-    type_precision* Q = new type_precision[n*p];
-    type_precision* A = Q;
-    type_precision* AL = A;
-    type_precision* AR = &A[l*n];
-
-    type_precision tau[k];
-
-
-    int y_iters = y_amount / y_block_size;
-
-
-    type_precision* Y;
-    type_precision* backupAR;
-
-
-
-    // printf("\n\n%%Computations\n%%");
-
-    get_ticks(start_tick);
-    for (i = 0; i < a_iters; i++)
-    {
-        if (a_iters < 10 || (i%(a_iters/10)) == 0)
-        {
-            cout << "%" << flush;
-        }
-
-        copy_vec(backupAL, AL, n*l);
-
-        AIOfile.load_ARblock(&backupAR, a_block_size);
-        AIOfile.reset_Y();
-        copy_vec(backupAR, AR, n*r);
-
-        //! Generate R
-        info = LAPACKE_sgeqrf(STORAGE_TYPE, n, p, A, lda, tau);
-        assert(info == 0, "QR decomp");
-        type_precision* R = extract_R(A, n, p);
-
-        //! generate Q
-        info = LAPACKE_sorgqr(STORAGE_TYPE, n, p, k, Q, lda, tau);
-        assert(info == 0, "Q form");
-
-
-
-//        matlab_print_matrix("ALL", n, l, backupAL);
-//        matlab_print_matrix("ARR", n, r, backupAR);
-
-        for (j = 0; j < y_iters; j++)
-        {
-            // matlab_print_matrix("RR", p, p, R);
-            // matlab_print_matrix("QQ", n, p, Q);
-
-            AIOfile.load_Yblock(&Y, y_block_size);
-            type_precision Qy[y_block_size*p];
-
-            //! qy = Q'*Y
-            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                        p, y_block_size, n, 1.0, Q, n, Y, n, 0.0, Qy, p);
-
-            type_precision* B = Qy;
-
-            //! b = R\qy
-            cblas_strsm(CblasColMajor, CblasLeft, CblasUpper,
-                        CblasNoTrans, CblasNonUnit,
-                        p, y_block_size, 1.0, R, p, Qy, p);
-
-            if (ForceCheck)
-            {
-                check_result(backupAL, backupAR, n, p, y_block_size, r, Y, B);
-            }
-        }
-
-        free(R);
-    }
-
-    get_ticks(end_tick);
-    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
-    out.gflops = 2.0 * a_amount *
-        (2.0 * n * p * p / 1000.0 + y_amount / 1000.0 *
-         (p * n + p * p * p)) / 1000.0 / 1000.0;
-
-
-    AIOfile.finalize();
-
-
-    free(AL);
-    delete[] Q;
-}
-
-
-void Algorithm::partialQR(struct Settings params, struct Outputs &out)
-{
-    int max_threads = params.threads;
-
-
-    srand(time(NULL));
-
-    blas_set_num_threads(max_threads);
-
-
-
-    cputime_type start_tick, start_tick2, end_tick;
-    double acc_gemm = 0;
-    double acc_trsm = 0;
-    double acc_other = 0;
-    double acc_atemp = 0;
-    double acc_rtr = 0;
-    double acc_rqr = 0;
-
-
-    type_precision *RL, *Ytemp, *RtlRtr;
-    lapack_int info, n, lda, ldy, l, r, k, p;
-
-    int i, j, w;
-    int m;
-
-    AIOfile.initialize(params);
-
-    n = params.n; l = params.l; r = params.r; p = l+r;
-
-    int y_amount = params.m;
-    int y_block_size = params.mb;// kk
-
-    int a_amount = params.t;
-    int a_block_size = 1;
-
-    int a_iters = a_amount/a_block_size;
-
-    lda = n; ldy = n;
-    k = l;
-
-
-
-    type_precision* backupAL;
-
-    AIOfile.load_AL(&backupAL);
-
-    type_precision* Q = new type_precision[n*p];
-    type_precision* AL = Q;
-    copy_vec(backupAL, AL, n*l);
-
-
-    type_precision tau[k];
-
-    type_precision* QL;
-    type_precision* QR = &Q[n*l];
-
-    int y_iters = y_amount / y_block_size;
-
-
-    type_precision* Y;
-
-    type_precision Rtr[l*r];
-    type_precision* Ar;
-    type_precision Rbr[r*r];
-
-    get_ticks(start_tick);
-    //! Generate RTL
-    info = LAPACKE_sgeqrf(STORAGE_TYPE, n, l, AL, lda, tau);
-    assert(info == 0, "QR decomp");
-
-    type_precision* R =prepare_R(AL, n, l, r);
-    type_precision* Rtl = R;
-
-    QL = AL;// same as Q
-    AL = backupAL;
-    //! generate QL
-    info = LAPACKE_sorgqr(STORAGE_TYPE, n, l, k, QL, lda, tau);
-
-    assert(info == 0, "Q form");
-
-    // printf("\n\n%%Computations\n%%");
-
-
-    for (i = 0; i < a_iters; i++)
-    {
-        if (a_iters < 10 || (i%(a_iters/10)) == 0)
-        {
-            cout << "%" << flush;
-        }
-
-        AIOfile.load_ARblock(&Ar, a_block_size);
-        AIOfile.reset_Y();
-
-        //! Rtr = Q1'*AR
-        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                    l, r*a_block_size, n, 1.0, QL, n, Ar, n, 0.0, Rtr, l);
-
-        type_precision* Atemp = replicate_vec(Ar, n*r*a_block_size);
-
-        //! Atemp = AR-Q1*Rtr
-        cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans,
-                    n, r*a_block_size, l, -1.0, QL, n, Rtr, l, 1.0, Atemp, n);
-
-        type_precision* QrRbr = QR;
-        copy_vec(Atemp, QrRbr, n*r);
-        info = LAPACKE_sgeqrf(STORAGE_TYPE, n, r, QrRbr, lda, tau);
-        assert(info == 0, "QR decomp of QrRbr");
-
-        // reuse of old function
-        type_precision* Rbrtemp = prepare_R(QrRbr, n, r, 0);
-
-        copy_vec(Rbrtemp, Rbr, r*r);
-        info = LAPACKE_sorgqr(STORAGE_TYPE, n, r, r, QrRbr, lda, tau);
-        assert(info == 0, "QR form");
-        free(Rbrtemp);
-
-        update_R(R, Rtr, Rbr, p, p, r);
-
-//        matlab_print_matrix("RL", p, l, Rtl);
-//        matlab_print_matrix("Rtr", l, r, Rtr);
-//        matlab_print_matrix("Rbr", r, r, Rbr);
-//        matlab_print_matrix("RR", p, p, R);
-//        matlab_print_matrix("QQ", n, p, Q);
-        // matlab_print_matrix("ALL", n, l, AL);
-        // matlab_print_matrix("ARR", n, r, Ar);
-        // matlab_print_matrix("R", p, p, R);
-        int top_idx = 0;
-
-        for (j = 0; j < y_iters; j++)
-        {
-            AIOfile.load_Yblock(&Y, y_block_size);
-            type_precision Qy[y_block_size*p];
-
-
-            //! qy = Q'*Y
-            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                        p, y_block_size, n, 1.0, Q, n, Y, n, 0.0, Qy, p);
-
-            type_precision* B = Qy;
-
-            //! b = R\qy
-            cblas_strsm(CblasColMajor, CblasLeft, CblasUpper,
-                        CblasNoTrans, CblasNonUnit,
-                        p, y_block_size, 1.0, R, p, Qy, p);
-
-            // matlab_print_matrix("bcomp", p, y_block_size, B);
-
-
-            if ( ForceCheck)
-            {
-                check_result(AL, Ar, n, p, y_block_size, r, Y, B);
-            }
-        }
-
-        free(Atemp);
-    }
-
-
-    get_ticks(end_tick);
-    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
-    out.gflops = 2.0*(n*l*l/1000.0 + a_amount *
-                      (2.0 * n * l * r / 1000.0 +
-                       n *  r * r / 1000.0 +
-                       y_amount / 1000.0 * (p * n + p * p * p))) /
-        1000.0 / 1000.0;
-
-
-    AIOfile.finalize();
-
-
-    free(Rtl);
-    free(AL);
-    delete[] Q;
-}
-
-
-void  Algorithm::partialQR_Blocked_Rtl(struct Settings params,
-                                       struct Outputs &out)
-{
-    int max_threads = params.threads;
-
-    // srand (time(NULL));
-
-    int cpu_cores = max_threads;
-
-
-    blas_set_num_threads(max_threads);
-    omp_set_num_threads(max_threads);
-
-
-
-
-    cputime_type start_tick, start_tick2, end_tick;
-    double acc_pre     = 0;
-    double acc_trsm    = 0;
-    double acc_other   = 0;
-    double acc_atemp   = 0;
-    double acc_rtr     = 0;
-    double acc_rqr     = 0;
-    double acc_RTL_QLY = 0;
-
-
-    type_precision *AL, *RL, *RQy_top, *Ytemp, *RtlRtr;
-    lapack_int info, n, lda, ldy, l, r, k, p;
-
-    int i, j, w;
-    int m;
-
-
-    AIOfile.initialize(params);
-
-    n = params.n; l = params.l; r = params.r; p = l+r;
-
-
-
-    double lvl_3_cache_size = 6;
-
-
-    int y_amount = params.t;
-    int y_block_size = params.tb;// kk
-
-    int a_amount = params.m;
-    int a_block_size = params.mb;
-
-    int a_iters = (a_amount + a_block_size - 1) / a_block_size;
-
-    int y_iters = (y_amount+ y_block_size - 1)/y_block_size;
-
-    int qy_idx = 0;
-
-    out.gflops = n/1000.0*l/1000.0*l/1000.0 +
-        gemm_flops(l, n, y_amount, 0) +
-        y_amount*l/1000.0*l/1000.0/1000.0 +
-        a_amount * (gemm_flops(l, n, r, 0) +
-                    l/1000.0*l/1000.0/1000.0 +
-                    gemm_flops(n, l, r, 1) +
-                    n/1000.0*r/1000.0*r/1000.0 +
-                    gemm_flops(r, n, y_amount, 0) +
-                    y_amount/1000.0*r/1000.0*r/1000.0 +
-                    y_amount/1000.0*l/1000.0/1000.0);
-
-    int sch_block_size = max(1.0,
-                             min((double)a_block_size / (double)max_threads,
-                                 (1.0*1000*1000) /
-                                 (double)((sizeof(type_precision) * n * r))));
-
-    // cout << endl<< "taskChunk "<< sch_block_size <<endl;
-
-    lda = n; ldy = n;
-    k = l;
-
-    AIOfile.load_AL(&AL);
-
-    type_precision* backupAL = replicate_vec(AL, n*l);
-
-
-
-    RQy_top = new type_precision[l*y_amount];
-
-    type_precision* Bfinal = (type_precision*)malloc(max_threads * p *
-                                                     y_block_size *
-                                                     sizeof(type_precision));
-
-    type_precision tau[k];
-
-    type_precision* QL;
-
-    type_precision* Rtr = (type_precision*)malloc(l * r * a_block_size *
-                                                  sizeof(type_precision));
-    type_precision* Qy_bot = (type_precision*)malloc(a_block_size *
-                                                     y_block_size * r *
-                                                     sizeof(type_precision));
-
-    type_precision* B_top = new type_precision[max_threads*y_block_size*l];
-    type_precision* B_bot = new type_precision[max_threads*y_block_size*r];
-
-    type_precision* Ar;
-    type_precision Rbr[r*r*a_block_size];
-
-
-
-    // printf("\n\n%%Computations\n%%");
-    for (i = 0; i < a_iters; i++)
-    {
-        if (a_iters >= 10 && (i%(a_iters/10)) == 0)
-        {
-            // cout << "*" << flush;
-        }
-
-        for (j = 0; j < y_iters; j++)
-        {
-            if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
-            {
-                // cout << "*" << flush;
-            }
-        }
-    }
-    cout << endl;
-
-
-    //! Start of Computations
-
-    get_ticks(start_tick);
-
-
-    info = LAPACKE_sgeqrf(STORAGE_TYPE, n, l, AL, lda, tau);
-    assert(info == 0, "QR decomp");
-
-    type_precision* Rtl = prepare_R(AL, n, l, 0);
-
-    QL = AL;
-    AL = backupAL;
-
-    info = LAPACKE_sorgqr(STORAGE_TYPE, n, l, k, QL, lda, tau);
-
-    assert(info == 0, "Q form");
-
-    matlab_print_matrix("QL", n, l, QL);
-
-    qy_idx = 0;
-
-    // printf("\n%%Preparing IO\n");
-    //! Prepare File for the Y
-
-
-    type_precision* Y;
-
-    // #pragma omp parallel default(shared)
+//    int max_threads = params.threads;
+//
+//    srand(time(NULL));
+//
+//    blas_set_num_threads(max_threads);
+//
+//    type_precision *Ytemp;
+//    lapack_int info, n, lda,ldy, l, r, p;
+//
+//    int i,k;
+//
+//    cputime_type start_tick, start_tick2, end_tick;
+//
+//    AIOfile.initialize(params);
+//
+//    n = params.n; l = params.l; r = params.r; p = l+r;
+//
+//    int y_amount = params.m;
+//    int y_block_size = params.mb;  // kk
+//
+//    int a_amount = params.t;
+//    int a_block_size = params.tb;
+//
+//    int a_iters = (a_amount + a_block_size - 1) / a_block_size;
+//
+//    lda = n; ldy = n;
+//    k = p;
+//
+//
+//
+//    type_precision* backupAL;
+//
+//    AIOfile.load_AL(&backupAL);
+//
+//
+//    type_precision Stl[l * l];
+//    type_precision Str[l * r * a_block_size];
+//
+//
+//    type_precision* Ay_top = new type_precision[l * y_amount];
+//    type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
+//    type_precision* A = new type_precision[n * p];
+//    type_precision* AR = new type_precision[n * r * a_block_size];
+//    type_precision* AL = new type_precision[n * l];
+//
+//    copy_vec(backupAL, AL, n * l);
+//    copy_vec(backupAL, A, n * l);
+//
+//
+//    int y_iters = y_amount/y_block_size;
+//
+//    type_precision* Y;
+//    type_precision* backupAR;
+//
+//    // printf("\n\n%%Computations\n%%");
+//
+//
+//    get_ticks(start_tick);
+//
+//
+//    //! Generate S
+//    cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
+//                l, n, 1.0, backupAL, lda, 0.0, Stl, l);
+//
+//    for (int j = 0; j < y_iters; j++)
 //    {
-//        // #pragma omp for private(Y, y_block_size, qy_idx) nowait schedule(static, 1)
+//        AIOfile.load_Yblock(&Y, y_block_size);
+//
+//        //! Ay_top = AL'*Y
+//        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+//                    l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
+//                    &Ay_top[j * l * y_block_size], l);
+//    }
+//
+//
+//    for (int i = 0; i < a_iters; i++)
+//    {
+//        if (a_iters > 10 && (i%(a_iters/10)) == 0)
+//        {
+//            cout << "%" << flush;
+//        }
+//
+//
+//        AIOfile.load_ARblock(&backupAR, a_block_size);
+//        AIOfile.reset_Y();
+//        copy_vec(backupAR, AR, n*r*a_block_size);
+//
+//        // matlab_print_matrix("A", n, p, A);
+//        //! Generate Str
+//        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+//                    l, r * a_block_size, n, 1.0, AL, n, AR, n, 0.0, Str, l);
+//
+//        type_precision Sbr[r*r*a_block_size];
+//
+//        #pragma omp parallel for default(shared) schedule(static)
+//        for (int ii= 0; ii < a_block_size; ii++)
+//        {
+//            //! Generate Sbr
+//            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+//                        r, r, n, 1.0, &AR[ii*r*n], n, &AR[ii*r*n],
+//                        n, 0.0, &Sbr[ii*r*r], r);
+//        }
+//
+//        type_precision Ay[y_block_size*p];
+//        type_precision S[p*p];
+//
+//
 //        for (int j = 0; j < y_iters; j++)
 //        {
-//            // qy_idx = j*y_block_size*l;
-//            // #pragma omp critical
-//            {AIOfile.load_Yblock(&Y, y_block_size);}
+//            if (a_iters < 10 && (y_iters < 10 || (j%(y_iters/10)) == 0))
+//            {
+//                cout << "%" << flush;
+//            }
 //
-//            //! qy_top = QL'*Y
+//
+//            AIOfile.load_Yblock(&Y, y_block_size);
+//
+//            //! Ay_bot = AR'*Y
 //            cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-//                            l, y_block_size, n, 1.0, QL, n, Y, n, 0.0,&RQy_top[qy_idx], l);
+//                        r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
+//                        0.0, Ay_bot, r * a_block_size);
 //
-//            //! K | RtlQlY =RTL\qy_top
-//            cblas_strsm(CblasColMajor, CblasLeft, CblasUpper, CblasNoTrans, CblasNonUnit,
-//                                            l, y_block_size, 1.0, Rtl, l,&RQy_top[qy_idx], l);
-//            qy_idx += y_block_size*l;
+//            #pragma omp parallel for private(S, Ay) default(shared) schedule(static)
+//            for (int ii= 0; ii < a_block_size; ii++)
+//            {
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/genabel -r 1609


More information about the Genabel-commits mailing list