[Genabel-commits] r1763 - in pkg/OmicABELnoMM: . src tests

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Tue Jul 15 10:17:30 CEST 2014


Author: afrank
Date: 2014-07-15 10:17:29 +0200 (Tue, 15 Jul 2014)
New Revision: 1763

Added:
   pkg/OmicABELnoMM/tests/Makefile.am
Modified:
   pkg/OmicABELnoMM/Makefile.am
   pkg/OmicABELnoMM/configure.ac
   pkg/OmicABELnoMM/src/AIOwrapper.cpp
   pkg/OmicABELnoMM/src/AIOwrapper.h
   pkg/OmicABELnoMM/src/Algorithm.cpp
   pkg/OmicABELnoMM/src/Algorithm.h
   pkg/OmicABELnoMM/src/Definitions.h
   pkg/OmicABELnoMM/src/Utility.cpp
   pkg/OmicABELnoMM/src/Utility.h
   pkg/OmicABELnoMM/src/main.cpp
   pkg/OmicABELnoMM/tests/Makefile
   pkg/OmicABELnoMM/tests/test.cpp
Log:
Added R2, T-stat, Pval functions, but inactive atm. Added exclusion list support to remove individuals. Fixed propagation of missing values. Performance Improvements.

Modified: pkg/OmicABELnoMM/Makefile.am
===================================================================
--- pkg/OmicABELnoMM/Makefile.am	2014-07-08 13:16:45 UTC (rev 1762)
+++ pkg/OmicABELnoMM/Makefile.am	2014-07-15 08:17:29 UTC (rev 1763)
@@ -2,7 +2,7 @@
 
 oanomm_headers = src/Definitions.h src/AIOwrapper.h src/Algorithm.h	\
  src/Utility.h
-oanomm_cpp = src/AIOwrapper.cpp src/Algorithm.cpp src/Utility.cpp	\
+oanomm_cpp = src/AIOwrapper.cpp src/Algorithm.cpp src/Utility.cpp \
  src/main.cpp
 
 omicabelnomm_SOURCES = $(oanomm_headers) $(oanomm_cpp)

Modified: pkg/OmicABELnoMM/configure.ac
===================================================================
--- pkg/OmicABELnoMM/configure.ac	2014-07-08 13:16:45 UTC (rev 1762)
+++ pkg/OmicABELnoMM/configure.ac	2014-07-15 08:17:29 UTC (rev 1763)
@@ -17,11 +17,12 @@
 # Set some default compile flags
 if test -z "$CXXFLAGS"; then
    # User did not set CXXFLAGS, so we can put in our own defaults
-    CXXFLAGS="-g -O2"
+    CXXFLAGS="-O3"
 fi
 if test -z "$CPPFLAGS"; then
    # User did not set CPPFLAGS, so we can put in our own defaults
     CPPFLAGS="-Wall -g -pedantic -Wunused-result -Wmaybe-uninitialized -Wformat"
+    #CPPFLAGS="-Wall"
 fi
 # If CXXFLAGS/CPPFLAGS are already set AC_PROG_CXX will not overwrite them
 # with its own defaults
@@ -45,13 +46,18 @@
    AC_MSG_ERROR([Unable to find a Lapack library])
 ])
 
+#Boost
 
+
+
 # Check for openMP. If found the OPENMP_CXXFLAGS is set automatically
 AC_OPENMP
 AC_SUBST(AM_CXXFLAGS, "$OPENMP_CFLAGS")
 
 
 
+
+
 # Checks for header files.
 AC_CHECK_HEADERS([limits.h stdlib.h string.h sys/time.h unistd.h])
 

Modified: pkg/OmicABELnoMM/src/AIOwrapper.cpp
===================================================================
--- pkg/OmicABELnoMM/src/AIOwrapper.cpp	2014-07-08 13:16:45 UTC (rev 1762)
+++ pkg/OmicABELnoMM/src/AIOwrapper.cpp	2014-07-15 08:17:29 UTC (rev 1763)
@@ -28,16 +28,21 @@
     pthread_barrier_init(&(FHandler.finalize_barrier),NULL,2);
 
 
-    Fhandler->fakefiles = params.use_fake_files;
+    Fhandler->fakefiles = params.use_fake_files;
+
+
+
 
 
+
     if(!Fhandler->fakefiles)
     {
         Fhandler->fnameAL = params.fnameAL;
         Fhandler->fnameAR = params.fnameAR;
         Fhandler->fnameY = params.fnameY;
-        Fhandler->fnameOutB = params.fnameOutB;
+        Fhandler->fnameOutB = params.fnameOutB;
 
+
         Yfvi  = load_databel_fvi( (Fhandler->fnameY+".fvi").c_str() );
         ALfvi = load_databel_fvi( (Fhandler->fnameAL+".fvi").c_str() );
         ARfvi = load_databel_fvi( (Fhandler->fnameAR+".fvi").c_str() );
@@ -46,7 +51,7 @@
         params.t = Yfvi->fvi_header.numVariables;
         params.l = ALfvi->fvi_header.numVariables;
 
-	int opt_tb = 1000;
+        int opt_tb = 1000;
         int opt_mb = 1000;
 
         params.mb = min(params.m, opt_tb);
@@ -56,7 +61,22 @@
     else
     {
 
-    }
+    }
+
+    //params.fname_excludelist = "exclfile.txt";
+    int excl_count = 0;
+    Fhandler->excl_List = new list< pair<int,int> >();
+
+    if(params.fname_excludelist.size()==0)
+    {
+        (Fhandler->excl_List)->push_back( make_pair(0,params.n) );
+    }
+    else
+    {
+        read_excludeList( Fhandler->excl_List,excl_count,params.n,params.fname_excludelist);
+    }
+
+    params.n -= excl_count;
 
     params.p = params.l + params.r;
 
@@ -106,7 +126,9 @@
     pthread_cond_destroy(&(Fhandler->condition_more));
 
     pthread_mutex_destroy(&(Fhandler->m_read));
-    pthread_cond_destroy(&(Fhandler->condition_read));
+    pthread_cond_destroy(&(Fhandler->condition_read));
+
+    delete Fhandler->excl_List;
 
 
 }
@@ -186,12 +208,15 @@
         }
         //cout << "\nEnd preping files\n" << flush;
 
-    }
+    }
+
+    //pthread_barrier_wait(&(Fhandler->finalize_barrier));//for testing only
 
 
     Fhandler->not_done = true;
-    Fhandler->reset_wait = false;
+    Fhandler->reset_wait = false;
 
+
     while(Fhandler->not_done)
     {
 
@@ -226,9 +251,30 @@
                 Fhandler->seed += 75;
             }
             else
-            {
-                size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Y);
-                result++;
+            {
+                list< pair<int,int> >* excl_List = Fhandler->excl_List;
+
+
+                int chunk_size_buff;
+                int buff_pos=0;
+                int file_pos;
+
+                for(int i = 0; i < tmp_y_blockSize; i++)
+                {
+                    for (list<  pair<int,int>  >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
+                    {
+                        file_pos = Fhandler->n*i + it->first;
+                        fseek ( fp_Y , file_pos*sizeof(type_precision) , SEEK_SET );
+                        chunk_size_buff = it->second;
+
+                        size_t result = fread (&tobeFilled->buff[buff_pos],sizeof(type_precision),chunk_size_buff,fp_Y); result++;
+                        buff_pos += chunk_size_buff;
+
+
+                    }
+                }
+//                size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Y);
+//                result++;
                 if(Fhandler->y_to_readSize <= 0)
                 {
                     fseek ( fp_Y , 0 , SEEK_SET );
@@ -267,17 +313,38 @@
             if(Fhandler->fakefiles)
             {
                 fseek ( fp_Ar , 0 , SEEK_SET );
-                size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
-                result++;
+                size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar); result++;
 
                 re_random_vec(tobeFilled->buff , Fhandler->n * tmp_ar_blockSize*Fhandler->r );
                 re_random_vec_nan(tobeFilled->buff , Fhandler->n * tmp_ar_blockSize*Fhandler->r );
 
             }
             else
-            {
-                size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
-                result++;
+            {
+
+                list< pair<int,int> >* excl_List = Fhandler->excl_List;
+
+                int chunk_size_buff;
+                int buff_pos=0;
+                int file_pos;
+
+                for(int i = 0; i < tmp_ar_blockSize*Fhandler->r; i++)
+                {
+                    for (list<  pair<int,int>  >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
+                    {
+                        file_pos = Fhandler->n*i + it->first;
+                        fseek ( fp_Ar , file_pos*sizeof(type_precision) , SEEK_SET );
+
+                        chunk_size_buff = it->second;
+                        size_t result = fread (&tobeFilled->buff[buff_pos],sizeof(type_precision),chunk_size_buff,fp_Ar); result++;
+                        buff_pos += chunk_size_buff;
+
+
+                    }
+                }
+
+//                size_t result = fread(tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
+//                result++;
                 if (Fhandler->Ar_to_readSize <= 0)
                 {
                     fseek ( fp_Ar , 0 , SEEK_SET );
@@ -307,8 +374,8 @@
             if(Fhandler->fakefiles)
             {
                 fseek ( fp_B , 0 , SEEK_SET );
-	  }
-		fwrite (tobeWritten->buff,sizeof(type_precision),size,fp_B);
+            }
+            fwrite (tobeWritten->buff,sizeof(type_precision),size,fp_B);
 
 
             Fhandler->b_empty_buffers.push(tobeWritten);
@@ -356,7 +423,8 @@
     //cout << "k" << flush;
     //barrier
     pthread_barrier_wait(&(Fhandler->finalize_barrier));
-
+
+    {
     type_buffElement* tmp;
     while(!Fhandler->full_buffers.empty())
     {
@@ -404,6 +472,7 @@
        Fhandler->b_empty_buffers.pop();
        delete []tmp->buff;
        delete tmp;
+    }
     }
 
 
@@ -544,13 +613,9 @@
 
 void AIOwrapper::write_B(type_precision* B, int p, int blockSize)
 {
-     //int status;
-    //int createstatus = 0;
-   // cout << " b: full" << Fhandler->b_full_buffers.size() << "; epty" << Fhandler->b_empty_buffers.size() << endl;
 
     while(Fhandler->b_empty_buffers.empty())
     {
-
         pthread_mutex_lock(&(Fhandler->m_more));
         pthread_cond_signal( &(Fhandler->condition_more ));
         pthread_mutex_unlock(&(Fhandler->m_more));
@@ -568,7 +633,7 @@
 
 
 
-
+        //cout << Fhandler->b_empty_buffers.size() << flush;
         Fhandler->currentWriteBuff = Fhandler->b_empty_buffers.front();
         Fhandler->b_empty_buffers.pop();
 
@@ -582,12 +647,9 @@
 
 
 
-    // cout << " b: full" << Fhandler->b_full_buffers.size() << "; epty" << Fhandler->b_empty_buffers.size() << endl;
-
     pthread_mutex_unlock(&(Fhandler->m_buff_upd));
 
 
-
     pthread_mutex_lock(&(Fhandler->m_more));
     pthread_cond_signal( &(Fhandler->condition_more ));
     pthread_mutex_unlock(&(Fhandler->m_more));
@@ -668,8 +730,12 @@
 //        {
 //            (tmp->buff)[i] = 0;
 //        }
-        Fhandler->b_empty_buffers.push(tmp);
+        Fhandler->b_empty_buffers.push(tmp);
+
+//        Fhandler->currentWriteBuff = Fhandler->b_empty_buffers.front();
+//        Fhandler->b_empty_buffers.pop();
 
+
     }
 
 }
@@ -819,17 +885,38 @@
         (*AL) = Fhandler->AL;
     }
     else
-    {
+    {
         FILE *fp;
         fp = fopen((Fhandler->fnameAL+".fvd").c_str(), "rb");
         if(fp == 0)
         {
             cout << "Error Reading File " << Fhandler->fnameAL << endl;
             exit(1);
-        }
+        }
+
+        list< pair<int,int> >* excl_List = Fhandler->excl_List;
+
+        int chunk_size_buff;
+        int buff_pos=0;
+        int file_pos;
+
+        for (int i=0; i < Fhandler->l; i++)
+        {
+            for (list<  pair<int,int>  >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
+            {
+                file_pos = i*Fhandler->n+ it->first;
+                fseek ( fp , file_pos*sizeof(type_precision) , SEEK_SET );
+                chunk_size_buff = it->second;
+
+                size_t result = fread (&(Fhandler->AL[buff_pos]),sizeof(type_precision),chunk_size_buff,fp); result++;
+                buff_pos += chunk_size_buff;
+            }
+        }
 
-        size_t result = fread (Fhandler->AL,sizeof(type_precision),Fhandler->l*Fhandler->n,fp);
-        result++;
+
+//        size_t result = fread (Fhandler->AL,sizeof(type_precision),Fhandler->l*Fhandler->n,fp);
+//
+//        result++;
         fclose(fp);
     }
 
@@ -862,6 +949,111 @@
 {
     delete []Fhandler->AL;
 }
+
+
+void AIOwrapper::read_excludeList(list< pair<int,int> >* excl, int &excl_count, int max_excl, string fname_excludeList)
+{
+
+    ifstream fp_exL(fname_excludeList.c_str());
+    if(fp_exL == 0)
+    {
+        cout << "Error reading exclude list file."<< endl;
+        exit(1);
+    }
+
+
+    string line;
+    excl_count = 0;
+    bool early_EOF;
+    int first,second, prev_second;
+
+     cout << "Excluding Ids: \n";
+
+    std::getline(fp_exL, line);
+    std::istringstream iss(line);
+    iss >> first;
+    early_EOF = iss.eof();
+    iss >> prev_second;
+
+    if(prev_second < first || early_EOF)
+            prev_second = first;
+
+    second = prev_second;
+
+
+    if(first > max_excl)
+    {
+        excl->push_back( make_pair(0,max_excl) );
+        cout << "\nNothing to Exclude!\n";
+    }
+    else
+    {
+        if(second > max_excl)
+        {
+                excl_count += max_excl-first+1;
+                excl->push_back( make_pair(0,first-1) );
+                cout << first << "-" << second << ", ";
+        }
+        else
+        {
+
+            cout << first << "-" << second << ", ";
+            excl->push_back( make_pair(0,first - 1) );
+            excl_count += second-(first-1);
+
+            while (std::getline(fp_exL, line) && second < max_excl )
+            {
+                std::istringstream iss(line);
+
+                iss >> first;
+                early_EOF = iss.eof();
+                iss >> second;
+                if(prev_second >= first)
+                {
+                    cout << "\nPlease give an ordered Exlusion List!\n";
+                    cout << "?? " << prev_second << "\n" << first << " ??"<< endl;
+                    exit( 1 );
+                }
+
+
+                if(second < first || early_EOF )
+                    second = first;
+
+
+                if(second > max_excl)
+                    excl_count += max_excl-first+1;
+                else if(first < max_excl)
+                    excl_count += second-(first-1);
+
+
+
+                cout << first << "-" << second << ", ";
+
+                excl->push_back( make_pair(prev_second,first - prev_second - 1) );
+
+
+                prev_second = second;
+
+
+            }
+        }
+    }
+
+
+
+
+    if(excl_count >= max_excl)
+    {
+		cout << "\nExclusion List excluded all data!\n";
+		cout << "Total Ids: " << max_excl << "\nExcluded Ids: " << excl_count << endl;
+		exit( 1 );
+	}
+	 cout << "Excluded: "  << excl_count << " Using: " << max_excl-excl_count<< endl;
+
+
+
+}
+
 
 void AIOwrapper::free_databel_fvi( struct databel_fvi **fvi )
 {

Modified: pkg/OmicABELnoMM/src/AIOwrapper.h
===================================================================
--- pkg/OmicABELnoMM/src/AIOwrapper.h	2014-07-08 13:16:45 UTC (rev 1762)
+++ pkg/OmicABELnoMM/src/AIOwrapper.h	2014-07-15 08:17:29 UTC (rev 1763)
@@ -2,7 +2,9 @@
 #define AIOWRAPPER_H
 
 #include "Definitions.h"
-#include "Utility.h"
+#include "Utility.h"
+#include <fstream>
+#include <sstream>
 
 typedef struct BufferElement type_buffElement;
 
@@ -20,9 +22,15 @@
 {
     string fnameAL;
     string fnameAR;
-    string fnameY;
-    string fnameOutB;
+    string fnameY;
+
 
+    string fnameOutB;
+
+
+    list< pair<int,int> >* excl_List;
+
+
     bool doublefileType;
     bool fakefiles;
 
@@ -138,7 +146,9 @@
 
     protected:
 
-    private:
+    private:
+
+        void read_excludeList(list< pair<int,int> >* excl, int &excl_count, int max_excl, string fname_excludeList);
 
 
         void prepare_AR( int desired_blockSize, int n, int totalR, int columnsR);
@@ -163,7 +173,7 @@
 
         void * fgls_malloc_impl( const char* file, long line, size_t size );
 
-        type_fileh FHandler;
+            public: type_fileh FHandler;
         type_fileh* Fhandler;
 
         FILE* fp_Ar;

Modified: pkg/OmicABELnoMM/src/Algorithm.cpp
===================================================================
--- pkg/OmicABELnoMM/src/Algorithm.cpp	2014-07-08 13:16:45 UTC (rev 1762)
+++ pkg/OmicABELnoMM/src/Algorithm.cpp	2014-07-15 08:17:29 UTC (rev 1763)
@@ -1,5 +1,7 @@
-#include "Algorithm.h"
+#include "Algorithm.h"
+
 
+
 Algorithm::Algorithm()
 {
     // ctor
@@ -51,41 +53,39 @@
 }
 
 
-void Algorithm::prepare_Bfinal(type_precision* bfinal, type_precision* top,
-                               type_precision* bot, int dim1_b,
-                               int dim2_b, int dim1_b_bot)
+void Algorithm::prepare_Bfinal(type_precision* bfinal, type_precision* bsource, int a_amount, int y_amount, int p)
 {
-    // memcpy are faster version of the fors
-    int i, k, w, top_idx, bot_idx;
-    int size;
-    top_idx = 0;
-    bot_idx = 0;
-    for (k = 0; k < dim2_b; k++)
-    {
-        size = k * dim1_b + (dim1_b - dim1_b_bot) - (k * dim1_b);
-        memcpy( (type_precision*)&bfinal[k * dim1_b],
-                (type_precision*)&top[top_idx],
-                size * sizeof(type_precision) );
-//        for (i = k*dim1_b; i < k*dim1_b+(dim1_b-dim1_b_bot); i++)
-//        {
-//            bfinal[i] = top[top_idx];
-//            top_idx++;
-//        }
-        top_idx += size;
-        i = k * dim1_b + size;
-        w = i;
-
-        size = w + dim1_b_bot - w;
-        memcpy( (type_precision*)&bfinal[w],
-                (type_precision*)&bot[bot_idx],
-                size * sizeof(type_precision) );
-//        for (i = w; i < w+dim1_b_bot; i++)
-//        {
-//            bfinal[i] = bot[bot_idx];
-//            bot_idx++;
-//        }
-        bot_idx += size;
-    }
+//    // memcpy are faster version of the fors
+//    int i, k, w, top_idx, bot_idx;
+//    int size;
+//    top_idx = 0;
+//    bot_idx = 0;
+//    for (k = 0; k < dim2_b; k++)
+//    {
+//        size = k * dim1_b + (dim1_b - dim1_b_bot) - (k * dim1_b);
+//        memcpy( (type_precision*)&bfinal[k * dim1_b],
+//                (type_precision*)&top[top_idx],
+//                size * sizeof(type_precision) );
+////        for (i = k*dim1_b; i < k*dim1_b+(dim1_b-dim1_b_bot); i++)
+////        {
+////            bfinal[i] = top[top_idx];
+////            top_idx++;
+////        }
+//        top_idx += size;
+//        i = k * dim1_b + size;
+//        w = i;
+//
+//        size = w + dim1_b_bot - w;
+//        memcpy( (type_precision*)&bfinal[w],
+//                (type_precision*)&bot[bot_idx],
+//                size * sizeof(type_precision) );
+////        for (i = w; i < w+dim1_b_bot; i++)
+////        {
+////            bfinal[i] = bot[bot_idx];
+////            bot_idx++;
+////        }
+//        bot_idx += size;
+//    }
 }
 
 
@@ -249,12 +249,13 @@
 
     lapack_int info = LAPACKE_sgels(STORAGE_TYPE, 'N', rowsA, colsA, rhs, A,
                                     rowsA, ynew, rowsA);
-    assert(info == 0, "Error Check");
+    myassert(info == 0, "Error Check");
 
 
     int index = 0;
     int index_new = 0;
-    for (i = 0; i < rhs; i++)
+    for (i = 0; i < rhs; i++)
+
     {
          copy_vec(&ynew[index], &new_sol[index_new], colsA);
          index += rowsA;
@@ -305,13 +306,14 @@
 
     srand(time(NULL));
 
-    blas_set_num_threads(max_threads);
+    blas_set_num_threads(max_threads);
+    omp_set_num_threads(max_threads);
 
 
     //type_precision *Ytemp;
     lapack_int info, n, lda, l, r, p;
 
-    cputime_type start_tick, start_tick2, end_tick;
+    cputime_type start_tick, start_tick2, start_tick3, end_tick;
 
     AIOfile.initialize(params);
 
@@ -333,54 +335,70 @@
 
     for (int j = 0; j < y_iters && !params.ForceCheck; j++)
     {
-        if (y_iters >= 40 && (j%(y_iters/40)) == 0)
+        if (!params.ForceCheck &&  y_iters >= 10 && (j%(y_iters/10)) == 0 )
         {
             cout << "*" << flush;
         }
 
         for (int i = 0; i < a_iters; i++)
         {
-            for (int jj = 0; jj < y_block_size; jj++)
-            {
-                if (y_iters < 40 &&
-                    (y_block_size < 3 || (jj%(y_block_size/3)) == 0)
-                    )
+
+                if ( !params.ForceCheck &&  y_iters < 10 &&
+                (  (a_iters >= 10 && (i%(a_iters/(10/y_iters))) == 0) || (a_iters < (10/y_iters)) ))
                 {
                     cout << "*" << flush;
                 }
-            }
+
         }
     }
 
     if(!params.ForceCheck)
         cout << endl;
 
-
+    //add memalign
 
     //type_precision Stl[l*l];
     //type_precision Str[l*r*a_block_size];
-    type_precision* Stl = new type_precision[l*l];
-    type_precision* Str = new type_precision[l*r*a_block_size];
+    type_precision* Stl = new type_precision[l*l*1];
+    type_precision* Str = new type_precision[l*r*a_block_size*1];
 
     type_precision* Sbr = new type_precision[r *  r * a_block_size];
-    type_precision* Ay = new type_precision[p * a_block_size];
+    type_precision* Ay = new type_precision[p * a_block_size];
+    //type_precision* B_resorted = new type_precision[p * a_block_size*y_block_size];
 
-    type_precision* S = new type_precision[p * p];
+    type_precision* S = new type_precision[p * p];
 
+
     type_precision* Ay_top = new type_precision[l * y_amount];
     type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
 
-    list<long int>* y_nan_idxs = new list<long int>[y_block_size];
+    type_precision* y_residual = new type_precision[n * y_block_size ];
+    type_precision* y_res_norms = new type_precision[a_block_size];
+
+    list<long int>* al_nan_idxs = new list<long int>[1];
+    list<long int>* y_nan_idxs = new list<long int>[y_block_size];
+    list<long int>* ar_nan_idxs = new list<long int>[a_block_size];
 
+
+    type_precision* A = new type_precision[n * p * 1];
     type_precision* AR = new type_precision[n * r * a_block_size * 1];
-    type_precision* AL = new type_precision[n * l * 1];
+//  type_precision* AL = new type_precision[n * l * 1];
+    type_precision* AL = A;
+
+    type_precision* B = Ay;
+
     type_precision* backupAR;  // = new type_precision[n*r*a_block_size];
     type_precision* backupAL;  // = new type_precision[n*l];
 
 
     AIOfile.load_AL(&backupAL);
-    //int total_al_nans = replace_nans(0, backupAL, n, l);
-    replace_nans(0, backupAL, n, l);
+
+    //pthread_barrier_wait(&(AIOfile.Fhandler->finalize_barrier));
+
+    replace_nans(al_nan_idxs,1, backupAL, n, l);
+    al_nan_idxs->push_back(1);
+
+    //LAPACKE_dgesdd()
 
     copy_vec(backupAL, AL, n*l);
 
@@ -394,176 +412,369 @@
 
     for (int j = 0; j < y_iters; j++)
     {
-        if (y_iters >= 40 && (j%(y_iters/40)) == 0 && !params.ForceCheck)
+        if (!params.ForceCheck &&  y_iters >= 10 && (j%(y_iters/10)) == 0 && !params.ForceCheck)
         {
             cout << AIOfile.io_overhead << flush;
             AIOfile.io_overhead = "*";
         }
+
+        get_ticks(start_tick2);
 
-        AIOfile.load_Yblock(&Y, y_block_size);
+        AIOfile.load_Yblock(&Y, y_block_size);
+
+        get_ticks(end_tick);
+        out.acc_loady += ticks2sec(end_tick,start_tick2);
 
+        get_ticks(start_tick2);
+        replace_nans(&y_nan_idxs[0],y_block_size, Y, n,1);
+        //out.acc_other += ticks2sec(end_tick,start_tick2);
 
 
-        //int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
-        replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
+        get_ticks(start_tick2);
 
-
         //! Ay_top = AL'*Y
         cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
                     l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
-                    &Ay_top[j * l * y_block_size], l);
+                    &Ay_top[j * l * y_block_size], l);
+
+        get_ticks(end_tick);
+        out.acc_gemm += ticks2sec(end_tick,start_tick2);
 
 
         for (int i = 0; i < a_iters; i++)
-        {
-            AIOfile.load_ARblock(&backupAR, a_block_size);
-            //int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
-            replace_nans(0, backupAR, n, a_block_size * r);
-            copy_vec(backupAR, AR, n *  r * a_block_size);
+        {
+
+            if (!params.ForceCheck && y_iters < 10 &&
+                (  (a_iters >= 10 && (i%(a_iters/(10/y_iters))) == 0) || (a_iters < (10/y_iters)) ))
+            {
+                cout << "*" << flush;
+            }
+
+            get_ticks(start_tick2);
 
-            get_ticks(start_tick2);
+            AIOfile.load_ARblock(&backupAR, a_block_size);
+
+            get_ticks(end_tick);
+            out.acc_loadxr += ticks2sec(end_tick,start_tick2);
+
+            get_ticks(start_tick2);
+
+            replace_nans(&ar_nan_idxs[0],a_block_size, backupAR, n, r);
+            replace_with_zeros(al_nan_idxs, backupAR,  n, r, a_block_size);
 
+            copy_vec(backupAR, AR, n *  r * a_block_size);
+
             get_ticks(end_tick);
-            out.acc_pre += ticks2sec(end_tick-start_tick2, cpu_freq);
+            //out.acc_other += ticks2sec(end_tick,start_tick2);
 
-            get_ticks(start_tick2);
+
+
+            get_ticks(start_tick2);
+
             //! Ay_bot = AR'*Y
             cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
                         r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
                         0.0, Ay_bot, r * a_block_size);
 
             get_ticks(end_tick);
-            out.firstloop += ticks2sec(end_tick - start_tick2, cpu_freq);
+            out.acc_gemm += ticks2sec(end_tick,start_tick2);
+
+            int aL_idx =  0*l * n;
+            int aR_idx = 0* r * n * a_block_size;
 
-            //#pragma omp parallel default(shared)
-            {
-                for (int jj = 0; jj < y_block_size; jj++)
-                {
-                    int thread_id = 0 * omp_get_thread_num();
-                    int aL_idx = thread_id * l * n;
-                    int aR_idx = thread_id * r * n * a_block_size;
+            get_ticks(start_tick3);
+            for (int jj = 0; jj < y_block_size; jj++)
+            {
+
+                //int thread_id = 0 * omp_get_thread_num();//so far singel thread version only
 
-                    if (y_iters < 40 &&
-                                  (y_block_size < 3 ||
-                                                  (jj%(y_block_size/3)) == 0) && !params.ForceCheck)
-                    {
-                        cout << AIOfile.io_overhead << flush;
-                        AIOfile.io_overhead = "*";
-                    }
+
+                get_ticks(start_tick2);
+
+                copy_vec(backupAL, &AL[aL_idx], n * l);//try to remove!
 
-                    copy_vec(backupAL, &AL[aL_idx], n * l);
+                replace_with_zeros(&y_nan_idxs[jj], &AL[aL_idx], n, l, 1);
+
+
+                get_ticks(end_tick);//2%
+                out.acc_other += ticks2sec(end_tick,start_tick2);
+
+                 get_ticks(start_tick2);
 
-                    replace_with_zeros(&y_nan_idxs[jj], &AL[aL_idx], n, l, 1);
-                    //! Generate Stl
-                    cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
-                                l, n, 1.0, &AL[aL_idx], lda, 0.0, Stl, l);
+                //! Generate Stl
+                cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
+                            l, n, 1.0, &AL[aL_idx], lda, 0.0, Stl, l);
+
+                get_ticks(end_tick);
+                out.acc_stl += ticks2sec(end_tick,start_tick2);
+
+
+                get_ticks(start_tick2);
+
+                copy_vec(backupAR,&AR[aR_idx], n*r*a_block_size);//!10%//try to remove!
+
+
+                replace_with_zeros(&y_nan_idxs[jj], backupAR,  n, r, a_block_size);
 
-                    copy_vec(backupAR,&AR[aR_idx], n*r*a_block_size);
-                    replace_with_zeros(&y_nan_idxs[jj], &AR[aR_idx],
-                                       n, r, a_block_size);
+
+
+                get_ticks(end_tick);
+                out.acc_other += ticks2sec(end_tick,start_tick2);
+
+                get_ticks(start_tick2);
 
-                    get_ticks(start_tick2);
-                    //! Generate Str
-                    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                                l, r * a_block_size, n, 1.0, &AL[aL_idx],
-                                n, &AR[aR_idx], n, 0.0, Str, l);
-                    get_ticks(end_tick);
-                    out.acc_RTL_QLY += ticks2sec(end_tick - start_tick2,
-                                                 cpu_freq);
+                //! Generate Str
+                cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+                            l, r * a_block_size, n, 1.0, &AL[aL_idx],
+                            n, &AR[aR_idx], n, 0.0, Str, l);//!45
 
+                get_ticks(end_tick);
+                out.acc_str += ticks2sec(end_tick,start_tick2);
+
 
-                    //type_precision Sbr[r *  r * a_block_size];
-                    //type_precision Ay[p * a_block_size];
 
+                //type_precision Sbr[r *  r * a_block_size*1];
+                //type_precision Ay[p * a_block_size*1];
+
+                blas_set_num_threads(1);
+                omp_set_num_threads(max_threads);
+
+                #pragma omp parallel default(shared)
+                {
 
-                    //#pragma omp for nowait  schedule(dynamic)
-                    for (int ii= 0; ii < a_block_size; ii++)
-                    {
-                        get_ticks(start_tick2);
-                        //! Generate Sbr
-                        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                                    r, r, n, 1.0, &AR[aR_idx+ii*r*n], n,
-                                    &AR[aR_idx + ii * r * n], n, 0.0,
-                                    &Sbr[ii * r * r], r);
-                        get_ticks(end_tick);
-                        out.acc_gemm += ticks2sec(end_tick - start_tick2,
-                                                  cpu_freq);
+                #pragma omp for nowait
+                for (int ii= 0; ii < a_block_size; ii++)
+                {
+                    //cout << omp_get_thread_num() << endl << flush;
 
-                        get_ticks(start_tick2);
 
-                        copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[ii*p], l);
-                        copy_vec(&Ay_bot[ii*r + jj*r*a_block_size],
-                                 &Ay[l+ii*p], r);
+                    get_ticks(start_tick2);
 
+                    //! Generate Sbr
+                    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+                                r, r, n, 1.0, &AR[aR_idx+ii*r*n], n,
+                                &AR[aR_idx + ii * r * n], n, 0.0,
+                                &Sbr[ii * r * r], r);
+
 
-                        //type_precision* B = Ay;
-                        //type_precision S[p * p];
+                    get_ticks(end_tick);
+                    out.acc_sbr += ticks2sec(end_tick,start_tick2 );
+
+                    get_ticks(start_tick2);
 
+                    copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[ii*p], l);
+                    copy_vec(&Ay_bot[ii*r + jj*r*a_block_size],
+                             &Ay[l+ii*p], r);
+
+
+                    type_precision S2[p * p];
 
-                        //! Rebuild S
-                        build_S(S, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
-                        // matlab_print_matrix("S", p, p, S);
+                    //! Rebuild S
+                    build_S(S2, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
+                    // matlab_print_matrix("S", p, p, S);
 
-                        get_ticks(end_tick);
-                        out.acc_b += ticks2sec(end_tick - start_tick2,
-                                               cpu_freq);
+
+                    get_ticks(end_tick);//5%
+                    out.acc_other += ticks2sec(end_tick,start_tick2);
 
-                        get_ticks(start_tick2);
+                    get_ticks(start_tick2);
 
-                        //! b = S\Ay
-                        info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, 1, S, p,
-                                             &Ay[ii*p], p);
 
-                        get_ticks(end_tick);
-                        out.acc_loadxr += ticks2sec(end_tick - start_tick2,
-                                                    cpu_freq);
-                        assert(info == 0, "POSV");
+                    //! b = S\Ay
+                    info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, 1, S2, p,
+                                         &Ay[ii*p], p);
+
+                    myassert(info == 0, "S\\Ay");
+
 
-                        if (params.ForceCheck)
+                    get_ticks(end_tick);
+                    out.acc_solve += ticks2sec(end_tick,start_tick2 );
+
+
+                    if (params.ForceCheck)
+                    {
+                        #pragma omp critical
                         {
-                            #pragma omp critical
-                            {
-                                check_result(AL, &AR[aR_idx+ii*r*n], n, p,
-                                             1, r, &Y[jj*n], &Ay[ii*p]);
-                            }
+                            check_result(AL, &AR[aR_idx+ii*r*n], n, p,
+                                         1, r, &Y[jj*n], &Ay[ii*p]);
                         }
                     }
+                }
+                }
+
+                /************statistics**************/
+//                type_precision* T;
+//                type_precision* R2;
+//                type_precision* P;
+
+                //hpc_statistics(n,A,a_block_size,Y,y_block_size,B,p,T,R2,P);
+
+                /**************************/
+
+                blas_set_num_threads(max_threads);
+
+                get_ticks(start_tick2);
 
-                    AIOfile.write_B(Ay, p, a_block_size);
-                }
-            }
+                AIOfile.write_B(B, p, a_block_size);
+
+                get_ticks(end_tick);
+                out.acc_storeb += ticks2sec(end_tick,start_tick2);
+
+
+
+
+
+            }
+
+            get_ticks(end_tick);
+            out.acc_real_innerloops += ticks2sec(end_tick ,start_tick3);
+
+
         }
          AIOfile.reset_AR();
     }
 
-    get_ticks(end_tick);
-    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
-    // out.gflops = a_amount/1000.0*(n*l*r+n*r*r+y_amount*(n*r+p*p*p)))/1000.0/1000.0;
-    out.gflops = y_amount * (gemm_flops(l, n, 1, 0) +
-                             a_amount * (gemm_flops(r, n, 1, 0) +
-                                         gemm_flops(l, n, l, 0) +
-                                         gemm_flops(l, n, r, 0) +
-                                         gemm_flops(r, n, r, 0) +
+    get_ticks(end_tick);
+
+    out.duration = ticks2sec(end_tick ,start_tick);
+
+
+    out.gflops = y_iters * (gemm_flops(l, n, params.tb*1, 0) +
+                             a_iters * ( gemm_flops(params.mb*r, n, params.tb*1, 0) +
+                             params.tb *(  gemm_flops(l, 1*n, l, 0) + gemm_flops(l, n, params.mb *r, 0) +
+                                    params.mb * (
+                                         gemm_flops(r, 1*n, r, 0) +
                                          (p * p * p / 3.0) /
-                                         1000.0/1000.0/1000.0));
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/genabel -r 1763


More information about the Genabel-commits mailing list