[Genabel-commits] r1763 - in pkg/OmicABELnoMM: . src tests
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Tue Jul 15 10:17:30 CEST 2014
Author: afrank
Date: 2014-07-15 10:17:29 +0200 (Tue, 15 Jul 2014)
New Revision: 1763
Added:
pkg/OmicABELnoMM/tests/Makefile.am
Modified:
pkg/OmicABELnoMM/Makefile.am
pkg/OmicABELnoMM/configure.ac
pkg/OmicABELnoMM/src/AIOwrapper.cpp
pkg/OmicABELnoMM/src/AIOwrapper.h
pkg/OmicABELnoMM/src/Algorithm.cpp
pkg/OmicABELnoMM/src/Algorithm.h
pkg/OmicABELnoMM/src/Definitions.h
pkg/OmicABELnoMM/src/Utility.cpp
pkg/OmicABELnoMM/src/Utility.h
pkg/OmicABELnoMM/src/main.cpp
pkg/OmicABELnoMM/tests/Makefile
pkg/OmicABELnoMM/tests/test.cpp
Log:
Added R2, T-stat, Pval functions, but inactive atm. Added exclusion list support to remove individuals. Fixed propagation of missing values. Performance Improvements.
Modified: pkg/OmicABELnoMM/Makefile.am
===================================================================
--- pkg/OmicABELnoMM/Makefile.am 2014-07-08 13:16:45 UTC (rev 1762)
+++ pkg/OmicABELnoMM/Makefile.am 2014-07-15 08:17:29 UTC (rev 1763)
@@ -2,7 +2,7 @@
oanomm_headers = src/Definitions.h src/AIOwrapper.h src/Algorithm.h \
src/Utility.h
-oanomm_cpp = src/AIOwrapper.cpp src/Algorithm.cpp src/Utility.cpp \
+oanomm_cpp = src/AIOwrapper.cpp src/Algorithm.cpp src/Utility.cpp \
src/main.cpp
omicabelnomm_SOURCES = $(oanomm_headers) $(oanomm_cpp)
Modified: pkg/OmicABELnoMM/configure.ac
===================================================================
--- pkg/OmicABELnoMM/configure.ac 2014-07-08 13:16:45 UTC (rev 1762)
+++ pkg/OmicABELnoMM/configure.ac 2014-07-15 08:17:29 UTC (rev 1763)
@@ -17,11 +17,12 @@
# Set some default compile flags
if test -z "$CXXFLAGS"; then
# User did not set CXXFLAGS, so we can put in our own defaults
- CXXFLAGS="-g -O2"
+ CXXFLAGS="-O3"
fi
if test -z "$CPPFLAGS"; then
# User did not set CPPFLAGS, so we can put in our own defaults
CPPFLAGS="-Wall -g -pedantic -Wunused-result -Wmaybe-uninitialized -Wformat"
+ #CPPFLAGS="-Wall"
fi
# If CXXFLAGS/CPPFLAGS are already set AC_PROG_CXX will not overwrite them
# with its own defaults
@@ -45,13 +46,18 @@
AC_MSG_ERROR([Unable to find a Lapack library])
])
+#Boost
+
+
# Check for openMP. If found the OPENMP_CXXFLAGS is set automatically
AC_OPENMP
AC_SUBST(AM_CXXFLAGS, "$OPENMP_CFLAGS")
+
+
# Checks for header files.
AC_CHECK_HEADERS([limits.h stdlib.h string.h sys/time.h unistd.h])
Modified: pkg/OmicABELnoMM/src/AIOwrapper.cpp
===================================================================
--- pkg/OmicABELnoMM/src/AIOwrapper.cpp 2014-07-08 13:16:45 UTC (rev 1762)
+++ pkg/OmicABELnoMM/src/AIOwrapper.cpp 2014-07-15 08:17:29 UTC (rev 1763)
@@ -28,16 +28,21 @@
pthread_barrier_init(&(FHandler.finalize_barrier),NULL,2);
- Fhandler->fakefiles = params.use_fake_files;
+ Fhandler->fakefiles = params.use_fake_files;
+
+
+
+
if(!Fhandler->fakefiles)
{
Fhandler->fnameAL = params.fnameAL;
Fhandler->fnameAR = params.fnameAR;
Fhandler->fnameY = params.fnameY;
- Fhandler->fnameOutB = params.fnameOutB;
+ Fhandler->fnameOutB = params.fnameOutB;
+
Yfvi = load_databel_fvi( (Fhandler->fnameY+".fvi").c_str() );
ALfvi = load_databel_fvi( (Fhandler->fnameAL+".fvi").c_str() );
ARfvi = load_databel_fvi( (Fhandler->fnameAR+".fvi").c_str() );
@@ -46,7 +51,7 @@
params.t = Yfvi->fvi_header.numVariables;
params.l = ALfvi->fvi_header.numVariables;
- int opt_tb = 1000;
+ int opt_tb = 1000;
int opt_mb = 1000;
params.mb = min(params.m, opt_tb);
@@ -56,7 +61,22 @@
else
{
- }
+ }
+
+ //params.fname_excludelist = "exclfile.txt";
+ int excl_count = 0;
+ Fhandler->excl_List = new list< pair<int,int> >();
+
+ if(params.fname_excludelist.size()==0)
+ {
+ (Fhandler->excl_List)->push_back( make_pair(0,params.n) );
+ }
+ else
+ {
+ read_excludeList( Fhandler->excl_List,excl_count,params.n,params.fname_excludelist);
+ }
+
+ params.n -= excl_count;
params.p = params.l + params.r;
@@ -106,7 +126,9 @@
pthread_cond_destroy(&(Fhandler->condition_more));
pthread_mutex_destroy(&(Fhandler->m_read));
- pthread_cond_destroy(&(Fhandler->condition_read));
+ pthread_cond_destroy(&(Fhandler->condition_read));
+
+ delete Fhandler->excl_List;
}
@@ -186,12 +208,15 @@
}
//cout << "\nEnd preping files\n" << flush;
- }
+ }
+
+ //pthread_barrier_wait(&(Fhandler->finalize_barrier));//for testing only
Fhandler->not_done = true;
- Fhandler->reset_wait = false;
+ Fhandler->reset_wait = false;
+
while(Fhandler->not_done)
{
@@ -226,9 +251,30 @@
Fhandler->seed += 75;
}
else
- {
- size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Y);
- result++;
+ {
+ list< pair<int,int> >* excl_List = Fhandler->excl_List;
+
+
+ int chunk_size_buff;
+ int buff_pos=0;
+ int file_pos;
+
+ for(int i = 0; i < tmp_y_blockSize; i++)
+ {
+ for (list< pair<int,int> >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
+ {
+ file_pos = Fhandler->n*i + it->first;
+ fseek ( fp_Y , file_pos*sizeof(type_precision) , SEEK_SET );
+ chunk_size_buff = it->second;
+
+ size_t result = fread (&tobeFilled->buff[buff_pos],sizeof(type_precision),chunk_size_buff,fp_Y); result++;
+ buff_pos += chunk_size_buff;
+
+
+ }
+ }
+// size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Y);
+// result++;
if(Fhandler->y_to_readSize <= 0)
{
fseek ( fp_Y , 0 , SEEK_SET );
@@ -267,17 +313,38 @@
if(Fhandler->fakefiles)
{
fseek ( fp_Ar , 0 , SEEK_SET );
- size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
- result++;
+ size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar); result++;
re_random_vec(tobeFilled->buff , Fhandler->n * tmp_ar_blockSize*Fhandler->r );
re_random_vec_nan(tobeFilled->buff , Fhandler->n * tmp_ar_blockSize*Fhandler->r );
}
else
- {
- size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
- result++;
+ {
+
+ list< pair<int,int> >* excl_List = Fhandler->excl_List;
+
+ int chunk_size_buff;
+ int buff_pos=0;
+ int file_pos;
+
+ for(int i = 0; i < tmp_ar_blockSize*Fhandler->r; i++)
+ {
+ for (list< pair<int,int> >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
+ {
+ file_pos = Fhandler->n*i + it->first;
+ fseek ( fp_Ar , file_pos*sizeof(type_precision) , SEEK_SET );
+
+ chunk_size_buff = it->second;
+ size_t result = fread (&tobeFilled->buff[buff_pos],sizeof(type_precision),chunk_size_buff,fp_Ar); result++;
+ buff_pos += chunk_size_buff;
+
+
+ }
+ }
+
+// size_t result = fread(tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
+// result++;
if (Fhandler->Ar_to_readSize <= 0)
{
fseek ( fp_Ar , 0 , SEEK_SET );
@@ -307,8 +374,8 @@
if(Fhandler->fakefiles)
{
fseek ( fp_B , 0 , SEEK_SET );
- }
- fwrite (tobeWritten->buff,sizeof(type_precision),size,fp_B);
+ }
+ fwrite (tobeWritten->buff,sizeof(type_precision),size,fp_B);
Fhandler->b_empty_buffers.push(tobeWritten);
@@ -356,7 +423,8 @@
//cout << "k" << flush;
//barrier
pthread_barrier_wait(&(Fhandler->finalize_barrier));
-
+
+ {
type_buffElement* tmp;
while(!Fhandler->full_buffers.empty())
{
@@ -404,6 +472,7 @@
Fhandler->b_empty_buffers.pop();
delete []tmp->buff;
delete tmp;
+ }
}
@@ -544,13 +613,9 @@
void AIOwrapper::write_B(type_precision* B, int p, int blockSize)
{
- //int status;
- //int createstatus = 0;
- // cout << " b: full" << Fhandler->b_full_buffers.size() << "; epty" << Fhandler->b_empty_buffers.size() << endl;
while(Fhandler->b_empty_buffers.empty())
{
-
pthread_mutex_lock(&(Fhandler->m_more));
pthread_cond_signal( &(Fhandler->condition_more ));
pthread_mutex_unlock(&(Fhandler->m_more));
@@ -568,7 +633,7 @@
-
+ //cout << Fhandler->b_empty_buffers.size() << flush;
Fhandler->currentWriteBuff = Fhandler->b_empty_buffers.front();
Fhandler->b_empty_buffers.pop();
@@ -582,12 +647,9 @@
- // cout << " b: full" << Fhandler->b_full_buffers.size() << "; epty" << Fhandler->b_empty_buffers.size() << endl;
-
pthread_mutex_unlock(&(Fhandler->m_buff_upd));
-
pthread_mutex_lock(&(Fhandler->m_more));
pthread_cond_signal( &(Fhandler->condition_more ));
pthread_mutex_unlock(&(Fhandler->m_more));
@@ -668,8 +730,12 @@
// {
// (tmp->buff)[i] = 0;
// }
- Fhandler->b_empty_buffers.push(tmp);
+ Fhandler->b_empty_buffers.push(tmp);
+
+// Fhandler->currentWriteBuff = Fhandler->b_empty_buffers.front();
+// Fhandler->b_empty_buffers.pop();
+
}
}
@@ -819,17 +885,38 @@
(*AL) = Fhandler->AL;
}
else
- {
+ {
FILE *fp;
fp = fopen((Fhandler->fnameAL+".fvd").c_str(), "rb");
if(fp == 0)
{
cout << "Error Reading File " << Fhandler->fnameAL << endl;
exit(1);
- }
+ }
+
+ list< pair<int,int> >* excl_List = Fhandler->excl_List;
+
+ int chunk_size_buff;
+ int buff_pos=0;
+ int file_pos;
+
+ for (int i=0; i < Fhandler->l; i++)
+ {
+ for (list< pair<int,int> >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
+ {
+ file_pos = i*Fhandler->n+ it->first;
+ fseek ( fp , file_pos*sizeof(type_precision) , SEEK_SET );
+ chunk_size_buff = it->second;
+
+ size_t result = fread (&(Fhandler->AL[buff_pos]),sizeof(type_precision),chunk_size_buff,fp); result++;
+ buff_pos += chunk_size_buff;
+ }
+ }
- size_t result = fread (Fhandler->AL,sizeof(type_precision),Fhandler->l*Fhandler->n,fp);
- result++;
+
+// size_t result = fread (Fhandler->AL,sizeof(type_precision),Fhandler->l*Fhandler->n,fp);
+//
+// result++;
fclose(fp);
}
@@ -862,6 +949,111 @@
{
delete []Fhandler->AL;
}
+
+
+void AIOwrapper::read_excludeList(list< pair<int,int> >* excl, int &excl_count, int max_excl, string fname_excludeList)
+{
+
+ ifstream fp_exL(fname_excludeList.c_str());
+ if(fp_exL == 0)
+ {
+ cout << "Error reading exclude list file."<< endl;
+ exit(1);
+ }
+
+
+ string line;
+ excl_count = 0;
+ bool early_EOF;
+ int first,second, prev_second;
+
+ cout << "Excluding Ids: \n";
+
+ std::getline(fp_exL, line);
+ std::istringstream iss(line);
+ iss >> first;
+ early_EOF = iss.eof();
+ iss >> prev_second;
+
+ if(prev_second < first || early_EOF)
+ prev_second = first;
+
+ second = prev_second;
+
+
+ if(first > max_excl)
+ {
+ excl->push_back( make_pair(0,max_excl) );
+ cout << "\nNothing to Exclude!\n";
+ }
+ else
+ {
+ if(second > max_excl)
+ {
+ excl_count += max_excl-first+1;
+ excl->push_back( make_pair(0,first-1) );
+ cout << first << "-" << second << ", ";
+ }
+ else
+ {
+
+ cout << first << "-" << second << ", ";
+ excl->push_back( make_pair(0,first - 1) );
+ excl_count += second-(first-1);
+
+ while (std::getline(fp_exL, line) && second < max_excl )
+ {
+ std::istringstream iss(line);
+
+ iss >> first;
+ early_EOF = iss.eof();
+ iss >> second;
+ if(prev_second >= first)
+ {
+ cout << "\nPlease give an ordered Exlusion List!\n";
+ cout << "?? " << prev_second << "\n" << first << " ??"<< endl;
+ exit( 1 );
+ }
+
+
+ if(second < first || early_EOF )
+ second = first;
+
+
+ if(second > max_excl)
+ excl_count += max_excl-first+1;
+ else if(first < max_excl)
+ excl_count += second-(first-1);
+
+
+
+ cout << first << "-" << second << ", ";
+
+ excl->push_back( make_pair(prev_second,first - prev_second - 1) );
+
+
+ prev_second = second;
+
+
+ }
+ }
+ }
+
+
+
+
+ if(excl_count >= max_excl)
+ {
+ cout << "\nExclusion List excluded all data!\n";
+ cout << "Total Ids: " << max_excl << "\nExcluded Ids: " << excl_count << endl;
+ exit( 1 );
+ }
+ cout << "Excluded: " << excl_count << " Using: " << max_excl-excl_count<< endl;
+
+
+
+}
+
void AIOwrapper::free_databel_fvi( struct databel_fvi **fvi )
{
Modified: pkg/OmicABELnoMM/src/AIOwrapper.h
===================================================================
--- pkg/OmicABELnoMM/src/AIOwrapper.h 2014-07-08 13:16:45 UTC (rev 1762)
+++ pkg/OmicABELnoMM/src/AIOwrapper.h 2014-07-15 08:17:29 UTC (rev 1763)
@@ -2,7 +2,9 @@
#define AIOWRAPPER_H
#include "Definitions.h"
-#include "Utility.h"
+#include "Utility.h"
+#include <fstream>
+#include <sstream>
typedef struct BufferElement type_buffElement;
@@ -20,9 +22,15 @@
{
string fnameAL;
string fnameAR;
- string fnameY;
- string fnameOutB;
+ string fnameY;
+
+ string fnameOutB;
+
+
+ list< pair<int,int> >* excl_List;
+
+
bool doublefileType;
bool fakefiles;
@@ -138,7 +146,9 @@
protected:
- private:
+ private:
+
+ void read_excludeList(list< pair<int,int> >* excl, int &excl_count, int max_excl, string fname_excludeList);
void prepare_AR( int desired_blockSize, int n, int totalR, int columnsR);
@@ -163,7 +173,7 @@
void * fgls_malloc_impl( const char* file, long line, size_t size );
- type_fileh FHandler;
+ public: type_fileh FHandler;
type_fileh* Fhandler;
FILE* fp_Ar;
Modified: pkg/OmicABELnoMM/src/Algorithm.cpp
===================================================================
--- pkg/OmicABELnoMM/src/Algorithm.cpp 2014-07-08 13:16:45 UTC (rev 1762)
+++ pkg/OmicABELnoMM/src/Algorithm.cpp 2014-07-15 08:17:29 UTC (rev 1763)
@@ -1,5 +1,7 @@
-#include "Algorithm.h"
+#include "Algorithm.h"
+
+
Algorithm::Algorithm()
{
// ctor
@@ -51,41 +53,39 @@
}
-void Algorithm::prepare_Bfinal(type_precision* bfinal, type_precision* top,
- type_precision* bot, int dim1_b,
- int dim2_b, int dim1_b_bot)
+void Algorithm::prepare_Bfinal(type_precision* bfinal, type_precision* bsource, int a_amount, int y_amount, int p)
{
- // memcpy are faster version of the fors
- int i, k, w, top_idx, bot_idx;
- int size;
- top_idx = 0;
- bot_idx = 0;
- for (k = 0; k < dim2_b; k++)
- {
- size = k * dim1_b + (dim1_b - dim1_b_bot) - (k * dim1_b);
- memcpy( (type_precision*)&bfinal[k * dim1_b],
- (type_precision*)&top[top_idx],
- size * sizeof(type_precision) );
-// for (i = k*dim1_b; i < k*dim1_b+(dim1_b-dim1_b_bot); i++)
-// {
-// bfinal[i] = top[top_idx];
-// top_idx++;
-// }
- top_idx += size;
- i = k * dim1_b + size;
- w = i;
-
- size = w + dim1_b_bot - w;
- memcpy( (type_precision*)&bfinal[w],
- (type_precision*)&bot[bot_idx],
- size * sizeof(type_precision) );
-// for (i = w; i < w+dim1_b_bot; i++)
-// {
-// bfinal[i] = bot[bot_idx];
-// bot_idx++;
-// }
- bot_idx += size;
- }
+// // memcpy are faster version of the fors
+// int i, k, w, top_idx, bot_idx;
+// int size;
+// top_idx = 0;
+// bot_idx = 0;
+// for (k = 0; k < dim2_b; k++)
+// {
+// size = k * dim1_b + (dim1_b - dim1_b_bot) - (k * dim1_b);
+// memcpy( (type_precision*)&bfinal[k * dim1_b],
+// (type_precision*)&top[top_idx],
+// size * sizeof(type_precision) );
+//// for (i = k*dim1_b; i < k*dim1_b+(dim1_b-dim1_b_bot); i++)
+//// {
+//// bfinal[i] = top[top_idx];
+//// top_idx++;
+//// }
+// top_idx += size;
+// i = k * dim1_b + size;
+// w = i;
+//
+// size = w + dim1_b_bot - w;
+// memcpy( (type_precision*)&bfinal[w],
+// (type_precision*)&bot[bot_idx],
+// size * sizeof(type_precision) );
+//// for (i = w; i < w+dim1_b_bot; i++)
+//// {
+//// bfinal[i] = bot[bot_idx];
+//// bot_idx++;
+//// }
+// bot_idx += size;
+// }
}
@@ -249,12 +249,13 @@
lapack_int info = LAPACKE_sgels(STORAGE_TYPE, 'N', rowsA, colsA, rhs, A,
rowsA, ynew, rowsA);
- assert(info == 0, "Error Check");
+ myassert(info == 0, "Error Check");
int index = 0;
int index_new = 0;
- for (i = 0; i < rhs; i++)
+ for (i = 0; i < rhs; i++)
+
{
copy_vec(&ynew[index], &new_sol[index_new], colsA);
index += rowsA;
@@ -305,13 +306,14 @@
srand(time(NULL));
- blas_set_num_threads(max_threads);
+ blas_set_num_threads(max_threads);
+ omp_set_num_threads(max_threads);
//type_precision *Ytemp;
lapack_int info, n, lda, l, r, p;
- cputime_type start_tick, start_tick2, end_tick;
+ cputime_type start_tick, start_tick2, start_tick3, end_tick;
AIOfile.initialize(params);
@@ -333,54 +335,70 @@
for (int j = 0; j < y_iters && !params.ForceCheck; j++)
{
- if (y_iters >= 40 && (j%(y_iters/40)) == 0)
+ if (!params.ForceCheck && y_iters >= 10 && (j%(y_iters/10)) == 0 )
{
cout << "*" << flush;
}
for (int i = 0; i < a_iters; i++)
{
- for (int jj = 0; jj < y_block_size; jj++)
- {
- if (y_iters < 40 &&
- (y_block_size < 3 || (jj%(y_block_size/3)) == 0)
- )
+
+ if ( !params.ForceCheck && y_iters < 10 &&
+ ( (a_iters >= 10 && (i%(a_iters/(10/y_iters))) == 0) || (a_iters < (10/y_iters)) ))
{
cout << "*" << flush;
}
- }
+
}
}
if(!params.ForceCheck)
cout << endl;
-
+ //add memalign
//type_precision Stl[l*l];
//type_precision Str[l*r*a_block_size];
- type_precision* Stl = new type_precision[l*l];
- type_precision* Str = new type_precision[l*r*a_block_size];
+ type_precision* Stl = new type_precision[l*l*1];
+ type_precision* Str = new type_precision[l*r*a_block_size*1];
type_precision* Sbr = new type_precision[r * r * a_block_size];
- type_precision* Ay = new type_precision[p * a_block_size];
+ type_precision* Ay = new type_precision[p * a_block_size];
+ //type_precision* B_resorted = new type_precision[p * a_block_size*y_block_size];
- type_precision* S = new type_precision[p * p];
+ type_precision* S = new type_precision[p * p];
+
type_precision* Ay_top = new type_precision[l * y_amount];
type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
- list<long int>* y_nan_idxs = new list<long int>[y_block_size];
+ type_precision* y_residual = new type_precision[n * y_block_size ];
+ type_precision* y_res_norms = new type_precision[a_block_size];
+
+ list<long int>* al_nan_idxs = new list<long int>[1];
+ list<long int>* y_nan_idxs = new list<long int>[y_block_size];
+ list<long int>* ar_nan_idxs = new list<long int>[a_block_size];
+
+ type_precision* A = new type_precision[n * p * 1];
type_precision* AR = new type_precision[n * r * a_block_size * 1];
- type_precision* AL = new type_precision[n * l * 1];
+// type_precision* AL = new type_precision[n * l * 1];
+ type_precision* AL = A;
+
+ type_precision* B = Ay;
+
type_precision* backupAR; // = new type_precision[n*r*a_block_size];
type_precision* backupAL; // = new type_precision[n*l];
AIOfile.load_AL(&backupAL);
- //int total_al_nans = replace_nans(0, backupAL, n, l);
- replace_nans(0, backupAL, n, l);
+
+ //pthread_barrier_wait(&(AIOfile.Fhandler->finalize_barrier));
+
+ replace_nans(al_nan_idxs,1, backupAL, n, l);
+ al_nan_idxs->push_back(1);
+
+ //LAPACKE_dgesdd()
copy_vec(backupAL, AL, n*l);
@@ -394,176 +412,369 @@
for (int j = 0; j < y_iters; j++)
{
- if (y_iters >= 40 && (j%(y_iters/40)) == 0 && !params.ForceCheck)
+ if (!params.ForceCheck && y_iters >= 10 && (j%(y_iters/10)) == 0 && !params.ForceCheck)
{
cout << AIOfile.io_overhead << flush;
AIOfile.io_overhead = "*";
}
+
+ get_ticks(start_tick2);
- AIOfile.load_Yblock(&Y, y_block_size);
+ AIOfile.load_Yblock(&Y, y_block_size);
+
+ get_ticks(end_tick);
+ out.acc_loady += ticks2sec(end_tick,start_tick2);
+ get_ticks(start_tick2);
+ replace_nans(&y_nan_idxs[0],y_block_size, Y, n,1);
+ //out.acc_other += ticks2sec(end_tick,start_tick2);
- //int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
- replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
+ get_ticks(start_tick2);
-
//! Ay_top = AL'*Y
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
- &Ay_top[j * l * y_block_size], l);
+ &Ay_top[j * l * y_block_size], l);
+
+ get_ticks(end_tick);
+ out.acc_gemm += ticks2sec(end_tick,start_tick2);
for (int i = 0; i < a_iters; i++)
- {
- AIOfile.load_ARblock(&backupAR, a_block_size);
- //int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
- replace_nans(0, backupAR, n, a_block_size * r);
- copy_vec(backupAR, AR, n * r * a_block_size);
+ {
+
+ if (!params.ForceCheck && y_iters < 10 &&
+ ( (a_iters >= 10 && (i%(a_iters/(10/y_iters))) == 0) || (a_iters < (10/y_iters)) ))
+ {
+ cout << "*" << flush;
+ }
+
+ get_ticks(start_tick2);
- get_ticks(start_tick2);
+ AIOfile.load_ARblock(&backupAR, a_block_size);
+
+ get_ticks(end_tick);
+ out.acc_loadxr += ticks2sec(end_tick,start_tick2);
+
+ get_ticks(start_tick2);
+
+ replace_nans(&ar_nan_idxs[0],a_block_size, backupAR, n, r);
+ replace_with_zeros(al_nan_idxs, backupAR, n, r, a_block_size);
+ copy_vec(backupAR, AR, n * r * a_block_size);
+
get_ticks(end_tick);
- out.acc_pre += ticks2sec(end_tick-start_tick2, cpu_freq);
+ //out.acc_other += ticks2sec(end_tick,start_tick2);
- get_ticks(start_tick2);
+
+
+ get_ticks(start_tick2);
+
//! Ay_bot = AR'*Y
cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
0.0, Ay_bot, r * a_block_size);
get_ticks(end_tick);
- out.firstloop += ticks2sec(end_tick - start_tick2, cpu_freq);
+ out.acc_gemm += ticks2sec(end_tick,start_tick2);
+
+ int aL_idx = 0*l * n;
+ int aR_idx = 0* r * n * a_block_size;
- //#pragma omp parallel default(shared)
- {
- for (int jj = 0; jj < y_block_size; jj++)
- {
- int thread_id = 0 * omp_get_thread_num();
- int aL_idx = thread_id * l * n;
- int aR_idx = thread_id * r * n * a_block_size;
+ get_ticks(start_tick3);
+ for (int jj = 0; jj < y_block_size; jj++)
+ {
+
+ //int thread_id = 0 * omp_get_thread_num();//so far singel thread version only
- if (y_iters < 40 &&
- (y_block_size < 3 ||
- (jj%(y_block_size/3)) == 0) && !params.ForceCheck)
- {
- cout << AIOfile.io_overhead << flush;
- AIOfile.io_overhead = "*";
- }
+
+ get_ticks(start_tick2);
+
+ copy_vec(backupAL, &AL[aL_idx], n * l);//try to remove!
- copy_vec(backupAL, &AL[aL_idx], n * l);
+ replace_with_zeros(&y_nan_idxs[jj], &AL[aL_idx], n, l, 1);
+
+
+ get_ticks(end_tick);//2%
+ out.acc_other += ticks2sec(end_tick,start_tick2);
+
+ get_ticks(start_tick2);
- replace_with_zeros(&y_nan_idxs[jj], &AL[aL_idx], n, l, 1);
- //! Generate Stl
- cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
- l, n, 1.0, &AL[aL_idx], lda, 0.0, Stl, l);
+ //! Generate Stl
+ cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
+ l, n, 1.0, &AL[aL_idx], lda, 0.0, Stl, l);
+
+ get_ticks(end_tick);
+ out.acc_stl += ticks2sec(end_tick,start_tick2);
+
+
+ get_ticks(start_tick2);
+
+ copy_vec(backupAR,&AR[aR_idx], n*r*a_block_size);//!10%//try to remove!
+
+
+ replace_with_zeros(&y_nan_idxs[jj], backupAR, n, r, a_block_size);
- copy_vec(backupAR,&AR[aR_idx], n*r*a_block_size);
- replace_with_zeros(&y_nan_idxs[jj], &AR[aR_idx],
- n, r, a_block_size);
+
+
+ get_ticks(end_tick);
+ out.acc_other += ticks2sec(end_tick,start_tick2);
+
+ get_ticks(start_tick2);
- get_ticks(start_tick2);
- //! Generate Str
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- l, r * a_block_size, n, 1.0, &AL[aL_idx],
- n, &AR[aR_idx], n, 0.0, Str, l);
- get_ticks(end_tick);
- out.acc_RTL_QLY += ticks2sec(end_tick - start_tick2,
- cpu_freq);
+ //! Generate Str
+ cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+ l, r * a_block_size, n, 1.0, &AL[aL_idx],
+ n, &AR[aR_idx], n, 0.0, Str, l);//!45
+ get_ticks(end_tick);
+ out.acc_str += ticks2sec(end_tick,start_tick2);
+
- //type_precision Sbr[r * r * a_block_size];
- //type_precision Ay[p * a_block_size];
+ //type_precision Sbr[r * r * a_block_size*1];
+ //type_precision Ay[p * a_block_size*1];
+
+ blas_set_num_threads(1);
+ omp_set_num_threads(max_threads);
+
+ #pragma omp parallel default(shared)
+ {
- //#pragma omp for nowait schedule(dynamic)
- for (int ii= 0; ii < a_block_size; ii++)
- {
- get_ticks(start_tick2);
- //! Generate Sbr
- cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
- r, r, n, 1.0, &AR[aR_idx+ii*r*n], n,
- &AR[aR_idx + ii * r * n], n, 0.0,
- &Sbr[ii * r * r], r);
- get_ticks(end_tick);
- out.acc_gemm += ticks2sec(end_tick - start_tick2,
- cpu_freq);
+ #pragma omp for nowait
+ for (int ii= 0; ii < a_block_size; ii++)
+ {
+ //cout << omp_get_thread_num() << endl << flush;
- get_ticks(start_tick2);
- copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[ii*p], l);
- copy_vec(&Ay_bot[ii*r + jj*r*a_block_size],
- &Ay[l+ii*p], r);
+ get_ticks(start_tick2);
+ //! Generate Sbr
+ cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
+ r, r, n, 1.0, &AR[aR_idx+ii*r*n], n,
+ &AR[aR_idx + ii * r * n], n, 0.0,
+ &Sbr[ii * r * r], r);
+
- //type_precision* B = Ay;
- //type_precision S[p * p];
+ get_ticks(end_tick);
+ out.acc_sbr += ticks2sec(end_tick,start_tick2 );
+
+ get_ticks(start_tick2);
+ copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[ii*p], l);
+ copy_vec(&Ay_bot[ii*r + jj*r*a_block_size],
+ &Ay[l+ii*p], r);
+
+
+ type_precision S2[p * p];
- //! Rebuild S
- build_S(S, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
- // matlab_print_matrix("S", p, p, S);
+ //! Rebuild S
+ build_S(S2, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
+ // matlab_print_matrix("S", p, p, S);
- get_ticks(end_tick);
- out.acc_b += ticks2sec(end_tick - start_tick2,
- cpu_freq);
+
+ get_ticks(end_tick);//5%
+ out.acc_other += ticks2sec(end_tick,start_tick2);
- get_ticks(start_tick2);
+ get_ticks(start_tick2);
- //! b = S\Ay
- info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, 1, S, p,
- &Ay[ii*p], p);
- get_ticks(end_tick);
- out.acc_loadxr += ticks2sec(end_tick - start_tick2,
- cpu_freq);
- assert(info == 0, "POSV");
+ //! b = S\Ay
+ info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, 1, S2, p,
+ &Ay[ii*p], p);
+
+ myassert(info == 0, "S\\Ay");
+
- if (params.ForceCheck)
+ get_ticks(end_tick);
+ out.acc_solve += ticks2sec(end_tick,start_tick2 );
+
+
+ if (params.ForceCheck)
+ {
+ #pragma omp critical
{
- #pragma omp critical
- {
- check_result(AL, &AR[aR_idx+ii*r*n], n, p,
- 1, r, &Y[jj*n], &Ay[ii*p]);
- }
+ check_result(AL, &AR[aR_idx+ii*r*n], n, p,
+ 1, r, &Y[jj*n], &Ay[ii*p]);
}
}
+ }
+ }
+
+ /************statistics**************/
+// type_precision* T;
+// type_precision* R2;
+// type_precision* P;
+
+ //hpc_statistics(n,A,a_block_size,Y,y_block_size,B,p,T,R2,P);
+
+ /**************************/
+
+ blas_set_num_threads(max_threads);
+
+ get_ticks(start_tick2);
- AIOfile.write_B(Ay, p, a_block_size);
- }
- }
+ AIOfile.write_B(B, p, a_block_size);
+
+ get_ticks(end_tick);
+ out.acc_storeb += ticks2sec(end_tick,start_tick2);
+
+
+
+
+
+ }
+
+ get_ticks(end_tick);
+ out.acc_real_innerloops += ticks2sec(end_tick ,start_tick3);
+
+
}
AIOfile.reset_AR();
}
- get_ticks(end_tick);
- out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
- // out.gflops = a_amount/1000.0*(n*l*r+n*r*r+y_amount*(n*r+p*p*p)))/1000.0/1000.0;
- out.gflops = y_amount * (gemm_flops(l, n, 1, 0) +
- a_amount * (gemm_flops(r, n, 1, 0) +
- gemm_flops(l, n, l, 0) +
- gemm_flops(l, n, r, 0) +
- gemm_flops(r, n, r, 0) +
+ get_ticks(end_tick);
+
+ out.duration = ticks2sec(end_tick ,start_tick);
+
+
+ out.gflops = y_iters * (gemm_flops(l, n, params.tb*1, 0) +
+ a_iters * ( gemm_flops(params.mb*r, n, params.tb*1, 0) +
+ params.tb *( gemm_flops(l, 1*n, l, 0) + gemm_flops(l, n, params.mb *r, 0) +
+ params.mb * (
+ gemm_flops(r, 1*n, r, 0) +
(p * p * p / 3.0) /
- 1000.0/1000.0/1000.0));
[TRUNCATED]
To get the complete diff run:
svnlook diff /svnroot/genabel -r 1763
More information about the Genabel-commits
mailing list