[GenABEL-dev] [Genabel-commits] r1763 - in pkg/OmicABELnoMM: . src tests
L.C. Karssen
lennart at karssen.org
Tue Jul 22 18:42:09 CEST 2014
Thanks Alvaro!
I ran only a quick check, and I noticed that after an 'autoreconf -i'
./configure, make and make check all work. Great!
However, running make distclean doesn't work. I'll commit a fix for that
later.
Now that the Makefile.am files are working fine, I'll delete the
tests/Makefile as well.
Thanks again,
Lennart.
On 15-07-14 10:17, noreply at r-forge.r-project.org wrote:
> Author: afrank
> Date: 2014-07-15 10:17:29 +0200 (Tue, 15 Jul 2014)
> New Revision: 1763
>
> Added:
> pkg/OmicABELnoMM/tests/Makefile.am
> Modified:
> pkg/OmicABELnoMM/Makefile.am
> pkg/OmicABELnoMM/configure.ac
> pkg/OmicABELnoMM/src/AIOwrapper.cpp
> pkg/OmicABELnoMM/src/AIOwrapper.h
> pkg/OmicABELnoMM/src/Algorithm.cpp
> pkg/OmicABELnoMM/src/Algorithm.h
> pkg/OmicABELnoMM/src/Definitions.h
> pkg/OmicABELnoMM/src/Utility.cpp
> pkg/OmicABELnoMM/src/Utility.h
> pkg/OmicABELnoMM/src/main.cpp
> pkg/OmicABELnoMM/tests/Makefile
> pkg/OmicABELnoMM/tests/test.cpp
> Log:
> Added R2, T-stat, Pval functions, but inactive atm. Added exclusion list support to remove individuals. Fixed propagation of missing values. Performance Improvements.
>
> Modified: pkg/OmicABELnoMM/Makefile.am
> ===================================================================
> --- pkg/OmicABELnoMM/Makefile.am 2014-07-08 13:16:45 UTC (rev 1762)
> +++ pkg/OmicABELnoMM/Makefile.am 2014-07-15 08:17:29 UTC (rev 1763)
> @@ -2,7 +2,7 @@
>
> oanomm_headers = src/Definitions.h src/AIOwrapper.h src/Algorithm.h \
> src/Utility.h
> -oanomm_cpp = src/AIOwrapper.cpp src/Algorithm.cpp src/Utility.cpp \
> +oanomm_cpp = src/AIOwrapper.cpp src/Algorithm.cpp src/Utility.cpp \
> src/main.cpp
>
> omicabelnomm_SOURCES = $(oanomm_headers) $(oanomm_cpp)
>
> Modified: pkg/OmicABELnoMM/configure.ac
> ===================================================================
> --- pkg/OmicABELnoMM/configure.ac 2014-07-08 13:16:45 UTC (rev 1762)
> +++ pkg/OmicABELnoMM/configure.ac 2014-07-15 08:17:29 UTC (rev 1763)
> @@ -17,11 +17,12 @@
> # Set some default compile flags
> if test -z "$CXXFLAGS"; then
> # User did not set CXXFLAGS, so we can put in our own defaults
> - CXXFLAGS="-g -O2"
> + CXXFLAGS="-O3"
> fi
> if test -z "$CPPFLAGS"; then
> # User did not set CPPFLAGS, so we can put in our own defaults
> CPPFLAGS="-Wall -g -pedantic -Wunused-result -Wmaybe-uninitialized -Wformat"
> + #CPPFLAGS="-Wall"
> fi
> # If CXXFLAGS/CPPFLAGS are already set AC_PROG_CXX will not overwrite them
> # with its own defaults
> @@ -45,13 +46,18 @@
> AC_MSG_ERROR([Unable to find a Lapack library])
> ])
>
> +#Boost
>
> +
> +
> # Check for openMP. If found the OPENMP_CXXFLAGS is set automatically
> AC_OPENMP
> AC_SUBST(AM_CXXFLAGS, "$OPENMP_CFLAGS")
>
>
>
> +
> +
> # Checks for header files.
> AC_CHECK_HEADERS([limits.h stdlib.h string.h sys/time.h unistd.h])
>
>
> Modified: pkg/OmicABELnoMM/src/AIOwrapper.cpp
> ===================================================================
> --- pkg/OmicABELnoMM/src/AIOwrapper.cpp 2014-07-08 13:16:45 UTC (rev 1762)
> +++ pkg/OmicABELnoMM/src/AIOwrapper.cpp 2014-07-15 08:17:29 UTC (rev 1763)
> @@ -28,16 +28,21 @@
> pthread_barrier_init(&(FHandler.finalize_barrier),NULL,2);
>
>
> - Fhandler->fakefiles = params.use_fake_files;
> + Fhandler->fakefiles = params.use_fake_files;
> +
> +
> +
>
>
> +
> if(!Fhandler->fakefiles)
> {
> Fhandler->fnameAL = params.fnameAL;
> Fhandler->fnameAR = params.fnameAR;
> Fhandler->fnameY = params.fnameY;
> - Fhandler->fnameOutB = params.fnameOutB;
> + Fhandler->fnameOutB = params.fnameOutB;
>
> +
> Yfvi = load_databel_fvi( (Fhandler->fnameY+".fvi").c_str() );
> ALfvi = load_databel_fvi( (Fhandler->fnameAL+".fvi").c_str() );
> ARfvi = load_databel_fvi( (Fhandler->fnameAR+".fvi").c_str() );
> @@ -46,7 +51,7 @@
> params.t = Yfvi->fvi_header.numVariables;
> params.l = ALfvi->fvi_header.numVariables;
>
> - int opt_tb = 1000;
> + int opt_tb = 1000;
> int opt_mb = 1000;
>
> params.mb = min(params.m, opt_tb);
> @@ -56,7 +61,22 @@
> else
> {
>
> - }
> + }
> +
> + //params.fname_excludelist = "exclfile.txt";
> + int excl_count = 0;
> + Fhandler->excl_List = new list< pair<int,int> >();
> +
> + if(params.fname_excludelist.size()==0)
> + {
> + (Fhandler->excl_List)->push_back( make_pair(0,params.n) );
> + }
> + else
> + {
> + read_excludeList( Fhandler->excl_List,excl_count,params.n,params.fname_excludelist);
> + }
> +
> + params.n -= excl_count;
>
> params.p = params.l + params.r;
>
> @@ -106,7 +126,9 @@
> pthread_cond_destroy(&(Fhandler->condition_more));
>
> pthread_mutex_destroy(&(Fhandler->m_read));
> - pthread_cond_destroy(&(Fhandler->condition_read));
> + pthread_cond_destroy(&(Fhandler->condition_read));
> +
> + delete Fhandler->excl_List;
>
>
> }
> @@ -186,12 +208,15 @@
> }
> //cout << "\nEnd preping files\n" << flush;
>
> - }
> + }
> +
> + //pthread_barrier_wait(&(Fhandler->finalize_barrier));//for testing only
>
>
> Fhandler->not_done = true;
> - Fhandler->reset_wait = false;
> + Fhandler->reset_wait = false;
>
> +
> while(Fhandler->not_done)
> {
>
> @@ -226,9 +251,30 @@
> Fhandler->seed += 75;
> }
> else
> - {
> - size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Y);
> - result++;
> + {
> + list< pair<int,int> >* excl_List = Fhandler->excl_List;
> +
> +
> + int chunk_size_buff;
> + int buff_pos=0;
> + int file_pos;
> +
> + for(int i = 0; i < tmp_y_blockSize; i++)
> + {
> + for (list< pair<int,int> >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
> + {
> + file_pos = Fhandler->n*i + it->first;
> + fseek ( fp_Y , file_pos*sizeof(type_precision) , SEEK_SET );
> + chunk_size_buff = it->second;
> +
> + size_t result = fread (&tobeFilled->buff[buff_pos],sizeof(type_precision),chunk_size_buff,fp_Y); result++;
> + buff_pos += chunk_size_buff;
> +
> +
> + }
> + }
> +// size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Y);
> +// result++;
> if(Fhandler->y_to_readSize <= 0)
> {
> fseek ( fp_Y , 0 , SEEK_SET );
> @@ -267,17 +313,38 @@
> if(Fhandler->fakefiles)
> {
> fseek ( fp_Ar , 0 , SEEK_SET );
> - size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
> - result++;
> + size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar); result++;
>
> re_random_vec(tobeFilled->buff , Fhandler->n * tmp_ar_blockSize*Fhandler->r );
> re_random_vec_nan(tobeFilled->buff , Fhandler->n * tmp_ar_blockSize*Fhandler->r );
>
> }
> else
> - {
> - size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
> - result++;
> + {
> +
> + list< pair<int,int> >* excl_List = Fhandler->excl_List;
> +
> + int chunk_size_buff;
> + int buff_pos=0;
> + int file_pos;
> +
> + for(int i = 0; i < tmp_ar_blockSize*Fhandler->r; i++)
> + {
> + for (list< pair<int,int> >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
> + {
> + file_pos = Fhandler->n*i + it->first;
> + fseek ( fp_Ar , file_pos*sizeof(type_precision) , SEEK_SET );
> +
> + chunk_size_buff = it->second;
> + size_t result = fread (&tobeFilled->buff[buff_pos],sizeof(type_precision),chunk_size_buff,fp_Ar); result++;
> + buff_pos += chunk_size_buff;
> +
> +
> + }
> + }
> +
> +// size_t result = fread(tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
> +// result++;
> if (Fhandler->Ar_to_readSize <= 0)
> {
> fseek ( fp_Ar , 0 , SEEK_SET );
> @@ -307,8 +374,8 @@
> if(Fhandler->fakefiles)
> {
> fseek ( fp_B , 0 , SEEK_SET );
> - }
> - fwrite (tobeWritten->buff,sizeof(type_precision),size,fp_B);
> + }
> + fwrite (tobeWritten->buff,sizeof(type_precision),size,fp_B);
>
>
> Fhandler->b_empty_buffers.push(tobeWritten);
> @@ -356,7 +423,8 @@
> //cout << "k" << flush;
> //barrier
> pthread_barrier_wait(&(Fhandler->finalize_barrier));
> -
> +
> + {
> type_buffElement* tmp;
> while(!Fhandler->full_buffers.empty())
> {
> @@ -404,6 +472,7 @@
> Fhandler->b_empty_buffers.pop();
> delete []tmp->buff;
> delete tmp;
> + }
> }
>
>
> @@ -544,13 +613,9 @@
>
> void AIOwrapper::write_B(type_precision* B, int p, int blockSize)
> {
> - //int status;
> - //int createstatus = 0;
> - // cout << " b: full" << Fhandler->b_full_buffers.size() << "; epty" << Fhandler->b_empty_buffers.size() << endl;
>
> while(Fhandler->b_empty_buffers.empty())
> {
> -
> pthread_mutex_lock(&(Fhandler->m_more));
> pthread_cond_signal( &(Fhandler->condition_more ));
> pthread_mutex_unlock(&(Fhandler->m_more));
> @@ -568,7 +633,7 @@
>
>
>
> -
> + //cout << Fhandler->b_empty_buffers.size() << flush;
> Fhandler->currentWriteBuff = Fhandler->b_empty_buffers.front();
> Fhandler->b_empty_buffers.pop();
>
> @@ -582,12 +647,9 @@
>
>
>
> - // cout << " b: full" << Fhandler->b_full_buffers.size() << "; epty" << Fhandler->b_empty_buffers.size() << endl;
> -
> pthread_mutex_unlock(&(Fhandler->m_buff_upd));
>
>
> -
> pthread_mutex_lock(&(Fhandler->m_more));
> pthread_cond_signal( &(Fhandler->condition_more ));
> pthread_mutex_unlock(&(Fhandler->m_more));
> @@ -668,8 +730,12 @@
> // {
> // (tmp->buff)[i] = 0;
> // }
> - Fhandler->b_empty_buffers.push(tmp);
> + Fhandler->b_empty_buffers.push(tmp);
> +
> +// Fhandler->currentWriteBuff = Fhandler->b_empty_buffers.front();
> +// Fhandler->b_empty_buffers.pop();
>
> +
> }
>
> }
> @@ -819,17 +885,38 @@
> (*AL) = Fhandler->AL;
> }
> else
> - {
> + {
> FILE *fp;
> fp = fopen((Fhandler->fnameAL+".fvd").c_str(), "rb");
> if(fp == 0)
> {
> cout << "Error Reading File " << Fhandler->fnameAL << endl;
> exit(1);
> - }
> + }
> +
> + list< pair<int,int> >* excl_List = Fhandler->excl_List;
> +
> + int chunk_size_buff;
> + int buff_pos=0;
> + int file_pos;
> +
> + for (int i=0; i < Fhandler->l; i++)
> + {
> + for (list< pair<int,int> >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
> + {
> + file_pos = i*Fhandler->n+ it->first;
> + fseek ( fp , file_pos*sizeof(type_precision) , SEEK_SET );
> + chunk_size_buff = it->second;
> +
> + size_t result = fread (&(Fhandler->AL[buff_pos]),sizeof(type_precision),chunk_size_buff,fp); result++;
> + buff_pos += chunk_size_buff;
> + }
> + }
>
> - size_t result = fread (Fhandler->AL,sizeof(type_precision),Fhandler->l*Fhandler->n,fp);
> - result++;
> +
> +// size_t result = fread (Fhandler->AL,sizeof(type_precision),Fhandler->l*Fhandler->n,fp);
> +//
> +// result++;
> fclose(fp);
> }
>
> @@ -862,6 +949,111 @@
> {
> delete []Fhandler->AL;
> }
> +
> +
> +void AIOwrapper::read_excludeList(list< pair<int,int> >* excl, int &excl_count, int max_excl, string fname_excludeList)
> +{
> +
> + ifstream fp_exL(fname_excludeList.c_str());
> + if(fp_exL == 0)
> + {
> + cout << "Error reading exclude list file."<< endl;
> + exit(1);
> + }
> +
> +
> + string line;
> + excl_count = 0;
> + bool early_EOF;
> + int first,second, prev_second;
> +
> + cout << "Excluding Ids: \n";
> +
> + std::getline(fp_exL, line);
> + std::istringstream iss(line);
> + iss >> first;
> + early_EOF = iss.eof();
> + iss >> prev_second;
> +
> + if(prev_second < first || early_EOF)
> + prev_second = first;
> +
> + second = prev_second;
> +
> +
> + if(first > max_excl)
> + {
> + excl->push_back( make_pair(0,max_excl) );
> + cout << "\nNothing to Exclude!\n";
> + }
> + else
> + {
> + if(second > max_excl)
> + {
> + excl_count += max_excl-first+1;
> + excl->push_back( make_pair(0,first-1) );
> + cout << first << "-" << second << ", ";
> + }
> + else
> + {
> +
> + cout << first << "-" << second << ", ";
> + excl->push_back( make_pair(0,first - 1) );
> + excl_count += second-(first-1);
> +
> + while (std::getline(fp_exL, line) && second < max_excl )
> + {
> + std::istringstream iss(line);
> +
> + iss >> first;
> + early_EOF = iss.eof();
> + iss >> second;
> + if(prev_second >= first)
> + {
> + cout << "\nPlease give an ordered Exlusion List!\n";
> + cout << "?? " << prev_second << "\n" << first << " ??"<< endl;
> + exit( 1 );
> + }
> +
> +
> + if(second < first || early_EOF )
> + second = first;
> +
> +
> + if(second > max_excl)
> + excl_count += max_excl-first+1;
> + else if(first < max_excl)
> + excl_count += second-(first-1);
> +
> +
> +
> + cout << first << "-" << second << ", ";
> +
> + excl->push_back( make_pair(prev_second,first - prev_second - 1) );
> +
> +
> + prev_second = second;
> +
> +
> + }
> + }
> + }
> +
> +
> +
> +
> + if(excl_count >= max_excl)
> + {
> + cout << "\nExclusion List excluded all data!\n";
> + cout << "Total Ids: " << max_excl << "\nExcluded Ids: " << excl_count << endl;
> + exit( 1 );
> + }
> + cout << "Excluded: " << excl_count << " Using: " << max_excl-excl_count<< endl;
> +
> +
> +
> +}
> +
>
> void AIOwrapper::free_databel_fvi( struct databel_fvi **fvi )
> {
>
> Modified: pkg/OmicABELnoMM/src/AIOwrapper.h
> ===================================================================
> --- pkg/OmicABELnoMM/src/AIOwrapper.h 2014-07-08 13:16:45 UTC (rev 1762)
> +++ pkg/OmicABELnoMM/src/AIOwrapper.h 2014-07-15 08:17:29 UTC (rev 1763)
> @@ -2,7 +2,9 @@
> #define AIOWRAPPER_H
>
> #include "Definitions.h"
> -#include "Utility.h"
> +#include "Utility.h"
> +#include <fstream>
> +#include <sstream>
>
> typedef struct BufferElement type_buffElement;
>
> @@ -20,9 +22,15 @@
> {
> string fnameAL;
> string fnameAR;
> - string fnameY;
> - string fnameOutB;
> + string fnameY;
> +
>
> + string fnameOutB;
> +
> +
> + list< pair<int,int> >* excl_List;
> +
> +
> bool doublefileType;
> bool fakefiles;
>
> @@ -138,7 +146,9 @@
>
> protected:
>
> - private:
> + private:
> +
> + void read_excludeList(list< pair<int,int> >* excl, int &excl_count, int max_excl, string fname_excludeList);
>
>
> void prepare_AR( int desired_blockSize, int n, int totalR, int columnsR);
> @@ -163,7 +173,7 @@
>
> void * fgls_malloc_impl( const char* file, long line, size_t size );
>
> - type_fileh FHandler;
> + public: type_fileh FHandler;
> type_fileh* Fhandler;
>
> FILE* fp_Ar;
>
> Modified: pkg/OmicABELnoMM/src/Algorithm.cpp
> ===================================================================
> --- pkg/OmicABELnoMM/src/Algorithm.cpp 2014-07-08 13:16:45 UTC (rev 1762)
> +++ pkg/OmicABELnoMM/src/Algorithm.cpp 2014-07-15 08:17:29 UTC (rev 1763)
> @@ -1,5 +1,7 @@
> -#include "Algorithm.h"
> +#include "Algorithm.h"
> +
>
> +
> Algorithm::Algorithm()
> {
> // ctor
> @@ -51,41 +53,39 @@
> }
>
>
> -void Algorithm::prepare_Bfinal(type_precision* bfinal, type_precision* top,
> - type_precision* bot, int dim1_b,
> - int dim2_b, int dim1_b_bot)
> +void Algorithm::prepare_Bfinal(type_precision* bfinal, type_precision* bsource, int a_amount, int y_amount, int p)
> {
> - // memcpy are faster version of the fors
> - int i, k, w, top_idx, bot_idx;
> - int size;
> - top_idx = 0;
> - bot_idx = 0;
> - for (k = 0; k < dim2_b; k++)
> - {
> - size = k * dim1_b + (dim1_b - dim1_b_bot) - (k * dim1_b);
> - memcpy( (type_precision*)&bfinal[k * dim1_b],
> - (type_precision*)&top[top_idx],
> - size * sizeof(type_precision) );
> -// for (i = k*dim1_b; i < k*dim1_b+(dim1_b-dim1_b_bot); i++)
> -// {
> -// bfinal[i] = top[top_idx];
> -// top_idx++;
> -// }
> - top_idx += size;
> - i = k * dim1_b + size;
> - w = i;
> -
> - size = w + dim1_b_bot - w;
> - memcpy( (type_precision*)&bfinal[w],
> - (type_precision*)&bot[bot_idx],
> - size * sizeof(type_precision) );
> -// for (i = w; i < w+dim1_b_bot; i++)
> -// {
> -// bfinal[i] = bot[bot_idx];
> -// bot_idx++;
> -// }
> - bot_idx += size;
> - }
> +// // memcpy are faster version of the fors
> +// int i, k, w, top_idx, bot_idx;
> +// int size;
> +// top_idx = 0;
> +// bot_idx = 0;
> +// for (k = 0; k < dim2_b; k++)
> +// {
> +// size = k * dim1_b + (dim1_b - dim1_b_bot) - (k * dim1_b);
> +// memcpy( (type_precision*)&bfinal[k * dim1_b],
> +// (type_precision*)&top[top_idx],
> +// size * sizeof(type_precision) );
> +//// for (i = k*dim1_b; i < k*dim1_b+(dim1_b-dim1_b_bot); i++)
> +//// {
> +//// bfinal[i] = top[top_idx];
> +//// top_idx++;
> +//// }
> +// top_idx += size;
> +// i = k * dim1_b + size;
> +// w = i;
> +//
> +// size = w + dim1_b_bot - w;
> +// memcpy( (type_precision*)&bfinal[w],
> +// (type_precision*)&bot[bot_idx],
> +// size * sizeof(type_precision) );
> +//// for (i = w; i < w+dim1_b_bot; i++)
> +//// {
> +//// bfinal[i] = bot[bot_idx];
> +//// bot_idx++;
> +//// }
> +// bot_idx += size;
> +// }
> }
>
>
> @@ -249,12 +249,13 @@
>
> lapack_int info = LAPACKE_sgels(STORAGE_TYPE, 'N', rowsA, colsA, rhs, A,
> rowsA, ynew, rowsA);
> - assert(info == 0, "Error Check");
> + myassert(info == 0, "Error Check");
>
>
> int index = 0;
> int index_new = 0;
> - for (i = 0; i < rhs; i++)
> + for (i = 0; i < rhs; i++)
> +
> {
> copy_vec(&ynew[index], &new_sol[index_new], colsA);
> index += rowsA;
> @@ -305,13 +306,14 @@
>
> srand(time(NULL));
>
> - blas_set_num_threads(max_threads);
> + blas_set_num_threads(max_threads);
> + omp_set_num_threads(max_threads);
>
>
> //type_precision *Ytemp;
> lapack_int info, n, lda, l, r, p;
>
> - cputime_type start_tick, start_tick2, end_tick;
> + cputime_type start_tick, start_tick2, start_tick3, end_tick;
>
> AIOfile.initialize(params);
>
> @@ -333,54 +335,70 @@
>
> for (int j = 0; j < y_iters && !params.ForceCheck; j++)
> {
> - if (y_iters >= 40 && (j%(y_iters/40)) == 0)
> + if (!params.ForceCheck && y_iters >= 10 && (j%(y_iters/10)) == 0 )
> {
> cout << "*" << flush;
> }
>
> for (int i = 0; i < a_iters; i++)
> {
> - for (int jj = 0; jj < y_block_size; jj++)
> - {
> - if (y_iters < 40 &&
> - (y_block_size < 3 || (jj%(y_block_size/3)) == 0)
> - )
> +
> + if ( !params.ForceCheck && y_iters < 10 &&
> + ( (a_iters >= 10 && (i%(a_iters/(10/y_iters))) == 0) || (a_iters < (10/y_iters)) ))
> {
> cout << "*" << flush;
> }
> - }
> +
> }
> }
>
> if(!params.ForceCheck)
> cout << endl;
>
> -
> + //add memalign
>
> //type_precision Stl[l*l];
> //type_precision Str[l*r*a_block_size];
> - type_precision* Stl = new type_precision[l*l];
> - type_precision* Str = new type_precision[l*r*a_block_size];
> + type_precision* Stl = new type_precision[l*l*1];
> + type_precision* Str = new type_precision[l*r*a_block_size*1];
>
> type_precision* Sbr = new type_precision[r * r * a_block_size];
> - type_precision* Ay = new type_precision[p * a_block_size];
> + type_precision* Ay = new type_precision[p * a_block_size];
> + //type_precision* B_resorted = new type_precision[p * a_block_size*y_block_size];
>
> - type_precision* S = new type_precision[p * p];
> + type_precision* S = new type_precision[p * p];
>
> +
> type_precision* Ay_top = new type_precision[l * y_amount];
> type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
>
> - list<long int>* y_nan_idxs = new list<long int>[y_block_size];
> + type_precision* y_residual = new type_precision[n * y_block_size ];
> + type_precision* y_res_norms = new type_precision[a_block_size];
> +
> + list<long int>* al_nan_idxs = new list<long int>[1];
> + list<long int>* y_nan_idxs = new list<long int>[y_block_size];
> + list<long int>* ar_nan_idxs = new list<long int>[a_block_size];
>
> +
> + type_precision* A = new type_precision[n * p * 1];
> type_precision* AR = new type_precision[n * r * a_block_size * 1];
> - type_precision* AL = new type_precision[n * l * 1];
> +// type_precision* AL = new type_precision[n * l * 1];
> + type_precision* AL = A;
> +
> + type_precision* B = Ay;
> +
> type_precision* backupAR; // = new type_precision[n*r*a_block_size];
> type_precision* backupAL; // = new type_precision[n*l];
>
>
> AIOfile.load_AL(&backupAL);
> - //int total_al_nans = replace_nans(0, backupAL, n, l);
> - replace_nans(0, backupAL, n, l);
> +
> + //pthread_barrier_wait(&(AIOfile.Fhandler->finalize_barrier));
> +
> + replace_nans(al_nan_idxs,1, backupAL, n, l);
> + al_nan_idxs->push_back(1);
> +
> + //LAPACKE_dgesdd()
>
> copy_vec(backupAL, AL, n*l);
>
> @@ -394,176 +412,369 @@
>
> for (int j = 0; j < y_iters; j++)
> {
> - if (y_iters >= 40 && (j%(y_iters/40)) == 0 && !params.ForceCheck)
> + if (!params.ForceCheck && y_iters >= 10 && (j%(y_iters/10)) == 0 && !params.ForceCheck)
> {
> cout << AIOfile.io_overhead << flush;
> AIOfile.io_overhead = "*";
> }
> +
> + get_ticks(start_tick2);
>
> - AIOfile.load_Yblock(&Y, y_block_size);
> + AIOfile.load_Yblock(&Y, y_block_size);
> +
> + get_ticks(end_tick);
> + out.acc_loady += ticks2sec(end_tick,start_tick2);
>
> + get_ticks(start_tick2);
> + replace_nans(&y_nan_idxs[0],y_block_size, Y, n,1);
> + //out.acc_other += ticks2sec(end_tick,start_tick2);
>
>
> - //int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
> - replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
> + get_ticks(start_tick2);
>
> -
> //! Ay_top = AL'*Y
> cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
> - &Ay_top[j * l * y_block_size], l);
> + &Ay_top[j * l * y_block_size], l);
> +
> + get_ticks(end_tick);
> + out.acc_gemm += ticks2sec(end_tick,start_tick2);
>
>
> for (int i = 0; i < a_iters; i++)
> - {
> - AIOfile.load_ARblock(&backupAR, a_block_size);
> - //int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
> - replace_nans(0, backupAR, n, a_block_size * r);
> - copy_vec(backupAR, AR, n * r * a_block_size);
> + {
> +
> + if (!params.ForceCheck && y_iters < 10 &&
> + ( (a_iters >= 10 && (i%(a_iters/(10/y_iters))) == 0) || (a_iters < (10/y_iters)) ))
> + {
> + cout << "*" << flush;
> + }
> +
> + get_ticks(start_tick2);
>
> - get_ticks(start_tick2);
> + AIOfile.load_ARblock(&backupAR, a_block_size);
> +
> + get_ticks(end_tick);
> + out.acc_loadxr += ticks2sec(end_tick,start_tick2);
> +
> + get_ticks(start_tick2);
> +
> + replace_nans(&ar_nan_idxs[0],a_block_size, backupAR, n, r);
> + replace_with_zeros(al_nan_idxs, backupAR, n, r, a_block_size);
>
> + copy_vec(backupAR, AR, n * r * a_block_size);
> +
> get_ticks(end_tick);
> - out.acc_pre += ticks2sec(end_tick-start_tick2, cpu_freq);
> + //out.acc_other += ticks2sec(end_tick,start_tick2);
>
> - get_ticks(start_tick2);
> +
> +
> + get_ticks(start_tick2);
> +
> //! Ay_bot = AR'*Y
> cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
> 0.0, Ay_bot, r * a_block_size);
>
> get_ticks(end_tick);
> - out.firstloop += ticks2sec(end_tick - start_tick2, cpu_freq);
> + out.acc_gemm += ticks2sec(end_tick,start_tick2);
> +
> + int aL_idx = 0*l * n;
> + int aR_idx = 0* r * n * a_block_size;
>
> - //#pragma omp parallel default(shared)
> - {
> - for (int jj = 0; jj < y_block_size; jj++)
> - {
> - int thread_id = 0 * omp_get_thread_num();
> - int aL_idx = thread_id * l * n;
> - int aR_idx = thread_id * r * n * a_block_size;
> + get_ticks(start_tick3);
> + for (int jj = 0; jj < y_block_size; jj++)
> + {
> +
> + //int thread_id = 0 * omp_get_thread_num();//so far singel thread version only
>
> - if (y_iters < 40 &&
> - (y_block_size < 3 ||
> - (jj%(y_block_size/3)) == 0) && !params.ForceCheck)
> - {
> - cout << AIOfile.io_overhead << flush;
> - AIOfile.io_overhead = "*";
> - }
> +
> + get_ticks(start_tick2);
> +
> + copy_vec(backupAL, &AL[aL_idx], n * l);//try to remove!
>
> - copy_vec(backupAL, &AL[aL_idx], n * l);
> + replace_with_zeros(&y_nan_idxs[jj], &AL[aL_idx], n, l, 1);
> +
> +
> + get_ticks(end_tick);//2%
> + out.acc_other += ticks2sec(end_tick,start_tick2);
> +
> + get_ticks(start_tick2);
>
> - replace_with_zeros(&y_nan_idxs[jj], &AL[aL_idx], n, l, 1);
> - //! Generate Stl
> - cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> - l, n, 1.0, &AL[aL_idx], lda, 0.0, Stl, l);
> + //! Generate Stl
> + cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> + l, n, 1.0, &AL[aL_idx], lda, 0.0, Stl, l);
> +
> + get_ticks(end_tick);
> + out.acc_stl += ticks2sec(end_tick,start_tick2);
> +
> +
> + get_ticks(start_tick2);
> +
> + copy_vec(backupAR,&AR[aR_idx], n*r*a_block_size);//!10%//try to remove!
> +
> +
> + replace_with_zeros(&y_nan_idxs[jj], backupAR, n, r, a_block_size);
>
> - copy_vec(backupAR,&AR[aR_idx], n*r*a_block_size);
> - replace_with_zeros(&y_nan_idxs[jj], &AR[aR_idx],
> - n, r, a_block_size);
> +
> +
> + get_ticks(end_tick);
> + out.acc_other += ticks2sec(end_tick,start_tick2);
> +
> + get_ticks(start_tick2);
>
> - get_ticks(start_tick2);
> - //! Generate Str
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - l, r * a_block_size, n, 1.0, &AL[aL_idx],
> - n, &AR[aR_idx], n, 0.0, Str, l);
> - get_ticks(end_tick);
> - out.acc_RTL_QLY += ticks2sec(end_tick - start_tick2,
> - cpu_freq);
> + //! Generate Str
> + cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> + l, r * a_block_size, n, 1.0, &AL[aL_idx],
> + n, &AR[aR_idx], n, 0.0, Str, l);//!45
>
> + get_ticks(end_tick);
> + out.acc_str += ticks2sec(end_tick,start_tick2);
> +
>
> - //type_precision Sbr[r * r * a_block_size];
> - //type_precision Ay[p * a_block_size];
>
> + //type_precision Sbr[r * r * a_block_size*1];
> + //type_precision Ay[p * a_block_size*1];
> +
> + blas_set_num_threads(1);
> + omp_set_num_threads(max_threads);
> +
> + #pragma omp parallel default(shared)
> + {
>
> - //#pragma omp for nowait schedule(dynamic)
> - for (int ii= 0; ii < a_block_size; ii++)
> - {
> - get_ticks(start_tick2);
> - //! Generate Sbr
> - cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> - r, r, n, 1.0, &AR[aR_idx+ii*r*n], n,
> - &AR[aR_idx + ii * r * n], n, 0.0,
> - &Sbr[ii * r * r], r);
> - get_ticks(end_tick);
> - out.acc_gemm += ticks2sec(end_tick - start_tick2,
> - cpu_freq);
> + #pragma omp for nowait
> + for (int ii= 0; ii < a_block_size; ii++)
> + {
> + //cout << omp_get_thread_num() << endl << flush;
>
> - get_ticks(start_tick2);
>
> - copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[ii*p], l);
> - copy_vec(&Ay_bot[ii*r + jj*r*a_block_size],
> - &Ay[l+ii*p], r);
> + get_ticks(start_tick2);
>
> + //! Generate Sbr
> + cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> + r, r, n, 1.0, &AR[aR_idx+ii*r*n], n,
> + &AR[aR_idx + ii * r * n], n, 0.0,
> + &Sbr[ii * r * r], r);
> +
>
> - //type_precision* B = Ay;
> - //type_precision S[p * p];
> + get_ticks(end_tick);
> + out.acc_sbr += ticks2sec(end_tick,start_tick2 );
> +
> + get_ticks(start_tick2);
>
> + copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[ii*p], l);
> + copy_vec(&Ay_bot[ii*r + jj*r*a_block_size],
> + &Ay[l+ii*p], r);
> +
> +
> + type_precision S2[p * p];
>
> - //! Rebuild S
> - build_S(S, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
> - // matlab_print_matrix("S", p, p, S);
> + //! Rebuild S
> + build_S(S2, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
> + // matlab_print_matrix("S", p, p, S);
>
> - get_ticks(end_tick);
> - out.acc_b += ticks2sec(end_tick - start_tick2,
> - cpu_freq);
> +
> + get_ticks(end_tick);//5%
> + out.acc_other += ticks2sec(end_tick,start_tick2);
>
> - get_ticks(start_tick2);
> + get_ticks(start_tick2);
>
> - //! b = S\Ay
> - info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, 1, S, p,
> - &Ay[ii*p], p);
>
> - get_ticks(end_tick);
> - out.acc_loadxr += ticks2sec(end_tick - start_tick2,
> - cpu_freq);
> - assert(info == 0, "POSV");
> + //! b = S\Ay
> + info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, 1, S2, p,
> + &Ay[ii*p], p);
> +
> + myassert(info == 0, "S\\Ay");
> +
>
> - if (params.ForceCheck)
> + get_ticks(end_tick);
> + out.acc_solve += ticks2sec(end_tick,start_tick2 );
> +
> +
> + if (params.ForceCheck)
> + {
> + #pragma omp critical
> {
> - #pragma omp critical
> - {
> - check_result(AL, &AR[aR_idx+ii*r*n], n, p,
> - 1, r, &Y[jj*n], &Ay[ii*p]);
> - }
> + check_result(AL, &AR[aR_idx+ii*r*n], n, p,
> + 1, r, &Y[jj*n], &Ay[ii*p]);
> }
> }
> + }
> + }
> +
> + /************statistics**************/
> +// type_precision* T;
> +// type_precision* R2;
> +// type_precision* P;
> +
> + //hpc_statistics(n,A,a_block_size,Y,y_block_size,B,p,T,R2,P);
> +
> + /**************************/
> +
> + blas_set_num_threads(max_threads);
> +
> + get_ticks(start_tick2);
>
> - AIOfile.write_B(Ay, p, a_block_size);
> - }
> - }
> + AIOfile.write_B(B, p, a_block_size);
> +
> + get_ticks(end_tick);
> + out.acc_storeb += ticks2sec(end_tick,start_tick2);
> +
> +
> +
> +
> +
> + }
> +
> + get_ticks(end_tick);
> + out.acc_real_innerloops += ticks2sec(end_tick ,start_tick3);
> +
> +
> }
> AIOfile.reset_AR();
> }
>
> - get_ticks(end_tick);
> - out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> - // out.gflops = a_amount/1000.0*(n*l*r+n*r*r+y_amount*(n*r+p*p*p)))/1000.0/1000.0;
> - out.gflops = y_amount * (gemm_flops(l, n, 1, 0) +
> - a_amount * (gemm_flops(r, n, 1, 0) +
> - gemm_flops(l, n, l, 0) +
> - gemm_flops(l, n, r, 0) +
> - gemm_flops(r, n, r, 0) +
> + get_ticks(end_tick);
> +
> + out.duration = ticks2sec(end_tick ,start_tick);
> +
> +
> + out.gflops = y_iters * (gemm_flops(l, n, params.tb*1, 0) +
> + a_iters * ( gemm_flops(params.mb*r, n, params.tb*1, 0) +
> + params.tb *( gemm_flops(l, 1*n, l, 0) + gemm_flops(l, n, params.mb *r, 0) +
> + params.mb * (
> + gemm_flops(r, 1*n, r, 0) +
> (p * p * p / 3.0) /
> - 1000.0/1000.0/1000.0));
> [TRUNCATED]
>
> To get the complete diff run:
> svnlook diff /svnroot/genabel -r 1763
> _______________________________________________
> Genabel-commits mailing list
> Genabel-commits at lists.r-forge.r-project.org
> https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/genabel-commits
>
--
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
L.C. Karssen
Utrecht
The Netherlands
lennart at karssen.org
http://blog.karssen.org
GPG key ID: A88F554A
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 213 bytes
Desc: OpenPGP digital signature
URL: <http://lists.r-forge.r-project.org/pipermail/genabel-devel/attachments/20140722/34cc1d63/attachment-0001.sig>
More information about the genabel-devel
mailing list