[GenABEL-dev] [Genabel-commits] r1763 - in pkg/OmicABELnoMM: .	src tests
    L.C. Karssen 
    lennart at karssen.org
       
    Tue Jul 22 18:42:09 CEST 2014
    
    
  
Thanks Alvaro!
I ran only a quick check, and I noticed that after an 'autoreconf -i'
./configure, make and make check all work. Great!
However, running make distclean doesn't work. I'll commit a fix for that
later.
Now that the Makefile.am files are working fine, I'll delete the
tests/Makefile as well.
Thanks again,
Lennart.
On 15-07-14 10:17, noreply at r-forge.r-project.org wrote:
> Author: afrank
> Date: 2014-07-15 10:17:29 +0200 (Tue, 15 Jul 2014)
> New Revision: 1763
> 
> Added:
>    pkg/OmicABELnoMM/tests/Makefile.am
> Modified:
>    pkg/OmicABELnoMM/Makefile.am
>    pkg/OmicABELnoMM/configure.ac
>    pkg/OmicABELnoMM/src/AIOwrapper.cpp
>    pkg/OmicABELnoMM/src/AIOwrapper.h
>    pkg/OmicABELnoMM/src/Algorithm.cpp
>    pkg/OmicABELnoMM/src/Algorithm.h
>    pkg/OmicABELnoMM/src/Definitions.h
>    pkg/OmicABELnoMM/src/Utility.cpp
>    pkg/OmicABELnoMM/src/Utility.h
>    pkg/OmicABELnoMM/src/main.cpp
>    pkg/OmicABELnoMM/tests/Makefile
>    pkg/OmicABELnoMM/tests/test.cpp
> Log:
> Added R2, T-stat, Pval functions, but inactive atm. Added exclusion list support to remove individuals. Fixed propagation of missing values. Performance Improvements.
> 
> Modified: pkg/OmicABELnoMM/Makefile.am
> ===================================================================
> --- pkg/OmicABELnoMM/Makefile.am	2014-07-08 13:16:45 UTC (rev 1762)
> +++ pkg/OmicABELnoMM/Makefile.am	2014-07-15 08:17:29 UTC (rev 1763)
> @@ -2,7 +2,7 @@
>  
>  oanomm_headers = src/Definitions.h src/AIOwrapper.h src/Algorithm.h	\
>   src/Utility.h
> -oanomm_cpp = src/AIOwrapper.cpp src/Algorithm.cpp src/Utility.cpp	\
> +oanomm_cpp = src/AIOwrapper.cpp src/Algorithm.cpp src/Utility.cpp \
>   src/main.cpp
>  
>  omicabelnomm_SOURCES = $(oanomm_headers) $(oanomm_cpp)
> 
> Modified: pkg/OmicABELnoMM/configure.ac
> ===================================================================
> --- pkg/OmicABELnoMM/configure.ac	2014-07-08 13:16:45 UTC (rev 1762)
> +++ pkg/OmicABELnoMM/configure.ac	2014-07-15 08:17:29 UTC (rev 1763)
> @@ -17,11 +17,12 @@
>  # Set some default compile flags
>  if test -z "$CXXFLAGS"; then
>     # User did not set CXXFLAGS, so we can put in our own defaults
> -    CXXFLAGS="-g -O2"
> +    CXXFLAGS="-O3"
>  fi
>  if test -z "$CPPFLAGS"; then
>     # User did not set CPPFLAGS, so we can put in our own defaults
>      CPPFLAGS="-Wall -g -pedantic -Wunused-result -Wmaybe-uninitialized -Wformat"
> +    #CPPFLAGS="-Wall"
>  fi
>  # If CXXFLAGS/CPPFLAGS are already set AC_PROG_CXX will not overwrite them
>  # with its own defaults
> @@ -45,13 +46,18 @@
>     AC_MSG_ERROR([Unable to find a Lapack library])
>  ])
>  
> +#Boost
>  
> +
> +
>  # Check for openMP. If found the OPENMP_CXXFLAGS is set automatically
>  AC_OPENMP
>  AC_SUBST(AM_CXXFLAGS, "$OPENMP_CFLAGS")
>  
>  
>  
> +
> +
>  # Checks for header files.
>  AC_CHECK_HEADERS([limits.h stdlib.h string.h sys/time.h unistd.h])
>  
> 
> Modified: pkg/OmicABELnoMM/src/AIOwrapper.cpp
> ===================================================================
> --- pkg/OmicABELnoMM/src/AIOwrapper.cpp	2014-07-08 13:16:45 UTC (rev 1762)
> +++ pkg/OmicABELnoMM/src/AIOwrapper.cpp	2014-07-15 08:17:29 UTC (rev 1763)
> @@ -28,16 +28,21 @@
>      pthread_barrier_init(&(FHandler.finalize_barrier),NULL,2);
>  
>  
> -    Fhandler->fakefiles = params.use_fake_files;
> +    Fhandler->fakefiles = params.use_fake_files;
> +
> +
> +
>  
>  
> +
>      if(!Fhandler->fakefiles)
>      {
>          Fhandler->fnameAL = params.fnameAL;
>          Fhandler->fnameAR = params.fnameAR;
>          Fhandler->fnameY = params.fnameY;
> -        Fhandler->fnameOutB = params.fnameOutB;
> +        Fhandler->fnameOutB = params.fnameOutB;
>  
> +
>          Yfvi  = load_databel_fvi( (Fhandler->fnameY+".fvi").c_str() );
>          ALfvi = load_databel_fvi( (Fhandler->fnameAL+".fvi").c_str() );
>          ARfvi = load_databel_fvi( (Fhandler->fnameAR+".fvi").c_str() );
> @@ -46,7 +51,7 @@
>          params.t = Yfvi->fvi_header.numVariables;
>          params.l = ALfvi->fvi_header.numVariables;
>  
> -	int opt_tb = 1000;
> +        int opt_tb = 1000;
>          int opt_mb = 1000;
>  
>          params.mb = min(params.m, opt_tb);
> @@ -56,7 +61,22 @@
>      else
>      {
>  
> -    }
> +    }
> +
> +    //params.fname_excludelist = "exclfile.txt";
> +    int excl_count = 0;
> +    Fhandler->excl_List = new list< pair<int,int> >();
> +
> +    if(params.fname_excludelist.size()==0)
> +    {
> +        (Fhandler->excl_List)->push_back( make_pair(0,params.n) );
> +    }
> +    else
> +    {
> +        read_excludeList( Fhandler->excl_List,excl_count,params.n,params.fname_excludelist);
> +    }
> +
> +    params.n -= excl_count;
>  
>      params.p = params.l + params.r;
>  
> @@ -106,7 +126,9 @@
>      pthread_cond_destroy(&(Fhandler->condition_more));
>  
>      pthread_mutex_destroy(&(Fhandler->m_read));
> -    pthread_cond_destroy(&(Fhandler->condition_read));
> +    pthread_cond_destroy(&(Fhandler->condition_read));
> +
> +    delete Fhandler->excl_List;
>  
>  
>  }
> @@ -186,12 +208,15 @@
>          }
>          //cout << "\nEnd preping files\n" << flush;
>  
> -    }
> +    }
> +
> +    //pthread_barrier_wait(&(Fhandler->finalize_barrier));//for testing only
>  
>  
>      Fhandler->not_done = true;
> -    Fhandler->reset_wait = false;
> +    Fhandler->reset_wait = false;
>  
> +
>      while(Fhandler->not_done)
>      {
>  
> @@ -226,9 +251,30 @@
>                  Fhandler->seed += 75;
>              }
>              else
> -            {
> -                size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Y);
> -                result++;
> +            {
> +                list< pair<int,int> >* excl_List = Fhandler->excl_List;
> +
> +
> +                int chunk_size_buff;
> +                int buff_pos=0;
> +                int file_pos;
> +
> +                for(int i = 0; i < tmp_y_blockSize; i++)
> +                {
> +                    for (list<  pair<int,int>  >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
> +                    {
> +                        file_pos = Fhandler->n*i + it->first;
> +                        fseek ( fp_Y , file_pos*sizeof(type_precision) , SEEK_SET );
> +                        chunk_size_buff = it->second;
> +
> +                        size_t result = fread (&tobeFilled->buff[buff_pos],sizeof(type_precision),chunk_size_buff,fp_Y); result++;
> +                        buff_pos += chunk_size_buff;
> +
> +
> +                    }
> +                }
> +//                size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Y);
> +//                result++;
>                  if(Fhandler->y_to_readSize <= 0)
>                  {
>                      fseek ( fp_Y , 0 , SEEK_SET );
> @@ -267,17 +313,38 @@
>              if(Fhandler->fakefiles)
>              {
>                  fseek ( fp_Ar , 0 , SEEK_SET );
> -                size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
> -                result++;
> +                size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar); result++;
>  
>                  re_random_vec(tobeFilled->buff , Fhandler->n * tmp_ar_blockSize*Fhandler->r );
>                  re_random_vec_nan(tobeFilled->buff , Fhandler->n * tmp_ar_blockSize*Fhandler->r );
>  
>              }
>              else
> -            {
> -                size_t result = fread (tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
> -                result++;
> +            {
> +
> +                list< pair<int,int> >* excl_List = Fhandler->excl_List;
> +
> +                int chunk_size_buff;
> +                int buff_pos=0;
> +                int file_pos;
> +
> +                for(int i = 0; i < tmp_ar_blockSize*Fhandler->r; i++)
> +                {
> +                    for (list<  pair<int,int>  >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
> +                    {
> +                        file_pos = Fhandler->n*i + it->first;
> +                        fseek ( fp_Ar , file_pos*sizeof(type_precision) , SEEK_SET );
> +
> +                        chunk_size_buff = it->second;
> +                        size_t result = fread (&tobeFilled->buff[buff_pos],sizeof(type_precision),chunk_size_buff,fp_Ar); result++;
> +                        buff_pos += chunk_size_buff;
> +
> +
> +                    }
> +                }
> +
> +//                size_t result = fread(tobeFilled->buff,sizeof(type_precision),size_buff,fp_Ar);
> +//                result++;
>                  if (Fhandler->Ar_to_readSize <= 0)
>                  {
>                      fseek ( fp_Ar , 0 , SEEK_SET );
> @@ -307,8 +374,8 @@
>              if(Fhandler->fakefiles)
>              {
>                  fseek ( fp_B , 0 , SEEK_SET );
> -	  }
> -		fwrite (tobeWritten->buff,sizeof(type_precision),size,fp_B);
> +            }
> +            fwrite (tobeWritten->buff,sizeof(type_precision),size,fp_B);
>  
>  
>              Fhandler->b_empty_buffers.push(tobeWritten);
> @@ -356,7 +423,8 @@
>      //cout << "k" << flush;
>      //barrier
>      pthread_barrier_wait(&(Fhandler->finalize_barrier));
> -
> +
> +    {
>      type_buffElement* tmp;
>      while(!Fhandler->full_buffers.empty())
>      {
> @@ -404,6 +472,7 @@
>         Fhandler->b_empty_buffers.pop();
>         delete []tmp->buff;
>         delete tmp;
> +    }
>      }
>  
>  
> @@ -544,13 +613,9 @@
>  
>  void AIOwrapper::write_B(type_precision* B, int p, int blockSize)
>  {
> -     //int status;
> -    //int createstatus = 0;
> -   // cout << " b: full" << Fhandler->b_full_buffers.size() << "; epty" << Fhandler->b_empty_buffers.size() << endl;
>  
>      while(Fhandler->b_empty_buffers.empty())
>      {
> -
>          pthread_mutex_lock(&(Fhandler->m_more));
>          pthread_cond_signal( &(Fhandler->condition_more ));
>          pthread_mutex_unlock(&(Fhandler->m_more));
> @@ -568,7 +633,7 @@
>  
>  
>  
> -
> +        //cout << Fhandler->b_empty_buffers.size() << flush;
>          Fhandler->currentWriteBuff = Fhandler->b_empty_buffers.front();
>          Fhandler->b_empty_buffers.pop();
>  
> @@ -582,12 +647,9 @@
>  
>  
>  
> -    // cout << " b: full" << Fhandler->b_full_buffers.size() << "; epty" << Fhandler->b_empty_buffers.size() << endl;
> -
>      pthread_mutex_unlock(&(Fhandler->m_buff_upd));
>  
>  
> -
>      pthread_mutex_lock(&(Fhandler->m_more));
>      pthread_cond_signal( &(Fhandler->condition_more ));
>      pthread_mutex_unlock(&(Fhandler->m_more));
> @@ -668,8 +730,12 @@
>  //        {
>  //            (tmp->buff)[i] = 0;
>  //        }
> -        Fhandler->b_empty_buffers.push(tmp);
> +        Fhandler->b_empty_buffers.push(tmp);
> +
> +//        Fhandler->currentWriteBuff = Fhandler->b_empty_buffers.front();
> +//        Fhandler->b_empty_buffers.pop();
>  
> +
>      }
>  
>  }
> @@ -819,17 +885,38 @@
>          (*AL) = Fhandler->AL;
>      }
>      else
> -    {
> +    {
>          FILE *fp;
>          fp = fopen((Fhandler->fnameAL+".fvd").c_str(), "rb");
>          if(fp == 0)
>          {
>              cout << "Error Reading File " << Fhandler->fnameAL << endl;
>              exit(1);
> -        }
> +        }
> +
> +        list< pair<int,int> >* excl_List = Fhandler->excl_List;
> +
> +        int chunk_size_buff;
> +        int buff_pos=0;
> +        int file_pos;
> +
> +        for (int i=0; i < Fhandler->l; i++)
> +        {
> +            for (list<  pair<int,int>  >::iterator it=excl_List->begin(); it != excl_List->end(); ++it)
> +            {
> +                file_pos = i*Fhandler->n+ it->first;
> +                fseek ( fp , file_pos*sizeof(type_precision) , SEEK_SET );
> +                chunk_size_buff = it->second;
> +
> +                size_t result = fread (&(Fhandler->AL[buff_pos]),sizeof(type_precision),chunk_size_buff,fp); result++;
> +                buff_pos += chunk_size_buff;
> +            }
> +        }
>  
> -        size_t result = fread (Fhandler->AL,sizeof(type_precision),Fhandler->l*Fhandler->n,fp);
> -        result++;
> +
> +//        size_t result = fread (Fhandler->AL,sizeof(type_precision),Fhandler->l*Fhandler->n,fp);
> +//
> +//        result++;
>          fclose(fp);
>      }
>  
> @@ -862,6 +949,111 @@
>  {
>      delete []Fhandler->AL;
>  }
> +
> +
> +void AIOwrapper::read_excludeList(list< pair<int,int> >* excl, int &excl_count, int max_excl, string fname_excludeList)
> +{
> +
> +    ifstream fp_exL(fname_excludeList.c_str());
> +    if(fp_exL == 0)
> +    {
> +        cout << "Error reading exclude list file."<< endl;
> +        exit(1);
> +    }
> +
> +
> +    string line;
> +    excl_count = 0;
> +    bool early_EOF;
> +    int first,second, prev_second;
> +
> +     cout << "Excluding Ids: \n";
> +
> +    std::getline(fp_exL, line);
> +    std::istringstream iss(line);
> +    iss >> first;
> +    early_EOF = iss.eof();
> +    iss >> prev_second;
> +
> +    if(prev_second < first || early_EOF)
> +            prev_second = first;
> +
> +    second = prev_second;
> +
> +
> +    if(first > max_excl)
> +    {
> +        excl->push_back( make_pair(0,max_excl) );
> +        cout << "\nNothing to Exclude!\n";
> +    }
> +    else
> +    {
> +        if(second > max_excl)
> +        {
> +                excl_count += max_excl-first+1;
> +                excl->push_back( make_pair(0,first-1) );
> +                cout << first << "-" << second << ", ";
> +        }
> +        else
> +        {
> +
> +            cout << first << "-" << second << ", ";
> +            excl->push_back( make_pair(0,first - 1) );
> +            excl_count += second-(first-1);
> +
> +            while (std::getline(fp_exL, line) && second < max_excl )
> +            {
> +                std::istringstream iss(line);
> +
> +                iss >> first;
> +                early_EOF = iss.eof();
> +                iss >> second;
> +                if(prev_second >= first)
> +                {
> +                    cout << "\nPlease give an ordered Exlusion List!\n";
> +                    cout << "?? " << prev_second << "\n" << first << " ??"<< endl;
> +                    exit( 1 );
> +                }
> +
> +
> +                if(second < first || early_EOF )
> +                    second = first;
> +
> +
> +                if(second > max_excl)
> +                    excl_count += max_excl-first+1;
> +                else if(first < max_excl)
> +                    excl_count += second-(first-1);
> +
> +
> +
> +                cout << first << "-" << second << ", ";
> +
> +                excl->push_back( make_pair(prev_second,first - prev_second - 1) );
> +
> +
> +                prev_second = second;
> +
> +
> +            }
> +        }
> +    }
> +
> +
> +
> +
> +    if(excl_count >= max_excl)
> +    {
> +		cout << "\nExclusion List excluded all data!\n";
> +		cout << "Total Ids: " << max_excl << "\nExcluded Ids: " << excl_count << endl;
> +		exit( 1 );
> +	}
> +	 cout << "Excluded: "  << excl_count << " Using: " << max_excl-excl_count<< endl;
> +
> +
> +
> +}
> +
>  
>  void AIOwrapper::free_databel_fvi( struct databel_fvi **fvi )
>  {
> 
> Modified: pkg/OmicABELnoMM/src/AIOwrapper.h
> ===================================================================
> --- pkg/OmicABELnoMM/src/AIOwrapper.h	2014-07-08 13:16:45 UTC (rev 1762)
> +++ pkg/OmicABELnoMM/src/AIOwrapper.h	2014-07-15 08:17:29 UTC (rev 1763)
> @@ -2,7 +2,9 @@
>  #define AIOWRAPPER_H
>  
>  #include "Definitions.h"
> -#include "Utility.h"
> +#include "Utility.h"
> +#include <fstream>
> +#include <sstream>
>  
>  typedef struct BufferElement type_buffElement;
>  
> @@ -20,9 +22,15 @@
>  {
>      string fnameAL;
>      string fnameAR;
> -    string fnameY;
> -    string fnameOutB;
> +    string fnameY;
> +
>  
> +    string fnameOutB;
> +
> +
> +    list< pair<int,int> >* excl_List;
> +
> +
>      bool doublefileType;
>      bool fakefiles;
>  
> @@ -138,7 +146,9 @@
>  
>      protected:
>  
> -    private:
> +    private:
> +
> +        void read_excludeList(list< pair<int,int> >* excl, int &excl_count, int max_excl, string fname_excludeList);
>  
>  
>          void prepare_AR( int desired_blockSize, int n, int totalR, int columnsR);
> @@ -163,7 +173,7 @@
>  
>          void * fgls_malloc_impl( const char* file, long line, size_t size );
>  
> -        type_fileh FHandler;
> +            public: type_fileh FHandler;
>          type_fileh* Fhandler;
>  
>          FILE* fp_Ar;
> 
> Modified: pkg/OmicABELnoMM/src/Algorithm.cpp
> ===================================================================
> --- pkg/OmicABELnoMM/src/Algorithm.cpp	2014-07-08 13:16:45 UTC (rev 1762)
> +++ pkg/OmicABELnoMM/src/Algorithm.cpp	2014-07-15 08:17:29 UTC (rev 1763)
> @@ -1,5 +1,7 @@
> -#include "Algorithm.h"
> +#include "Algorithm.h"
> +
>  
> +
>  Algorithm::Algorithm()
>  {
>      // ctor
> @@ -51,41 +53,39 @@
>  }
>  
>  
> -void Algorithm::prepare_Bfinal(type_precision* bfinal, type_precision* top,
> -                               type_precision* bot, int dim1_b,
> -                               int dim2_b, int dim1_b_bot)
> +void Algorithm::prepare_Bfinal(type_precision* bfinal, type_precision* bsource, int a_amount, int y_amount, int p)
>  {
> -    // memcpy are faster version of the fors
> -    int i, k, w, top_idx, bot_idx;
> -    int size;
> -    top_idx = 0;
> -    bot_idx = 0;
> -    for (k = 0; k < dim2_b; k++)
> -    {
> -        size = k * dim1_b + (dim1_b - dim1_b_bot) - (k * dim1_b);
> -        memcpy( (type_precision*)&bfinal[k * dim1_b],
> -                (type_precision*)&top[top_idx],
> -                size * sizeof(type_precision) );
> -//        for (i = k*dim1_b; i < k*dim1_b+(dim1_b-dim1_b_bot); i++)
> -//        {
> -//            bfinal[i] = top[top_idx];
> -//            top_idx++;
> -//        }
> -        top_idx += size;
> -        i = k * dim1_b + size;
> -        w = i;
> -
> -        size = w + dim1_b_bot - w;
> -        memcpy( (type_precision*)&bfinal[w],
> -                (type_precision*)&bot[bot_idx],
> -                size * sizeof(type_precision) );
> -//        for (i = w; i < w+dim1_b_bot; i++)
> -//        {
> -//            bfinal[i] = bot[bot_idx];
> -//            bot_idx++;
> -//        }
> -        bot_idx += size;
> -    }
> +//    // memcpy are faster version of the fors
> +//    int i, k, w, top_idx, bot_idx;
> +//    int size;
> +//    top_idx = 0;
> +//    bot_idx = 0;
> +//    for (k = 0; k < dim2_b; k++)
> +//    {
> +//        size = k * dim1_b + (dim1_b - dim1_b_bot) - (k * dim1_b);
> +//        memcpy( (type_precision*)&bfinal[k * dim1_b],
> +//                (type_precision*)&top[top_idx],
> +//                size * sizeof(type_precision) );
> +////        for (i = k*dim1_b; i < k*dim1_b+(dim1_b-dim1_b_bot); i++)
> +////        {
> +////            bfinal[i] = top[top_idx];
> +////            top_idx++;
> +////        }
> +//        top_idx += size;
> +//        i = k * dim1_b + size;
> +//        w = i;
> +//
> +//        size = w + dim1_b_bot - w;
> +//        memcpy( (type_precision*)&bfinal[w],
> +//                (type_precision*)&bot[bot_idx],
> +//                size * sizeof(type_precision) );
> +////        for (i = w; i < w+dim1_b_bot; i++)
> +////        {
> +////            bfinal[i] = bot[bot_idx];
> +////            bot_idx++;
> +////        }
> +//        bot_idx += size;
> +//    }
>  }
>  
>  
> @@ -249,12 +249,13 @@
>  
>      lapack_int info = LAPACKE_sgels(STORAGE_TYPE, 'N', rowsA, colsA, rhs, A,
>                                      rowsA, ynew, rowsA);
> -    assert(info == 0, "Error Check");
> +    myassert(info == 0, "Error Check");
>  
>  
>      int index = 0;
>      int index_new = 0;
> -    for (i = 0; i < rhs; i++)
> +    for (i = 0; i < rhs; i++)
> +
>      {
>           copy_vec(&ynew[index], &new_sol[index_new], colsA);
>           index += rowsA;
> @@ -305,13 +306,14 @@
>  
>      srand(time(NULL));
>  
> -    blas_set_num_threads(max_threads);
> +    blas_set_num_threads(max_threads);
> +    omp_set_num_threads(max_threads);
>  
>  
>      //type_precision *Ytemp;
>      lapack_int info, n, lda, l, r, p;
>  
> -    cputime_type start_tick, start_tick2, end_tick;
> +    cputime_type start_tick, start_tick2, start_tick3, end_tick;
>  
>      AIOfile.initialize(params);
>  
> @@ -333,54 +335,70 @@
>  
>      for (int j = 0; j < y_iters && !params.ForceCheck; j++)
>      {
> -        if (y_iters >= 40 && (j%(y_iters/40)) == 0)
> +        if (!params.ForceCheck &&  y_iters >= 10 && (j%(y_iters/10)) == 0 )
>          {
>              cout << "*" << flush;
>          }
>  
>          for (int i = 0; i < a_iters; i++)
>          {
> -            for (int jj = 0; jj < y_block_size; jj++)
> -            {
> -                if (y_iters < 40 &&
> -                    (y_block_size < 3 || (jj%(y_block_size/3)) == 0)
> -                    )
> +
> +                if ( !params.ForceCheck &&  y_iters < 10 &&
> +                (  (a_iters >= 10 && (i%(a_iters/(10/y_iters))) == 0) || (a_iters < (10/y_iters)) ))
>                  {
>                      cout << "*" << flush;
>                  }
> -            }
> +
>          }
>      }
>  
>      if(!params.ForceCheck)
>          cout << endl;
>  
> -
> +    //add memalign
>  
>      //type_precision Stl[l*l];
>      //type_precision Str[l*r*a_block_size];
> -    type_precision* Stl = new type_precision[l*l];
> -    type_precision* Str = new type_precision[l*r*a_block_size];
> +    type_precision* Stl = new type_precision[l*l*1];
> +    type_precision* Str = new type_precision[l*r*a_block_size*1];
>  
>      type_precision* Sbr = new type_precision[r *  r * a_block_size];
> -    type_precision* Ay = new type_precision[p * a_block_size];
> +    type_precision* Ay = new type_precision[p * a_block_size];
> +    //type_precision* B_resorted = new type_precision[p * a_block_size*y_block_size];
>  
> -    type_precision* S = new type_precision[p * p];
> +    type_precision* S = new type_precision[p * p];
>  
> +
>      type_precision* Ay_top = new type_precision[l * y_amount];
>      type_precision* Ay_bot = new type_precision[y_block_size * a_block_size * r];
>  
> -    list<long int>* y_nan_idxs = new list<long int>[y_block_size];
> +    type_precision* y_residual = new type_precision[n * y_block_size ];
> +    type_precision* y_res_norms = new type_precision[a_block_size];
> +
> +    list<long int>* al_nan_idxs = new list<long int>[1];
> +    list<long int>* y_nan_idxs = new list<long int>[y_block_size];
> +    list<long int>* ar_nan_idxs = new list<long int>[a_block_size];
>  
> +
> +    type_precision* A = new type_precision[n * p * 1];
>      type_precision* AR = new type_precision[n * r * a_block_size * 1];
> -    type_precision* AL = new type_precision[n * l * 1];
> +//  type_precision* AL = new type_precision[n * l * 1];
> +    type_precision* AL = A;
> +
> +    type_precision* B = Ay;
> +
>      type_precision* backupAR;  // = new type_precision[n*r*a_block_size];
>      type_precision* backupAL;  // = new type_precision[n*l];
>  
>  
>      AIOfile.load_AL(&backupAL);
> -    //int total_al_nans = replace_nans(0, backupAL, n, l);
> -    replace_nans(0, backupAL, n, l);
> +
> +    //pthread_barrier_wait(&(AIOfile.Fhandler->finalize_barrier));
> +
> +    replace_nans(al_nan_idxs,1, backupAL, n, l);
> +    al_nan_idxs->push_back(1);
> +
> +    //LAPACKE_dgesdd()
>  
>      copy_vec(backupAL, AL, n*l);
>  
> @@ -394,176 +412,369 @@
>  
>      for (int j = 0; j < y_iters; j++)
>      {
> -        if (y_iters >= 40 && (j%(y_iters/40)) == 0 && !params.ForceCheck)
> +        if (!params.ForceCheck &&  y_iters >= 10 && (j%(y_iters/10)) == 0 && !params.ForceCheck)
>          {
>              cout << AIOfile.io_overhead << flush;
>              AIOfile.io_overhead = "*";
>          }
> +
> +        get_ticks(start_tick2);
>  
> -        AIOfile.load_Yblock(&Y, y_block_size);
> +        AIOfile.load_Yblock(&Y, y_block_size);
> +
> +        get_ticks(end_tick);
> +        out.acc_loady += ticks2sec(end_tick,start_tick2);
>  
> +        get_ticks(start_tick2);
> +        replace_nans(&y_nan_idxs[0],y_block_size, Y, n,1);
> +        //out.acc_other += ticks2sec(end_tick,start_tick2);
>  
>  
> -        //int total_y_nans = replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
> -        replace_nans(&y_nan_idxs[0], Y, n, y_block_size);
> +        get_ticks(start_tick2);
>  
> -
>          //! Ay_top = AL'*Y
>          cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>                      l, y_block_size, n, 1.0, AL, n, Y, n, 0.0,
> -                    &Ay_top[j * l * y_block_size], l);
> +                    &Ay_top[j * l * y_block_size], l);
> +
> +        get_ticks(end_tick);
> +        out.acc_gemm += ticks2sec(end_tick,start_tick2);
>  
>  
>          for (int i = 0; i < a_iters; i++)
> -        {
> -            AIOfile.load_ARblock(&backupAR, a_block_size);
> -            //int total_ar_nans = replace_nans(0, backupAR, n, a_block_size * r);
> -            replace_nans(0, backupAR, n, a_block_size * r);
> -            copy_vec(backupAR, AR, n *  r * a_block_size);
> +        {
> +
> +            if (!params.ForceCheck && y_iters < 10 &&
> +                (  (a_iters >= 10 && (i%(a_iters/(10/y_iters))) == 0) || (a_iters < (10/y_iters)) ))
> +            {
> +                cout << "*" << flush;
> +            }
> +
> +            get_ticks(start_tick2);
>  
> -            get_ticks(start_tick2);
> +            AIOfile.load_ARblock(&backupAR, a_block_size);
> +
> +            get_ticks(end_tick);
> +            out.acc_loadxr += ticks2sec(end_tick,start_tick2);
> +
> +            get_ticks(start_tick2);
> +
> +            replace_nans(&ar_nan_idxs[0],a_block_size, backupAR, n, r);
> +            replace_with_zeros(al_nan_idxs, backupAR,  n, r, a_block_size);
>  
> +            copy_vec(backupAR, AR, n *  r * a_block_size);
> +
>              get_ticks(end_tick);
> -            out.acc_pre += ticks2sec(end_tick-start_tick2, cpu_freq);
> +            //out.acc_other += ticks2sec(end_tick,start_tick2);
>  
> -            get_ticks(start_tick2);
> +
> +
> +            get_ticks(start_tick2);
> +
>              //! Ay_bot = AR'*Y
>              cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
>                          r * a_block_size, y_block_size, n, 1.0, AR, n, Y, n,
>                          0.0, Ay_bot, r * a_block_size);
>  
>              get_ticks(end_tick);
> -            out.firstloop += ticks2sec(end_tick - start_tick2, cpu_freq);
> +            out.acc_gemm += ticks2sec(end_tick,start_tick2);
> +
> +            int aL_idx =  0*l * n;
> +            int aR_idx = 0* r * n * a_block_size;
>  
> -            //#pragma omp parallel default(shared)
> -            {
> -                for (int jj = 0; jj < y_block_size; jj++)
> -                {
> -                    int thread_id = 0 * omp_get_thread_num();
> -                    int aL_idx = thread_id * l * n;
> -                    int aR_idx = thread_id * r * n * a_block_size;
> +            get_ticks(start_tick3);
> +            for (int jj = 0; jj < y_block_size; jj++)
> +            {
> +
> +                //int thread_id = 0 * omp_get_thread_num();//so far singel thread version only
>  
> -                    if (y_iters < 40 &&
> -                                  (y_block_size < 3 ||
> -                                                  (jj%(y_block_size/3)) == 0) && !params.ForceCheck)
> -                    {
> -                        cout << AIOfile.io_overhead << flush;
> -                        AIOfile.io_overhead = "*";
> -                    }
> +
> +                get_ticks(start_tick2);
> +
> +                copy_vec(backupAL, &AL[aL_idx], n * l);//try to remove!
>  
> -                    copy_vec(backupAL, &AL[aL_idx], n * l);
> +                replace_with_zeros(&y_nan_idxs[jj], &AL[aL_idx], n, l, 1);
> +
> +
> +                get_ticks(end_tick);//2%
> +                out.acc_other += ticks2sec(end_tick,start_tick2);
> +
> +                 get_ticks(start_tick2);
>  
> -                    replace_with_zeros(&y_nan_idxs[jj], &AL[aL_idx], n, l, 1);
> -                    //! Generate Stl
> -                    cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> -                                l, n, 1.0, &AL[aL_idx], lda, 0.0, Stl, l);
> +                //! Generate Stl
> +                cblas_ssyrk(CblasColMajor, CblasUpper, CblasTrans,
> +                            l, n, 1.0, &AL[aL_idx], lda, 0.0, Stl, l);
> +
> +                get_ticks(end_tick);
> +                out.acc_stl += ticks2sec(end_tick,start_tick2);
> +
> +
> +                get_ticks(start_tick2);
> +
> +                copy_vec(backupAR,&AR[aR_idx], n*r*a_block_size);//!10%//try to remove!
> +
> +
> +                replace_with_zeros(&y_nan_idxs[jj], backupAR,  n, r, a_block_size);
>  
> -                    copy_vec(backupAR,&AR[aR_idx], n*r*a_block_size);
> -                    replace_with_zeros(&y_nan_idxs[jj], &AR[aR_idx],
> -                                       n, r, a_block_size);
> +
> +
> +                get_ticks(end_tick);
> +                out.acc_other += ticks2sec(end_tick,start_tick2);
> +
> +                get_ticks(start_tick2);
>  
> -                    get_ticks(start_tick2);
> -                    //! Generate Str
> -                    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                                l, r * a_block_size, n, 1.0, &AL[aL_idx],
> -                                n, &AR[aR_idx], n, 0.0, Str, l);
> -                    get_ticks(end_tick);
> -                    out.acc_RTL_QLY += ticks2sec(end_tick - start_tick2,
> -                                                 cpu_freq);
> +                //! Generate Str
> +                cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> +                            l, r * a_block_size, n, 1.0, &AL[aL_idx],
> +                            n, &AR[aR_idx], n, 0.0, Str, l);//!45
>  
> +                get_ticks(end_tick);
> +                out.acc_str += ticks2sec(end_tick,start_tick2);
> +
>  
> -                    //type_precision Sbr[r *  r * a_block_size];
> -                    //type_precision Ay[p * a_block_size];
>  
> +                //type_precision Sbr[r *  r * a_block_size*1];
> +                //type_precision Ay[p * a_block_size*1];
> +
> +                blas_set_num_threads(1);
> +                omp_set_num_threads(max_threads);
> +
> +                #pragma omp parallel default(shared)
> +                {
>  
> -                    //#pragma omp for nowait  schedule(dynamic)
> -                    for (int ii= 0; ii < a_block_size; ii++)
> -                    {
> -                        get_ticks(start_tick2);
> -                        //! Generate Sbr
> -                        cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> -                                    r, r, n, 1.0, &AR[aR_idx+ii*r*n], n,
> -                                    &AR[aR_idx + ii * r * n], n, 0.0,
> -                                    &Sbr[ii * r * r], r);
> -                        get_ticks(end_tick);
> -                        out.acc_gemm += ticks2sec(end_tick - start_tick2,
> -                                                  cpu_freq);
> +                #pragma omp for nowait
> +                for (int ii= 0; ii < a_block_size; ii++)
> +                {
> +                    //cout << omp_get_thread_num() << endl << flush;
>  
> -                        get_ticks(start_tick2);
>  
> -                        copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[ii*p], l);
> -                        copy_vec(&Ay_bot[ii*r + jj*r*a_block_size],
> -                                 &Ay[l+ii*p], r);
> +                    get_ticks(start_tick2);
>  
> +                    //! Generate Sbr
> +                    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
> +                                r, r, n, 1.0, &AR[aR_idx+ii*r*n], n,
> +                                &AR[aR_idx + ii * r * n], n, 0.0,
> +                                &Sbr[ii * r * r], r);
> +
>  
> -                        //type_precision* B = Ay;
> -                        //type_precision S[p * p];
> +                    get_ticks(end_tick);
> +                    out.acc_sbr += ticks2sec(end_tick,start_tick2 );
> +
> +                    get_ticks(start_tick2);
>  
> +                    copy_vec(&Ay_top[j*l*y_block_size+jj*l], &Ay[ii*p], l);
> +                    copy_vec(&Ay_bot[ii*r + jj*r*a_block_size],
> +                             &Ay[l+ii*p], r);
> +
> +
> +                    type_precision S2[p * p];
>  
> -                        //! Rebuild S
> -                        build_S(S, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
> -                        // matlab_print_matrix("S", p, p, S);
> +                    //! Rebuild S
> +                    build_S(S2, Stl, &Str[ii*r*l], &Sbr[ii*r*r], l, r);
> +                    // matlab_print_matrix("S", p, p, S);
>  
> -                        get_ticks(end_tick);
> -                        out.acc_b += ticks2sec(end_tick - start_tick2,
> -                                               cpu_freq);
> +
> +                    get_ticks(end_tick);//5%
> +                    out.acc_other += ticks2sec(end_tick,start_tick2);
>  
> -                        get_ticks(start_tick2);
> +                    get_ticks(start_tick2);
>  
> -                        //! b = S\Ay
> -                        info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, 1, S, p,
> -                                             &Ay[ii*p], p);
>  
> -                        get_ticks(end_tick);
> -                        out.acc_loadxr += ticks2sec(end_tick - start_tick2,
> -                                                    cpu_freq);
> -                        assert(info == 0, "POSV");
> +                    //! b = S\Ay
> +                    info = LAPACKE_sposv(STORAGE_TYPE, 'U', p, 1, S2, p,
> +                                         &Ay[ii*p], p);
> +
> +                    myassert(info == 0, "S\\Ay");
> +
>  
> -                        if (params.ForceCheck)
> +                    get_ticks(end_tick);
> +                    out.acc_solve += ticks2sec(end_tick,start_tick2 );
> +
> +
> +                    if (params.ForceCheck)
> +                    {
> +                        #pragma omp critical
>                          {
> -                            #pragma omp critical
> -                            {
> -                                check_result(AL, &AR[aR_idx+ii*r*n], n, p,
> -                                             1, r, &Y[jj*n], &Ay[ii*p]);
> -                            }
> +                            check_result(AL, &AR[aR_idx+ii*r*n], n, p,
> +                                         1, r, &Y[jj*n], &Ay[ii*p]);
>                          }
>                      }
> +                }
> +                }
> +
> +                /************statistics**************/
> +//                type_precision* T;
> +//                type_precision* R2;
> +//                type_precision* P;
> +
> +                //hpc_statistics(n,A,a_block_size,Y,y_block_size,B,p,T,R2,P);
> +
> +                /**************************/
> +
> +                blas_set_num_threads(max_threads);
> +
> +                get_ticks(start_tick2);
>  
> -                    AIOfile.write_B(Ay, p, a_block_size);
> -                }
> -            }
> +                AIOfile.write_B(B, p, a_block_size);
> +
> +                get_ticks(end_tick);
> +                out.acc_storeb += ticks2sec(end_tick,start_tick2);
> +
> +
> +
> +
> +
> +            }
> +
> +            get_ticks(end_tick);
> +            out.acc_real_innerloops += ticks2sec(end_tick ,start_tick3);
> +
> +
>          }
>           AIOfile.reset_AR();
>      }
>  
> -    get_ticks(end_tick);
> -    out.duration = ticks2sec(end_tick - start_tick, cpu_freq);
> -    // out.gflops = a_amount/1000.0*(n*l*r+n*r*r+y_amount*(n*r+p*p*p)))/1000.0/1000.0;
> -    out.gflops = y_amount * (gemm_flops(l, n, 1, 0) +
> -                             a_amount * (gemm_flops(r, n, 1, 0) +
> -                                         gemm_flops(l, n, l, 0) +
> -                                         gemm_flops(l, n, r, 0) +
> -                                         gemm_flops(r, n, r, 0) +
> +    get_ticks(end_tick);
> +
> +    out.duration = ticks2sec(end_tick ,start_tick);
> +
> +
> +    out.gflops = y_iters * (gemm_flops(l, n, params.tb*1, 0) +
> +                             a_iters * ( gemm_flops(params.mb*r, n, params.tb*1, 0) +
> +                             params.tb *(  gemm_flops(l, 1*n, l, 0) + gemm_flops(l, n, params.mb *r, 0) +
> +                                    params.mb * (
> +                                         gemm_flops(r, 1*n, r, 0) +
>                                           (p * p * p / 3.0) /
> -                                         1000.0/1000.0/1000.0));
> [TRUNCATED]
> 
> To get the complete diff run:
>     svnlook diff /svnroot/genabel -r 1763
> _______________________________________________
> Genabel-commits mailing list
> Genabel-commits at lists.r-forge.r-project.org
> https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/genabel-commits
> 
-- 
*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*
L.C. Karssen
Utrecht
The Netherlands
lennart at karssen.org
http://blog.karssen.org
GPG key ID: A88F554A
-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 213 bytes
Desc: OpenPGP digital signature
URL: <http://lists.r-forge.r-project.org/pipermail/genabel-devel/attachments/20140722/34cc1d63/attachment-0001.sig>
    
    
More information about the genabel-devel
mailing list