[Rcpp-commits] r4072 - in pkg/Rcpp: . inst inst/include/Rcpp/hash inst/include/Rcpp/sugar/functions inst/include/Rcpp/vector

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Tue Dec 4 16:16:02 CET 2012


Author: romain
Date: 2012-12-04 16:16:01 +0100 (Tue, 04 Dec 2012)
New Revision: 4072

Modified:
   pkg/Rcpp/ChangeLog
   pkg/Rcpp/inst/NEWS.Rd
   pkg/Rcpp/inst/include/Rcpp/hash/hash.h
   pkg/Rcpp/inst/include/Rcpp/sugar/functions/unique.h
   pkg/Rcpp/inst/include/Rcpp/vector/Vector.h
Log:
more efficient unique, based on IndexHash

Modified: pkg/Rcpp/ChangeLog
===================================================================
--- pkg/Rcpp/ChangeLog	2012-12-04 13:55:49 UTC (rev 4071)
+++ pkg/Rcpp/ChangeLog	2012-12-04 15:16:01 UTC (rev 4072)
@@ -3,6 +3,8 @@
         * include/Rcpp/hash/hash.h: new implementation of IndexHash, based on
 	Simon's fastmatch package
         * include/Rcpp/sugar/functions/match.h: using new IndexHash
+        * include/Rcpp/vector/Vector.h: more efficiently create Vector from 
+        sugar expression that are already vectors, i.e. grab the SEXP
         
 2012-12-03  Dirk Eddelbuettel  <edd at debian.org>
 

Modified: pkg/Rcpp/inst/NEWS.Rd
===================================================================
--- pkg/Rcpp/inst/NEWS.Rd	2012-12-04 13:55:49 UTC (rev 4071)
+++ pkg/Rcpp/inst/NEWS.Rd	2012-12-04 15:16:01 UTC (rev 4072)
@@ -12,7 +12,14 @@
       patch by Yan Zhou
       \item New class \code{Rcpp::String} to facilitate working with a single
       element of a character vector
+      \item utility class sugar::IndexHash inspired from Simon Urbanek's fastmatch
+      package
     }
+    \item Changes in Rcpp sugar:
+    \itemize{
+        \item More efficient version of \code{match} based on \code{IndexHash}
+        \item More efficient version of \code{unique} base on \code{IndexHash}
+    }
   }
 }
 

Modified: pkg/Rcpp/inst/include/Rcpp/hash/hash.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/hash/hash.h	2012-12-04 13:55:49 UTC (rev 4071)
+++ pkg/Rcpp/inst/include/Rcpp/hash/hash.h	2012-12-04 15:16:01 UTC (rev 4072)
@@ -37,7 +37,7 @@
         typedef typename traits::storage_type<RTYPE>::type STORAGE ;
         typedef Vector<RTYPE> VECTOR ;
               
-        IndexHash( SEXP table ) : m(2), k(1), src( (STORAGE*)dataptr(table) ), data() {
+        IndexHash( SEXP table ) : m(2), k(1), src( (STORAGE*)dataptr(table) ), data(), size_(0) {
             int n =  Rf_length(table) ;
             int desired = n*2 ;
             while( m < desired ){ m *= 2 ; k++ ; }
@@ -46,23 +46,39 @@
         }
         
         template <typename T>
-        inline SEXP lookup(const T& vec){
+        inline SEXP lookup(const T& vec) const {
             return lookup__impl(vec, vec.size() ) ;
         }
         
         // use the pointers for actual (non sugar expression vectors)
-        inline SEXP lookup(const VECTOR& vec){
+        inline SEXP lookup(const VECTOR& vec) const {
             return lookup__impl(vec.begin(), vec.size() ) ;
         }
         
+        bool contains(STORAGE val) const {
+            return get_index(val) == NA_INTEGER ;    
+        }
         
+        inline int size() const {
+            return size_ ;
+        }
+        
+        inline Vector<RTYPE> keys() const{
+            Vector<RTYPE> res = no_init(size_) ;
+            for( int i=0, j=0; j<size_; i++){
+                if( data[i] ) res[j++] = src[data[i]] ;
+            }
+            return res ;
+        }
+        
     private:
         int m, k ;
         STORAGE* src ;
         std::vector<int> data ;
+        int size_ ;
         
         template <typename T>
-        SEXP lookup__impl(const T& vec, int n){
+        SEXP lookup__impl(const T& vec, int n) const {
             SEXP res = Rf_allocVector(INTSXP, n) ;
             int *v = INTEGER(res) ;
             for( int i=0; i<n; i++) v[i] = get_index( vec[i] ) ;    
@@ -76,12 +92,14 @@
               addr++;
               if (addr == m) addr = 0;
             }
-            if (!data[addr])
+            if (!data[addr]){
               data[addr] = i ;
+              size_++ ;
+            }
         }
         
         /* NOTE: we are returning a 1-based index ! */
-        int get_index(STORAGE value){
+        int get_index(STORAGE value) const {
             int addr = get_addr(value) ;
             while (data[addr]) {
               if (src[data[addr] - 1] == value)
@@ -93,15 +111,15 @@
         }
         
         // defined below
-        int get_addr(STORAGE value) ;
+        int get_addr(STORAGE value) const ;
     } ;
         
     template <>
-    inline int IndexHash<INTSXP>::get_addr(int value){
+    inline int IndexHash<INTSXP>::get_addr(int value) const {
         return RCPP_HASH(value) ;
     }
     template <>
-    inline int IndexHash<REALSXP>::get_addr(double val){
+    inline int IndexHash<REALSXP>::get_addr(double val) const {
       int addr;
       union dint_u {
           double d;
@@ -118,7 +136,7 @@
     }
     
     template <>
-    inline int IndexHash<STRSXP>::get_addr(SEXP value){
+    inline int IndexHash<STRSXP>::get_addr(SEXP value) const {
         intptr_t val = (intptr_t) value;
         int addr;
         #if (defined _LP64) || (defined __LP64__) || (defined WIN64)

Modified: pkg/Rcpp/inst/include/Rcpp/sugar/functions/unique.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/sugar/functions/unique.h	2012-12-04 13:55:49 UTC (rev 4071)
+++ pkg/Rcpp/inst/include/Rcpp/sugar/functions/unique.h	2012-12-04 15:16:01 UTC (rev 4072)
@@ -25,71 +25,6 @@
 namespace Rcpp{
 namespace sugar{
 
-template <int RTYPE, typename T>
-class Unique {
-public:
-    typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-    
-    Unique( const T& vec ) : set( vec.begin(), vec.end() ) {}
-    
-    Vector<RTYPE> get( ) {
-        return Vector<RTYPE>( set.begin(), set.end() ) ;
-    }
-    Vector<RTYPE> get_sorted( ) {
-        return Vector<RTYPE>( set.begin(), set.end() ).sort() ;
-    }
-    
-private:
-    
-    RCPP_UNORDERED_SET<STORAGE> set ;
-    
-} ;
-   
-// for a character expression
-template <typename T>
-class Unique<STRSXP,T> {
-public:
-    Unique( const T& vec ) : set() {
-        std::string buffer ;
-        int n = vec.size() ;
-        for( int i=0; i<n; i++){
-            buffer = vec[i] ;
-            set.insert( buffer ) ;
-        }
-    }
-    
-    CharacterVector get( ) {
-        return CharacterVector( set.begin(), set.end() ) ;
-    }
-    CharacterVector get_sorted( ) {
-        return CharacterVector( set.begin(), set.end() ).sort() ;
-    }
-    
-private:
-    
-    RCPP_UNORDERED_SET<std::string> set ;
-   
-} ;
-
-// for a character vector
-template <>
-class Unique<STRSXP,CharacterVector> {
-public:
-    Unique( const CharacterVector& vec ) : set( vec.begin(), vec.end() ) {}
-    
-    CharacterVector get( ) {
-        return CharacterVector( set.begin(), set.end() ) ;
-    }
-    
-    CharacterVector get_sorted( ) {
-        return CharacterVector( set.begin(), set.end() ).sort() ;
-    }
-private:
-    
-    RCPP_UNORDERED_SET<SEXP> set ;
-   
-} ;
-
 template <typename SET, typename STORAGE>
 class InSet {
 public:
@@ -126,11 +61,13 @@
 
 template <int RTYPE, bool NA, typename T>
 inline Vector<RTYPE> unique( const VectorBase<RTYPE,NA,T>& t ){
-	return sugar::Unique<RTYPE,T>( t.get_ref() ).get() ;
+	Vector<RTYPE> vec(t) ;
+	sugar::IndexHash<RTYPE> hash(vec) ; 
+    return hash.keys() ;
 }
 template <int RTYPE, bool NA, typename T>
 inline Vector<RTYPE> sort_unique( const VectorBase<RTYPE,NA,T>& t ){
-	return sugar::Unique<RTYPE,T>( t.get_ref() ).get_sorted() ;
+	return unique<RTYPE,NA,T>( t ).sort() ;
 }
 
 template <int RTYPE, bool NA, typename T, bool RHS_NA, typename RHS_T>

Modified: pkg/Rcpp/inst/include/Rcpp/vector/Vector.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/vector/Vector.h	2012-12-04 13:55:49 UTC (rev 4071)
+++ pkg/Rcpp/inst/include/Rcpp/vector/Vector.h	2012-12-04 15:16:01 UTC (rev 4072)
@@ -134,10 +134,7 @@
     
     template <bool NA, typename VEC>
     Vector( const VectorBase<RTYPE,NA,VEC>& other ) : RObject() {
-    	RCPP_DEBUG_4( "Vector<%d>( VectorBase<%d,%d,%s> )", RTYPE, NA, RTYPE, DEMANGLE(VEC) ) ;
-    	int n = other.size() ;
-    	RObject::setSEXP( Rf_allocVector( RTYPE, n ) ) ;
-    	import_expression<VEC>( other.get_ref() , n ) ;
+    	import_sugar_expression( other, typename traits::same_type<Vector,VEC>::type() ) ;
     }
     
     // should eally onlu be used for LogicalVector. 
@@ -146,10 +143,25 @@
     	RObject::setSEXP( r_cast<RTYPE>( const_cast<sugar::SingleLogicalResult<NA,T>&>( obj ) .get_sexp() ) ) ;
     }
     
+private:
+	  
+    // we are importing a real sugar expression, i.e. not a vector
+    template <bool NA, typename VEC>
+    inline void import_sugar_expression( const Rcpp::VectorBase<RTYPE,NA,VEC>& other, traits::false_type ){
+        RCPP_DEBUG_4( "Vector<%d>::import_sugar_expression( VectorBase<%d,%d,%s>, false_type )", RTYPE, NA, RTYPE, DEMANGLE(VEC) ) ;
+    	int n = other.size() ;
+    	RObject::setSEXP( Rf_allocVector( RTYPE, n ) ) ;
+    	import_expression<VEC>( other.get_ref() , n ) ;
+    }   
     
+    // we are imoprtung a sugar expression that actually is a vector
+    template <bool NA, typename VEC>
+    inline void import_sugar_expression( const Rcpp::VectorBase<RTYPE,NA,VEC>& other, traits::true_type ){
+        RCPP_DEBUG_4( "Vector<%d>::import_sugar_expression( VectorBase<%d,%d,%s>, true_type )", RTYPE, NA, RTYPE, DEMANGLE(VEC) ) ;
+    	RObject::setSEXP( other.get_ref() ) ;
+    }   
     
-private:
-	  
+    
     // TODO: do some dispatch when VEC == Vector so that we use std::copy
     template <typename T>
     inline void import_expression( const T& other, int n ){



More information about the Rcpp-commits mailing list