[Rcpp-commits] r4080 - in pkg/Rcpp/inst/include/Rcpp: hash sugar/functions

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Wed Dec 5 00:46:49 CET 2012


Author: romain
Date: 2012-12-05 00:46:49 +0100 (Wed, 05 Dec 2012)
New Revision: 4080

Added:
   pkg/Rcpp/inst/include/Rcpp/hash/IndexHash.h
   pkg/Rcpp/inst/include/Rcpp/hash/SelfHash.h
Modified:
   pkg/Rcpp/inst/include/Rcpp/hash/hash.h
   pkg/Rcpp/inst/include/Rcpp/sugar/functions/self_match.h
   pkg/Rcpp/inst/include/Rcpp/sugar/functions/unique.h
Log:
fixing self_match

Added: pkg/Rcpp/inst/include/Rcpp/hash/IndexHash.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/hash/IndexHash.h	                        (rev 0)
+++ pkg/Rcpp/inst/include/Rcpp/hash/IndexHash.h	2012-12-04 23:46:49 UTC (rev 4080)
@@ -0,0 +1,168 @@
+// -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 4 -*-
+//
+// IndexHash.h: Rcpp R/C++ interface class library -- hashing utility, inspired 
+// from Simon's fastmatch package
+//
+// Copyright (C) 2010, 2011  Simon Urbanek
+// Copyright (C) 2012  Dirk Eddelbuettel and Romain Francois
+//
+// This file is part of Rcpp.
+//
+// Rcpp is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// Rcpp is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef RCPP__HASH__INDEX_HASH_H
+#define RCPP__HASH__INDEX_HASH_H
+
+namespace Rcpp{
+    namespace sugar{ 
+      
+    #ifndef RCPP_HASH    
+    #define RCPP_HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - k))
+    #endif
+    
+    template <int RTYPE>
+    class IndexHash {
+    public:
+        typedef typename traits::storage_type<RTYPE>::type STORAGE ;
+        typedef Vector<RTYPE> VECTOR ;
+              
+        IndexHash( SEXP table ) : n(Rf_length(table)), m(2), k(1), src( (STORAGE*)dataptr(table) ), data(), size_(0) {
+            int desired = n*2 ;
+            while( m < desired ){ m *= 2 ; k++ ; }
+            data.resize( m ) ;
+        }
+        
+        inline IndexHash& fill(){
+            for( int i=0; i<n; i++) add_value(i) ;
+            return *this ;
+        }
+        
+        inline LogicalVector fill_and_get_duplicated() { 
+            LogicalVector result = no_init(n) ;
+            int* res = LOGICAL(result) ;
+            for( int i=0; i<n; i++) res[i] = ! add_value(i) ;
+            return result ;
+        }
+        
+        template <typename T>
+        inline SEXP lookup(const T& vec) const {
+            return lookup__impl(vec, vec.size() ) ;
+        }
+        
+        // use the pointers for actual (non sugar expression vectors)
+        inline SEXP lookup(const VECTOR& vec) const {
+            return lookup__impl(vec.begin(), vec.size() ) ;
+        }
+        
+        inline bool contains(STORAGE val) const {
+            return get_index(val) != NA_INTEGER ;    
+        }
+            
+        inline int size() const {
+            return size_ ;
+        }
+        
+        // keys, in the order they appear in the data
+        inline Vector<RTYPE> keys() const{
+            Vector<RTYPE> res = no_init(size_) ;
+            for( int i=0, j=0; j<size_; i++){
+                if( data[i] ) res[j++] = src[data[i]-1] ;
+            }
+            return res ;
+        }
+        
+        int n, m, k ;
+        STORAGE* src ;
+        std::vector<int> data ;
+        int size_ ;
+        
+        template <typename T>
+        SEXP lookup__impl(const T& vec, int n) const {
+            SEXP res = Rf_allocVector(INTSXP, n) ;
+            int *v = INTEGER(res) ;
+            for( int i=0; i<n; i++) v[i] = get_index( vec[i] ) ;    
+            return res ;
+        }
+        
+        bool add_value(int i){
+            RCPP_DEBUG_2( "%s::add_value(%d)", DEMANGLE(IndexHash), i )
+            STORAGE val = src[i++] ;
+            int addr = get_addr(val) ;
+            while (data[addr] && src[data[addr] - 1] != val) {
+              addr++;
+              if (addr == m) addr = 0;
+            }
+            if (!data[addr]){
+              data[addr] = i ;
+              size_++ ;
+              return true ;
+            }
+            return false;
+        }
+        
+        /* NOTE: we are returning a 1-based index ! */
+        int get_index(STORAGE value) const {
+            int addr = get_addr(value) ;
+            while (data[addr]) {
+              if (src[data[addr] - 1] == value)
+                return data[addr];
+              addr++;
+              if (addr == m) addr = 0;
+            }
+            return NA_INTEGER;
+        }
+        
+        // defined below
+        int get_addr(STORAGE value) const ;
+    } ;
+        
+    template <>
+    inline int IndexHash<INTSXP>::get_addr(int value) const {
+        return RCPP_HASH(value) ;
+    }
+    template <>
+    inline int IndexHash<REALSXP>::get_addr(double val) const {
+      int addr;
+      union dint_u {
+          double d;
+          unsigned int u[2];
+        };
+      union dint_u val_u;
+      /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
+      if (val == 0.0) val = 0.0;
+      if (R_IsNA(val)) val = NA_REAL;
+      else if (R_IsNaN(val)) val = R_NaN;
+      val_u.d = val;
+      addr = RCPP_HASH(val_u.u[0] + val_u.u[1]);
+      return addr ;
+    }
+    
+    template <>
+    inline int IndexHash<STRSXP>::get_addr(SEXP value) const {
+        intptr_t val = (intptr_t) value;
+        int addr;
+        #if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+          addr = RCPP_HASH((val & 0xffffffff) ^ (val >> 32));
+        #else
+          addr = RCPP_HASH(val);
+        #endif
+        return addr ;
+    }
+
+    
+} // sugar
+} // Rcpp
+
+#endif
+

Added: pkg/Rcpp/inst/include/Rcpp/hash/SelfHash.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/hash/SelfHash.h	                        (rev 0)
+++ pkg/Rcpp/inst/include/Rcpp/hash/SelfHash.h	2012-12-04 23:46:49 UTC (rev 4080)
@@ -0,0 +1,129 @@
+// -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 4 -*-
+//
+// hash.h: Rcpp R/C++ interface class library -- hashing utility, inspired 
+// from Simon's fastmatch package
+//
+// Copyright (C) 2010, 2011  Simon Urbanek
+// Copyright (C) 2012  Dirk Eddelbuettel and Romain Francois
+//
+// This file is part of Rcpp.
+//
+// Rcpp is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// Rcpp is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef RCPP__HASH__SELF_HASH_H
+#define RCPP__HASH__SELF_HASH_H
+
+namespace Rcpp{
+namespace sugar{ 
+
+
+    template <int RTYPE>
+    class SelfHash {
+    public:
+        typedef typename traits::storage_type<RTYPE>::type STORAGE ;
+        typedef Vector<RTYPE> VECTOR ;
+              
+        SelfHash( SEXP table ) : n(Rf_length(table)), m(2), k(1), 
+            src( (STORAGE*)dataptr(table) ), data(), indices(), size_(0) 
+        {
+            int desired = n*2 ;
+            while( m < desired ){ m *= 2 ; k++ ; }
+            data.resize( m ) ;
+            indices.resize( m ) ;
+        }
+       
+        inline IntegerVector fill_and_self_match(){
+            IntegerVector result = no_init(n) ;
+            int* res = INTEGER(result) ;
+            for( int i=0; i<n; i++) res[i] = add_value_get_index(i) ;
+            return result ;    
+        }                       
+    
+        inline int size() const {
+            return size_ ;
+        }
+        
+        int n, m, k ;
+        STORAGE* src ;
+        std::vector<int> data ;
+        std::vector<int> indices ;
+        int size_ ;
+        
+        int add_value_get_index(int i){
+            STORAGE val = src[i++] ;
+            int addr = get_addr(val) ;
+            while (data[addr] && src[data[addr] - 1] != val) {
+              addr++;
+              if (addr == m) addr = 0;
+            }
+            if (!data[addr]) {
+                data[addr] = i ;
+                indices[addr] = ++size_ ;
+            }
+            return indices[addr] ;
+        }
+        
+        /* NOTE: we are returning a 1-based index ! */
+        int get_index(STORAGE value) const {
+            int addr = get_addr(value) ;
+            while (data[addr]) {
+              if (src[data[addr] - 1] == value)
+                return data[addr];
+              addr++;
+              if (addr == m) addr = 0;
+            }
+            return NA_INTEGER;
+        }
+        
+        // defined below
+        int get_addr(STORAGE value) const ;
+    } ;
+        
+    template <>
+    inline int SelfHash<INTSXP>::get_addr(int value) const {
+        return RCPP_HASH(value) ;
+    }
+    template <>
+    inline int SelfHash<REALSXP>::get_addr(double val) const {
+      int addr;
+      union dint_u {
+          double d;
+          unsigned int u[2];
+        };
+      union dint_u val_u;
+      /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
+      if (val == 0.0) val = 0.0;
+      if (R_IsNA(val)) val = NA_REAL;
+      else if (R_IsNaN(val)) val = R_NaN;
+      val_u.d = val;            
+      addr = RCPP_HASH(val_u.u[0] + val_u.u[1]);
+      return addr ;
+    }
+    
+    template <>
+    inline int SelfHash<STRSXP>::get_addr(SEXP value) const {
+        intptr_t val = (intptr_t) value;
+        int addr;
+        #if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+          addr = RCPP_HASH((val & 0xffffffff) ^ (val >> 32));
+        #else
+          addr = RCPP_HASH(val);
+        #endif
+        return addr ;
+    }
+
+} // sugar
+} // Rcpp
+
+#endif

Modified: pkg/Rcpp/inst/include/Rcpp/hash/hash.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/hash/hash.h	2012-12-04 21:02:39 UTC (rev 4079)
+++ pkg/Rcpp/inst/include/Rcpp/hash/hash.h	2012-12-04 23:46:49 UTC (rev 4080)
@@ -1,9 +1,7 @@
 // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 4 -*-
 //
-// hash.h: Rcpp R/C++ interface class library -- hashing utility, inspired 
-// from Simon's fastmatch package
+// hash.h: Rcpp R/C++ interface class library -- hashing
 //
-// Copyright (C) 2010, 2011  Simon Urbanek
 // Copyright (C) 2012  Dirk Eddelbuettel and Romain Francois
 //
 // This file is part of Rcpp.
@@ -24,166 +22,8 @@
 #ifndef RCPP__HASH__HASH_H
 #define RCPP__HASH__HASH_H
 
-namespace Rcpp{
-    namespace sugar{ 
-      
-    #ifndef RCPP_HASH    
-    #define RCPP_HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - k))
-    #endif
-    
-    template <int RTYPE>
-    class IndexHash {
-    public:
-        typedef typename traits::storage_type<RTYPE>::type STORAGE ;
-        typedef Vector<RTYPE> VECTOR ;
-              
-        IndexHash( SEXP table ) : n(Rf_length(table)), m(2), k(1), src( (STORAGE*)dataptr(table) ), data(), size_(0) {
-            int desired = n*2 ;
-            while( m < desired ){ m *= 2 ; k++ ; }
-            data.resize( m ) ;
-        }
-        
-        inline IndexHash& fill(){
-            for( int i=0; i<n; i++) add_value(i) ;
-            return *this ;
-        }
-        
-        inline LogicalVector fill_and_get_duplicated() { 
-            LogicalVector result = no_init(n) ;
-            int* res = LOGICAL(result) ;
-            for( int i=0; i<n; i++) res[i] = ! add_value(i) ;
-            return result ;
-        }
-        
-        inline IntegerVector fill_and_self_match(){
-            IntegerVector result = no_init(n) ;
-            int* res = INTEGER(result) ;
-            for( int i=0; i<n; i++) res[i] = add_value_get_index(i) ;
-            return result ;    
-        }
-    
-        
-        template <typename T>
-        inline SEXP lookup(const T& vec) const {
-            return lookup__impl(vec, vec.size() ) ;
-        }
-        
-        // use the pointers for actual (non sugar expression vectors)
-        inline SEXP lookup(const VECTOR& vec) const {
-            return lookup__impl(vec.begin(), vec.size() ) ;
-        }
-        
-        inline bool contains(STORAGE val) const {
-            return get_index(val) != NA_INTEGER ;    
-        }
-        
-        inline int size() const {
-            return size_ ;
-        }
-        
-        inline Vector<RTYPE> keys() const{
-            Vector<RTYPE> res = no_init(size_) ;
-            for( int i=0, j=0; j<size_; i++){
-                if( data[i] ) res[j++] = src[data[i]] ;
-            }
-            return res ;
-        }
-        
-        int n, m, k ;
-        STORAGE* src ;
-        std::vector<int> data ;
-        int size_ ;
-        
-        template <typename T>
-        SEXP lookup__impl(const T& vec, int n) const {
-            SEXP res = Rf_allocVector(INTSXP, n) ;
-            int *v = INTEGER(res) ;
-            for( int i=0; i<n; i++) v[i] = get_index( vec[i] ) ;    
-            return res ;
-        }
-        
-        bool add_value(int i){
-            STORAGE val = src[i++] ;
-            int addr = get_addr(val) ;
-            while (data[addr] && src[data[addr] - 1] != val) {
-              addr++;
-              if (addr == m) addr = 0;
-            }
-            if (!data[addr]){
-              data[addr] = i ;
-              size_++ ;
-              return true ;
-            }
-            return false;
-        }
-        
-        int add_value_get_index(int i){
-            STORAGE val = src[i++] ;
-            int addr = get_addr(val) ;
-            while (data[addr] && src[data[addr] - 1] != val) {
-              addr++;
-              if (addr == m) addr = 0;
-            }
-            if (!data[addr]){
-              data[addr] = i ;
-              size_++ ;
-              return i ;
-            }
-            return data[addr] ;
-        }
-        
-        /* NOTE: we are returning a 1-based index ! */
-        int get_index(STORAGE value) const {
-            int addr = get_addr(value) ;
-            while (data[addr]) {
-              if (src[data[addr] - 1] == value)
-                return data[addr];
-              addr++;
-              if (addr == m) addr = 0;
-            }
-            return NA_INTEGER;
-        }
-        
-        // defined below
-        int get_addr(STORAGE value) const ;
-    } ;
-        
-    template <>
-    inline int IndexHash<INTSXP>::get_addr(int value) const {
-        return RCPP_HASH(value) ;
-    }
-    template <>
-    inline int IndexHash<REALSXP>::get_addr(double val) const {
-      int addr;
-      union dint_u {
-          double d;
-          unsigned int u[2];
-        };
-      union dint_u val_u;
-      /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
-      if (val == 0.0) val = 0.0;
-      if (R_IsNA(val)) val = NA_REAL;
-      else if (R_IsNaN(val)) val = R_NaN;
-      val_u.d = val;
-      addr = RCPP_HASH(val_u.u[0] + val_u.u[1]);
-      return addr ;
-    }
-    
-    template <>
-    inline int IndexHash<STRSXP>::get_addr(SEXP value) const {
-        intptr_t val = (intptr_t) value;
-        int addr;
-        #if (defined _LP64) || (defined __LP64__) || (defined WIN64)
-          addr = RCPP_HASH((val & 0xffffffff) ^ (val >> 32));
-        #else
-          addr = RCPP_HASH(val);
-        #endif
-        return addr ;
-    }
+#include <Rcpp/hash/IndexHash.h>
+#include <Rcpp/hash/SelfHash.h>
 
-    
-} // sugar
-} // Rcpp
-
 #endif
 

Modified: pkg/Rcpp/inst/include/Rcpp/sugar/functions/self_match.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/sugar/functions/self_match.h	2012-12-04 21:02:39 UTC (rev 4079)
+++ pkg/Rcpp/inst/include/Rcpp/sugar/functions/self_match.h	2012-12-04 23:46:49 UTC (rev 4080)
@@ -23,11 +23,52 @@
 #define Rcpp__sugar__self_match_h
           
 namespace Rcpp{
+namespace sugar{
 
+template <typename HASH, typename STORAGE>
+class SelfInserter {
+public:
+    SelfInserter( HASH& hash_ ) : hash(hash_), index(0) {}
+    
+    inline int operator()( STORAGE value ){
+        typename HASH::iterator it = hash.find( value ) ;
+        if( it == hash.end() ){
+            hash.insert( std::make_pair(value, ++index) ) ;
+            return index ; 
+        } else {
+            return it->second ;
+        }   
+    }
+    
+private:
+    HASH& hash ;
+    int index;
+} ; 
+
+template <int RTYPE, typename TABLE_T>        
+class SelfMatch {
+public:
+    typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
+    
+    SelfMatch( const TABLE_T& table ): hash(), result(table.size()) {
+        std::transform( table.begin(), table.end(), result.begin(), Inserter(hash) ) ;
+    }
+    
+    inline operator IntegerVector() const { return result ; }
+    
+private:
+    typedef RCPP_UNORDERED_MAP<STORAGE, int> HASH ;
+    typedef SelfInserter<HASH,STORAGE> Inserter ;
+    HASH hash ; 
+    IntegerVector result ;
+}; 
+    
+} // sugar
+
 template <int RTYPE, bool NA, typename T>
 inline IntegerVector self_match( const VectorBase<RTYPE,NA,T>& x ){
     Vector<RTYPE> vec(x) ;
-    return sugar::IndexHash<RTYPE>(vec).fill_and_self_match() ;
+    return sugar::SelfHash<RTYPE>(vec).fill_and_self_match() ;
 }
 
 

Modified: pkg/Rcpp/inst/include/Rcpp/sugar/functions/unique.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/sugar/functions/unique.h	2012-12-04 21:02:39 UTC (rev 4079)
+++ pkg/Rcpp/inst/include/Rcpp/sugar/functions/unique.h	2012-12-04 23:46:49 UTC (rev 4080)
@@ -65,7 +65,8 @@
 inline Vector<RTYPE> unique( const VectorBase<RTYPE,NA,T>& t ){
 	Vector<RTYPE> vec(t) ;
 	sugar::IndexHash<RTYPE> hash(vec) ; 
-    return hash.keys() ;
+	hash.fill() ;
+	return hash.keys() ;
 }
 template <int RTYPE, bool NA, typename T>
 inline Vector<RTYPE> sort_unique( const VectorBase<RTYPE,NA,T>& t ){



More information about the Rcpp-commits mailing list