[Rcpp-commits] r4064 - in pkg/Rcpp: . inst/include/Rcpp inst/include/Rcpp/hash inst/include/Rcpp/sugar inst/include/Rcpp/sugar/functions src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Tue Dec 4 00:04:11 CET 2012


Author: romain
Date: 2012-12-04 00:04:10 +0100 (Tue, 04 Dec 2012)
New Revision: 4064

Added:
   pkg/Rcpp/inst/include/Rcpp/hash/
   pkg/Rcpp/inst/include/Rcpp/hash/hash.h
   pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h
   pkg/Rcpp/src/fastmatch.c
Modified:
   pkg/Rcpp/ChangeLog
   pkg/Rcpp/inst/include/Rcpp/barrier.h
   pkg/Rcpp/inst/include/Rcpp/sugar/functions/match.h
   pkg/Rcpp/inst/include/Rcpp/sugar/sugar.h
   pkg/Rcpp/src/barrier.cpp
Log:
faster match, based on Simon's fastmatch

Modified: pkg/Rcpp/ChangeLog
===================================================================
--- pkg/Rcpp/ChangeLog	2012-12-03 22:11:11 UTC (rev 4063)
+++ pkg/Rcpp/ChangeLog	2012-12-03 23:04:10 UTC (rev 4064)
@@ -1,3 +1,11 @@
+2012-12-04 Romain Francois <romain at r-enthusiasts.com>
+
+        * include/Rcpp/hash/hash.h: new implementation of IndexHash, based on 
+        Simon's fastmatch package
+        * include/Rcpp/hash/hash_impl.h: low level implementation details
+        * include/Rcpp/sugar/functions/match.h: using new IndexHash
+        * src/fastmatch.c : largely inspired from Simon's fastmatch
+
 2012-12-03  JJ Allaire <jj at rstudio.org>
 
         * R/Attributes.R: added function to check whether R development
@@ -22,7 +30,7 @@
         * unitTests/cpp/String.cpp : unit tests for String
         * unitTests/runit.String.R: unit test for String
         * include/Rcpp/sugar/sets.h: support for String
-
+        
 2012-12-01  Dirk Eddelbuettel  <edd at debian.org>
 
 	* inst/include/RcppCommon.h: Applied patch by Yan Zhou to add support

Modified: pkg/Rcpp/inst/include/Rcpp/barrier.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/barrier.h	2012-12-03 22:11:11 UTC (rev 4063)
+++ pkg/Rcpp/inst/include/Rcpp/barrier.h	2012-12-03 23:04:10 UTC (rev 4064)
@@ -32,5 +32,6 @@
 void set_vector_elt(SEXP, int, SEXP ) ;
 SEXP* get_vector_ptr(SEXP) ;
 const char* char_nocheck( SEXP ) ;
+void* dataptr(SEXP) ;
 
 #endif

Added: pkg/Rcpp/inst/include/Rcpp/hash/hash.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/hash/hash.h	                        (rev 0)
+++ pkg/Rcpp/inst/include/Rcpp/hash/hash.h	2012-12-03 23:04:10 UTC (rev 4064)
@@ -0,0 +1,89 @@
+// -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 4 -*-
+//
+// hash.h: Rcpp R/C++ interface class library -- hashing 
+//
+// Copyright (C) 2012  Dirk Eddelbuettel and Romain Francois
+//
+// This file is part of Rcpp.
+//
+// Rcpp is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// Rcpp is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef RCPP__HASH__HASH_H
+#define RCPP__HASH__HASH_H
+
+#include <Rcpp/hash/hash_impl.h>
+
+namespace Rcpp{
+    namespace sugar{ 
+    template <typename T> void add_hash_value( hash_t *h, int i) ;
+    template <> inline void add_hash_value<int>( hash_t* h, int i ){ add_hash_int(h,i) ;}
+    template <> inline void add_hash_value<double>( hash_t* h, int i ){ add_hash_real(h,i) ;}
+    template <> inline void add_hash_value<SEXP>( hash_t* h, int i ){ add_hash_ptr(h,i) ;}
+    
+    template <typename T> int get_hash_value( hash_t *h, T val) ;
+    template <> inline int get_hash_value<int>( hash_t *h, int val){ return get_hash_int(h, val) ; }
+    template <> inline int get_hash_value<double>( hash_t *h, double val){ return get_hash_real(h, val); }
+    template <> inline int get_hash_value<SEXP>( hash_t *h, SEXP val){ return get_hash_ptr(h, val) ; }
+    
+    template <int RTYPE>
+    class IndexHash {
+    public:
+        typedef typename traits::storage_type<RTYPE>::type STORAGE ;
+        typedef Vector<RTYPE> VECTOR ;
+              
+        IndexHash( SEXP table ) : h(0) {
+            int n =  LENGTH(table) ;
+            h = new_hash( dataptr(table), n ) ;
+            for( int i=0; i<n; i++){
+               add_hash_value<STORAGE>( h, i) ;     
+            }    
+        }
+        ~IndexHash(){ 
+            if(h) {
+                free_hash(h);
+                h = 0 ;
+            }
+        }
+        
+        template <typename T>
+        SEXP lookup(const T& vec){
+            int n = vec.size() ;
+            SEXP res = Rf_allocVector(INTSXP, n) ;
+            int *v = INTEGER(res) ;
+            for( int i=0; i<n; i++){
+                v[i] = get_hash_value<STORAGE>( h, vec[i] ) ;    
+            }
+            return res ;
+        }
+        
+        SEXP lookup(const VECTOR& vec){
+            int n = vec.size() ;
+            SEXP res = Rf_allocVector(INTSXP, n) ;
+            int *v = INTEGER(res) ;
+            STORAGE* p_vec = vec.begin() ;
+            for( int i=0; i<n; i++){
+                v[i] = get_hash_value<STORAGE>( h, p_vec[i] ) ;    
+            }
+            return res ;
+        }
+        
+    private:    
+        hash_t* h ;
+    } ;
+    
+} // sugar
+} // Rcpp
+
+#endif
+

Added: pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h	                        (rev 0)
+++ pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h	2012-12-03 23:04:10 UTC (rev 4064)
@@ -0,0 +1,53 @@
+// -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 4 -*-
+//
+// hash_impl.h: Rcpp R/C++ interface class library -- hashing 
+//
+// Copyright (C) 2012  Dirk Eddelbuettel and Romain Francois
+//
+// This file is part of Rcpp.
+//
+// Rcpp is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// Rcpp is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef RCPP__HASH__HASH_IMPL_H
+#define RCPP__HASH__HASH_IMPL_H
+
+#ifdef __cplusplus 
+extern "C" {
+#endif    
+
+    typedef struct hash {
+      int m, k, els, type;
+      void *src;
+      SEXP parent;
+      struct hash *next;
+      int ix[1];
+    } hash_t;
+
+    hash_t *new_hash(void *src, int len) ;
+    void free_hash(hash_t *h) ;
+
+    void add_hash_int(hash_t *h, int i) ;
+    void add_hash_real(hash_t *h, int i) ;
+    void add_hash_ptr(hash_t *h, int i) ;
+     
+    int get_hash_int(hash_t *h, int val) ;
+    int get_hash_real(hash_t *h, double val) ;
+    int get_hash_ptr(hash_t *h, void *val_ptr) ;
+
+#ifdef __cplusplus 
+}
+#endif
+
+#endif
+

Modified: pkg/Rcpp/inst/include/Rcpp/sugar/functions/match.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/sugar/functions/match.h	2012-12-03 22:11:11 UTC (rev 4063)
+++ pkg/Rcpp/inst/include/Rcpp/sugar/functions/match.h	2012-12-03 23:04:10 UTC (rev 4064)
@@ -23,69 +23,12 @@
 #define Rcpp__sugar__match_h
           
 namespace Rcpp{
-namespace sugar{
 
-template <typename HASH, typename STORAGE>
-class HashIndexInserter {
-public:
-    HashIndexInserter( HASH& hash_ ) : hash(hash_), index(1){}
-    
-    inline void operator()( STORAGE value ){
-        hash.insert( std::make_pair(value, index++) ) ;
-    }
-    
-private:
-    HASH& hash ;
-    int index;
-} ; 
-template <typename HASH, typename STORAGE>
-class HashIndexFinder {
-public:
-    HashIndexFinder( HASH& hash_) : hash(hash_), end(hash.end()) {}
-    
-    inline int operator()( STORAGE value ){
-        typename HASH::const_iterator it = hash.find(value);
-        if( it == end ){
-            return NA_INTEGER ;    
-        } else {
-            return it->second ;    
-        }
-    }
-    
-private:
-    HASH& hash ;
-    typename HASH::const_iterator end ;
-} ;
-
-    
-// version for INTSXP, REALSXP, RAWSXP, CPLXSXP
-template <int RTYPE, typename TABLE_T>        
-class IndexHash {
-public:
-    typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-    
-    IndexHash( const TABLE_T& table ): hash() {
-        for_each( table.begin(), table.end(), Inserter(hash) ) ;
-    }
-    
-    template <typename T>
-    IntegerVector match( const T& obj ){
-        return IntegerVector( obj.begin(), obj.end(), Finder(hash) ) ;
-    }
- 
-private:
-    typedef RCPP_UNORDERED_MAP<STORAGE,int> HASH ;
-    typedef HashIndexInserter<HASH,STORAGE> Inserter ;
-    typedef HashIndexFinder<HASH,STORAGE> Finder ;
-    HASH hash ;    
-}; 
-    
-} // sugar
-
 template <int RTYPE, bool NA, typename T, bool RHS_NA, typename RHS_T>
-inline IntegerVector match( const VectorBase<RTYPE,NA,T>& x, const VectorBase<RTYPE,RHS_NA,RHS_T>& table ){
-    sugar::IndexHash<RTYPE,RHS_T> hash( table.get_ref() ) ;
-    return hash.match( x.get_ref() ) ;
+inline IntegerVector match( const VectorBase<RTYPE,NA,T>& x, const VectorBase<RTYPE,RHS_NA,RHS_T>& table_ ){
+    Vector<RTYPE> table = table_ ;
+    sugar::IndexHash<RTYPE> hash( table ) ;
+    return hash.lookup( x.get_ref() ) ;
 }
 
 } // Rcpp

Modified: pkg/Rcpp/inst/include/Rcpp/sugar/sugar.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/sugar/sugar.h	2012-12-03 22:11:11 UTC (rev 4063)
+++ pkg/Rcpp/inst/include/Rcpp/sugar/sugar.h	2012-12-03 23:04:10 UTC (rev 4064)
@@ -26,8 +26,11 @@
 #include <Rcpp/sugar/tools/iterator.h>
 #include <Rcpp/sugar/block/block.h>
 
+#include <Rcpp/hash/hash.h>
+
 #include <Rcpp/sugar/operators/operators.h>
 #include <Rcpp/sugar/functions/functions.h>
 #include <Rcpp/sugar/matrix/matrix_functions.h>
 
+
 #endif

Modified: pkg/Rcpp/src/barrier.cpp
===================================================================
--- pkg/Rcpp/src/barrier.cpp	2012-12-03 22:11:11 UTC (rev 4063)
+++ pkg/Rcpp/src/barrier.cpp	2012-12-03 23:04:10 UTC (rev 4064)
@@ -44,7 +44,9 @@
     SET_VECTOR_ELT(x, i, value ) ;
 }
 SEXP* get_vector_ptr(SEXP x){ return VECTOR_PTR(x) ; }
+void* dataptr(SEXP x){ return DATAPTR(x); }
 
 // when we already know x is a CHARSXP
 const char* char_nocheck( SEXP x ){ return CHAR(x); }
 
+

Added: pkg/Rcpp/src/fastmatch.c
===================================================================
--- pkg/Rcpp/src/fastmatch.c	                        (rev 0)
+++ pkg/Rcpp/src/fastmatch.c	2012-12-03 23:04:10 UTC (rev 4064)
@@ -0,0 +1,199 @@
+/*
+ *  fastmatch: fast implementation of match() in R using semi-permanent hash tables
+ *
+ *  Copyright (C) 2010, 2011  Simon Urbanek
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ */
+
+/* for speed (should not really matter in this case as most time is spent in the hashing) */
+#define USE_RINTERNALS 1
+#include <Rinternals.h>
+
+/* for malloc/free since we handle our hash table memory separately from R */
+#include <stdlib.h>
+/* for hashing for pointers we need intptr_t */
+#include <stdint.h>
+
+#include <Rcpp/hash/hash_impl.h>
+
+/* create a new hash table with the given source and length.
+   we store only the index - values are picked from the source 
+   so you must make sure the source is still alive when used */
+hash_t *new_hash(void *src, int len) {
+  hash_t *h;
+  int m = 2, k = 1, desired = len * 2; /* we want a maximal load of 50% */
+  while (m < desired) { m *= 2; k++; }
+  h = (hash_t*) calloc(1, sizeof(hash_t) + (sizeof(int) * m));
+  if (!h) Rf_error("unable to allocate %.2Mb for a hash table", (double) sizeof(int) * (double) m / (1024.0 * 1024.0));
+  h->m = m;
+  h->k = k;
+  h->src = src;
+  return h;
+}
+
+/* free the hash table (and all chained hash tables as well) */
+void free_hash(hash_t *h) {
+  if (h->next) free_hash(h->next);
+  free(h);
+}
+
+/* pi-hash fn */
+#define HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - h->k))
+
+/* add the integer value at index i (0-based!) to the hash */
+void add_hash_int(hash_t *h, int i) {
+  int *src = (int*) h->src;
+  int val = src[i++], addr;
+  addr = HASH(val);
+  while (h->ix[addr] && src[h->ix[addr] - 1] != val) {
+    addr++;
+    if (addr == h->m) addr = 0;
+  }
+  if (!h->ix[addr])
+    h->ix[addr] = i;
+}
+
+/* to avoid aliasing rules issues use a union */
+union dint_u {
+  double d;
+  unsigned int u[2];
+};
+
+/* add the double value at index i (0-based!) to the hash */
+void add_hash_real(hash_t *h, int i) {
+  double *src = (double*) h->src;
+  union dint_u val;
+  int addr;
+  /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */
+  val.d = (src[i] == 0.0) ? 0.0 : src[i];
+  if (R_IsNA(val.d)) val.d = NA_REAL;
+  else if (R_IsNaN(val.d)) val.d = R_NaN;
+  addr = HASH(val.u[0]+ val.u[1]);
+  while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) {
+    addr++;
+    if (addr == h->m) addr = 0;
+  }
+  if (!h->ix[addr])
+    h->ix[addr] = i + 1;
+}
+
+/* add the pointer value at index i (0-based!) to the hash */
+void add_hash_ptr(hash_t *h, int i) {
+  int addr;
+  void **src = (void**) h->src;
+  intptr_t val = (intptr_t) src[i++];
+#if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+  addr = HASH((val & 0xffffffff) ^ (val >> 32));
+#else
+  addr = HASH(val);
+#endif
+  while (h->ix[addr] && (intptr_t) src[h->ix[addr] - 1] != val) {
+    addr++;
+    if (addr == h->m) addr = 0;
+  }
+  if (!h->ix[addr])
+    h->ix[addr] = i;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+int get_hash_int(hash_t *h, int val) {
+  int *src = (int*) h->src;
+  int addr;
+  addr = HASH(val);
+  while (h->ix[addr]) {
+    if (src[h->ix[addr] - 1] == val)
+      return h->ix[addr];
+    addr ++;
+    if (addr == h->m) addr = 0;
+  }
+  return NA_INTEGER;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+int get_hash_real(hash_t *h, double val) {
+  double *src = (double*) h->src;
+  int addr;
+  union dint_u val_u;
+  /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
+  if (val == 0.0) val = 0.0;
+  if (R_IsNA(val)) val = NA_REAL;
+  else if (R_IsNaN(val)) val = R_NaN;
+  val_u.d = val;
+  addr = HASH(val_u.u[0] + val_u.u[1]);
+  while (h->ix[addr]) {
+    if (src[h->ix[addr] - 1] == val)
+      return h->ix[addr];
+    addr++;
+    if (addr == h->m) addr = 0;
+  }
+  return NA_INTEGER;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+int get_hash_ptr(hash_t *h, void *val_ptr) {
+  void **src = (void **) h->src;
+  intptr_t val = (intptr_t) val_ptr;
+  int addr;
+#if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+  addr = HASH((val & 0xffffffff) ^ (val >> 32));
+#else
+  addr = HASH(val);
+#endif               
+  while (h->ix[addr]) {
+    if ((intptr_t) src[h->ix[addr] - 1] == val)
+      return h->ix[addr];
+    addr ++;
+    if (addr == h->m) addr = 0;
+  }
+  return NA_INTEGER;
+}
+
+/* the only externally visible function to be called from R */
+SEXP simon_fmatch(SEXP x, SEXP y) {
+  SEXPTYPE type = TYPEOF(y) ;
+  
+  hash_t* h = new_hash(DATAPTR(y), LENGTH(y));
+  h->type = type;
+  h->parent = y;
+
+  int i, n = LENGTH(y);
+  if (type == INTSXP)
+	for(i = 0; i < n; i++)
+	  add_hash_int(h, i);
+  else if (type == REALSXP)
+	for(i = 0; i < n; i++)
+	  add_hash_real(h, i);
+  else
+	for(i = 0; i < n; i++)
+	  add_hash_ptr(h, i);
+    
+  n = LENGTH(x) ;
+    SEXP r = allocVector(INTSXP, n);
+    int *v = INTEGER(r);
+    if (type == INTSXP) {
+      int *k = INTEGER(x);
+      for (i = 0; i < n; i++)
+	  v[i] = get_hash_int(h, k[i]);
+    } else if (type == REALSXP) {
+      double *k = REAL(x);
+      for (i = 0; i < n; i++)
+	  v[i] = get_hash_real(h, k[i]);
+    } else {
+      SEXP *k = (SEXP*) DATAPTR(x);
+      for (i = 0; i < n; i++)
+	  v[i] = get_hash_ptr(h, k[i]);
+    }
+  
+    free_hash( h ) ;
+    return r;
+  
+}
+



More information about the Rcpp-commits mailing list