[Rcpp-commits] r4071 - in pkg/Rcpp: . inst/include/Rcpp/hash src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Tue Dec 4 14:55:49 CET 2012


Author: romain
Date: 2012-12-04 14:55:49 +0100 (Tue, 04 Dec 2012)
New Revision: 4071

Removed:
   pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h
   pkg/Rcpp/src/fastmatch.c
Modified:
   pkg/Rcpp/ChangeLog
   pkg/Rcpp/inst/include/Rcpp/hash/hash.h
Log:
C++ify hash code from Simon

Modified: pkg/Rcpp/ChangeLog
===================================================================
--- pkg/Rcpp/ChangeLog	2012-12-04 03:22:05 UTC (rev 4070)
+++ pkg/Rcpp/ChangeLog	2012-12-04 13:55:49 UTC (rev 4071)
@@ -2,10 +2,8 @@
 
         * include/Rcpp/hash/hash.h: new implementation of IndexHash, based on
 	Simon's fastmatch package
-        * include/Rcpp/hash/hash_impl.h: low level implementation details
         * include/Rcpp/sugar/functions/match.h: using new IndexHash
-        * src/fastmatch.c : largely inspired from Simon's fastmatch
-
+        
 2012-12-03  Dirk Eddelbuettel  <edd at debian.org>
 
 	* inst/include/RcppCommon.h: Applied patch kindly contributed by Yan

Modified: pkg/Rcpp/inst/include/Rcpp/hash/hash.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/hash/hash.h	2012-12-04 03:22:05 UTC (rev 4070)
+++ pkg/Rcpp/inst/include/Rcpp/hash/hash.h	2012-12-04 13:55:49 UTC (rev 4071)
@@ -1,7 +1,9 @@
 // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 4 -*-
 //
-// hash.h: Rcpp R/C++ interface class library -- hashing 
+// hash.h: Rcpp R/C++ interface class library -- hashing utility, inspired 
+// from Simon's fastmatch package
 //
+// Copyright (C) 2010, 2011  Simon Urbanek
 // Copyright (C) 2012  Dirk Eddelbuettel and Romain Francois
 //
 // This file is part of Rcpp.
@@ -22,66 +24,112 @@
 #ifndef RCPP__HASH__HASH_H
 #define RCPP__HASH__HASH_H
 
-#include <Rcpp/hash/hash_impl.h>
-
 namespace Rcpp{
     namespace sugar{ 
-    template <typename T> void add_hash_value( hash_t *h, int i) ;
-    template <> inline void add_hash_value<int>( hash_t* h, int i ){ add_hash_int(h,i) ;}
-    template <> inline void add_hash_value<double>( hash_t* h, int i ){ add_hash_real(h,i) ;}
-    template <> inline void add_hash_value<SEXP>( hash_t* h, int i ){ add_hash_ptr(h,i) ;}
+      
+    #ifndef RCPP_HASH    
+    #define RCPP_HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - k))
+    #endif
     
-    template <typename T> int get_hash_value( hash_t *h, T val) ;
-    template <> inline int get_hash_value<int>( hash_t *h, int val){ return get_hash_int(h, val) ; }
-    template <> inline int get_hash_value<double>( hash_t *h, double val){ return get_hash_real(h, val); }
-    template <> inline int get_hash_value<SEXP>( hash_t *h, SEXP val){ return get_hash_ptr(h, val) ; }
-    
     template <int RTYPE>
     class IndexHash {
     public:
         typedef typename traits::storage_type<RTYPE>::type STORAGE ;
         typedef Vector<RTYPE> VECTOR ;
               
-        IndexHash( SEXP table ) : h(0) {
-            int n =  LENGTH(table) ;
-            h = new_hash( dataptr(table), n ) ;
-            for( int i=0; i<n; i++){
-               add_hash_value<STORAGE>( h, i) ;     
-            }    
+        IndexHash( SEXP table ) : m(2), k(1), src( (STORAGE*)dataptr(table) ), data() {
+            int n =  Rf_length(table) ;
+            int desired = n*2 ;
+            while( m < desired ){ m *= 2 ; k++ ; }
+            data.resize( m ) ;
+            for( int i=0; i<n; i++) add_value(i) ;    
         }
-        ~IndexHash(){ 
-            if(h) {
-                free_hash(h);
-                h = 0 ;
-            }
+        
+        template <typename T>
+        inline SEXP lookup(const T& vec){
+            return lookup__impl(vec, vec.size() ) ;
         }
         
+        // use the pointers for actual (non sugar expression vectors)
+        inline SEXP lookup(const VECTOR& vec){
+            return lookup__impl(vec.begin(), vec.size() ) ;
+        }
+        
+        
+    private:
+        int m, k ;
+        STORAGE* src ;
+        std::vector<int> data ;
+        
         template <typename T>
-        SEXP lookup(const T& vec){
-            int n = vec.size() ;
+        SEXP lookup__impl(const T& vec, int n){
             SEXP res = Rf_allocVector(INTSXP, n) ;
             int *v = INTEGER(res) ;
-            for( int i=0; i<n; i++){
-                v[i] = get_hash_value<STORAGE>( h, vec[i] ) ;    
-            }
+            for( int i=0; i<n; i++) v[i] = get_index( vec[i] ) ;    
             return res ;
         }
         
-        SEXP lookup(const VECTOR& vec){
-            int n = vec.size() ;
-            SEXP res = Rf_allocVector(INTSXP, n) ;
-            int *v = INTEGER(res) ;
-            STORAGE* p_vec = vec.begin() ;
-            for( int i=0; i<n; i++){
-                v[i] = get_hash_value<STORAGE>( h, p_vec[i] ) ;    
+        void add_value(int i){
+            STORAGE val = src[i++] ;
+            int addr = get_addr(val) ;
+            while (data[addr] && src[data[addr] - 1] != val) {
+              addr++;
+              if (addr == m) addr = 0;
             }
-            return res ;
+            if (!data[addr])
+              data[addr] = i ;
         }
         
-    private:    
-        hash_t* h ;
+        /* NOTE: we are returning a 1-based index ! */
+        int get_index(STORAGE value){
+            int addr = get_addr(value) ;
+            while (data[addr]) {
+              if (src[data[addr] - 1] == value)
+                return data[addr];
+              addr++;
+              if (addr == m) addr = 0;
+            }
+            return NA_INTEGER;
+        }
+        
+        // defined below
+        int get_addr(STORAGE value) ;
     } ;
+        
+    template <>
+    inline int IndexHash<INTSXP>::get_addr(int value){
+        return RCPP_HASH(value) ;
+    }
+    template <>
+    inline int IndexHash<REALSXP>::get_addr(double val){
+      int addr;
+      union dint_u {
+          double d;
+          unsigned int u[2];
+        };
+      union dint_u val_u;
+      /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
+      if (val == 0.0) val = 0.0;
+      if (R_IsNA(val)) val = NA_REAL;
+      else if (R_IsNaN(val)) val = R_NaN;
+      val_u.d = val;
+      addr = RCPP_HASH(val_u.u[0] + val_u.u[1]);
+      return addr ;
+    }
     
+    template <>
+    inline int IndexHash<STRSXP>::get_addr(SEXP value){
+        intptr_t val = (intptr_t) value;
+        int addr;
+        #if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+          addr = RCPP_HASH((val & 0xffffffff) ^ (val >> 32));
+        #else
+          addr = RCPP_HASH(val);
+        #endif
+        return addr ;
+    }
+
+    
 } // sugar
 } // Rcpp
 

Deleted: pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h	2012-12-04 03:22:05 UTC (rev 4070)
+++ pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h	2012-12-04 13:55:49 UTC (rev 4071)
@@ -1,51 +0,0 @@
-// -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 4 -*-
-//
-// hash_impl.h: Rcpp R/C++ interface class library -- hashing 
-//
-// Copyright (C) 2012  Dirk Eddelbuettel and Romain Francois
-//
-// This file is part of Rcpp.
-//
-// Rcpp is free software: you can redistribute it and/or modify it
-// under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 2 of the License, or
-// (at your option) any later version.
-//
-// Rcpp is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
-
-#ifndef RCPP__HASH__HASH_IMPL_H
-#define RCPP__HASH__HASH_IMPL_H
-
-#ifdef __cplusplus 
-extern "C" {
-#endif    
-
-    typedef struct hash {
-      int m, k  ;
-      void *src;
-      int ix[1];
-    } hash_t;
-
-    hash_t *new_hash(void *src, int len) ;
-    void free_hash(hash_t *h) ;
-
-    void add_hash_int(hash_t *h, int i) ;
-    void add_hash_real(hash_t *h, int i) ;
-    void add_hash_ptr(hash_t *h, int i) ;
-     
-    int get_hash_int(hash_t *h, int val) ;
-    int get_hash_real(hash_t *h, double val) ;
-    int get_hash_ptr(hash_t *h, void *val_ptr) ;
-
-#ifdef __cplusplus 
-}
-#endif
-
-#endif
-

Deleted: pkg/Rcpp/src/fastmatch.c
===================================================================
--- pkg/Rcpp/src/fastmatch.c	2012-12-04 03:22:05 UTC (rev 4070)
+++ pkg/Rcpp/src/fastmatch.c	2012-12-04 13:55:49 UTC (rev 4071)
@@ -1,157 +0,0 @@
-/*
- *  fastmatch: fast implementation of match() in R using semi-permanent hash tables
- *
- *  Copyright (C) 2010, 2011  Simon Urbanek
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; version 2 of the License.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- */
-
-/* for speed (should not really matter in this case as most time is spent in the hashing) */
-#define USE_RINTERNALS 1
-#include <Rinternals.h>
-
-/* for malloc/free since we handle our hash table memory separately from R */
-#include <stdlib.h>
-/* for hashing for pointers we need intptr_t */
-#include <stdint.h>
-
-#include <Rcpp/hash/hash_impl.h>
-
-/* create a new hash table with the given source and length.
-   we store only the index - values are picked from the source 
-   so you must make sure the source is still alive when used */
-hash_t *new_hash(void *src, int len) {
-  hash_t *h;
-  int m = 2, k = 1, desired = len * 2; /* we want a maximal load of 50% */
-  while (m < desired) { m *= 2; k++; }
-  h = (hash_t*) calloc(1, sizeof(hash_t) + (sizeof(int) * m));
-  if (!h) Rf_error("unable to allocate %.2Mb for a hash table", (double) sizeof(int) * (double) m / (1024.0 * 1024.0));
-  h->m = m;
-  h->k = k;
-  h->src = src;
-  return h;
-}
-
-/* free the hash table (and all chained hash tables as well) */
-void free_hash(hash_t *h) {
-  free(h);
-}
-
-/* pi-hash fn */
-#define HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - h->k))
-
-/* add the integer value at index i (0-based!) to the hash */
-void add_hash_int(hash_t *h, int i) {
-  int *src = (int*) h->src;
-  int val = src[i++], addr;
-  addr = HASH(val);
-  while (h->ix[addr] && src[h->ix[addr] - 1] != val) {
-    addr++;
-    if (addr == h->m) addr = 0;
-  }
-  if (!h->ix[addr])
-    h->ix[addr] = i;
-}
-
-/* to avoid aliasing rules issues use a union */
-union dint_u {
-  double d;
-  unsigned int u[2];
-};
-
-/* add the double value at index i (0-based!) to the hash */
-void add_hash_real(hash_t *h, int i) {
-  double *src = (double*) h->src;
-  union dint_u val;
-  int addr;
-  /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */
-  val.d = (src[i] == 0.0) ? 0.0 : src[i];
-  if (R_IsNA(val.d)) val.d = NA_REAL;
-  else if (R_IsNaN(val.d)) val.d = R_NaN;
-  addr = HASH(val.u[0]+ val.u[1]);
-  while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) {
-    addr++;
-    if (addr == h->m) addr = 0;
-  }
-  if (!h->ix[addr])
-    h->ix[addr] = i + 1;
-}
-
-/* add the pointer value at index i (0-based!) to the hash */
-void add_hash_ptr(hash_t *h, int i) {
-  int addr;
-  void **src = (void**) h->src;
-  intptr_t val = (intptr_t) src[i++];
-#if (defined _LP64) || (defined __LP64__) || (defined WIN64)
-  addr = HASH((val & 0xffffffff) ^ (val >> 32));
-#else
-  addr = HASH(val);
-#endif
-  while (h->ix[addr] && (intptr_t) src[h->ix[addr] - 1] != val) {
-    addr++;
-    if (addr == h->m) addr = 0;
-  }
-  if (!h->ix[addr])
-    h->ix[addr] = i;
-}
-
-/* NOTE: we are returning a 1-based index ! */
-int get_hash_int(hash_t *h, int val) {
-  int *src = (int*) h->src;
-  int addr;
-  addr = HASH(val);
-  while (h->ix[addr]) {
-    if (src[h->ix[addr] - 1] == val)
-      return h->ix[addr];
-    addr ++;
-    if (addr == h->m) addr = 0;
-  }
-  return NA_INTEGER;
-}
-
-/* NOTE: we are returning a 1-based index ! */
-int get_hash_real(hash_t *h, double val) {
-  double *src = (double*) h->src;
-  int addr;
-  union dint_u val_u;
-  /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
-  if (val == 0.0) val = 0.0;
-  if (R_IsNA(val)) val = NA_REAL;
-  else if (R_IsNaN(val)) val = R_NaN;
-  val_u.d = val;
-  addr = HASH(val_u.u[0] + val_u.u[1]);
-  while (h->ix[addr]) {
-    if (src[h->ix[addr] - 1] == val)
-      return h->ix[addr];
-    addr++;
-    if (addr == h->m) addr = 0;
-  }
-  return NA_INTEGER;
-}
-
-/* NOTE: we are returning a 1-based index ! */
-int get_hash_ptr(hash_t *h, void *val_ptr) {
-  void **src = (void **) h->src;
-  intptr_t val = (intptr_t) val_ptr;
-  int addr;
-#if (defined _LP64) || (defined __LP64__) || (defined WIN64)
-  addr = HASH((val & 0xffffffff) ^ (val >> 32));
-#else
-  addr = HASH(val);
-#endif               
-  while (h->ix[addr]) {
-    if ((intptr_t) src[h->ix[addr] - 1] == val)
-      return h->ix[addr];
-    addr ++;
-    if (addr == h->m) addr = 0;
-  }
-  return NA_INTEGER;
-}
-



More information about the Rcpp-commits mailing list