[Rcpp-commits] r4064 - in pkg/Rcpp: . inst/include/Rcpp inst/include/Rcpp/hash inst/include/Rcpp/sugar inst/include/Rcpp/sugar/functions src
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Tue Dec 4 00:04:11 CET 2012
Author: romain
Date: 2012-12-04 00:04:10 +0100 (Tue, 04 Dec 2012)
New Revision: 4064
Added:
pkg/Rcpp/inst/include/Rcpp/hash/
pkg/Rcpp/inst/include/Rcpp/hash/hash.h
pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h
pkg/Rcpp/src/fastmatch.c
Modified:
pkg/Rcpp/ChangeLog
pkg/Rcpp/inst/include/Rcpp/barrier.h
pkg/Rcpp/inst/include/Rcpp/sugar/functions/match.h
pkg/Rcpp/inst/include/Rcpp/sugar/sugar.h
pkg/Rcpp/src/barrier.cpp
Log:
faster match, based on Simon's fastmatch
Modified: pkg/Rcpp/ChangeLog
===================================================================
--- pkg/Rcpp/ChangeLog 2012-12-03 22:11:11 UTC (rev 4063)
+++ pkg/Rcpp/ChangeLog 2012-12-03 23:04:10 UTC (rev 4064)
@@ -1,3 +1,11 @@
+2012-12-04 Romain Francois <romain at r-enthusiasts.com>
+
+ * include/Rcpp/hash/hash.h: new implementation of IndexHash, based on
+ Simon's fastmatch package
+ * include/Rcpp/hash/hash_impl.h: low level implementation details
+ * include/Rcpp/sugar/functions/match.h: using new IndexHash
+ * src/fastmatch.c : largely inspired from Simon's fastmatch
+
2012-12-03 JJ Allaire <jj at rstudio.org>
* R/Attributes.R: added function to check whether R development
@@ -22,7 +30,7 @@
* unitTests/cpp/String.cpp : unit tests for String
* unitTests/runit.String.R: unit test for String
* include/Rcpp/sugar/sets.h: support for String
-
+
2012-12-01 Dirk Eddelbuettel <edd at debian.org>
* inst/include/RcppCommon.h: Applied patch by Yan Zhou to add support
Modified: pkg/Rcpp/inst/include/Rcpp/barrier.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/barrier.h 2012-12-03 22:11:11 UTC (rev 4063)
+++ pkg/Rcpp/inst/include/Rcpp/barrier.h 2012-12-03 23:04:10 UTC (rev 4064)
@@ -32,5 +32,6 @@
void set_vector_elt(SEXP, int, SEXP ) ;
SEXP* get_vector_ptr(SEXP) ;
const char* char_nocheck( SEXP ) ;
+void* dataptr(SEXP) ;
#endif
Added: pkg/Rcpp/inst/include/Rcpp/hash/hash.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/hash/hash.h (rev 0)
+++ pkg/Rcpp/inst/include/Rcpp/hash/hash.h 2012-12-03 23:04:10 UTC (rev 4064)
@@ -0,0 +1,89 @@
+// -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 4 -*-
+//
+// hash.h: Rcpp R/C++ interface class library -- hashing
+//
+// Copyright (C) 2012 Dirk Eddelbuettel and Romain Francois
+//
+// This file is part of Rcpp.
+//
+// Rcpp is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// Rcpp is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Rcpp. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef RCPP__HASH__HASH_H
+#define RCPP__HASH__HASH_H
+
+#include <Rcpp/hash/hash_impl.h>
+
+namespace Rcpp{
+ namespace sugar{
+ template <typename T> void add_hash_value( hash_t *h, int i) ;
+ template <> inline void add_hash_value<int>( hash_t* h, int i ){ add_hash_int(h,i) ;}
+ template <> inline void add_hash_value<double>( hash_t* h, int i ){ add_hash_real(h,i) ;}
+ template <> inline void add_hash_value<SEXP>( hash_t* h, int i ){ add_hash_ptr(h,i) ;}
+
+ template <typename T> int get_hash_value( hash_t *h, T val) ;
+ template <> inline int get_hash_value<int>( hash_t *h, int val){ return get_hash_int(h, val) ; }
+ template <> inline int get_hash_value<double>( hash_t *h, double val){ return get_hash_real(h, val); }
+ template <> inline int get_hash_value<SEXP>( hash_t *h, SEXP val){ return get_hash_ptr(h, val) ; }
+
+ template <int RTYPE>
+ class IndexHash {
+ public:
+ typedef typename traits::storage_type<RTYPE>::type STORAGE ;
+ typedef Vector<RTYPE> VECTOR ;
+
+ IndexHash( SEXP table ) : h(0) {
+ int n = LENGTH(table) ;
+ h = new_hash( dataptr(table), n ) ;
+ for( int i=0; i<n; i++){
+ add_hash_value<STORAGE>( h, i) ;
+ }
+ }
+ ~IndexHash(){
+ if(h) {
+ free_hash(h);
+ h = 0 ;
+ }
+ }
+
+ template <typename T>
+ SEXP lookup(const T& vec){
+ int n = vec.size() ;
+ SEXP res = Rf_allocVector(INTSXP, n) ;
+ int *v = INTEGER(res) ;
+ for( int i=0; i<n; i++){
+ v[i] = get_hash_value<STORAGE>( h, vec[i] ) ;
+ }
+ return res ;
+ }
+
+ SEXP lookup(const VECTOR& vec){
+ int n = vec.size() ;
+ SEXP res = Rf_allocVector(INTSXP, n) ;
+ int *v = INTEGER(res) ;
+ STORAGE* p_vec = vec.begin() ;
+ for( int i=0; i<n; i++){
+ v[i] = get_hash_value<STORAGE>( h, p_vec[i] ) ;
+ }
+ return res ;
+ }
+
+ private:
+ hash_t* h ;
+ } ;
+
+} // sugar
+} // Rcpp
+
+#endif
+
Added: pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h (rev 0)
+++ pkg/Rcpp/inst/include/Rcpp/hash/hash_impl.h 2012-12-03 23:04:10 UTC (rev 4064)
@@ -0,0 +1,53 @@
+// -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 4 -*-
+//
+// hash_impl.h: Rcpp R/C++ interface class library -- hashing
+//
+// Copyright (C) 2012 Dirk Eddelbuettel and Romain Francois
+//
+// This file is part of Rcpp.
+//
+// Rcpp is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// Rcpp is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with Rcpp. If not, see <http://www.gnu.org/licenses/>.
+
+#ifndef RCPP__HASH__HASH_IMPL_H
+#define RCPP__HASH__HASH_IMPL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+ typedef struct hash {
+ int m, k, els, type;
+ void *src;
+ SEXP parent;
+ struct hash *next;
+ int ix[1];
+ } hash_t;
+
+ hash_t *new_hash(void *src, int len) ;
+ void free_hash(hash_t *h) ;
+
+ void add_hash_int(hash_t *h, int i) ;
+ void add_hash_real(hash_t *h, int i) ;
+ void add_hash_ptr(hash_t *h, int i) ;
+
+ int get_hash_int(hash_t *h, int val) ;
+ int get_hash_real(hash_t *h, double val) ;
+ int get_hash_ptr(hash_t *h, void *val_ptr) ;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
Modified: pkg/Rcpp/inst/include/Rcpp/sugar/functions/match.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/sugar/functions/match.h 2012-12-03 22:11:11 UTC (rev 4063)
+++ pkg/Rcpp/inst/include/Rcpp/sugar/functions/match.h 2012-12-03 23:04:10 UTC (rev 4064)
@@ -23,69 +23,12 @@
#define Rcpp__sugar__match_h
namespace Rcpp{
-namespace sugar{
-template <typename HASH, typename STORAGE>
-class HashIndexInserter {
-public:
- HashIndexInserter( HASH& hash_ ) : hash(hash_), index(1){}
-
- inline void operator()( STORAGE value ){
- hash.insert( std::make_pair(value, index++) ) ;
- }
-
-private:
- HASH& hash ;
- int index;
-} ;
-template <typename HASH, typename STORAGE>
-class HashIndexFinder {
-public:
- HashIndexFinder( HASH& hash_) : hash(hash_), end(hash.end()) {}
-
- inline int operator()( STORAGE value ){
- typename HASH::const_iterator it = hash.find(value);
- if( it == end ){
- return NA_INTEGER ;
- } else {
- return it->second ;
- }
- }
-
-private:
- HASH& hash ;
- typename HASH::const_iterator end ;
-} ;
-
-
-// version for INTSXP, REALSXP, RAWSXP, CPLXSXP
-template <int RTYPE, typename TABLE_T>
-class IndexHash {
-public:
- typedef typename Rcpp::traits::storage_type<RTYPE>::type STORAGE ;
-
- IndexHash( const TABLE_T& table ): hash() {
- for_each( table.begin(), table.end(), Inserter(hash) ) ;
- }
-
- template <typename T>
- IntegerVector match( const T& obj ){
- return IntegerVector( obj.begin(), obj.end(), Finder(hash) ) ;
- }
-
-private:
- typedef RCPP_UNORDERED_MAP<STORAGE,int> HASH ;
- typedef HashIndexInserter<HASH,STORAGE> Inserter ;
- typedef HashIndexFinder<HASH,STORAGE> Finder ;
- HASH hash ;
-};
-
-} // sugar
-
template <int RTYPE, bool NA, typename T, bool RHS_NA, typename RHS_T>
-inline IntegerVector match( const VectorBase<RTYPE,NA,T>& x, const VectorBase<RTYPE,RHS_NA,RHS_T>& table ){
- sugar::IndexHash<RTYPE,RHS_T> hash( table.get_ref() ) ;
- return hash.match( x.get_ref() ) ;
+inline IntegerVector match( const VectorBase<RTYPE,NA,T>& x, const VectorBase<RTYPE,RHS_NA,RHS_T>& table_ ){
+ Vector<RTYPE> table = table_ ;
+ sugar::IndexHash<RTYPE> hash( table ) ;
+ return hash.lookup( x.get_ref() ) ;
}
} // Rcpp
Modified: pkg/Rcpp/inst/include/Rcpp/sugar/sugar.h
===================================================================
--- pkg/Rcpp/inst/include/Rcpp/sugar/sugar.h 2012-12-03 22:11:11 UTC (rev 4063)
+++ pkg/Rcpp/inst/include/Rcpp/sugar/sugar.h 2012-12-03 23:04:10 UTC (rev 4064)
@@ -26,8 +26,11 @@
#include <Rcpp/sugar/tools/iterator.h>
#include <Rcpp/sugar/block/block.h>
+#include <Rcpp/hash/hash.h>
+
#include <Rcpp/sugar/operators/operators.h>
#include <Rcpp/sugar/functions/functions.h>
#include <Rcpp/sugar/matrix/matrix_functions.h>
+
#endif
Modified: pkg/Rcpp/src/barrier.cpp
===================================================================
--- pkg/Rcpp/src/barrier.cpp 2012-12-03 22:11:11 UTC (rev 4063)
+++ pkg/Rcpp/src/barrier.cpp 2012-12-03 23:04:10 UTC (rev 4064)
@@ -44,7 +44,9 @@
SET_VECTOR_ELT(x, i, value ) ;
}
SEXP* get_vector_ptr(SEXP x){ return VECTOR_PTR(x) ; }
+void* dataptr(SEXP x){ return DATAPTR(x); }
// when we already know x is a CHARSXP
const char* char_nocheck( SEXP x ){ return CHAR(x); }
+
Added: pkg/Rcpp/src/fastmatch.c
===================================================================
--- pkg/Rcpp/src/fastmatch.c (rev 0)
+++ pkg/Rcpp/src/fastmatch.c 2012-12-03 23:04:10 UTC (rev 4064)
@@ -0,0 +1,199 @@
+/*
+ * fastmatch: fast implementation of match() in R using semi-permanent hash tables
+ *
+ * Copyright (C) 2010, 2011 Simon Urbanek
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+/* for speed (should not really matter in this case as most time is spent in the hashing) */
+#define USE_RINTERNALS 1
+#include <Rinternals.h>
+
+/* for malloc/free since we handle our hash table memory separately from R */
+#include <stdlib.h>
+/* for hashing for pointers we need intptr_t */
+#include <stdint.h>
+
+#include <Rcpp/hash/hash_impl.h>
+
+/* create a new hash table with the given source and length.
+ we store only the index - values are picked from the source
+ so you must make sure the source is still alive when used */
+hash_t *new_hash(void *src, int len) {
+ hash_t *h;
+ int m = 2, k = 1, desired = len * 2; /* we want a maximal load of 50% */
+ while (m < desired) { m *= 2; k++; }
+ h = (hash_t*) calloc(1, sizeof(hash_t) + (sizeof(int) * m));
+ if (!h) Rf_error("unable to allocate %.2Mb for a hash table", (double) sizeof(int) * (double) m / (1024.0 * 1024.0));
+ h->m = m;
+ h->k = k;
+ h->src = src;
+ return h;
+}
+
+/* free the hash table (and all chained hash tables as well) */
+void free_hash(hash_t *h) {
+ if (h->next) free_hash(h->next);
+ free(h);
+}
+
+/* pi-hash fn */
+#define HASH(X) (3141592653U * ((unsigned int)(X)) >> (32 - h->k))
+
+/* add the integer value at index i (0-based!) to the hash */
+void add_hash_int(hash_t *h, int i) {
+ int *src = (int*) h->src;
+ int val = src[i++], addr;
+ addr = HASH(val);
+ while (h->ix[addr] && src[h->ix[addr] - 1] != val) {
+ addr++;
+ if (addr == h->m) addr = 0;
+ }
+ if (!h->ix[addr])
+ h->ix[addr] = i;
+}
+
+/* to avoid aliasing rules issues use a union */
+union dint_u {
+ double d;
+ unsigned int u[2];
+};
+
+/* add the double value at index i (0-based!) to the hash */
+void add_hash_real(hash_t *h, int i) {
+ double *src = (double*) h->src;
+ union dint_u val;
+ int addr;
+ /* double is a bit tricky - we nave to nomalize 0.0, NA and NaN */
+ val.d = (src[i] == 0.0) ? 0.0 : src[i];
+ if (R_IsNA(val.d)) val.d = NA_REAL;
+ else if (R_IsNaN(val.d)) val.d = R_NaN;
+ addr = HASH(val.u[0]+ val.u[1]);
+ while (h->ix[addr] && src[h->ix[addr] - 1] != val.d) {
+ addr++;
+ if (addr == h->m) addr = 0;
+ }
+ if (!h->ix[addr])
+ h->ix[addr] = i + 1;
+}
+
+/* add the pointer value at index i (0-based!) to the hash */
+void add_hash_ptr(hash_t *h, int i) {
+ int addr;
+ void **src = (void**) h->src;
+ intptr_t val = (intptr_t) src[i++];
+#if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+ addr = HASH((val & 0xffffffff) ^ (val >> 32));
+#else
+ addr = HASH(val);
+#endif
+ while (h->ix[addr] && (intptr_t) src[h->ix[addr] - 1] != val) {
+ addr++;
+ if (addr == h->m) addr = 0;
+ }
+ if (!h->ix[addr])
+ h->ix[addr] = i;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+int get_hash_int(hash_t *h, int val) {
+ int *src = (int*) h->src;
+ int addr;
+ addr = HASH(val);
+ while (h->ix[addr]) {
+ if (src[h->ix[addr] - 1] == val)
+ return h->ix[addr];
+ addr ++;
+ if (addr == h->m) addr = 0;
+ }
+ return NA_INTEGER;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+int get_hash_real(hash_t *h, double val) {
+ double *src = (double*) h->src;
+ int addr;
+ union dint_u val_u;
+ /* double is a bit tricky - we nave to normalize 0.0, NA and NaN */
+ if (val == 0.0) val = 0.0;
+ if (R_IsNA(val)) val = NA_REAL;
+ else if (R_IsNaN(val)) val = R_NaN;
+ val_u.d = val;
+ addr = HASH(val_u.u[0] + val_u.u[1]);
+ while (h->ix[addr]) {
+ if (src[h->ix[addr] - 1] == val)
+ return h->ix[addr];
+ addr++;
+ if (addr == h->m) addr = 0;
+ }
+ return NA_INTEGER;
+}
+
+/* NOTE: we are returning a 1-based index ! */
+int get_hash_ptr(hash_t *h, void *val_ptr) {
+ void **src = (void **) h->src;
+ intptr_t val = (intptr_t) val_ptr;
+ int addr;
+#if (defined _LP64) || (defined __LP64__) || (defined WIN64)
+ addr = HASH((val & 0xffffffff) ^ (val >> 32));
+#else
+ addr = HASH(val);
+#endif
+ while (h->ix[addr]) {
+ if ((intptr_t) src[h->ix[addr] - 1] == val)
+ return h->ix[addr];
+ addr ++;
+ if (addr == h->m) addr = 0;
+ }
+ return NA_INTEGER;
+}
+
+/* the only externally visible function to be called from R */
+SEXP simon_fmatch(SEXP x, SEXP y) {
+ SEXPTYPE type = TYPEOF(y) ;
+
+ hash_t* h = new_hash(DATAPTR(y), LENGTH(y));
+ h->type = type;
+ h->parent = y;
+
+ int i, n = LENGTH(y);
+ if (type == INTSXP)
+ for(i = 0; i < n; i++)
+ add_hash_int(h, i);
+ else if (type == REALSXP)
+ for(i = 0; i < n; i++)
+ add_hash_real(h, i);
+ else
+ for(i = 0; i < n; i++)
+ add_hash_ptr(h, i);
+
+ n = LENGTH(x) ;
+ SEXP r = allocVector(INTSXP, n);
+ int *v = INTEGER(r);
+ if (type == INTSXP) {
+ int *k = INTEGER(x);
+ for (i = 0; i < n; i++)
+ v[i] = get_hash_int(h, k[i]);
+ } else if (type == REALSXP) {
+ double *k = REAL(x);
+ for (i = 0; i < n; i++)
+ v[i] = get_hash_real(h, k[i]);
+ } else {
+ SEXP *k = (SEXP*) DATAPTR(x);
+ for (i = 0; i < n; i++)
+ v[i] = get_hash_ptr(h, k[i]);
+ }
+
+ free_hash( h ) ;
+ return r;
+
+}
+
More information about the Rcpp-commits
mailing list