[Rcpp-devel] Regular Expressions

Dirk Eddelbuettel edd at debian.org
Sun Mar 3 03:21:26 CET 2013


Gabor,

Here is a completely new, second variant of the same example, this time
implemented using only inline with a custom plugin.  This should have what
you need.

Code first:
-----------------------------------------------------------------------------
edd at max:/tmp$ cat boostregex.R 

library(inline)

## NB double backslashes expanded to four backslashes because of quoting  :-/
inctxt <- '
#include <Rcpp.h>
#include <string>
#include <boost/regex.hpp>

bool validate_card_format(const std::string& s) {
   static const boost::regex e("(\\\\d{4}[- ]){3}\\\\d{4}");
   return boost::regex_match(s, e);
}

const boost::regex e("\\\\A(\\\\d{3,4})[- ]?(\\\\d{4})[- ]?(\\\\d{4})[- ]?(\\\\d{4})\\\\z");
const std::string machine_format("\\\\1\\\\2\\\\3\\\\4");
const std::string human_format("\\\\1-\\\\2-\\\\3-\\\\4");

std::string machine_readable_card_number(const std::string& s) {
   return boost::regex_replace(s, e, machine_format, boost::match_default | boost::format_sed);
}

std::string human_readable_card_number(const std::string& s) {
   return boost::regex_replace(s, e, human_format, boost::match_default | boost::format_sed);
}
'

srctxt <- '
    std::vector<std::string> s = Rcpp::as<std::vector<std::string> >(sx);
    int n = s.size();

    std::vector<bool> valid(n);
    std::vector<std::string> machine(n);
    std::vector<std::string> human(n);

    for (int i=0; i<n; i++) {
        valid[i]  = validate_card_format(s[i]);
        machine[i] = machine_readable_card_number(s[i]);
        human[i] = human_readable_card_number(s[i]);
    }
    return Rcpp::DataFrame::create(Rcpp::Named("input") = s,
                                   Rcpp::Named("valid") = valid,
                                   Rcpp::Named("machine") = machine,
                                   Rcpp::Named("human") = human);
'

plug <- Rcpp:::Rcpp.plugin.maker(
    include.before = "#include <boost/regex.hpp>",
    libs = paste("-L/usr/local/lib/R/site-library/Rcpp/lib -lRcpp",
                 "-Wl,-rpath,/usr/local/lib/R/site-library/Rcpp/lib",
                 "-L/usr/lib -lboost_regex -lm"))
registerPlugin("boostDemo", plug )
regexDemo <- cxxfunction(signature(sx="CharVec"), body=srctxt, incl=inctxt, plugin="boostDemo", verbose=TRUE)

s <- c("0000111122223333", "0000 1111 2222 3333", "0000-1111-2222-3333", "000-1111-2222-3333")
regexDemo(s)
edd at max:/tmp$ 
-----------------------------------------------------------------------------

Output in verbose mode:
-----------------------------------------------------------------------------
edd at max:/tmp$ 
edd at max:/tmp$ Rscript boostregex.R 
Loading required package: methods
 >> setting environment variables: 
PKG_LIBS = -L/usr/local/lib/R/site-library/Rcpp/lib -lRcpp -Wl,-rpath,/usr/local/lib/R/site-library/Rcpp/lib -L/usr/lib -lboost_regex -lm -L/usr/local/lib/R/site-library/Rcpp/lib -lRcpp -Wl,-rpath,/usr/local/lib/R/site-library/Rcpp/lib

 >> LinkingTo : Rcpp
CLINK_CPPFLAGS =  -I"/usr/local/lib/R/site-library/Rcpp/include" 

 >> Program source :

   1 : 
   2 : // includes from the plugin
   3 : #include <boost/regex.hpp>
   4 : #include <Rcpp.h>
   5 : 
   6 : 
   7 : #ifndef BEGIN_RCPP
   8 : #define BEGIN_RCPP
   9 : #endif
  10 : 
  11 : #ifndef END_RCPP
  12 : #define END_RCPP
  13 : #endif
  14 : 
  15 : using namespace Rcpp;
  16 : 
  17 : 
  18 : // user includes
  19 : 
  20 : #include <Rcpp.h>
  21 : #include <string>
  22 : #include <boost/regex.hpp>
  23 : 
  24 : bool validate_card_format(const std::string& s) {
  25 :    static const boost::regex e("(\\d{4}[- ]){3}\\d{4}");
  26 :    return boost::regex_match(s, e);
  27 : }
  28 : 
  29 : const boost::regex e("\\A(\\d{3,4})[- ]?(\\d{4})[- ]?(\\d{4})[- ]?(\\d{4})\\z");
  30 : const std::string machine_format("\\1\\2\\3\\4");
  31 : const std::string human_format("\\1-\\2-\\3-\\4");
  32 : 
  33 : std::string machine_readable_card_number(const std::string& s) {
  34 :    return boost::regex_replace(s, e, machine_format, boost::match_default | boost::format_sed);
  35 : }
  36 : 
  37 : std::string human_readable_card_number(const std::string& s) {
  38 :    return boost::regex_replace(s, e, human_format, boost::match_default | boost::format_sed);
  39 : }
  40 : 
  41 : 
  42 : // declarations
  43 : extern "C" {
  44 : SEXP file13316a634edf( SEXP sx) ;
  45 : }
  46 : 
  47 : // definition
  48 : 
  49 : SEXP file13316a634edf( SEXP sx ){
  50 : BEGIN_RCPP
  51 : 
  52 :     std::vector<std::string> s = Rcpp::as<std::vector<std::string> >(sx);
  53 :     int n = s.size();
  54 : 
  55 :     std::vector<bool> valid(n);
  56 :     std::vector<std::string> machine(n);
  57 :     std::vector<std::string> human(n);
  58 : 
  59 :     for (int i=0; i<n; i++) {
  60 :         valid[i]  = validate_card_format(s[i]);
  61 :         machine[i] = machine_readable_card_number(s[i]);
  62 :         human[i] = human_readable_card_number(s[i]);
  63 :     }
  64 :     return Rcpp::DataFrame::create(Rcpp::Named("input") = s,
  65 :                                    Rcpp::Named("valid") = valid,
  66 :                                    Rcpp::Named("machine") = machine,
  67 :                                    Rcpp::Named("human") = human);
  68 : 
  69 : END_RCPP
  70 : }
  71 : 
  72 : 
Compilation argument:
 /usr/lib/R/bin/R CMD SHLIB file13316a634edf.cpp 2> file13316a634edf.cpp.err.txt 
ccache g++-4.7 -I/usr/share/R/include -DNDEBUG   -I"/usr/local/lib/R/site-library/Rcpp/include"   -fpic  -g0 -O3 -Wall -pipe -Wno-variadic-macros -pedantic -c file13316a634edf.cpp -o file13316a634edf.o
g++-4.7 -shared -o file13316a634edf.so file13316a634edf.o -L/usr/local/lib/R/site-library/Rcpp/lib -lRcpp -Wl,-rpath,/usr/local/lib/R/site-library/Rcpp/lib -L/usr/lib -lboost_regex -lm -L/usr/local/lib/R/site-library/Rcpp/lib -lRcpp -Wl,-rpath,/usr/local/lib/R/site-library/Rcpp/lib -L/usr/lib/R/lib -lR
                input valid          machine               human
1    0000111122223333 FALSE 0000111122223333 0000-1111-2222-3333
2 0000 1111 2222 3333  TRUE 0000111122223333 0000-1111-2222-3333
3 0000-1111-2222-3333  TRUE 0000111122223333 0000-1111-2222-3333
4  000-1111-2222-3333 FALSE  000111122223333  000-1111-2222-3333
edd at max:/tmp$ 
-----------------------------------------------------------------------------

You should be able to adapt this on Windows. Keeping my fingers crossed...

Dirk

-- 
Dirk Eddelbuettel | edd at debian.org | http://dirk.eddelbuettel.com  


More information about the Rcpp-devel mailing list