[Rcpp-commits] r3669 - in pkg/RcppCNPy: . inst man src

Fri Jul 6 03:08:18 CEST 2012

Author: edd
Date: 2012-07-06 03:08:16 +0200 (Fri, 06 Jul 2012)
New Revision: 3669

Modified:
   pkg/RcppCNPy/ChangeLog
   pkg/RcppCNPy/DESCRIPTION
   pkg/RcppCNPy/inst/NEWS.Rd
   pkg/RcppCNPy/man/RcppCNPy-package.Rd
   pkg/RcppCNPy/src/cnpy.cpp
   pkg/RcppCNPy/src/cnpy.h
   pkg/RcppCNPy/src/cnpyMod.cpp
Log:
Version 0.0.2 with automatic transpose-on-read as well as automatic transpose on read


Modified: pkg/RcppCNPy/ChangeLog
===================================================================

--- pkg/RcppCNPy/ChangeLog	2012-07-05 02:50:19 UTC (rev 3668)
+++ pkg/RcppCNPy/ChangeLog	2012-07-06 01:08:16 UTC (rev 3669)
@@ -1,3 +1,14 @@
+2012-07-05  Dirk Eddelbuettel  <edd at dexter>
+
+	* DESCRIPTION (Version): Version 0.0.2
+
+	* src/cnpyMod.cpp: Added transpose() method to transparently deal
+	with the Fortran-vs-C storage order difference between Python and R.
+	Also added support for reading vectors.
+
+	* src/cnpy.{cpp,h}: Added support for loading from .npy.gz files
+	which is automagically enabled if the filename ends in ".gz"
+
 2012-07-04  Dirk Eddelbuettel  <edd at debian.org>
 
 	* Initial version 0.0.1

Modified: pkg/RcppCNPy/DESCRIPTION
===================================================================
--- pkg/RcppCNPy/DESCRIPTION	2012-07-05 02:50:19 UTC (rev 3668)
+++ pkg/RcppCNPy/DESCRIPTION	2012-07-06 01:08:16 UTC (rev 3669)
@@ -1,13 +1,14 @@
 Package: RcppCNPy
 Type: Package
 Title: Rcpp bindings for NumPy files
-Version: 0.0.1
-Date: 2012-07-04
+Version: 0.0.2
+Date: $Date$
 Author: Dirk Eddelbuettel
 Maintainer: Dirk Eddelbuettel <edd at debian.org>
 Description: This package provides access to the cnpy library by Carl Rogers
  which provides read and write facilities for files created with (or for) the
- NumPY extension for Python.
+ NumPY extension for Python.  Vectors and matrices of either numeric or
+ integer types can be read or written. Compressed files can be read as well.
 License: GPL (>= 2)
 LazyLoad: yes
 Depends: methods, Rcpp (>= 0.9.13)


Property changes on: pkg/RcppCNPy/DESCRIPTION
___________________________________________________________________
Added: svn:keywords
   + Date

Modified: pkg/RcppCNPy/inst/NEWS.Rd
===================================================================
--- pkg/RcppCNPy/inst/NEWS.Rd	2012-07-05 02:50:19 UTC (rev 3668)
+++ pkg/RcppCNPy/inst/NEWS.Rd	2012-07-06 01:08:16 UTC (rev 3669)
@@ -2,6 +2,12 @@
 \title{News for Package \pkg{RcppCNPy}}
 \newcommand{\cpkg}{\href{http://CRAN.R-project.org/package=#1}{\pkg{#1}}}
 
+\section{Changes in version 0.0.2 (2012-07-05)}{
+  \itemize{
+    \item Added automatic use of transpose to automagically account for
+    Fortran-vs-C major storage defaults between Python and R.
+    \item Added support for reading gzip'ed files ending in ".npy.gz"
+}
 \section{Changes in version 0.0.1 (2012-07-04)}{
   \itemize{
     \item Initial version, as a straightforward Rcpp modules wrap around

Modified: pkg/RcppCNPy/man/RcppCNPy-package.Rd
===================================================================
--- pkg/RcppCNPy/man/RcppCNPy-package.Rd	2012-07-05 02:50:19 UTC (rev 3668)
+++ pkg/RcppCNPy/man/RcppCNPy-package.Rd	2012-07-06 01:08:16 UTC (rev 3669)
@@ -15,8 +15,7 @@
   Support is currently still pretty limited to reading and writing of
   either vectors or matrices of either numeric or integer type.
 
-  \emph{Note that matrices must be transposed first to make up for
-    Fortran ordering.}
+  Files with \code{gzip} compression can be transparently read as well.
 }
 \usage{
   npyLoad(filename, type="numeric")

Modified: pkg/RcppCNPy/src/cnpy.cpp
===================================================================
--- pkg/RcppCNPy/src/cnpy.cpp	2012-07-05 02:50:19 UTC (rev 3668)
+++ pkg/RcppCNPy/src/cnpy.cpp	2012-07-06 01:08:16 UTC (rev 3669)
@@ -133,6 +133,23 @@
     return arr;
 }
 
+cnpy::NpyArray gzload_the_npy_file(gzFile fp) {
+    unsigned int* shape;
+    unsigned int ndims, word_size;
+    cnpy::parse_npy_gzheader(fp,word_size,shape,ndims);
+    //unsigned long long size = 1; //long long so no overflow when multiplying by word_size
+    unsigned long size = 1; //long long so no overflow when multiplying by word_size
+    for(unsigned int i = 0;i < ndims;i++) size *= shape[i];
+
+    cnpy::NpyArray arr;
+    arr.word_size = word_size;
+    arr.shape = std::vector<unsigned int>(shape,shape+ndims);
+    arr.data = new char[size*word_size];    
+    //int nread = fread(arr.data,word_size,size,fp);
+    if (gzread(fp,arr.data,word_size*size) < 0) REprintf("cnpy::gzload_the_npy_file error");
+    return arr;
+}
+
 cnpy::npz_t cnpy::npz_load(std::string fname) {
     FILE* fp = fopen(fname.c_str(),"rb");
 
@@ -225,5 +242,51 @@
     return arr;
 }
 
+cnpy::NpyArray cnpy::npy_gzload(std::string fname) {
+    gzFile fp = gzopen(fname.c_str(), "rb");
+    if(!fp) {
+        REprintf("npy_gzload: Error! Unable to open file %s!\n",fname.c_str());
+    }
+    NpyArray arr = gzload_the_npy_file(fp);
+    gzclose(fp);
+    return arr;
+}
 
+void cnpy::parse_npy_gzheader(gzFile fp, unsigned int& word_size, unsigned int*& shape, unsigned int& ndims) {  
+    char buffer[256];
+    if (gzread(fp,buffer,sizeof(char)*11) != 11) REprintf("cnpy::parse_npy_gzheader read discprepancy");
+    std::string header = gzgets(fp, buffer,256);
+    Rassert(header[header.size()-1] == '\n', "header ended improperly");
 
+    int loc1, loc2;
+
+    //fortran order
+    loc1 = header.find("fortran_order")+16;
+    bool fortran_order = (header.substr(loc1,5) == "True" ? true : false);
+    Rassert(!fortran_order, "fortran_order error");
+
+    //shape
+    loc1 = header.find("(");
+    loc2 = header.find(")");
+    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
+    if(str_shape[str_shape.size()-1] == ',') ndims = 1;
+    else ndims = std::count(str_shape.begin(),str_shape.end(),',')+1;
+    shape = new unsigned int[ndims];
+    for(unsigned int i = 0;i < ndims;i++) {
+        loc1 = str_shape.find(",");
+        shape[i] = atoi(str_shape.substr(0,loc1).c_str());
+        str_shape = str_shape.substr(loc1+1);
+    }
+
+    //endian, word size, data type
+    loc1 = header.find("descr")+9;
+    bool littleEndian = (header[loc1] == '<' ? true : false);
+    Rassert(littleEndian, "littleEndian error");
+
+    //char type = header[loc1+1];
+    //assert(type == map_type(T);
+
+    std::string str_ws = header.substr(loc1+2);
+    loc2 = str_ws.find("'");
+    word_size = atoi(str_ws.substr(0,loc2).c_str());
+}

Modified: pkg/RcppCNPy/src/cnpy.h
===================================================================
--- pkg/RcppCNPy/src/cnpy.h	2012-07-05 02:50:19 UTC (rev 3668)
+++ pkg/RcppCNPy/src/cnpy.h	2012-07-06 01:08:16 UTC (rev 3669)
@@ -47,6 +47,8 @@
     npz_t npz_load(std::string fname);
     NpyArray npz_load(std::string fname, std::string varname);
     NpyArray npy_load(std::string fname);
+    NpyArray npy_gzload(std::string fname);
+    void parse_npy_gzheader(gzFile fp,unsigned int& word_size, unsigned int*& shape, unsigned int& ndims);
 
     template<typename T> std::vector<char>& operator+=(std::vector<char>& lhs, const T rhs) {
         //write in little endian

Modified: pkg/RcppCNPy/src/cnpyMod.cpp
===================================================================
--- pkg/RcppCNPy/src/cnpyMod.cpp	2012-07-05 02:50:19 UTC (rev 3668)
+++ pkg/RcppCNPy/src/cnpyMod.cpp	2012-07-06 01:08:16 UTC (rev 3669)
@@ -22,20 +22,67 @@
 #include <Rcpp.h>               // need to include the main Rcpp header file only
 #include "cnpy.h"               // (local copy of) header for cnpy library
 
+template <typename T>
+T transpose(const T & m) {      // tranpose for IntegerMatrix / NumericMatrix, see array.c in R
+    int k = m.rows(), n = m.cols();
+    //Rcpp::Rcout << "Transposing " << n << " by " << k << std::endl;
+    T z(n, k);
+    int sz1 = n*k-1;
+    typename T::iterator mit, zit;
+    for (mit = m.begin(), zit = z.begin(); mit != m.end(); mit++, zit += n) {
+        if (zit >= z.end()) zit -= sz1;
+        *zit = *mit;
+    }
+    return(z);
+}
+
+// cf stackoverflow.com/questions/874134
+bool hasEnding(std::string const &full, std::string const &ending) {
+    if (full.length() >= ending.length()) {
+        return(0 == full.compare(full.length() - ending.length(), ending.length(), ending));
+    } else {
+        return false;
+    }
+}
+
 Rcpp::RObject npyLoad(const std::string & filename, const std::string & type) { 
-    cnpy::NpyArray arr = cnpy::npy_load(filename);
+
+    cnpy::NpyArray arr;
+
+    if (hasEnding(filename, ".gz")) {
+        arr = cnpy::npy_gzload(filename);
+    } else {
+        arr = cnpy::npy_load(filename);
+    }
+
     std::vector<unsigned int> shape = arr.shape;
-    if (shape.size() != 2)  Rf_error("Wrong dimension");
     SEXP ret = R_NilValue;      		// allows us to assign either int or numeric 
-    if (type == "numeric") {
-        ret = Rcpp::NumericMatrix(shape[0], shape[1], reinterpret_cast<double*>(arr.data));
-    } else if (type == "integer") {
-        ret = Rcpp::IntegerMatrix(shape[0], shape[1], reinterpret_cast<int*>(arr.data));
+    if (shape.size() == 1) {
+        if (type == "numeric") {
+            double *p = reinterpret_cast<double*>(arr.data);
+            ret = Rcpp::NumericVector(p, p + shape[0]);
+        } else if (type == "integer") {
+            int *p = reinterpret_cast<int*>(arr.data);
+            ret = Rcpp::IntegerVector(p, p + shape[0]);
+        } else {
+            arr.destruct();
+            REprintf("Unsupported type in npyLoad");
+        } 
+    } else if (shape.size() == 2) {
+        if (type == "numeric") {
+            // invert dimension for creation, and then tranpose to correct Fortran-vs-C storage
+            ret = transpose(Rcpp::NumericMatrix(shape[1], shape[0], reinterpret_cast<double*>(arr.data)));
+        } else if (type == "integer") {
+            // invert dimension for creation, and then tranpose to correct Fortran-vs-C storage
+            ret = transpose(Rcpp::IntegerMatrix(shape[1], shape[0], reinterpret_cast<double*>(arr.data)));
+        } else {
+            arr.destruct();
+            REprintf("Unsupported type in npyLoad");
+        }
     } else {
+        Rf_error("Unsupported dimension in npyLoad");
         arr.destruct();
-        REprintf("Unsupported type in npyLoad");
     }
-    arr.destruct();
     return ret;
 }
 
@@ -77,7 +124,7 @@
              &npyLoad,          		// function pointer to helper function defined above
              List::create( Named("filename"),   // function arguments including default value
                            Named("type") = "numeric"),
-             "read an npy file into a numeric or integer matrix");
+             "read an npy file into a numeric or integer vector or matrix");
 
     function("npySave",         		// name of the identifier at the R level
              &npySave,          		// function pointer to helper function defined above