From noreply at r-forge.r-project.org Wed Jan 1 00:18:45 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 1 Jan 2014 00:18:45 +0100 (CET) Subject: [Rprotobuf-commits] r682 - pkg/vignettes Message-ID: <20131231231845.91906186734@r-forge.r-project.org> Author: murray Date: 2014-01-01 00:18:45 +0100 (Wed, 01 Jan 2014) New Revision: 682 Modified: pkg/vignettes/RProtoBuf-intro.Rnw Log: Add necessary usepackage color Modified: pkg/vignettes/RProtoBuf-intro.Rnw =================================================================== --- pkg/vignettes/RProtoBuf-intro.Rnw 2013-12-31 21:51:25 UTC (rev 681) +++ pkg/vignettes/RProtoBuf-intro.Rnw 2013-12-31 23:18:45 UTC (rev 682) @@ -5,6 +5,7 @@ %\VignetteDepends{RProtoBuf} \usepackage{url} +\usepackage{color} \usepackage[colorlinks]{hyperref} \definecolor{link}{rgb}{0,0,0.3} \hypersetup{ From noreply at r-forge.r-project.org Wed Jan 1 07:02:40 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 1 Jan 2014 07:02:40 +0100 (CET) Subject: [Rprotobuf-commits] r683 - in pkg: . src Message-ID: <20140101060240.980D41868E8@r-forge.r-project.org> Author: murray Date: 2014-01-01 07:02:34 +0100 (Wed, 01 Jan 2014) New Revision: 683 Modified: pkg/ChangeLog pkg/src/mutators.cpp pkg/src/rprotobuf.h Log: Use R_xlen_t and check for long vectors (repeated_fields in protocol buffers seem to be limited to int indexing as with traditional R vectors). Resolves a number of type coercion warnings identified by Flexelint. Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2013-12-31 23:18:45 UTC (rev 682) +++ pkg/ChangeLog 2014-01-01 06:02:34 UTC (rev 683) @@ -13,8 +13,14 @@ * src/wrapper_Descriptor.cpp (rprotobuf): Remove unused variable, rename another variable for clarity, and add some TODOs. * src/wrapper_FileDescriptor.cpp (rprotobuf): Idem. - * src/DescriptorPoolLookup.cpp (rprotobuf): Remove unreachable statement. + * src/DescriptorPoolLookup.cpp (rprotobuf): Remove unreachable + statement. * src/extensions.cpp: Remove unused header. + * src/mutators.cpp (rprotobuf): Update code to check for Long + vectors and use the safer R_xlen_t type everywhere. Protocol + Buffers repeated fields seem to be limited to int size indices + as with normal R vectors. + * src/rprotobuf.h: Idem 2013-12-30 Murray Stokely Modified: pkg/src/mutators.cpp =================================================================== --- pkg/src/mutators.cpp 2013-12-31 23:18:45 UTC (rev 682) +++ pkg/src/mutators.cpp 2014-01-01 06:02:34 UTC (rev 683) @@ -104,124 +104,122 @@ return ret; } - // TODO(mstokely): not long vector clean. int index should be R_xlen_t - // Add test illustrating the problem by using size(repeated_field)<-bignum -int32 GET_int32(SEXP x, int index) { +int32 GET_int32(SEXP x, R_xlen_t vec_index) { switch (TYPEOF(x)) { case INTSXP: - return ((int32)INTEGER(x)[index]); + return ((int32)INTEGER(x)[vec_index]); case REALSXP: - return ((int32)REAL(x)[index]); + return ((int32)REAL(x)[vec_index]); case LGLSXP: - return ((int32)LOGICAL(x)[index]); + return ((int32)LOGICAL(x)[vec_index]); case RAWSXP: - return ((int32)RAW(x)[index]); + return ((int32)RAW(x)[vec_index]); case STRSXP: - return Int32FromString(CHAR(STRING_ELT(x, index))); + return Int32FromString(CHAR(STRING_ELT(x, vec_index))); default: Rcpp::stop("cannot cast SEXP to int32"); } return (int32)0; // -Wall, should not happen since we only call this when we know it works } -int64 GET_int64(SEXP x, int index) { +int64 GET_int64(SEXP x, R_xlen_t vec_index) { switch (TYPEOF(x)) { case INTSXP: - return ((int64)INTEGER(x)[index]); + return ((int64)INTEGER(x)[vec_index]); case REALSXP: - return ((int64)REAL(x)[index]); + return ((int64)REAL(x)[vec_index]); case LGLSXP: - return ((int64)LOGICAL(x)[index]); + return ((int64)LOGICAL(x)[vec_index]); case RAWSXP: - return ((int64)RAW(x)[index]); + return ((int64)RAW(x)[vec_index]); case STRSXP: - return Int64FromString(CHAR(STRING_ELT(x, index))); + return Int64FromString(CHAR(STRING_ELT(x, vec_index))); default: Rcpp::stop("cannot cast SEXP to int64"); } return (int64)0; // -Wall, should not happen since we only call this when we know it works } -uint32 GET_uint32(SEXP x, int index) { +uint32 GET_uint32(SEXP x, R_xlen_t vec_index) { switch (TYPEOF(x)) { case INTSXP: - return ((uint32)INTEGER(x)[index]); + return ((uint32)INTEGER(x)[vec_index]); case REALSXP: - return ((uint32)REAL(x)[index]); + return ((uint32)REAL(x)[vec_index]); case LGLSXP: - return ((uint32)LOGICAL(x)[index]); + return ((uint32)LOGICAL(x)[vec_index]); case RAWSXP: - return ((uint32)RAW(x)[index]); + return ((uint32)RAW(x)[vec_index]); case STRSXP: - return Int32FromString(CHAR(STRING_ELT(x, index))); + return Int32FromString(CHAR(STRING_ELT(x, vec_index))); default: Rcpp::stop("cannot cast SEXP to uint32"); } return (uint32)0; // -Wall, should not happen since we only call this when we know it works } -uint64 GET_uint64(SEXP x, int index) { +uint64 GET_uint64(SEXP x, R_xlen_t vec_index) { switch (TYPEOF(x)) { case INTSXP: - return ((uint64)INTEGER(x)[index]); + return ((uint64)INTEGER(x)[vec_index]); case REALSXP: - return ((uint64)REAL(x)[index]); + return ((uint64)REAL(x)[vec_index]); case LGLSXP: - return ((uint64)LOGICAL(x)[index]); + return ((uint64)LOGICAL(x)[vec_index]); case RAWSXP: - return ((uint64)RAW(x)[index]); + return ((uint64)RAW(x)[vec_index]); case STRSXP: - return Int64FromString(CHAR(STRING_ELT(x, index))); + return Int64FromString(CHAR(STRING_ELT(x, vec_index))); default: Rcpp::stop("cannot cast SEXP to uint64"); } return (uint64)0; // -Wall, should not happen since we only call this when we know it works } -bool GET_bool(SEXP x, int index) { +bool GET_bool(SEXP x, R_xlen_t vec_index) { switch (TYPEOF(x)) { case INTSXP: - if (INTEGER(x)[index] == R_NaInt) { + if (INTEGER(x)[vec_index] == R_NaInt) { Rcpp::stop("NA boolean values can not be stored in bool protocol buffer fields"); } - return ((bool)INTEGER(x)[index]); + return ((bool)INTEGER(x)[vec_index]); case REALSXP: - if (REAL(x)[index] == R_NaReal) { + if (REAL(x)[vec_index] == R_NaReal) { Rcpp::stop("NA boolean values can not be stored in bool protocol buffer fields"); } - return ((bool)REAL(x)[index]); + return ((bool)REAL(x)[vec_index]); case LGLSXP: - if (LOGICAL(x)[index] == NA_LOGICAL) { + if (LOGICAL(x)[vec_index] == NA_LOGICAL) { Rcpp::stop("NA boolean values can not be stored in bool protocol buffer fields"); } - return ((bool)LOGICAL(x)[index]); + return ((bool)LOGICAL(x)[vec_index]); case RAWSXP: - return ((bool)RAW(x)[index]); + return ((bool)RAW(x)[vec_index]); default: Rcpp::stop("cannot cast SEXP to bool"); } return (bool)0; // Unreachable. -Wall } -std::string GET_stdstring(SEXP x, int index) { +std::string GET_stdstring(SEXP x, R_xlen_t vec_index) { if (TYPEOF(x) == STRSXP) { - return (CHAR(STRING_ELT(x, index))); + return (CHAR(STRING_ELT(x, vec_index))); } return ""; // Unreachable. -Wall } -std::string GET_bytes(SEXP x, int index) { +std::string GET_bytes(SEXP x, R_xlen_t vec_index) { switch (TYPEOF(x)) { case RAWSXP: - if (index == 0) { + if (vec_index == 0) { return (std::string((const char*)RAW(x), (size_t)LENGTH(x))); } else { Rcpp::stop("cannot cast SEXP to bytes"); } case VECSXP: - if (TYPEOF(VECTOR_ELT(x, index)) == RAWSXP) { - return (std::string((const char*)RAW(VECTOR_ELT(x, index)), - (size_t)LENGTH(VECTOR_ELT(x, index)))); + if (TYPEOF(VECTOR_ELT(x, vec_index)) == RAWSXP) { + return (std::string((const char*)RAW(VECTOR_ELT(x, vec_index)), + (size_t)LENGTH(VECTOR_ELT(x, vec_index)))); } else { Rcpp::stop("cannot cast SEXP to bytes"); } @@ -234,16 +232,15 @@ /** * indicates if this is a list of messages * - * @param x a list (VECSXP) + * @param x a list (VECSXP), not a long vec * @return TRUE if all objects are instances of Message class */ Rboolean allAreMessages(SEXP x) { - if (TYPEOF(x) != VECSXP) return _FALSE_; - int n = LENGTH(x); + R_xlen_t n = LENGTH(x); // Caller verifies its not a long vec SEXP current; - for (int i = 0; i < n; i++) { + for (R_xlen_t i = 0; i < n; i++) { current = VECTOR_ELT(x, i); /* not an S4 object */ if (TYPEOF(current) != S4SXP) return _FALSE_; @@ -261,12 +258,11 @@ * @return TRUE if all objects are instances of RAWSXP */ Rboolean allAreRaws(SEXP x) { - if (TYPEOF(x) != VECSXP) return _FALSE_; - int n = LENGTH(x); + R_xlen_t n = LENGTH(x); SEXP current; - for (int i = 0; i < n; i++) { + for (R_xlen_t i = 0; i < n; i++) { current = VECTOR_ELT(x, i); /* not a RAWSXP */ if (TYPEOF(current) != RAWSXP) return _FALSE_; @@ -286,7 +282,7 @@ BEGIN_RCPP const GPB::EnumDescriptor* enum_desc = field_desc->enum_type(); // N.B. n undefined if TYPEOF(value) not a vector, but we catch that below. - int n = LENGTH(value); + R_xlen_t n = XLENGTH(value); switch (TYPEOF(value)) { // {{{ numbers @@ -693,7 +689,8 @@ */ void setRepeatedMessageField(GPB::Message* message, const Reflection* ref, - const GPB::FieldDescriptor* field_desc, SEXP value, int value_size) { + const GPB::FieldDescriptor* field_desc, SEXP value, + R_xlen_t value_size) { // The number of elements already in the repeated field. int field_size = ref->FieldSize(*message, field_desc); @@ -723,7 +720,7 @@ case RAWSXP: case STRSXP: // For int32, we support chars. { - int i = 0; + R_xlen_t i = 0; /* in any case, fill the values up to field_size */ for (; i < field_size; i++) { ref->SetRepeatedInt32(message, field_desc, i, GET_int32(value, i)); @@ -757,8 +754,7 @@ case RAWSXP: case STRSXP: // For int64, we support chars. { - int i = 0; - + R_xlen_t i = 0; /* in any case, fill the values up to field_size */ for (; i < field_size; i++) { ref->SetRepeatedInt64(message, field_desc, i, GET_int64(value, i)); @@ -790,7 +786,7 @@ case RAWSXP: case STRSXP: // For int32, we support chars. { - int i = 0; + R_xlen_t i = 0; /* in any case, fill the values up to field_size */ for (; i < field_size; i++) { ref->SetRepeatedUInt32(message, field_desc, i, GET_uint32(value, i)); @@ -821,7 +817,7 @@ case RAWSXP: case STRSXP: // For int64, we support chars. { - int i = 0; + R_xlen_t i = 0; /* in any case, fill the values up to field_size */ for (; i < field_size; i++) { ref->SetRepeatedUInt64(message, field_desc, i, GET_uint64(value, i)); @@ -849,7 +845,7 @@ case REALSXP: case LGLSXP: case RAWSXP: { - int i = 0; + R_xlen_t i = 0; /* in any case, fill the values up to field_size */ for (; i < field_size; i++) { ref->SetRepeatedDouble(message, field_desc, i, GET_double(value, i)); @@ -876,7 +872,7 @@ case REALSXP: case LGLSXP: case RAWSXP: { - int i = 0; + R_xlen_t i = 0; /* in any case, fill the values up to field_size */ for (; i < field_size; i++) { ref->SetRepeatedFloat(message, field_desc, i, GET_float(value, i)); @@ -904,7 +900,7 @@ case REALSXP: case LGLSXP: case RAWSXP: { - int i = 0; + R_xlen_t i = 0; /* in any case, fill the values up to field_size */ for (; i < field_size; i++) { ref->SetRepeatedBool(message, field_desc, i, GET_bool(value, i)); @@ -931,7 +927,7 @@ switch (TYPEOF(value)) { case STRSXP: { /* in any case, fill the values up to field_size */ - int i = 0; + R_xlen_t i = 0; for (; i < field_size; i++) { ref->SetRepeatedString(message, field_desc, i, COPYSTRING(CHAR(STRING_ELT(value, i)))); @@ -948,7 +944,7 @@ } case RAWSXP: { /* in any case, fill the values up to field_size */ - int i = 0; + R_xlen_t i = 0; for (; i < field_size; i++) { ref->SetRepeatedString(message, field_desc, i, GET_bytes(value, 0)); } @@ -975,7 +971,7 @@ // has been tested above if (LENGTH(value) > 0 && TYPEOF(VECTOR_ELT(value, 0)) == RAWSXP) { /* in any case, fill the values up to field_size */ - int i = 0; + R_xlen_t i = 0; for (; i < field_size; i++) { ref->SetRepeatedString(message, field_desc, i, GET_bytes(value, i)); } @@ -993,7 +989,7 @@ GPB::Message* __mess; /* in any case, fill the values up to field_size */ - int i = 0; + R_xlen_t i = 0; for (; i < field_size; i++) { __mess = GET_MESSAGE_POINTER_FROM_S4(VECTOR_ELT(value, i)); ref->SetRepeatedString(message, field_desc, i, @@ -1154,7 +1150,17 @@ // }}} // {{{ preliminary checks - int value_size = Rf_isVector(value) ? LENGTH(value) : 1; + R_xlen_t value_size = 1; + if (Rf_isVector(value)) { + if (IS_LONG_VEC(value)) { + // field_size is an int, so presumably it would generate + // a CHECK failure in protobuf code on 2^32 element anyway. + Rcpp_error("Long vectors not supported for repeated fields."); + } else { + value_size = LENGTH(value); + } + } + // if the R type is RAWSXP and the cpp type is string or bytes, // then value_size is actually one because the raw vector // is converted to a string Modified: pkg/src/rprotobuf.h =================================================================== --- pkg/src/rprotobuf.h 2013-12-31 23:18:45 UTC (rev 682) +++ pkg/src/rprotobuf.h 2014-01-01 06:02:34 UTC (rev 683) @@ -142,13 +142,13 @@ int GET_int(SEXP, int); double GET_double(SEXP, int); float GET_float(SEXP, int); -int32 GET_int32(SEXP, int); -int64 GET_int64(SEXP, int); -uint32 GET_uint32(SEXP, int); -uint64 GET_uint64(SEXP, int); -bool GET_bool(SEXP, int); -std::string GET_stdstring(SEXP, int); -std::string GET_bytes(SEXP, int); +int32 GET_int32(SEXP, R_xlen_t); +int64 GET_int64(SEXP, R_xlen_t); +uint32 GET_uint32(SEXP, R_xlen_t); +uint64 GET_uint64(SEXP, R_xlen_t); +bool GET_bool(SEXP, R_xlen_t); +std::string GET_stdstring(SEXP, R_xlen_t); +std::string GET_bytes(SEXP, R_xlen_t); void CHECK_values_for_enum(const GPB::FieldDescriptor*, SEXP); void CHECK_messages(const GPB::FieldDescriptor*, SEXP); From noreply at r-forge.r-project.org Wed Jan 1 07:55:25 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 1 Jan 2014 07:55:25 +0100 (CET) Subject: [Rprotobuf-commits] r684 - pkg/src Message-ID: <20140101065525.5D3651868A4@r-forge.r-project.org> Author: murray Date: 2014-01-01 07:55:24 +0100 (Wed, 01 Jan 2014) New Revision: 684 Modified: pkg/src/ConnectionCopyingInputStream.cpp pkg/src/ConnectionCopyingOutputStream.cpp pkg/src/ConnectionInputStream.cpp pkg/src/ConnectionOutputStream.cpp pkg/src/DescriptorPoolLookup.cpp pkg/src/RSourceTree.cpp pkg/src/RWarningErrorCollector.cpp pkg/src/RconnectionCopyingInputStream.cpp pkg/src/SocketCopyingInputStream.cpp pkg/src/ZeroCopyInputStreamWrapper.cpp pkg/src/ZeroCopyOutputStreamWrapper.cpp pkg/src/extensions.cpp pkg/src/extractors.cpp pkg/src/lookup.cpp pkg/src/mutators.cpp pkg/src/rprotobuf.cpp pkg/src/streams.cpp pkg/src/wrapper_ArrayInputStream.cpp pkg/src/wrapper_ArrayOutputStream.cpp pkg/src/wrapper_Descriptor.cpp pkg/src/wrapper_EnumDescriptor.cpp pkg/src/wrapper_EnumValueDescriptor.cpp pkg/src/wrapper_FieldDescriptor.cpp pkg/src/wrapper_FileDescriptor.cpp pkg/src/wrapper_Message.cpp pkg/src/wrapper_MethodDescriptor.cpp pkg/src/wrapper_ServiceDescriptor.cpp pkg/src/wrapper_ZeroCopyInputStream.cpp Log: Add uniform file-level comments to the first line of each file that match exactly what is specified in the directory level emacs config. Specifically: // -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- This is accomplished by running awk in a for loop over every .cpp file in this directory: awk -v s="// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*-" 'NR == 1 {print s} {print}' $x > $x.new Modified: pkg/src/ConnectionCopyingInputStream.cpp =================================================================== --- pkg/src/ConnectionCopyingInputStream.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/ConnectionCopyingInputStream.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "ConnectionCopyingInputStream.h" Modified: pkg/src/ConnectionCopyingOutputStream.cpp =================================================================== --- pkg/src/ConnectionCopyingOutputStream.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/ConnectionCopyingOutputStream.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "ConnectionCopyingOutputStream.h" Modified: pkg/src/ConnectionInputStream.cpp =================================================================== --- pkg/src/ConnectionInputStream.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/ConnectionInputStream.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "ConnectionInputStream.h" #include "ConnectionCopyingInputStream.h" Modified: pkg/src/ConnectionOutputStream.cpp =================================================================== --- pkg/src/ConnectionOutputStream.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/ConnectionOutputStream.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "ConnectionOutputStream.h" #include "ConnectionCopyingOutputStream.h" Modified: pkg/src/DescriptorPoolLookup.cpp =================================================================== --- pkg/src/DescriptorPoolLookup.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/DescriptorPoolLookup.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- // DescriptorPoolLookup.cpp: R/C++ interface class library // // Copyright (C) 2010 - 2011 Dirk Eddelbuettel and Romain Francois Modified: pkg/src/RSourceTree.cpp =================================================================== --- pkg/src/RSourceTree.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/RSourceTree.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "RSourceTree.h" Modified: pkg/src/RWarningErrorCollector.cpp =================================================================== --- pkg/src/RWarningErrorCollector.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/RWarningErrorCollector.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "RWarningErrorCollector.h" Modified: pkg/src/RconnectionCopyingInputStream.cpp =================================================================== --- pkg/src/RconnectionCopyingInputStream.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/RconnectionCopyingInputStream.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "RconnectionCopyingInputStream.h" Modified: pkg/src/SocketCopyingInputStream.cpp =================================================================== --- pkg/src/SocketCopyingInputStream.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/SocketCopyingInputStream.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "SocketCopyingInputStream.h" Modified: pkg/src/ZeroCopyInputStreamWrapper.cpp =================================================================== --- pkg/src/ZeroCopyInputStreamWrapper.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/ZeroCopyInputStreamWrapper.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" namespace rprotobuf { Modified: pkg/src/ZeroCopyOutputStreamWrapper.cpp =================================================================== --- pkg/src/ZeroCopyOutputStreamWrapper.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/ZeroCopyOutputStreamWrapper.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" namespace rprotobuf { Modified: pkg/src/extensions.cpp =================================================================== --- pkg/src/extensions.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/extensions.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,4 +1,4 @@ -/* +/* -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- * Copyright 2012 Google Inc. All Rights Reserved. * Author: Murray Stokely * Modified: pkg/src/extractors.cpp =================================================================== --- pkg/src/extractors.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/extractors.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- // Copyright (C) 2010 - 2011 Dirk Eddelbuettel and Romain Francois // // This file is part of RProtoBuf. Modified: pkg/src/lookup.cpp =================================================================== --- pkg/src/lookup.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/lookup.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "DescriptorPoolLookup.h" Modified: pkg/src/mutators.cpp =================================================================== --- pkg/src/mutators.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/mutators.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- // Copyright (C) 2010 - 2011 Dirk Eddelbuettel and Romain Francois // // This file is part of RProtoBuf. Modified: pkg/src/rprotobuf.cpp =================================================================== --- pkg/src/rprotobuf.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/rprotobuf.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "DescriptorPoolLookup.h" #include "RcppMacros.h" Modified: pkg/src/streams.cpp =================================================================== --- pkg/src/streams.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/streams.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "ConnectionInputStream.h" #include "ConnectionOutputStream.h" Modified: pkg/src/wrapper_ArrayInputStream.cpp =================================================================== --- pkg/src/wrapper_ArrayInputStream.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_ArrayInputStream.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "RcppMacros.h" Modified: pkg/src/wrapper_ArrayOutputStream.cpp =================================================================== --- pkg/src/wrapper_ArrayOutputStream.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_ArrayOutputStream.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "RcppMacros.h" Modified: pkg/src/wrapper_Descriptor.cpp =================================================================== --- pkg/src/wrapper_Descriptor.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_Descriptor.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "RcppMacros.h" Modified: pkg/src/wrapper_EnumDescriptor.cpp =================================================================== --- pkg/src/wrapper_EnumDescriptor.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_EnumDescriptor.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- // wrapper_EnumDescriptor.cpp: R/C++ interface class library // // Copyright (C) 2010 - 2011 Dirk Eddelbuettel and Romain Francois Modified: pkg/src/wrapper_EnumValueDescriptor.cpp =================================================================== --- pkg/src/wrapper_EnumValueDescriptor.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_EnumValueDescriptor.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- // wrapper_EnumValueDescriptor.h: R/C++ interface class library // // Copyright (C) 2010 - 2011 Dirk Eddelbuettel and Romain Francois Modified: pkg/src/wrapper_FieldDescriptor.cpp =================================================================== --- pkg/src/wrapper_FieldDescriptor.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_FieldDescriptor.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- // Copyright (C) 2010 - 2011 Dirk Eddelbuettel and Romain Francois // // This file is part of RProtoBuf. Modified: pkg/src/wrapper_FileDescriptor.cpp =================================================================== --- pkg/src/wrapper_FileDescriptor.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_FileDescriptor.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "RcppMacros.h" Modified: pkg/src/wrapper_Message.cpp =================================================================== --- pkg/src/wrapper_Message.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_Message.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "fieldtypes.h" #include "RcppMacros.h" Modified: pkg/src/wrapper_MethodDescriptor.cpp =================================================================== --- pkg/src/wrapper_MethodDescriptor.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_MethodDescriptor.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "RcppMacros.h" Modified: pkg/src/wrapper_ServiceDescriptor.cpp =================================================================== --- pkg/src/wrapper_ServiceDescriptor.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_ServiceDescriptor.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" #include "RcppMacros.h" Modified: pkg/src/wrapper_ZeroCopyInputStream.cpp =================================================================== --- pkg/src/wrapper_ZeroCopyInputStream.cpp 2014-01-01 06:02:34 UTC (rev 683) +++ pkg/src/wrapper_ZeroCopyInputStream.cpp 2014-01-01 06:55:24 UTC (rev 684) @@ -1,3 +1,4 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- #include "rprotobuf.h" namespace rprotobuf { From noreply at r-forge.r-project.org Wed Jan 1 08:02:20 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 1 Jan 2014 08:02:20 +0100 (CET) Subject: [Rprotobuf-commits] r685 - / Message-ID: <20140101070221.0A8321868E5@r-forge.r-project.org> Author: murray Date: 2014-01-01 08:02:20 +0100 (Wed, 01 Jan 2014) New Revision: 685 Modified: STYLE Log: Update with some emacs and style considerations (e.g. unused_ prefix for function arguments that are not used). Modified: STYLE =================================================================== --- STYLE 2014-01-01 06:55:24 UTC (rev 684) +++ STYLE 2014-01-01 07:02:20 UTC (rev 685) @@ -3,7 +3,7 @@ 1. Conventions: spacing, column limit, etc. 2. clang-format 3. excluded files -4. Emacs FoldingMode annotations. +4. Emacs considerations (file-local config, dir-level config, and FoldingMode annotations.) 1. Conventions: spacing, column limit, etc. @@ -19,6 +19,7 @@ 4) no space inside opening/closing parenthesis of argument lists '(arg1, arg2)' 5) space before open brackets. 6) open brackets on same line as if conditional or switch statement. +7) Unused arguments in functions should be named with 'unused_' prefix. These conventions can be changed, so long as the code base remains consistent. @@ -50,11 +51,28 @@ We exclude Rcppsupport.h and RcppMacros.h because I don't like the result. I want BEGIN_RCPP on its own line, for example. -4. Emacs FoldingMode annotations. +4. Emacs considerations ------------------------------------------------------------------------------- -Some of the files include '{{{' '}}}' annotations in comments that are -used to fold/unfold sections of code with Emacs folding mode (M-x -folding-mode). You can then right click to expand/hide these sections -to get a higher level view of the file or drill down into specific -sections of the code. +A directory level configuration file is present in pkg/src to enforce +common standard for files in that directory. Modern emacs should read +this file and apply its settings for new or existing file buffers in +that directory. + +The file level prop line at the top of a file can be generated from +the directory level config with +M-x copy-dir-locals-to-file-locals-prop-line + +The relevant prop line was added to all of the .cpp files in this +directory by running awk in a loop: + +awk -v s="// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*-" 'NR == 1 {print s} {print}' $x > $x.new + +In addition to the buffer C++ mode settings, some of the files include +'{{{' '}}}' annotations in comments that are used to fold/unfold +sections of code with Emacs folding mode (M-x folding-mode). You can +then right click to expand/hide these sections to get a higher level +view of the file or drill down into specific sections of the code. + +These folding annotations are also used by other editors and are not +specific to Emacs. \ No newline at end of file From noreply at r-forge.r-project.org Fri Jan 3 02:24:22 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 02:24:22 +0100 (CET) Subject: [Rprotobuf-commits] r686 - papers/rjournal Message-ID: <20140103012422.9C650186BCC@r-forge.r-project.org> Author: murray Date: 2014-01-03 02:24:22 +0100 (Fri, 03 Jan 2014) New Revision: 686 Modified: papers/rjournal/RJwrapper.tex papers/rjournal/eddelbuettel-francois-stokely.Rnw papers/rjournal/eddelbuettel-francois-stokely.bib Log: Remove Romain from authors list per his request, add an acknowledgements section which of course thanks him most of all. Add a section on unsigned integers in the type coercion section, now that they are handled better in RProtoBuf. Add more text in the Descriptors subsection. Add references to int64 (orphaned but still useful) and bit64 in the 64-bit integers subsection. Modified: papers/rjournal/RJwrapper.tex =================================================================== --- papers/rjournal/RJwrapper.tex 2014-01-01 07:02:20 UTC (rev 685) +++ papers/rjournal/RJwrapper.tex 2014-01-03 01:24:22 UTC (rev 686) @@ -4,6 +4,7 @@ \usepackage{RJournal} \usepackage{amsmath,amssymb,array} \usepackage{booktabs} +\usepackage{tabularx} %% load any required packages here Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-01 07:02:20 UTC (rev 685) +++ papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-03 01:24:22 UTC (rev 686) @@ -9,7 +9,7 @@ \renewenvironment{Schunk}{\vspace{\topsep}}{\vspace{\topsep}} \title{RProtoBuf: Efficient Cross-Language Data Serialization in R} -\author{by Dirk Eddelbuettel, Romain Fran\c{c}ois, and Murray Stokely} +\author{by Dirk Eddelbuettel and Murray Stokely} %% DE: I tend to have wider option(width=...) so this %% guarantees better line breaks @@ -540,12 +540,20 @@ \subsection{Descriptors} -Message descriptors are represented in R with the \emph{Descriptor} S4 +Descriptors describe the type of a Message. This includes what fields +a message contains and what the types of those fields are. Message +descriptors are represented in R with the \emph{Descriptor} S4 class. The class contains the slots \texttt{pointer} and \texttt{type}. Similarly to messages, the \verb|$| operator can be used to retrieve descriptors that are contained in the descriptor, or invoke pseudo-methods. +When \CRANpkg{RProtoBuf} is first loaded it calls +\texttt{readProtoFiles} to read in an example \texttt{.proto} file +included with the package. The \texttt{tutorial.Person} descriptor +and any other descriptors defined in loaded \texttt{.proto} files are +then available on the search path. + <<>>= # field descriptor tutorial.Person$email @@ -890,6 +898,23 @@ try(a$optional_bool <- NA,silent=TRUE) @ +\subsection{Unsigned Integers} + +R lacks a native unsigned integer type. Values between $2^{31}$ and +$2^{32} - 1$ read from unsigned int protocol buffer fields must be +stored as doubles in R. + +<<>>= +as.integer(2^31-1) +<>= +as.integer(2^31 - 1) + as.integer(1) +<>= +try(as.integer(2^31 - 1) + as.integer(1)) + +2^31 +class(2^31) +@ + \subsection{64-bit integers} \label{sec:int64} @@ -936,7 +961,10 @@ When reading the value back into R, numeric types are returned by default, but when the full precision is required a character value will be returned if the \texttt{RProtoBuf.int64AsString} option is set -to \texttt{TRUE}. +to \texttt{TRUE}. The character values are useful because they can +accurately be used as unique identifiers and can easily be passed to R +packages such as \CRANpkg{int64} \citep{int64} or \CRANpkg{bit64} +\citep{bit64} which represent 64-bit integers in R. <<>>= options("RProtoBuf.int64AsString" = FALSE) @@ -1211,6 +1239,24 @@ %This file is only a basic article template. For full details of \emph{The R Journal} style and information on how to prepare your article for submission, see the \href{http://journal.r-project.org/latex/RJauthorguide.pdf}{Instructions for Authors}. +\section{Acknowledgement} + +\CRANpkg{RProtoBuf} was originally written in 2009 by Romain +Fran\c{c}ois and Dirk Eddelbuettel. The authors would particularly +like to thank Romain for his initial implementation and continued +design discussions. Several features of this package are based +on the design of the \CRANpkg{rJava} package by Simon Urbanek +(dispatch on new, S4 class structures using external pointers). We'd +like to thank Simon for his indirect involvment on +\CRANpkg{RProtoBuf}. The user defined table mechanism, implemented by +Duncan Temple Lang for the purpose of the \pkg{RObjectTables} +package allowed the dynamic symbol lookup. Many thanks to Duncan for +this amazing feature. Kenton Varda was generous with his time in +reviewing code and explaining obscure protocol buffer semantics. Karl +Millar and Jeroen Ooms were helpful in reviewing code or offering +suggestions. Saptarshi Guha's contemporaneous work on \pkg{RHIPE} was a +strong motivator. + \bibliography{eddelbuettel-francois-stokely} \address{Dirk Eddelbuettel\\ @@ -1219,12 +1265,6 @@ USA} \email{edd at debian.org} -\address{Author Two\\ - Affiliation\\ - Address\\ - Country} -\email{author2 at work} - \address{Murray Stokely\\ Google, Inc.\\ 1600 Amphitheatre Parkway\\ Modified: papers/rjournal/eddelbuettel-francois-stokely.bib =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.bib 2014-01-01 07:02:20 UTC (rev 685) +++ papers/rjournal/eddelbuettel-francois-stokely.bib 2014-01-03 01:24:22 UTC (rev 686) @@ -7,6 +7,20 @@ pages={1--18}, year={2011} } + at Manual{int64, + title = {int64: 64 bit integer types}, + author = {Romain Francois}, + year = {2011}, + note = {R package version 1.1.2}, + url = {http://CRAN.R-project.org/package=int64}, +} + at Manual{bit64, + title = {bit64: A S3 class for vectors of 64bit integers}, + author = {Jens Oehlschl?gel}, + year = {2012}, + note = {R package version 0.9-3}, + url = {http://CRAN.R-project.org/package=bit64}, +} @book{eddelbuettel2013seamless, title={Seamless R and C++ Integration with Rcpp}, author={Eddelbuettel, Dirk}, From noreply at r-forge.r-project.org Fri Jan 3 03:22:18 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 03:22:18 +0100 (CET) Subject: [Rprotobuf-commits] r687 - papers/rjournal Message-ID: <20140103022218.1FAF2186B65@r-forge.r-project.org> Author: murray Date: 2014-01-03 03:22:17 +0100 (Fri, 03 Jan 2014) New Revision: 687 Added: papers/rjournal/JSSwrapper.tex papers/rjournal/jss.bst papers/rjournal/jss.cls papers/rjournal/jss.dtx papers/rjournal/jsslogo.jpg Modified: papers/rjournal/RJwrapper.tex papers/rjournal/eddelbuettel-francois-stokely.Rnw Log: Add a JSSwrapper around the same content used for the RJournal article. Edits should still be made to the same eddelbuettel-francois-stokely.Rnw file, and the Makefile still only builds the RJournal version by default, but now you can type pdflatex JSSwrapper instead and manually see what it would look like when formatted for JSS instead. Added: papers/rjournal/JSSwrapper.tex =================================================================== --- papers/rjournal/JSSwrapper.tex (rev 0) +++ papers/rjournal/JSSwrapper.tex 2014-01-03 02:22:17 UTC (rev 687) @@ -0,0 +1,79 @@ +\documentclass[article]{jss} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +% +% Local helpers to make this more compatible with R Journal style. +% +\newcommand{\CRANpkg}[1]{\pkg{#1}} +\RequirePackage{fancyvrb} +\RequirePackage{alltt} +\DefineVerbatimEnvironment{example}{Verbatim}{} + +%% almost as usual +\author{Dirk Eddelbuettel\\Debian and R Projects \And + Murray Stokely\\Google, Inc} +\title{\pkg{RProtoBuf}: Efficient Cross-Language Data Serialization in R} + +%% for pretty printing and a nice hypersummary also set: +\Plainauthor{Dirk Eddelbuettel, Murray Stokely} %% comma-separated +\Plaintitle{RProtoBuf: Efficient Cross-Language Data Serialization in R} +\Shorttitle{\pkg{RProtoBuf}: Protocol Buffers in R} %% a short title (if necessary) + +%% an abstract and keywords +\Abstract{ +Modern data collection and analysis pipelines often involve +a sophisticated mix of applications written in general purpose and +specialized programming languages. Protocol Buffers are a popular +method of serializing structured data between applications---while remaining +independent of programming languages or operating system. The +\CRANpkg{RProtoBuf} package provides a complete interface to this +library. +} +\Keywords{r, protocol buffers, serialization, cross-platform} +\Plainkeywords{r, protocol buffers, serialization, cross-platform} %% without formatting +%% at least one keyword must be supplied + +%% publication information +%% NOTE: Typically, this can be left commented and will be filled out by the technical editor +%% \Volume{50} +%% \Issue{9} +%% \Month{June} +%% \Year{2012} +%% \Submitdate{2012-06-04} +%% \Acceptdate{2012-06-04} + +%% The address of (at least) one author should be given +%% in the following format: +\Address{ + Murray Stokely\\ + Google, Inc.\\ + 1600 Amphitheatre Parkway\\ + Mountain View, CA 94040\\ + USA\\ + E-mail: \email{mstokely at google.com}\\ + URL: \url{http://www.stokely.org/} +} +%% It is also possible to add a telephone and fax number +%% before the e-mail in the following format: +%% Telephone: +43/512/507-7103 +%% Fax: +43/512/507-2851 + +%% for those who use Sweave please include the following line (with % symbols): +%% need no \usepackage{Sweave.sty} + +%% end of declarations %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +\begin{document} + +\include{eddelbuettel-francois-stokely} +%% include your article here, just as usual +%% Note that you should use the \pkg{}, \proglang{} and \code{} commands. + +%\section[About Java]{About \proglang{Java}} +%% Note: If there is markup in \(sub)section, then it has to be escape as above. + +\end{document} Modified: papers/rjournal/RJwrapper.tex =================================================================== --- papers/rjournal/RJwrapper.tex 2014-01-03 01:24:22 UTC (rev 686) +++ papers/rjournal/RJwrapper.tex 2014-01-03 02:22:17 UTC (rev 687) @@ -20,6 +20,20 @@ %% replace RJtemplate with your article \begin{article} \input{eddelbuettel-francois-stokely} + +\address{Dirk Eddelbuettel\\ + Debian and R Projects\\ + 711 Monroe Avenue, River Forest, IL 60305\\ + USA} +\email{edd at debian.org} + +\address{Murray Stokely\\ + Google, Inc.\\ + 1600 Amphitheatre Parkway\\ + Mountain View, CA 94043\\ + USA} +\email{mstokely at google.com} + \end{article} \end{document} Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-03 01:24:22 UTC (rev 686) +++ papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-03 02:22:17 UTC (rev 687) @@ -223,7 +223,7 @@ % \label{figure:rlogo} %\end{figure} -\subsection{Importing Message Descriptors from \texttt{.proto} files} +\subsection{Importing Message Descriptors from .proto files} %The three basic abstractions of \CRANpkg{RProtoBuf} are Messages, %which encapsulate a data structure, Descriptors, which define the @@ -871,7 +871,7 @@ R booleans can accept three values: \texttt{TRUE}, \texttt{FALSE}, and \texttt{NA}. However, most other languages, including the protocol -buffer schema, only accept \text{TRUE} or \text{FALSE}. This means +buffer schema, only accept \texttt{TRUE} or \texttt{FALSE}. This means that we simply can not store R logical vectors that include all three possible values as booleans. The library will refuse to store \texttt{NA}s in protocol buffer boolean fields, and users must instead @@ -1067,7 +1067,7 @@ % % latex table generated in R 3.0.2 by xtable 1.7-0 package % Fri Dec 27 17:00:03 2013 -\begin{table}[ht] +\begin{table}[h!] \begin{center} \scalebox{0.9}{ \begin{tabular}{l|r|r|r|r|r} @@ -1258,16 +1258,3 @@ strong motivator. \bibliography{eddelbuettel-francois-stokely} - -\address{Dirk Eddelbuettel\\ - Debian and R Projects\\ - 711 Monroe Avenue, River Forest, IL 60305\\ - USA} -\email{edd at debian.org} - -\address{Murray Stokely\\ - Google, Inc.\\ - 1600 Amphitheatre Parkway\\ - Mountain View, CA 94043\\ - USA} -\email{mstokely at google.com} Added: papers/rjournal/jss.bst =================================================================== --- papers/rjournal/jss.bst (rev 0) +++ papers/rjournal/jss.bst 2014-01-03 02:22:17 UTC (rev 687) @@ -0,0 +1,1631 @@ +%% +%% This is file `jss.bst', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% merlin.mbs (with options: `ay,nat,nm-rvx,keyxyr,dt-beg,yr-par,note-yr,tit-qq,atit-u,trnum-it,vol-bf,volp-com,num-xser,pre-edn,isbn,issn,edpar,pp,ed,xedn,xand,etal-it,revdata,eprint,url,url-blk,doi,nfss') +%% +%% ** BibTeX style file for JSS publications (http://www.jstatsoft.org/) +%% +%% Copyright 1994-2007 Patrick W Daly +%% License: GPL-2 + % =============================================================== + % IMPORTANT NOTICE: + % This bibliographic style (bst) file has been generated from one or + % more master bibliographic style (mbs) files, listed above, provided + % with kind permission of Patrick W Daly. + % + % This generated file can be redistributed and/or modified under the terms + % of the General Public License (Version 2). + % =============================================================== + % Name and version information of the main mbs file: + % \ProvidesFile{merlin.mbs}[2007/04/24 4.20 (PWD, AO, DPC)] + % For use with BibTeX version 0.99a or later + %------------------------------------------------------------------- + % This bibliography style file is intended for texts in ENGLISH + % This is an author-year citation style bibliography. As such, it is + % non-standard LaTeX, and requires a special package file to function properly. + % Such a package is natbib.sty by Patrick W. Daly + % The form of the \bibitem entries is + % \bibitem[Jones et al.(1990)]{key}... + % \bibitem[Jones et al.(1990)Jones, Baker, and Smith]{key}... + % The essential feature is that the label (the part in brackets) consists + % of the author names, as they should appear in the citation, with the year + % in parentheses following. There must be no space before the opening + % parenthesis! + % With natbib v5.3, a full list of authors may also follow the year. + % In natbib.sty, it is possible to define the type of enclosures that is + % really wanted (brackets or parentheses), but in either case, there must + % be parentheses in the label. + % The \cite command functions as follows: + % \citet{key} ==>> Jones et al. (1990) + % \citet*{key} ==>> Jones, Baker, and Smith (1990) + % \citep{key} ==>> (Jones et al., 1990) + % \citep*{key} ==>> (Jones, Baker, and Smith, 1990) + % \citep[chap. 2]{key} ==>> (Jones et al., 1990, chap. 2) + % \citep[e.g.][]{key} ==>> (e.g. Jones et al., 1990) + % \citep[e.g.][p. 32]{key} ==>> (e.g. Jones et al., p. 32) + % \citeauthor{key} ==>> Jones et al. + % \citeauthor*{key} ==>> Jones, Baker, and Smith + % \citeyear{key} ==>> 1990 + %--------------------------------------------------------------------- + +ENTRY + { address + archive + author + booktitle + chapter + collaboration + doi + edition + editor + eid + eprint + howpublished + institution + isbn + issn + journal + key + month + note + number + numpages + organization + pages + publisher + school + series + title + type + url + volume + year + } + {} + { label extra.label sort.label short.list } +INTEGERS { output.state before.all mid.sentence after.sentence after.block } +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} +STRINGS { s t} +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} +FUNCTION {add.blank} +{ " " * before.all 'output.state := +} + +FUNCTION {date.block} +{ + new.block +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} +FUNCTION {non.stop} +{ duplicate$ + "}" * add.period$ + #-1 #1 substring$ "." = +} + +STRINGS {z} +FUNCTION {remove.dots} +{ 'z := + "" + { z empty$ not } + { z #1 #1 substring$ + z #2 global.max$ substring$ 'z := + duplicate$ "." = 'pop$ + { * } + if$ + } + while$ +} +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "\emph{" swap$ * "}" * } + if$ +} +FUNCTION {bolden} +{ duplicate$ empty$ + { pop$ "" } + { "\textbf{" swap$ * "}" * } + if$ +} +FUNCTION {tie.or.space.prefix} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ +} + +FUNCTION {capitalize} +{ "u" change.case$ "t" change.case$ } + +FUNCTION {space.word} +{ " " swap$ * " " * } + % Here are the language-specific definitions for explicit words. + % Each function has a name bbl.xxx where xxx is the English word. + % The language selected here is ENGLISH +FUNCTION {bbl.and} +{ "and"} + +FUNCTION {bbl.etal} +{ "et~al." } + +FUNCTION {bbl.editors} +{ "eds." } + +FUNCTION {bbl.editor} +{ "ed." } + +FUNCTION {bbl.edby} +{ "edited by" } + +FUNCTION {bbl.edition} +{ "edition" } + +FUNCTION {bbl.volume} +{ "volume" } + +FUNCTION {bbl.of} +{ "of" } + +FUNCTION {bbl.number} +{ "number" } + +FUNCTION {bbl.nr} +{ "no." } + +FUNCTION {bbl.in} +{ "in" } + +FUNCTION {bbl.pages} +{ "pp." } + +FUNCTION {bbl.page} +{ "p." } + +FUNCTION {bbl.eidpp} +{ "pages" } + +FUNCTION {bbl.chapter} +{ "chapter" } + +FUNCTION {bbl.techrep} +{ "Technical Report" } + +FUNCTION {bbl.mthesis} +{ "Master's thesis" } + +FUNCTION {bbl.phdthesis} +{ "Ph.D. thesis" } + +MACRO {jan} {"January"} + +MACRO {feb} {"February"} + +MACRO {mar} {"March"} + +MACRO {apr} {"April"} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"August"} + +MACRO {sep} {"September"} + +MACRO {oct} {"October"} + +MACRO {nov} {"November"} + +MACRO {dec} {"December"} + +MACRO {acmcs} {"ACM Computing Surveys"} + +MACRO {acta} {"Acta Informatica"} + +MACRO {cacm} {"Communications of the ACM"} + +MACRO {ibmjrd} {"IBM Journal of Research and Development"} + +MACRO {ibmsj} {"IBM Systems Journal"} + +MACRO {ieeese} {"IEEE Transactions on Software Engineering"} + +MACRO {ieeetc} {"IEEE Transactions on Computers"} + +MACRO {ieeetcad} + {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"} + +MACRO {ipl} {"Information Processing Letters"} + +MACRO {jacm} {"Journal of the ACM"} + +MACRO {jcss} {"Journal of Computer and System Sciences"} + +MACRO {scp} {"Science of Computer Programming"} + +MACRO {sicomp} {"SIAM Journal on Computing"} + +MACRO {tocs} {"ACM Transactions on Computer Systems"} + +MACRO {tods} {"ACM Transactions on Database Systems"} + +MACRO {tog} {"ACM Transactions on Graphics"} + +MACRO {toms} {"ACM Transactions on Mathematical Software"} + +MACRO {toois} {"ACM Transactions on Office Information Systems"} + +MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"} + +MACRO {tcs} {"Theoretical Computer Science"} +FUNCTION {bibinfo.check} +{ swap$ + duplicate$ missing$ + { + pop$ pop$ + "" + } + { duplicate$ empty$ + { + swap$ pop$ + } + { swap$ + pop$ + } + if$ + } + if$ +} +FUNCTION {bibinfo.warn} +{ swap$ + duplicate$ missing$ + { + swap$ "missing " swap$ * " in " * cite$ * warning$ pop$ + "" + } + { duplicate$ empty$ + { + swap$ "empty " swap$ * " in " * cite$ * warning$ + } + { swap$ + pop$ + } + if$ + } + if$ +} +FUNCTION {format.eprint} +{ eprint duplicate$ empty$ + 'skip$ + { "\eprint" + archive empty$ + 'skip$ + { "[" * archive * "]" * } + if$ + "{" * swap$ * "}" * + } + if$ +} +FUNCTION {format.url} +{ url empty$ + { "" } + { "\urlprefix\url{" url * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + + +STRINGS { bibinfo} + +FUNCTION {format.names} +{ 'bibinfo := + duplicate$ empty$ 'skip$ { + 's := + "" 't := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}{ jj}{ f{}}" + format.name$ + remove.dots + bibinfo bibinfo.check + 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + s nameptr "{ll}" format.name$ duplicate$ "others" = + { 't := } + { pop$ } + if$ + "," * + t "others" = + { + " " * bbl.etal emphasize * + } + { " " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ + } if$ +} +FUNCTION {format.names.ed} +{ + 'bibinfo := + duplicate$ empty$ 'skip$ { + 's := + "" 't := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{f{}~}{vv~}{ll}{ jj}" + format.name$ + remove.dots + bibinfo bibinfo.check + 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + s nameptr "{ll}" format.name$ duplicate$ "others" = + { 't := } + { pop$ } + if$ + "," * + t "others" = + { + + " " * bbl.etal emphasize * + } + { " " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ + } if$ +} +FUNCTION {format.key} +{ empty$ + { key field.or.null } + { "" } + if$ +} + +FUNCTION {format.authors} +{ author "author" format.names + duplicate$ empty$ 'skip$ + { collaboration "collaboration" bibinfo.check + duplicate$ empty$ 'skip$ + { " (" swap$ * ")" * } + if$ + * + } + if$ +} +FUNCTION {get.bbl.editor} +{ editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ } + +FUNCTION {format.editors} +{ editor "editor" format.names duplicate$ empty$ 'skip$ + { + " " * + get.bbl.editor + "(" swap$ * ")" * + * + } + if$ +} +FUNCTION {format.isbn} +{ isbn "isbn" bibinfo.check + duplicate$ empty$ 'skip$ + { + new.block + "ISBN " swap$ * + } + if$ +} + +FUNCTION {format.issn} +{ issn "issn" bibinfo.check + duplicate$ empty$ 'skip$ + { + new.block + "ISSN " swap$ * + } + if$ +} + +FUNCTION {format.doi} +{ doi "doi" bibinfo.check + duplicate$ empty$ 'skip$ + { + new.block + "\doi{" swap$ * "}" * + } + if$ +} +FUNCTION {format.note} +{ + note empty$ + { "" } + { note #1 #1 substring$ + duplicate$ "{" = + 'skip$ + { output.state mid.sentence = + { "l" } + { "u" } + if$ + change.case$ + } + if$ + note #2 global.max$ substring$ * "note" bibinfo.check + } + if$ +} + +FUNCTION {format.title} +{ title + "title" bibinfo.check + duplicate$ empty$ 'skip$ + { + "\enquote{" swap$ * + add.period$ "}" * + } + if$ +} +FUNCTION {format.full.names} +{'s := + "" 't := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}" format.name$ + 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + s nameptr "{ll}" format.name$ duplicate$ "others" = + { 't := } + { pop$ } + if$ + t "others" = + { + " " * bbl.etal emphasize * + } + { + numnames #2 > + { "," * } + 'skip$ + if$ + bbl.and + space.word * t * + } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {author.editor.key.full} +{ author empty$ + { editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.full.names } + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {author.key.full} +{ author empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {editor.key.full} +{ editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.full.names } + if$ +} + +FUNCTION {make.full.names} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.key.full + { type$ "proceedings" = + 'editor.key.full + 'author.key.full + if$ + } + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem[{" write$ + label write$ + ")" make.full.names duplicate$ short.list = + { pop$ } + { * } + if$ + "}]{" * write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {n.dashify} +{ + 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {word.in} +{ bbl.in capitalize + " " * } + +FUNCTION {format.date} +{ year "year" bibinfo.check duplicate$ empty$ + { + "empty year in " cite$ * "; set to ????" * warning$ + pop$ "????" + } + 'skip$ + if$ + extra.label * + before.all 'output.state := + " (" swap$ * ")" * +} +FUNCTION {format.btitle} +{ title "title" bibinfo.check + duplicate$ empty$ 'skip$ + { + emphasize + } + if$ +} +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { bbl.volume volume tie.or.space.prefix + "volume" bibinfo.check * * + series "series" bibinfo.check + duplicate$ empty$ 'pop$ + { swap$ bbl.of space.word * swap$ + emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { series empty$ + { number "number" bibinfo.check } + { output.state mid.sentence = + { bbl.number } + { bbl.number capitalize } + if$ + number tie.or.space.prefix "number" bibinfo.check * * + bbl.in space.word * + series "series" bibinfo.check * + } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition duplicate$ empty$ 'skip$ + { + output.state mid.sentence = + { "l" } + { "t" } + if$ change.case$ + "edition" bibinfo.check + " " * bbl.edition * + } + if$ +} +INTEGERS { multiresult } +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} +FUNCTION {format.pages} +{ pages duplicate$ empty$ 'skip$ + { duplicate$ multi.page.check + { + bbl.pages swap$ + n.dashify + } + { + bbl.page swap$ + } + if$ + tie.or.space.prefix + "pages" bibinfo.check + * * + } + if$ +} +FUNCTION {format.journal.pages} +{ pages duplicate$ empty$ 'pop$ + { swap$ duplicate$ empty$ + { pop$ pop$ format.pages } + { + ", " * + swap$ + n.dashify + "pages" bibinfo.check + * + } + if$ + } + if$ +} +FUNCTION {format.journal.eid} +{ eid "eid" bibinfo.check + duplicate$ empty$ 'pop$ + { swap$ duplicate$ empty$ 'skip$ + { + ", " * + } + if$ + swap$ * + numpages empty$ 'skip$ + { bbl.eidpp numpages tie.or.space.prefix + "numpages" bibinfo.check * * + " (" swap$ * ")" * * + } + if$ + } + if$ +} +FUNCTION {format.vol.num.pages} +{ volume field.or.null + duplicate$ empty$ 'skip$ + { + "volume" bibinfo.check + } + if$ + bolden + number "number" bibinfo.check duplicate$ empty$ 'skip$ + { + swap$ duplicate$ empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + swap$ + "(" swap$ * ")" * + } + if$ * + eid empty$ + { format.journal.pages } + { format.journal.eid } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { bbl.chapter } + { type "l" change.case$ + "type" bibinfo.check + } + if$ + chapter tie.or.space.prefix + "chapter" bibinfo.check + * * + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.booktitle} +{ + booktitle "booktitle" bibinfo.check + emphasize +} +FUNCTION {format.in.ed.booktitle} +{ format.booktitle duplicate$ empty$ 'skip$ + { + editor "editor" format.names.ed duplicate$ empty$ 'pop$ + { + " " * + get.bbl.editor + "(" swap$ * "), " * + * swap$ + * } + if$ + word.in swap$ * + } + if$ +} +FUNCTION {format.thesis.type} +{ type duplicate$ empty$ + 'pop$ + { swap$ pop$ + "t" change.case$ "type" bibinfo.check + } + if$ +} +FUNCTION {format.tr.number} +{ number "number" bibinfo.check + type duplicate$ empty$ + { pop$ bbl.techrep } + 'skip$ + if$ + "type" bibinfo.check + swap$ duplicate$ empty$ + { pop$ "t" change.case$ } + { tie.or.space.prefix * * } + if$ +} +FUNCTION {format.article.crossref} +{ + word.in + " \cite{" * crossref * "}" * +} +FUNCTION {format.book.crossref} +{ volume duplicate$ empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + pop$ word.in + } + { bbl.volume + capitalize + swap$ tie.or.space.prefix "volume" bibinfo.check * * bbl.of space.word * + } + if$ + " \cite{" * crossref * "}" * +} +FUNCTION {format.incoll.inproc.crossref} +{ + word.in + " \cite{" * crossref * "}" * +} +FUNCTION {format.org.or.pub} +{ 't := + "" + address empty$ t empty$ and + 'skip$ + { + t empty$ + { address "address" bibinfo.check * + } + { t * + address empty$ + 'skip$ + { ", " * address "address" bibinfo.check * } + if$ + } + if$ + } + if$ +} +FUNCTION {format.publisher.address} +{ publisher "publisher" bibinfo.warn format.org.or.pub +} + +FUNCTION {format.organization.address} +{ organization "organization" bibinfo.check format.org.or.pub +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + crossref missing$ + { + journal + "journal" bibinfo.check + emphasize + "journal" output.check + format.vol.num.pages output + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + format.issn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.date "year" output.check + date.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + format.edition output + new.sentence + format.publisher.address output + } + { + new.block + format.book.crossref output.nonnull + } + if$ + format.isbn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {booklet} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + howpublished "howpublished" bibinfo.check output + address "address" bibinfo.check output + format.isbn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.date "year" output.check + date.block + format.btitle "title" output.check + crossref missing$ + { + format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + format.edition output + new.sentence + format.publisher.address output + } + { + format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + crossref missing$ + { format.isbn output } + 'skip$ + if$ + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.edition output + format.chapter.pages output + new.sentence + format.publisher.address output + format.isbn output + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {inproceedings} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.pages output + new.sentence + publisher empty$ + { format.organization.address output } + { organization "organization" bibinfo.check output + format.publisher.address output + } + if$ + format.isbn output + format.issn output + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {conference} { inproceedings } +FUNCTION {manual} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + date.block + format.btitle "title" output.check + organization address new.block.checkb + organization "organization" bibinfo.check output + address "address" bibinfo.check output + format.edition output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {mastersthesis} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.btitle + "title" output.check + new.block + bbl.mthesis format.thesis.type output.nonnull + school "school" bibinfo.warn output + address "address" bibinfo.check output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {misc} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + date.block + format.title output + new.block + howpublished "howpublished" bibinfo.check output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {phdthesis} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.btitle + "title" output.check + new.block + bbl.phdthesis format.thesis.type output.nonnull + school "school" bibinfo.warn output + address "address" bibinfo.check output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {proceedings} +{ output.bibitem + format.editors output + editor format.key output + format.date "year" output.check + date.block + format.btitle "title" output.check + format.bvolume output + format.number.series output + new.sentence + publisher empty$ + { format.organization.address output } + { organization "organization" bibinfo.check output + format.publisher.address output + } + if$ + format.isbn output + format.issn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {techreport} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title + "title" output.check + new.block + format.tr.number emphasize output.nonnull + institution "institution" bibinfo.warn output + address "address" bibinfo.check output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {unpublished} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + format.doi output + new.block + format.note "note" output.check + format.eprint output + format.url output + fin.entry +} + +FUNCTION {default.type} { misc } +READ +FUNCTION {sortify} +{ purify$ + "l" change.case$ +} +INTEGERS { len } +FUNCTION {chop.word} +{ 's := + 'len := + s #1 len substring$ = + { s len #1 + global.max$ substring$ } + 's + if$ +} +FUNCTION {format.lab.names} +{ 's := + "" 't := + s #1 "{vv~}{ll}" format.name$ + s num.names$ duplicate$ + #2 > + { pop$ + " " * bbl.etal emphasize * + } + { #2 < + 'skip$ [TRUNCATED] To get the complete diff run: svnlook diff /svnroot/rprotobuf -r 687 From noreply at r-forge.r-project.org Fri Jan 3 07:31:40 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 07:31:40 +0100 (CET) Subject: [Rprotobuf-commits] r688 - in pkg: . src Message-ID: <20140103063140.8EA22186CFC@r-forge.r-project.org> Author: murray Date: 2014-01-03 07:31:40 +0100 (Fri, 03 Jan 2014) New Revision: 688 Added: pkg/src/std.lnt Modified: pkg/.Rbuildignore Log: Check in the Flexelint configuration file I am using which greatly reduces the number of warnings produced by Flexelint over this code base. Modified: pkg/.Rbuildignore =================================================================== --- pkg/.Rbuildignore 2014-01-03 02:22:17 UTC (rev 687) +++ pkg/.Rbuildignore 2014-01-03 06:31:40 UTC (rev 688) @@ -2,3 +2,4 @@ configure.in vignettes/Sweave.sty src/.dir-locals.el +src/std.lnt Added: pkg/src/std.lnt =================================================================== --- pkg/src/std.lnt (rev 0) +++ pkg/src/std.lnt 2014-01-03 06:31:40 UTC (rev 688) @@ -0,0 +1,73 @@ +// Rcpp includes really long macro lines, so we need this. ++linebuf ++linebuf ++linebuf ++linebuf + +// Make at least two passes so that cross-functional checks can be made. +-passes(2) + +// Output four text files with useful info ++program_info( output_prefix= ) + +// R and Rcpp include paths. +--i"/Library/Frameworks/R.framework/Resources/include" +--i"/Library/Frameworks/R.framework/Versions/3.0/Resources/library/Rcpp/include" + +// Many RProtoBuf/Rcpp classes do not have/need a default 0 argument constructor. +// This is discussed in: +// Koenig, Andrew +// Check List for Class Authors +// The C++ Journal, 2:1 (1992 Nov 1), 42-46 +// Reprinted in "Ruminations on C++", Chapter 4 +-esym(1712,S4_Message) +-esym(1712,S4_ArrayOutputStream) +-esym(1712,S4_ArrayInputStream) +-esym(1712,S4_MethodDescriptor) +-esym(1712,RconnectionCopyingInputStream) +-esym(1712,S4_Descriptor) +-esym(1712,S4_FileDescriptor) +-esym(1712,S4_FieldDescriptor) +-esym(1712,S4_ServiceDescriptor) +-esym(1712,S4_EnumDescriptor) +-esym(1712,S4_EnumValueDescriptor) +-esym(1712,ConnectionCopyingInputStream) +-esym(1712,Int64AsStringRepeatedFieldImporter) +-esym(1712,UInt64AsStringRepeatedFieldImporter) +-esym(1712,UInt32RepeatedFieldImporter) +-esym(1712,RepeatedFieldImporter) +-esym(1712,ZeroCopyOutputStreamWrapper) +-esym(1712,ZeroCopyInputStreamWrapper) + +// Turn off unused variable warnings for any identifier beginning with unused_ +// And const warnings as well. +-esym(715, unused_*) +-esym(818, unused_*) + +// Tell Flexelint about functions that never return. +// http://www.gimpel.com/Discussion.cfm?ThreadID=4476 +-function(exit, Rcpp::stop) +-function(exit, Rcpp_error) +-function(exit, Rf_error) + +// 527 unreachable code, sometimes we return for the benefit of -Wall with +// less sophisticated tools. +-esym(527, return) + +// 725 expected positive indentation - BEGIN_RCPP expands to a block but we don't indent. +// just turn off this message since we use clang-format to handle indentation. +-e725 + +// 27: illegal character, lets just ignore for anything in /usr/include +// e.g. There are some dtrace probes in /usr/include/c++/4.2.1/bits/os_defines.h on MacOS X that include + +-efile(27, "/usr/include/*") +-esym(27, "__dtrace*") + +// 578: declaration of symbol 'index' hides symbol index from strings.h +-esym(578, index) + +// Not interested in debugging at the moment, let the Rcpp maintainers deal. +-emacro( (*), VOID_END_RCPP) +-emacro( (*), END_RCPP) +-emacro( (*), BEGIN_RCPP) From noreply at r-forge.r-project.org Fri Jan 3 10:24:49 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 10:24:49 +0100 (CET) Subject: [Rprotobuf-commits] r689 - pkg/src Message-ID: <20140103092449.DB03A186BC8@r-forge.r-project.org> Author: murray Date: 2014-01-03 10:24:47 +0100 (Fri, 03 Jan 2014) New Revision: 689 Modified: pkg/src/RcppMacros.h Log: Remove unused macros RPB_FUNCTION_4 and RPB_FUNCTION_VOID_0. Reported by Flexelint. Modified: pkg/src/RcppMacros.h =================================================================== --- pkg/src/RcppMacros.h 2014-01-03 06:31:40 UTC (rev 688) +++ pkg/src/RcppMacros.h 2014-01-03 09:24:47 UTC (rev 689) @@ -80,26 +80,6 @@ } \ __OUT__ RCPP_DECORATE(__NAME__)(___0, ___1, ___2) -#define RPB_FUNCTION_4(__OUT__,__NAME__, ___0, ___1, ___2, ___3) \ -__OUT__ RCPP_DECORATE(__NAME__)(___0, ___1, ___2, ___3); \ -extern "C" SEXP __NAME__(SEXP x0, SEXP x1, SEXP x2, SEXP x3){ \ -SEXP res = R_NilValue ; \ -BEGIN_RCPP \ -res = ::Rcpp::wrap( RCPP_DECORATE(__NAME__)(::Rcpp::internal::converter( x0 ), ::Rcpp::internal::converter( x1 ), ::Rcpp::internal::converter( x2 )), ::Rcpp::internal::converter( x3 )) ) ; \ -return res ; \ -END_RCPP \ -} \ -__OUT__ RCPP_DECORATE(__NAME__)(___0, ___1, ___2, ___3) - -#define RPB_FUNCTION_VOID_0(__NAME__) \ -void RCPP_DECORATE(__NAME__)(); \ -extern "C" SEXP __NAME__(){ \ -BEGIN_RCPP \ -RCPP_DECORATE(__NAME__)(); \ -END_RCPP \ -} \ -void RCPP_DECORATE(__NAME__)() - #define RPB_FUNCTION_VOID_1(__NAME__, ___0) \ void RCPP_DECORATE(__NAME__)(___0); \ extern "C" SEXP __NAME__(SEXP x0){ \ From noreply at r-forge.r-project.org Fri Jan 3 10:31:22 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 10:31:22 +0100 (CET) Subject: [Rprotobuf-commits] r690 - pkg/src Message-ID: <20140103093122.A51FA186C1A@r-forge.r-project.org> Author: murray Date: 2014-01-03 10:31:22 +0100 (Fri, 03 Jan 2014) New Revision: 690 Modified: pkg/src/mutators.cpp Log: string -> std::string, remove a comment fold, and use the RPB_DEBUG_BEGIN/END macros for brevity. Modified: pkg/src/mutators.cpp =================================================================== --- pkg/src/mutators.cpp 2014-01-03 09:24:47 UTC (rev 689) +++ pkg/src/mutators.cpp 2014-01-03 09:31:22 UTC (rev 690) @@ -82,11 +82,11 @@ } template -ValueType Int64FromString(const string& value) { +ValueType Int64FromString(const std::string& value) { std::stringstream ss(value); ValueType ret; if ((ss >> ret).fail() || !(ss >> std::ws).eof()) { - string message = + std::string message = "Provided character value '" + value + "' cannot be cast to 64-bit integer."; Rcpp::stop(message.c_str()); } @@ -94,11 +94,11 @@ } template -ValueType Int32FromString(const string& value) { +ValueType Int32FromString(const std::string& value) { std::stringstream ss(value); ValueType ret; if ((ss >> ret).fail() || !(ss >> std::ws).eof()) { - string message = + std::string message = "Provided character value '" + value + "' cannot be cast to 32-bit integer."; Rcpp::stop(message.c_str()); } @@ -1127,14 +1127,10 @@ */ SEXP setMessageField(SEXP pointer, SEXP name, SEXP value) { BEGIN_RCPP - // {{{ grab data -#ifdef RPB_DEBUG - Rprintf("\n"); - + RPB_DEBUG_BEGIN("setMessageField") PRINT_DEBUG_INFO("pointer", pointer); PRINT_DEBUG_INFO("name", name); PRINT_DEBUG_INFO("value", value); -#endif /* grab the Message pointer */ GPB::Message* message = GET_MESSAGE_POINTER_FROM_XP(pointer); @@ -1148,7 +1144,6 @@ ref->ClearField(message, field_desc); return R_NilValue; } - // }}} // {{{ preliminary checks R_xlen_t value_size = 1; @@ -1188,9 +1183,7 @@ } else { setNonRepeatedMessageField(message, ref, field_desc, value, value_size); } -#ifdef RPB_DEBUG - Rprintf("\n"); -#endif + RPB_DEBUG_END("setMessageField") END_RCPP } From noreply at r-forge.r-project.org Fri Jan 3 10:32:00 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 10:32:00 +0100 (CET) Subject: [Rprotobuf-commits] r691 - pkg/src Message-ID: <20140103093200.72E0C186C22@r-forge.r-project.org> Author: murray Date: 2014-01-03 10:32:00 +0100 (Fri, 03 Jan 2014) New Revision: 691 Modified: pkg/src/extractors.cpp Log: string->std::string and use RPB_DEBUG_BEGIN/END macros for brevity. Modified: pkg/src/extractors.cpp =================================================================== --- pkg/src/extractors.cpp 2014-01-03 09:31:22 UTC (rev 690) +++ pkg/src/extractors.cpp 2014-01-03 09:32:00 UTC (rev 691) @@ -38,8 +38,8 @@ std::stringstream ss; if ((ss << value).fail()) { // This should not happen, its a bug in the code. - string message = string("Error converting int64 to string, unset ") + - kIntStringOptionName + " option."; + std::string message = std::string("Error converting int64 to string, unset ") + + kIntStringOptionName + " option."; Rcpp::stop(message.c_str()); } return Rcpp::CharacterVector(ss.str()); @@ -60,21 +60,16 @@ */ RcppExport SEXP getMessageField(SEXP pointer, SEXP name) { -#ifdef RPB_DEBUG - Rprintf("\n"); - + RPB_DEBUG_BEGIN("getMessageField") PRINT_DEBUG_INFO("pointer", pointer); PRINT_DEBUG_INFO("name", name); -#endif /* grab the Message pointer */ Rcpp::XPtr message(pointer); GPB::FieldDescriptor* field_desc = getFieldDescriptor(message, name); -#ifdef RPB_DEBUG - Rprintf("\n"); -#endif + RPB_DEBUG_END("getMessageField"); return (extractFieldAsSEXP(message, field_desc)); } From noreply at r-forge.r-project.org Fri Jan 3 10:32:54 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 10:32:54 +0100 (CET) Subject: [Rprotobuf-commits] r692 - pkg/src Message-ID: <20140103093254.E11C9186C51@r-forge.r-project.org> Author: murray Date: 2014-01-03 10:32:54 +0100 (Fri, 03 Jan 2014) New Revision: 692 Modified: pkg/src/rprotobuf.cpp pkg/src/rprotobuf.h Log: Use RPB_DEBUG macros for brevity. Modified: pkg/src/rprotobuf.cpp =================================================================== --- pkg/src/rprotobuf.cpp 2014-01-03 09:32:00 UTC (rev 691) +++ pkg/src/rprotobuf.cpp 2014-01-03 09:32:54 UTC (rev 692) @@ -6,8 +6,8 @@ namespace rprotobuf { GPB::Message* PROTOTYPE(const GPB::Descriptor* desc) { + RPB_DEBUG_BEGIN("PROTOTYPE") #ifdef RPB_DEBUG - Rprintf("\n"); Rprintf("desc = %d\n", desc); #endif /* first try the runtime factory */ @@ -25,21 +25,16 @@ Rprintf("runtime factory = %d\n", m); #endif } + RPB_DEBUG_END("PROTOTYPE") return m; } GPB::Message* CLONE(const GPB::Message* origin) { -#ifdef RPB_DEBUG - Rprintf(""); -#endif - + RPB_DEBUG_BEGIN("CLONE") const GPB::Descriptor* desc = origin->GetDescriptor(); GPB::Message* sheep = PROTOTYPE(desc); sheep->CopyFrom(*origin); - -#ifdef RPB_DEBUG - Rprintf(""); -#endif + RPB_DEBUG_END("CLONE") return sheep; } @@ -127,8 +122,8 @@ */ SEXP newProtoMessage(SEXP descriptor) { BEGIN_RCPP + RPB_DEBUG_BEGIN("newProtoMessage") #ifdef RPB_DEBUG - Rprintf("\n"); /* FIXME: the message type, we don't really need that*/ SEXP type = GET_SLOT(descriptor, Rf_install("type")); #endif @@ -146,9 +141,7 @@ if (!message) { Rcpp_error("could not call factory->GetPrototype(desc)->New()"); } -#ifdef RPB_DEBUG - Rprintf("\n"); -#endif + RPB_DEBUG_END("newProtoMessage") return (S4_Message(message)); END_RCPP @@ -206,19 +199,17 @@ * @return TRUE if m is a a message of the given type */ Rboolean isMessage(SEXP m, const char* target) { -#ifdef RPB_DEBUG - Rprintf("\n"); -#endif + RPB_DEBUG_BEGIN("isMessage") if (TYPEOF(m) != S4SXP || !Rf_inherits(m, "Message")) return _FALSE_; GPB::Message* message = (GPB::Message*)EXTPTR_PTR(GET_SLOT(m, Rf_install("pointer"))); const char* type = message->GetDescriptor()->full_name().c_str(); + RPB_DEBUG_END("isMessage") if (strcmp(type, target)) { return _FALSE_; } - return _TRUE_; } Modified: pkg/src/rprotobuf.h =================================================================== --- pkg/src/rprotobuf.h 2014-01-03 09:32:00 UTC (rev 691) +++ pkg/src/rprotobuf.h 2014-01-03 09:32:54 UTC (rev 692) @@ -73,9 +73,13 @@ // #define FIN_DBG(ptr, CLAZZ) Rprintf( "RProtoBuf finalizing %s (%p)\n", CLAZZ, // ptr ) +#ifdef RPB_DEBUG #define PRINT_DEBUG_INFO(name, o) \ Rprintf(" %s [%d] = ", name, TYPEOF(o)); \ Rf_PrintValue(o); +#else +#define PRINT_DEBUG_INFO(name, o) +#endif #define RPROTOBUF_LOOKUP 24 // #define LOOKUP_DEBUG From noreply at r-forge.r-project.org Fri Jan 3 10:46:50 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 10:46:50 +0100 (CET) Subject: [Rprotobuf-commits] r693 - pkg/src Message-ID: <20140103094650.C6C9E18671F@r-forge.r-project.org> Author: murray Date: 2014-01-03 10:46:50 +0100 (Fri, 03 Jan 2014) New Revision: 693 Modified: pkg/src/std.lnt Log: Additional parameters to remove false positives and provide better output. Modified: pkg/src/std.lnt =================================================================== --- pkg/src/std.lnt 2014-01-03 09:32:54 UTC (rev 692) +++ pkg/src/std.lnt 2014-01-03 09:46:50 UTC (rev 693) @@ -4,6 +4,15 @@ +linebuf +linebuf +// Verbose option, to see all the include files and which are treated as Library or not. +// -vf + +// error messages only in library headers +-wlib(1) +// But not illegal character warnings. +// Some MacOS X dtrace$ identifiers cause lots of illegal char errors. +-elib(27) // Error 27: Illegal character + // Make at least two passes so that cross-functional checks can be made. -passes(2) @@ -67,7 +76,13 @@ // 578: declaration of symbol 'index' hides symbol index from strings.h -esym(578, index) -// Not interested in debugging at the moment, let the Rcpp maintainers deal. --emacro( (*), VOID_END_RCPP) --emacro( (*), END_RCPP) --emacro( (*), BEGIN_RCPP) +// END_RCPP __ex__ reference parameter is not const, but maybe should be +-elibmacro( 1764 ) + +// These don't work, because they are in library headers e.g. need -elibmacro +// -emacro( (*), VOID_END_RCPP) +// -emacro( (*), END_RCPP) +//-emacro( (*), BEGIN_RCPP) + +// Print a summary at the end. +-summary From noreply at r-forge.r-project.org Fri Jan 3 20:41:57 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 20:41:57 +0100 (CET) Subject: [Rprotobuf-commits] r694 - papers/rjournal Message-ID: <20140103194157.7DDF0185D16@r-forge.r-project.org> Author: murray Date: 2014-01-03 20:41:57 +0100 (Fri, 03 Jan 2014) New Revision: 694 Added: papers/rjournal/protobuf-distributed-system-crop.pdf papers/rjournal/protobuf-distributed-system.dia Log: Add another figure showing how RProtoBuf is used to serialize/deserialize data for transmission to other distributed systems or files. This figure is inspired by a figure I see on slide 1 of : http://ganges.usc.edu/pgroupW/images/a/a9/Serializarion_Framework.pdf Added: papers/rjournal/protobuf-distributed-system-crop.pdf =================================================================== (Binary files differ) Property changes on: papers/rjournal/protobuf-distributed-system-crop.pdf ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: papers/rjournal/protobuf-distributed-system.dia =================================================================== (Binary files differ) Property changes on: papers/rjournal/protobuf-distributed-system.dia ___________________________________________________________________ Added: svn:mime-type + application/octet-stream From noreply at r-forge.r-project.org Fri Jan 3 21:21:48 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 21:21:48 +0100 (CET) Subject: [Rprotobuf-commits] r695 - papers/rjournal Message-ID: <20140103202148.2519618697E@r-forge.r-project.org> Author: murray Date: 2014-01-03 21:21:47 +0100 (Fri, 03 Jan 2014) New Revision: 695 Modified: papers/rjournal/eddelbuettel-francois-stokely.bib Log: Add citations for msgpackR and rmongodb: R interfaces to MessagePack and BSON. Modified: papers/rjournal/eddelbuettel-francois-stokely.bib =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.bib 2014-01-03 19:41:57 UTC (rev 694) +++ papers/rjournal/eddelbuettel-francois-stokely.bib 2014-01-03 20:21:47 UTC (rev 695) @@ -7,6 +7,20 @@ pages={1--18}, year={2011} } + at Manual{msgpackR, + title = {msgpackR: A library to serialize or unserialize data in MessagePack format}, + author = {Mikiya TANIZAWA}, + year = {2013}, + note = {R package version 1.1}, + url = {http://CRAN.R-project.org/package=msgpackR}, +} + at Manual{rmongodb, + title={rmongodb: R-MongoDB driver}, + author={Gerald Lindsly}, + year = {2013}, + note = {R package version 1.3.3}, + url = {http://CRAN.R-project.org/package=rmongodb}, +} @Manual{int64, title = {int64: 64 bit integer types}, author = {Romain Francois}, From noreply at r-forge.r-project.org Fri Jan 3 21:39:06 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 21:39:06 +0100 (CET) Subject: [Rprotobuf-commits] r696 - papers/rjournal Message-ID: <20140103203906.C482218010D@r-forge.r-project.org> Author: murray Date: 2014-01-03 21:39:06 +0100 (Fri, 03 Jan 2014) New Revision: 696 Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw Log: Improve introductory text. Compare with the binary JSON formats like MessagePack and BSON. Add the new high level figure about serializing/deserializing data from an interactive R session to other distributed systems. Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-03 20:21:47 UTC (rev 695) +++ papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-03 20:39:06 UTC (rev 696) @@ -36,16 +36,24 @@ Modern data collection and analysis pipelines are increasingly being built using collections of components to better manage software complexity through reusability, modularity, and fault -isolation \citep{Wegiel:2010:CTT:1932682.1869479}. Different -programming languages are often used for the different phases of data +isolation \citep{Wegiel:2010:CTT:1932682.1869479}. +Data analysis patterns such as Split-Apply-Combine +\citep{wickham2011split} explicitly break up large problems into +manageable pieces. These patterns are frequently employed with +different programming languages used for the different phases of data analysis -- collection, cleaning, analysis, post-processing, and presentation in order to take advantage of the unique combination of performance, speed of development, and library support offered by different environments. Each stage of the data analysis pipeline may involve storing intermediate results in a -file or sending them over the network. Programming languages such as -Java, Ruby, Python, and R include built-in serialization support, but -these formats are tied to the specific programming language in use. +file or sending them over the network. + +Programming languages such as Java, Ruby, Python, and R include +built-in serialization support, but these formats are tied to the +specific programming language in use and thus lock the user into a +single environment. +% +% do not facilitate % TODO(ms): and they often don't support versioning among other faults. CSV files can be read and written by many applications and so are often used for exporting tabular data. However, CSV files have a @@ -55,25 +63,43 @@ characters. JSON is another widely-supported format used mostly on the web that removes many of these disadvantages, but it too suffers from being too slow to parse and also does not provide strong typing -between integers and floating point. Large numbers of JSON messages -would also be required to duplicate the field names with each message. +between integers and floating point. Because the schema information +is not kept separately, multiple JSON messages of the same +type needlessly duplicate the field names with each message. +% +% +% +A number of binary formats based on JSON have been proposed that +reduce the parsing cost and improve the efficiency. MessagePack +\citep{msgpackR} and BSON \citep{rmongodb} both have R interfaces, but +these formats lack a separate schema for the serialized data and thus +still duplicate field names with each message sent over the network or +stored in a file. Such formats also lack support for versioning when +data storage needs evolve over time. -TODO(ms): Also work in reference to Split-Apply-Combine pattern for -data analysis \citep{wickham2011split}, since that is a great pattern -but it seems overly optimistic to expect all of those phases to always -be done in the same language. +% TODO(mstokely): Take a more conversational tone here asking +% questions and motivating protocol buffers? This article describes the basics of Google's Protocol Buffers through an easy to use R package, \CRANpkg{RProtoBuf}. After describing the basics of protocol buffers and \CRANpkg{RProtoBuf}, we illustrate several common use cases for protocol buffers in data analysis. +\section{Protocol Buffers} +% This content is good. Maybe use and cite? +% http://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html + +Protocol Buffers are a widely used modern language-neutral, +platform-neutral, extensible mechanism for sharing structured data. + + +one of the more popular examples of the modern + + XXX Related work on IDLs (greatly expanded ) XXX Design tradeoffs: reflection vs proto compiler -\section{Protocol Buffers} - Once the data serialization needs get complex enough, application developers typically benefit from the use of an \emph{interface description language}, or \emph{IDL}. IDLs like Google's Protocol @@ -139,6 +165,14 @@ languages to support protocol buffers is compiled as part of the project page: \url{http://code.google.com/p/protobuf/wiki/ThirdPartyAddOns} +\begin{figure}[t] +\begin{center} +\includegraphics[width=\textwidth]{protobuf-distributed-system-crop.pdf} +\end{center} +\caption{Example protobuf usage} +\label{fig:protobuf-distributed-usecase} +\end{figure} + \section{Basic Usage: Messages and Descriptors} This section describes how to use the R API to create and manipulate From noreply at r-forge.r-project.org Fri Jan 3 22:46:56 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 22:46:56 +0100 (CET) Subject: [Rprotobuf-commits] r697 - papers/rjournal Message-ID: <20140103214657.1221C1862A6@r-forge.r-project.org> Author: murray Date: 2014-01-03 22:46:56 +0100 (Fri, 03 Jan 2014) New Revision: 697 Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw Log: Further improve the introduction. Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-03 20:39:06 UTC (rev 696) +++ papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-03 21:46:56 UTC (rev 697) @@ -48,24 +48,23 @@ analysis pipeline may involve storing intermediate results in a file or sending them over the network. -Programming languages such as Java, Ruby, Python, and R include -built-in serialization support, but these formats are tied to the -specific programming language in use and thus lock the user into a -single environment. -% -% do not facilitate -% TODO(ms): and they often don't support versioning among other faults. -CSV files can be read and written by many applications and so are -often used for exporting tabular data. However, CSV files have a -number of disadvantages, such as a limitation of exporting only +Given these requirements, how do we safely share intermediate results +between different applications, possibly written in different +languages, and possibly running on different computers? Programming +languages such as R, Java, Julia, and Python include built-in +serialization support, but these formats are tied to the specific +programming language in use and thus lock the user into a single +environment. CSV files can be read and written by many applications +and so are often used for exporting tabular data. However, CSV files +have a number of disadvantages, such as a limitation of exporting only tabular datasets, lack of type-safety, inefficient text representation and parsing, and ambiguities in the format involving special characters. JSON is another widely-supported format used mostly on the web that removes many of these disadvantages, but it too suffers from being too slow to parse and also does not provide strong typing between integers and floating point. Because the schema information -is not kept separately, multiple JSON messages of the same -type needlessly duplicate the field names with each message. +is not kept separately, multiple JSON messages of the same type +needlessly duplicate the field names with each message. % % % @@ -77,6 +76,19 @@ stored in a file. Such formats also lack support for versioning when data storage needs evolve over time. +Once the data serialization needs of an application become complex +enough, developers typically benefit from the use of an +\emph{interface description language}, or \emph{IDL}. IDLs like +Google's Protocol Buffers, Apache Thrift, and Apache Avro provide a compact +well-documented schema for cross-langauge data structures and +efficient binary interchange formats. The schema can be used to +generate model classes for statically typed programming languages such +as C++ and Java, or can be used with reflection for dynamically typed +programming languages. Since the schema is provided separately from +the encoded data, the data can be efficiently encoded to minimize +storage costs of the stored data when compared with simple +``schema-less'' binary interchange formats. + % TODO(mstokely): Take a more conversational tone here asking % questions and motivating protocol buffers? @@ -85,12 +97,12 @@ basics of protocol buffers and \CRANpkg{RProtoBuf}, we illustrate several common use cases for protocol buffers in data analysis. + \section{Protocol Buffers} % This content is good. Maybe use and cite? % http://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html -Protocol Buffers are a widely used modern language-neutral, -platform-neutral, extensible mechanism for sharing structured data. +Protocol Buffers are a widely used modern language-neutral, platform-neutral, extensible mechanism for sharing structured data. one of the more popular examples of the modern @@ -100,18 +112,6 @@ XXX Design tradeoffs: reflection vs proto compiler -Once the data serialization needs get complex enough, application -developers typically benefit from the use of an \emph{interface -description language}, or \emph{IDL}. IDLs like Google's Protocol -Buffers and Apache Thrift provide a compact well-documented schema for -cross-langauge data structures as well efficient binary interchange -formats. The schema can be used to generate model classes for -statically typed programming languages such as C++ and Java, or can be -used with reflection for dynamically typed programming languages. -Since the schema is provided separately from the encoded data, the -data can be efficiently encoded to minimize storage costs of the -stored data when compared with simple ``schema-less'' binary -interchange formats like BSON. % TODO(ms) Also talk about versioning and why its useful. From noreply at r-forge.r-project.org Fri Jan 3 23:56:15 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 3 Jan 2014 23:56:15 +0100 (CET) Subject: [Rprotobuf-commits] r698 - papers/rjournal Message-ID: <20140103225616.0474B1869F7@r-forge.r-project.org> Author: murray Date: 2014-01-03 23:56:15 +0100 (Fri, 03 Jan 2014) New Revision: 698 Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw Log: Improve section 2 on protocol buffers. Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-03 21:46:56 UTC (rev 697) +++ papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-03 22:56:15 UTC (rev 698) @@ -51,7 +51,7 @@ Given these requirements, how do we safely share intermediate results between different applications, possibly written in different languages, and possibly running on different computers? Programming -languages such as R, Java, Julia, and Python include built-in +languages such as R, Julia, Java, and Python include built-in serialization support, but these formats are tied to the specific programming language in use and thus lock the user into a single environment. CSV files can be read and written by many applications @@ -79,7 +79,7 @@ Once the data serialization needs of an application become complex enough, developers typically benefit from the use of an \emph{interface description language}, or \emph{IDL}. IDLs like -Google's Protocol Buffers, Apache Thrift, and Apache Avro provide a compact +Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact well-documented schema for cross-langauge data structures and efficient binary interchange formats. The schema can be used to generate model classes for statically typed programming languages such @@ -92,79 +92,113 @@ % TODO(mstokely): Take a more conversational tone here asking % questions and motivating protocol buffers? +% TODO(mstokely): If we go to JSS, include a larger paragraph here +% referencing each numbered section. I don't like these generally, +% but its useful for this paper I think because we have a boring bit +% in the middle (full class/method details) and interesting +% applications at the end. This article describes the basics of Google's Protocol Buffers through an easy to use R package, \CRANpkg{RProtoBuf}. After describing the basics of protocol buffers and \CRANpkg{RProtoBuf}, we illustrate several common use cases for protocol buffers in data analysis. +\section{Protocol Buffers} -\section{Protocol Buffers} +Introductory section which may include references in parentheses +\citep{R}, or cite a reference such as \citet{R} in the text. + % This content is good. Maybe use and cite? % http://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html -Protocol Buffers are a widely used modern language-neutral, platform-neutral, extensible mechanism for sharing structured data. +%% TODO(de,ms) What follows is oooooold and was lifted from the webpage +%% Rewrite? +Protocol Buffers are a modern language-neutral, platform-neutral, +extensible mechanism for sharing and storing structured data. They +have been widely adopted in industry with applications as varied as Sony +Playstations, Twitter, Google Search, Hadoop, and Open Street Map. While +traditional IDLs were previously characterized by bloat and +complexity, Protocol Buffers is based on a simple list and records +model that is flexible and easy to use. Some of the key features +provided by Protocol Buffers for data analysis include: -one of the more popular examples of the modern +\begin{itemize} +\item \emph{Portable}: Allows users to send and receive data between + applications or different computers. +\item \emph{Efficient}: Data is serialized into a compact binary + representation for transmission or storage. +\item \emph{Exentsible}: New fields can be added to Protocol Buffer Schemas + in a forward-compatible way that do not break older applications. +\item \emph{Stable}: Protocol Buffers have been in wide use for over a + decade. +\end{itemize} +Figure~\ref{fig:protobuf-distributed-usecase} illustrates an example +communication workflow with protocol buffers and an interactive R +session. Common use cases include populating a request RPC protocol +buffer in R that is then serialized and sent over the network to a +remote server. The server would then deserialize the message, act on +the request, and respond with a new protocol buffer over the network. -XXX Related work on IDLs (greatly expanded ) +%Protocol buffers are a language-neutral, platform-neutral, extensible +%way of serializing structured data for use in communications +%protocols, data storage, and more. -XXX Design tradeoffs: reflection vs proto compiler -% TODO(ms) Also talk about versioning and why its useful. +%Protocol Buffers offer key features such as an efficient data interchange +%format that is both language- and operating system-agnostic yet uses a +%lightweight and highly performant encoding, object serialization and +%de-serialization as well data and configuration management. Protocol +%buffers are also forward compatible: updates to the \texttt{proto} +%files do not break programs built against the previous specification. -%BSON, msgpack, Thrift, and Protocol Buffers take this latter approach, -%with the +%While benchmarks are not available, Google states on the project page that in +%comparison to XML, protocol buffers are at the same time \textsl{simpler}, +%between three to ten times \textsl{smaller}, between twenty and one hundred +%times \textsl{faster}, as well as less ambiguous and easier to program. -% There are references comparing these we should use here. +Many sources compare data serialization formats and show protocol +buffers very favorably to the alternatives, such +as \citep{Sumaray:2012:CDS:2184751.2184810} -TODO Also mention Thrift and msgpack and the references comparing some -of these tradeoffs. +%The flexibility of the reflection-based API is particularly well +%suited for interactive data analysis. -Introductory section which may include references in parentheses -\citep{R}, or cite a reference such as \citet{R} in the text. +% XXX Design tradeoffs: reflection vs proto compiler -%% TODO(de,ms) What follows is oooooold and was lifted from the webpage -%% Rewrite? -Protocol buffers are a language-neutral, platform-neutral, extensible -way of serializing structured data for use in communications -protocols, data storage, and more. +For added speed and efficiency, the C++, Java, and Python bindings to +Protocol Buffers are used with a compiler that translates a protocol +buffer schema description file (ending in \texttt{.proto}) into +language-specific classes that can be used to create, read, write and +manipulate protocol buffer messages. The R interface, in contrast, +uses a reflection-based API that is particularly well suited for +interactive data analysis. All messages in R have a single class +structure, but different accessor methods are created at runtime based +on the name fields of the specified message type. -Protocol Buffers offer key features such as an efficient data interchange -format that is both language- and operating system-agnostic yet uses a -lightweight and highly performant encoding, object serialization and -de-serialization as well data and configuration management. Protocol -buffers are also forward compatible: updates to the \texttt{proto} -files do not break programs built against the previous specification. +% In other words, given the 'proto' +%description file, code is automatically generated for the chosen +%target language(s). The project page contains a tutorial for each of +%these officially supported languages: +%\url{http://code.google.com/apis/protocolbuffers/docs/tutorials.html} -While benchmarks are not available, Google states on the project page that in -comparison to XML, protocol buffers are at the same time \textsl{simpler}, -between three to ten times \textsl{smaller}, between twenty and one hundred -times \textsl{faster}, as well as less ambiguous and easier to program. +%The protocol buffers code is released under an open-source (BSD) license. The +%protocol buffer project (\url{http://code.google.com/p/protobuf/}) +%contains a C++ library and a set of runtime libraries and compilers for +%C++, Java and Python. -The protocol buffers code is released under an open-source (BSD) license. The -protocol buffer project (\url{http://code.google.com/p/protobuf/}) -contains a C++ library and a set of runtime libraries and compilers for -C++, Java and Python. +%With these languages, the workflow follows standard practice of so-called +%Interface Description Languages (IDL) +%(c.f. \href{http://en.wikipedia.org/wiki/Interface_description_language}{Wikipedia +% on IDL}). This consists of compiling a protocol buffer description file +%(ending in \texttt{.proto}) into language specific classes that can be used -With these languages, the workflow follows standard practice of so-called -Interface Description Languages (IDL) -(c.f. \href{http://en.wikipedia.org/wiki/Interface_description_language}{Wikipedia - on IDL}). This consists of compiling a protocol buffer description file -(ending in \texttt{.proto}) into language specific classes that can be used -to create, read, write and manipulate protocol buffer messages. In other -words, given the 'proto' description file, code is automatically generated -for the chosen target language(s). The project page contains a tutorial for -each of these officially supported languages: -\url{http://code.google.com/apis/protocolbuffers/docs/tutorials.html} +%Besides the officially supported C++, Java and Python implementations, several projects have been +%created to support protocol buffers for many languages. The list of known +%languages to support protocol buffers is compiled as part of the +%project page: \url{http://code.google.com/p/protobuf/wiki/ThirdPartyAddOns} -Besides the officially supported C++, Java and Python implementations, several projects have been -created to support protocol buffers for many languages. The list of known -languages to support protocol buffers is compiled as part of the -project page: \url{http://code.google.com/p/protobuf/wiki/ThirdPartyAddOns} - \begin{figure}[t] \begin{center} \includegraphics[width=\textwidth]{protobuf-distributed-system-crop.pdf} @@ -184,18 +218,21 @@ and Descriptors. Messages provide a common abstract encapsulation of structured data fields of the type specified in a Message Descriptor. Message Descriptors are defined in \texttt{.proto} files and define a -schema for a particular named class of messages. This separation -between schema and the message objects is in contrast to -more verbose formats like JSON, and when combined with the efficient -binary representation of any Message object explains a large part of -the performance and storage-space advantage offered by Protocol -Buffers. TODO(ms): we already said some of this above. clean up. +schema for a particular named class of messages. Table~\ref{tab:proto} shows an example \texttt{.proto} file which defines the \texttt{tutorial.Person} type. The R code in the right column shows an example of creating a new message of this type and populating its fields. +% Commented out because we said this earlier. +%This separation +%between schema and the message objects is in contrast to +%more verbose formats like JSON, and when combined with the efficient +%binary representation of any Message object explains a large part of +%the performance and storage-space advantage offered by Protocol +%Buffers. TODO(ms): we already said some of this above. clean up. + % lifted from protobuf page: %With Protocol Buffers you define how you want your data to be %structured once, and then you can read or write structured data to and @@ -1262,12 +1299,8 @@ \section{Summary} -TODO(ms): random citations to work in: +% RProtoBuf has been used. -Many sources compare data serialization formats and show protocol -buffers very favorably to the alternatives, such -as \citep{Sumaray:2012:CDS:2184751.2184810} - %Its pretty useful. Murray to see if he can get approval to talk a %tiny bit about how much its used at Google. From noreply at r-forge.r-project.org Sat Jan 4 01:13:09 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 01:13:09 +0100 (CET) Subject: [Rprotobuf-commits] r699 - papers/rjournal Message-ID: <20140104001309.63965185A68@r-forge.r-project.org> Author: murray Date: 2014-01-04 01:13:08 +0100 (Sat, 04 Jan 2014) New Revision: 699 Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw Log: Submit dirk's rewrite of acknowledgments section. Add todo about OpenGPU or Poly at the bottom. Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-03 22:56:15 UTC (rev 698) +++ papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-04 00:13:08 UTC (rev 699) @@ -1130,7 +1130,7 @@ table is that RProtoBuf does not in general provide any significant space-savings over R's normal serialization mechanism. The benefit from RProtoBuf comes from its interoperability with other -environments. +environments, safe versioning, TODO comparison of protobuf serialization sizes/times for various vectors. Compared to R's native serialization. Discussion of the RHIPE approach of serializing any/all R objects, vs more specific protocol buffers for specific R objects. @@ -1297,6 +1297,9 @@ messages, send the serialized message to a remote server, read back a response, and then parse the response protocol buffer interactively. +TODO(mstokely): Talk about Jeroen Ooms OpenCPU, or talk about Andy +Chu's Poly. + \section{Summary} % RProtoBuf has been used. @@ -1308,20 +1311,16 @@ \section{Acknowledgement} -\CRANpkg{RProtoBuf} was originally written in 2009 by Romain -Fran\c{c}ois and Dirk Eddelbuettel. The authors would particularly -like to thank Romain for his initial implementation and continued -design discussions. Several features of this package are based -on the design of the \CRANpkg{rJava} package by Simon Urbanek -(dispatch on new, S4 class structures using external pointers). We'd -like to thank Simon for his indirect involvment on -\CRANpkg{RProtoBuf}. The user defined table mechanism, implemented by -Duncan Temple Lang for the purpose of the \pkg{RObjectTables} -package allowed the dynamic symbol lookup. Many thanks to Duncan for -this amazing feature. Kenton Varda was generous with his time in -reviewing code and explaining obscure protocol buffer semantics. Karl -Millar and Jeroen Ooms were helpful in reviewing code or offering -suggestions. Saptarshi Guha's contemporaneous work on \pkg{RHIPE} was a -strong motivator. +The first versions of \CRANpkg{RProtoBuf} were written during 2009-2010, +with very significant contributions, both in code and design, made by +Romain Fran\c{c}ois. His continued influence on design and code is +appreciated. Several features of the package are influenced +by on the design of the \CRANpkg{rJava} package by Simon Urbanek +The user-defined table mechanism, implemented by Duncan Temple Lang for the +purpose of the \pkg{RObjectTables} package allowed the dynamic symbol lookup. +Kenton Varda was generous with his time in reviewing code and explaining +obscure protocol buffer semantics. Karl Millar and Jeroen Ooms were +helpful in reviewing code or offering suggestions. The contemporaneous +work by Saptarshi Guha on \pkg{RHIPE} was a strong initial motivator. \bibliography{eddelbuettel-francois-stokely} From noreply at r-forge.r-project.org Sat Jan 4 02:01:55 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 02:01:55 +0100 (CET) Subject: [Rprotobuf-commits] r700 - papers/rjournal Message-ID: <20140104010155.37E64186C33@r-forge.r-project.org> Author: edd Date: 2014-01-04 02:01:54 +0100 (Sat, 04 Jan 2014) New Revision: 700 Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw papers/rjournal/eddelbuettel-francois-stokely.bib Log: one incomplete round of comments Modified: papers/rjournal/eddelbuettel-francois-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-04 00:13:08 UTC (rev 699) +++ papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-04 01:01:54 UTC (rev 700) @@ -24,8 +24,8 @@ specialized programming languages. Protocol Buffers are a popular method of serializing structured data between applications---while remaining independent of programming languages or operating system. The - \CRANpkg{RProtoBuf} package provides a complete interface to this - library. + \CRANpkg{RProtoBuf} package provides a complete interface between this + library and the R environment for statistical computing. %TODO(ms) keep it less than 150 words. } @@ -36,7 +36,7 @@ Modern data collection and analysis pipelines are increasingly being built using collections of components to better manage software complexity through reusability, modularity, and fault -isolation \citep{Wegiel:2010:CTT:1932682.1869479}. +isolation \citep{Wegiel:2010:CTT:1932682.1869479}. Data analysis patterns such as Split-Apply-Combine \citep{wickham2011split} explicitly break up large problems into manageable pieces. These patterns are frequently employed with @@ -47,12 +47,15 @@ different environments. Each stage of the data analysis pipeline may involve storing intermediate results in a file or sending them over the network. +% DE: Nice! Given these requirements, how do we safely share intermediate results between different applications, possibly written in different -languages, and possibly running on different computers? Programming +languages, and possibly running on different computer system, possibly +spanning different operating systems? Programming languages such as R, Julia, Java, and Python include built-in serialization support, but these formats are tied to the specific +% DE: need to define serialization? programming language in use and thus lock the user into a single environment. CSV files can be read and written by many applications and so are often used for exporting tabular data. However, CSV files @@ -74,7 +77,9 @@ these formats lack a separate schema for the serialized data and thus still duplicate field names with each message sent over the network or stored in a file. Such formats also lack support for versioning when -data storage needs evolve over time. +data storage needs evolve over time, or when application logic and +requirement changes dictate update to the message format. +% DE: Need to talk about XML ? Once the data serialization needs of an application become complex enough, developers typically benefit from the use of an @@ -82,8 +87,8 @@ Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact well-documented schema for cross-langauge data structures and efficient binary interchange formats. The schema can be used to -generate model classes for statically typed programming languages such -as C++ and Java, or can be used with reflection for dynamically typed +generate model classes for statically-typed programming languages such +as C++ and Java, or can be used with reflection for dynamically-typed programming languages. Since the schema is provided separately from the encoded data, the data can be efficiently encoded to minimize storage costs of the stored data when compared with simple @@ -104,7 +109,7 @@ \section{Protocol Buffers} -Introductory section which may include references in parentheses +FIXME Introductory section which may include references in parentheses \citep{R}, or cite a reference such as \citet{R} in the text. % This content is good. Maybe use and cite? @@ -113,15 +118,19 @@ %% TODO(de,ms) What follows is oooooold and was lifted from the webpage %% Rewrite? -Protocol Buffers are a modern language-neutral, platform-neutral, -extensible mechanism for sharing and storing structured data. They -have been widely adopted in industry with applications as varied as Sony -Playstations, Twitter, Google Search, Hadoop, and Open Street Map. While -traditional IDLs were previously characterized by bloat and -complexity, Protocol Buffers is based on a simple list and records -model that is flexible and easy to use. Some of the key features -provided by Protocol Buffers for data analysis include: +Protocol Buffers can be described as a modern, language-neutral, platform-neutral, +extensible mechanism for sharing and storing structured data. Since their +introduction, Protocol Buffers have been widely adopted in industry with +applications as varied as database-internal messaging (Drizzle), % DE: citation? +Sony Playstations, Twitter, Google Search, Hadoop, and Open Street Map. While +% TODO(DE): This either needs a citation, or remove the name drop +traditional IDLs have at time been criticized for code bloat and +complexity, Protocol Buffers are based on a simple list and records +model that is compartively flexible and simple to use. +Some of the key features provided by Protocol Buffers for data analysis +include: + \begin{itemize} \item \emph{Portable}: Allows users to send and receive data between applications or different computers. @@ -138,14 +147,14 @@ session. Common use cases include populating a request RPC protocol buffer in R that is then serialized and sent over the network to a remote server. The server would then deserialize the message, act on -the request, and respond with a new protocol buffer over the network. +the request, and respond with a new protocol buffer over the network. The key +difference to, say, a request to an Rserve instance is that the remote server +may not even know the R language. %Protocol buffers are a language-neutral, platform-neutral, extensible %way of serializing structured data for use in communications %protocols, data storage, and more. - - %Protocol Buffers offer key features such as an efficient data interchange %format that is both language- and operating system-agnostic yet uses a %lightweight and highly performant encoding, object serialization and @@ -160,7 +169,7 @@ Many sources compare data serialization formats and show protocol buffers very favorably to the alternatives, such -as \citep{Sumaray:2012:CDS:2184751.2184810} +as \citet{Sumaray:2012:CDS:2184751.2184810} %The flexibility of the reflection-based API is particularly well %suited for interactive data analysis. Modified: papers/rjournal/eddelbuettel-francois-stokely.bib =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.bib 2014-01-04 00:13:08 UTC (rev 699) +++ papers/rjournal/eddelbuettel-francois-stokely.bib 2014-01-04 01:01:54 UTC (rev 700) @@ -9,7 +9,7 @@ } @Manual{msgpackR, title = {msgpackR: A library to serialize or unserialize data in MessagePack format}, - author = {Mikiya TANIZAWA}, + author = {Mikiya Tanizawa}, year = {2013}, note = {R package version 1.1}, url = {http://CRAN.R-project.org/package=msgpackR}, From noreply at r-forge.r-project.org Sat Jan 4 02:06:19 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 02:06:19 +0100 (CET) Subject: [Rprotobuf-commits] r701 - papers/rjournal Message-ID: <20140104010619.F1C30186CAD@r-forge.r-project.org> Author: edd Date: 2014-01-04 02:06:19 +0100 (Sat, 04 Jan 2014) New Revision: 701 Added: papers/rjournal/eddelbuettel-stokely.Rnw papers/rjournal/eddelbuettel-stokely.bib Removed: papers/rjournal/eddelbuettel-francois-stokely.Rnw papers/rjournal/eddelbuettel-francois-stokely.bib Modified: papers/rjournal/Makefile papers/rjournal/RJwrapper.tex Log: renaming Modified: papers/rjournal/Makefile =================================================================== --- papers/rjournal/Makefile 2014-01-04 01:01:54 UTC (rev 700) +++ papers/rjournal/Makefile 2014-01-04 01:06:19 UTC (rev 701) @@ -9,8 +9,8 @@ rm -fr RJwrapper.blg rm -fr RJwrapper.brf -RJwrapper.pdf: RJwrapper.tex eddelbuettel-francois-stokely.Rnw RJournal.sty - R CMD Sweave eddelbuettel-francois-stokely.Rnw +RJwrapper.pdf: RJwrapper.tex eddelbuettel-stokely.Rnw RJournal.sty + R CMD Sweave eddelbuettel-stokely.Rnw pdflatex RJwrapper.tex bibtex RJwrapper pdflatex RJwrapper.tex Modified: papers/rjournal/RJwrapper.tex =================================================================== --- papers/rjournal/RJwrapper.tex 2014-01-04 01:01:54 UTC (rev 700) +++ papers/rjournal/RJwrapper.tex 2014-01-04 01:06:19 UTC (rev 701) @@ -19,7 +19,7 @@ %% replace RJtemplate with your article \begin{article} - \input{eddelbuettel-francois-stokely} + \input{eddelbuettel-stokely} \address{Dirk Eddelbuettel\\ Debian and R Projects\\ Deleted: papers/rjournal/eddelbuettel-francois-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-04 01:01:54 UTC (rev 700) +++ papers/rjournal/eddelbuettel-francois-stokely.Rnw 2014-01-04 01:06:19 UTC (rev 701) @@ -1,1335 +0,0 @@ -% !TeX root = RJwrapper.tex -% We don't want a left margin for Sinput or Soutput for our table 1. -%\DefineVerbatimEnvironment{Sinput}{Verbatim} {xleftmargin=0em} -%\DefineVerbatimEnvironment{Soutput}{Verbatim}{xleftmargin=0em} -%\DefineVerbatimEnvironment{Scode}{Verbatim}{xleftmargin=2em} -% Setting the topsep to 0 reduces spacing from input to output and -% improves table 1. -\fvset{listparameters={\setlength{\topsep}{0pt}}} -\renewenvironment{Schunk}{\vspace{\topsep}}{\vspace{\topsep}} - -\title{RProtoBuf: Efficient Cross-Language Data Serialization in R} -\author{by Dirk Eddelbuettel and Murray Stokely} - -%% DE: I tend to have wider option(width=...) so this -%% guarantees better line breaks -<>= -options(width=65, prompt="R> ", digits=4) -@ - -\maketitle - -\abstract{Modern data collection and analysis pipelines often involve - a sophisticated mix of applications written in general purpose and - specialized programming languages. Protocol Buffers are a popular - method of serializing structured data between applications---while remaining - independent of programming languages or operating system. The - \CRANpkg{RProtoBuf} package provides a complete interface between this - library and the R environment for statistical computing. - %TODO(ms) keep it less than 150 words. -} - -%TODO(de) 'protocol buffers' or 'Protocol Buffers' ? - -\section{Introduction} - -Modern data collection and analysis pipelines are increasingly being -built using collections of components to better manage software -complexity through reusability, modularity, and fault -isolation \citep{Wegiel:2010:CTT:1932682.1869479}. -Data analysis patterns such as Split-Apply-Combine -\citep{wickham2011split} explicitly break up large problems into -manageable pieces. These patterns are frequently employed with -different programming languages used for the different phases of data -analysis -- collection, cleaning, analysis, post-processing, and -presentation in order to take advantage of the unique combination of -performance, speed of development, and library support offered by -different environments. Each stage of the data -analysis pipeline may involve storing intermediate results in a -file or sending them over the network. -% DE: Nice! - -Given these requirements, how do we safely share intermediate results -between different applications, possibly written in different -languages, and possibly running on different computer system, possibly -spanning different operating systems? Programming -languages such as R, Julia, Java, and Python include built-in -serialization support, but these formats are tied to the specific -% DE: need to define serialization? -programming language in use and thus lock the user into a single -environment. CSV files can be read and written by many applications -and so are often used for exporting tabular data. However, CSV files -have a number of disadvantages, such as a limitation of exporting only -tabular datasets, lack of type-safety, inefficient text representation -and parsing, and ambiguities in the format involving special -characters. JSON is another widely-supported format used mostly on -the web that removes many of these disadvantages, but it too suffers -from being too slow to parse and also does not provide strong typing -between integers and floating point. Because the schema information -is not kept separately, multiple JSON messages of the same type -needlessly duplicate the field names with each message. -% -% -% -A number of binary formats based on JSON have been proposed that -reduce the parsing cost and improve the efficiency. MessagePack -\citep{msgpackR} and BSON \citep{rmongodb} both have R interfaces, but -these formats lack a separate schema for the serialized data and thus -still duplicate field names with each message sent over the network or -stored in a file. Such formats also lack support for versioning when -data storage needs evolve over time, or when application logic and -requirement changes dictate update to the message format. -% DE: Need to talk about XML ? - -Once the data serialization needs of an application become complex -enough, developers typically benefit from the use of an -\emph{interface description language}, or \emph{IDL}. IDLs like -Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact -well-documented schema for cross-langauge data structures and -efficient binary interchange formats. The schema can be used to -generate model classes for statically-typed programming languages such -as C++ and Java, or can be used with reflection for dynamically-typed -programming languages. Since the schema is provided separately from -the encoded data, the data can be efficiently encoded to minimize -storage costs of the stored data when compared with simple -``schema-less'' binary interchange formats. - -% TODO(mstokely): Take a more conversational tone here asking -% questions and motivating protocol buffers? - -% TODO(mstokely): If we go to JSS, include a larger paragraph here -% referencing each numbered section. I don't like these generally, -% but its useful for this paper I think because we have a boring bit -% in the middle (full class/method details) and interesting -% applications at the end. -This article describes the basics of Google's Protocol Buffers through -an easy to use R package, \CRANpkg{RProtoBuf}. After describing the -basics of protocol buffers and \CRANpkg{RProtoBuf}, we illustrate -several common use cases for protocol buffers in data analysis. - -\section{Protocol Buffers} - -FIXME Introductory section which may include references in parentheses -\citep{R}, or cite a reference such as \citet{R} in the text. - -% This content is good. Maybe use and cite? -% http://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html - - -%% TODO(de,ms) What follows is oooooold and was lifted from the webpage -%% Rewrite? -Protocol Buffers can be described as a modern, language-neutral, platform-neutral, -extensible mechanism for sharing and storing structured data. Since their -introduction, Protocol Buffers have been widely adopted in industry with -applications as varied as database-internal messaging (Drizzle), % DE: citation? -Sony Playstations, Twitter, Google Search, Hadoop, and Open Street Map. While -% TODO(DE): This either needs a citation, or remove the name drop -traditional IDLs have at time been criticized for code bloat and -complexity, Protocol Buffers are based on a simple list and records -model that is compartively flexible and simple to use. - -Some of the key features provided by Protocol Buffers for data analysis -include: - -\begin{itemize} -\item \emph{Portable}: Allows users to send and receive data between - applications or different computers. -\item \emph{Efficient}: Data is serialized into a compact binary - representation for transmission or storage. -\item \emph{Exentsible}: New fields can be added to Protocol Buffer Schemas - in a forward-compatible way that do not break older applications. -\item \emph{Stable}: Protocol Buffers have been in wide use for over a - decade. -\end{itemize} - -Figure~\ref{fig:protobuf-distributed-usecase} illustrates an example -communication workflow with protocol buffers and an interactive R -session. Common use cases include populating a request RPC protocol -buffer in R that is then serialized and sent over the network to a -remote server. The server would then deserialize the message, act on -the request, and respond with a new protocol buffer over the network. The key -difference to, say, a request to an Rserve instance is that the remote server -may not even know the R language. - -%Protocol buffers are a language-neutral, platform-neutral, extensible -%way of serializing structured data for use in communications -%protocols, data storage, and more. - -%Protocol Buffers offer key features such as an efficient data interchange -%format that is both language- and operating system-agnostic yet uses a -%lightweight and highly performant encoding, object serialization and -%de-serialization as well data and configuration management. Protocol -%buffers are also forward compatible: updates to the \texttt{proto} -%files do not break programs built against the previous specification. - -%While benchmarks are not available, Google states on the project page that in -%comparison to XML, protocol buffers are at the same time \textsl{simpler}, -%between three to ten times \textsl{smaller}, between twenty and one hundred -%times \textsl{faster}, as well as less ambiguous and easier to program. - -Many sources compare data serialization formats and show protocol -buffers very favorably to the alternatives, such -as \citet{Sumaray:2012:CDS:2184751.2184810} - -%The flexibility of the reflection-based API is particularly well -%suited for interactive data analysis. - -% XXX Design tradeoffs: reflection vs proto compiler - -For added speed and efficiency, the C++, Java, and Python bindings to -Protocol Buffers are used with a compiler that translates a protocol -buffer schema description file (ending in \texttt{.proto}) into -language-specific classes that can be used to create, read, write and -manipulate protocol buffer messages. The R interface, in contrast, -uses a reflection-based API that is particularly well suited for -interactive data analysis. All messages in R have a single class -structure, but different accessor methods are created at runtime based -on the name fields of the specified message type. - -% In other words, given the 'proto' -%description file, code is automatically generated for the chosen -%target language(s). The project page contains a tutorial for each of -%these officially supported languages: -%\url{http://code.google.com/apis/protocolbuffers/docs/tutorials.html} - -%The protocol buffers code is released under an open-source (BSD) license. The -%protocol buffer project (\url{http://code.google.com/p/protobuf/}) -%contains a C++ library and a set of runtime libraries and compilers for -%C++, Java and Python. - -%With these languages, the workflow follows standard practice of so-called -%Interface Description Languages (IDL) -%(c.f. \href{http://en.wikipedia.org/wiki/Interface_description_language}{Wikipedia -% on IDL}). This consists of compiling a protocol buffer description file -%(ending in \texttt{.proto}) into language specific classes that can be used - -%Besides the officially supported C++, Java and Python implementations, several projects have been -%created to support protocol buffers for many languages. The list of known -%languages to support protocol buffers is compiled as part of the -%project page: \url{http://code.google.com/p/protobuf/wiki/ThirdPartyAddOns} - -\begin{figure}[t] -\begin{center} -\includegraphics[width=\textwidth]{protobuf-distributed-system-crop.pdf} -\end{center} -\caption{Example protobuf usage} -\label{fig:protobuf-distributed-usecase} -\end{figure} - -\section{Basic Usage: Messages and Descriptors} - -This section describes how to use the R API to create and manipulate -protocol buffer messages in R, and how to read and write the -binary \emph{payload} of the messages to files and arbitrary binary -R connections. - -The two fundamental building blocks of Protocol Buffers are Messages -and Descriptors. Messages provide a common abstract encapsulation of -structured data fields of the type specified in a Message Descriptor. -Message Descriptors are defined in \texttt{.proto} files and define a -schema for a particular named class of messages. - -Table~\ref{tab:proto} shows an example \texttt{.proto} file which -defines the \texttt{tutorial.Person} type. The R code in the right -column shows an example of creating a new message of this type and -populating its fields. - -% Commented out because we said this earlier. -%This separation -%between schema and the message objects is in contrast to -%more verbose formats like JSON, and when combined with the efficient -%binary representation of any Message object explains a large part of -%the performance and storage-space advantage offered by Protocol -%Buffers. TODO(ms): we already said some of this above. clean up. - -% lifted from protobuf page: -%With Protocol Buffers you define how you want your data to be -%structured once, and then you can read or write structured data to and -%from a variety of data streams using a variety of different -%languages. The definition - -%% TODO(de) Can we make this not break the width of the page? -\noindent -\begin{table} -\begin{tabular}{@{\hskip .01\textwidth}p{.40\textwidth}@{\hskip .015\textwidth}|@{\hskip .015\textwidth}p{0.55\textwidth}@{\hskip .01\textwidth}} -\hline -Schema : \texttt{addressbook.proto} & Example R Session\\ -\hline -\begin{minipage}{.35\textwidth} -\vspace{2mm} -\begin{example} -package tutorial; -message Person { - required string name = 1; - required int32 id = 2; - optional string email = 3; - enum PhoneType { - MOBILE = 0; HOME = 1; - WORK = 2; - } - message PhoneNumber { - required string number = 1; - optional PhoneType type = 2; - } - repeated PhoneNumber phone = 4; -} -\end{example} -\vspace{2mm} -\end{minipage} & \begin{minipage}{.45\textwidth} -<>= -library(RProtoBuf) -person <- new(tutorial.Person, id=1, name="Dirk") -person -person$name -person$name <- "Romain" -cat(as.character(person)) -serialize(person, NULL) -@ -\end{minipage} \\ -\hline -\end{tabular} -\caption{The schema representation from a \texttt{.proto} file for the - \texttt{tutorial.Person} class (left) and simple R code for creating - an object of this class and accessing its fields (right).} -\label{tab:proto} -\end{table} - -%This section may contain a figure such as Figure~\ref{figure:rlogo}. -% -%\begin{figure}[htbp] -% \centering -% \includegraphics{Rlogo} -% \caption{The logo of R.} -% \label{figure:rlogo} -%\end{figure} - -\subsection{Importing Message Descriptors from .proto files} - -%The three basic abstractions of \CRANpkg{RProtoBuf} are Messages, -%which encapsulate a data structure, Descriptors, which define the -%schema used by one or more messages, and DescriptorPools, which -%provide access to descriptors. - -Before we can create a new Protocol Buffer Message or parse a -serialized stream of bytes as a Message, we must read in the message -type specification from a \texttt{.proto} file. - -New \texttt{.proto} files are imported with the \code{readProtoFiles} -function, which can import a single file, all files in a directory, or -all \texttt{.proto} files provided by another R package. - -The \texttt{.proto} file syntax for defining the structure of protocol -buffer data is described comprehensively on Google Code: -\url{http://code.google.com/apis/protocolbuffers/docs/proto.html}. - -Once the proto files are imported, all message descriptors are -are available in the R search path in the \texttt{RProtoBuf:DescriptorPool} -special environment. The underlying mechanism used here is -described in more detail in Section~\ref{sec-lookup}. - -<<>>= -ls( "RProtoBuf:DescriptorPool" ) -@ - -%\subsection{Importing proto files} -%In contrast to the other languages (Java, C++, Python) that are officially -%supported by Google, the implementation used by the \texttt{RProtoBuf} -%package does not rely on the \texttt{protoc} compiler (with the exception of -%the two functions discussed in the previous section). This means that no -%initial step of statically compiling the proto file into C++ code that is -%then accessed by R code is necessary. Instead, \texttt{proto} files are -%parsed and processed \textsl{at runtime} by the protobuf C++ library---which -%is much more appropriate for a dynamic language. - -\subsection{Creating a message} - -New messages are created with the \texttt{new} function which accepts -a Message Descriptor and optionally a list of ``name = value'' pairs -to set in the message. -%The objects contained in the special environment are -%descriptors for their associated message types. Descriptors will be -%discussed in detail in another part of this document, but for the -%purpose of this section, descriptors are just used with the \texttt{new} -%function to create messages. - -<<>>= -p1 <- new( tutorial.Person ) -p <- new( tutorial.Person, name = "Romain", id = 1 ) -@ - -\subsection{Access and modify fields of a message} - -Once the message is created, its fields can be queried -and modified using the dollar operator of R, making protocol -buffer messages seem like lists. - -<<>>= -p$name -p$id -p$email <- "francoisromain at free.fr" -@ - -However, as opposed to R lists, no partial matching is performed -and the name must be given entirely. - -The \verb|[[| operator can also be used to query and set fields -of a mesages, supplying either their name or their tag number : - -<<>>= -p[["name"]] <- "Romain Francois" -p[[ 2 ]] <- 3 -p[[ "email" ]] -@ - -Protocol buffers include a 64-bit integer type, but R lacks native -64-bit integer support. A workaround is available and described in -Section~\ref{sec:int64} for working with large integer values. - -% TODO(mstokely): Document extensions here. -% There are none in addressbook.proto though. - -\subsection{Display messages} - -Protocol buffer messages and descriptors implement \texttt{show} -methods that provide basic information about the message : - -<<>>= -p -@ - -For additional information, such as for debugging purposes, -the \texttt{as.character} method provides a more complete ASCII -representation of the contents of a message. - -<<>>= -writeLines( as.character( p ) ) -@ - -\subsection{Serializing messages} - -However, the main focus of protocol buffer messages is -efficiency. Therefore, messages are transported as a sequence -of bytes. The \texttt{serialize} method is implemented for -protocol buffer messages to serialize a message into a sequence of -bytes that represents the message. -%(raw vector in R speech) that represents the message. - -<<>>= -serialize( p, NULL ) -@ - -The same method can also be used to serialize messages to files : - -<<>>= -tf1 <- tempfile() -serialize( p, tf1 ) -readBin( tf1, raw(0), 500 ) -@ - -Or to arbitrary binary connections: - -<<>>= -tf2 <- tempfile() -con <- file( tf2, open = "wb" ) -serialize( p, con ) -close( con ) -readBin( tf2, raw(0), 500 ) -@ - -\texttt{serialize} can also be used in a more traditional -object oriented fashion using the dollar operator : - -<<>>= -# serialize to a file -p$serialize( tf1 ) -# serialize to a binary connection -con <- file( tf2, open = "wb" ) -p$serialize( con ) -close( con ) -@ - - -\subsection{Parsing messages} - -The \texttt{RProtoBuf} package defines the \texttt{read} and -\texttt{readASCII} functions to read messages from files, raw vectors, -or arbitrary connections. \texttt{read} expects to read the message -payload from binary files or connections and \texttt{readASCII} parses -the human-readable ASCII output that is created with -\code{as.character}. - -The binary representation of the message (often called the payload) -does not contain information that can be used to dynamically -infer the message type, so we have to provide this information -to the \texttt{read} function in the form of a descriptor : - -<<>>= -msg <- read( tutorial.Person, tf1 ) -writeLines( as.character( msg ) ) -@ - -The \texttt{input} argument of \texttt{read} can also be a binary -readable R connection, such as a binary file connection: - -<<>>= -con <- file( tf2, open = "rb" ) -message <- read( tutorial.Person, con ) -close( con ) -writeLines( as.character( message ) ) -@ - -Finally, the payload of the message can be used : - -<<>>= -# reading the raw vector payload of the message -payload <- readBin( tf1, raw(0), 5000 ) -message <- read( tutorial.Person, payload ) -@ - - -\texttt{read} can also be used as a pseudo method of the descriptor -object : - -<<>>= -# reading from a file -message <- tutorial.Person$read( tf1 ) -# reading from a binary connection -con <- file( tf2, open = "rb" ) -message <- tutorial.Person$read( con ) -close( con ) -# read from the payload -message <- tutorial.Person$read( payload ) -@ - - -\section{Under the hood: S4 Classes, Methods, and Pseudo Methods} - -The \CRANpkg{RProtoBuf} package uses the S4 system to store -information about descriptors and messages. Using the S4 system -allows the \texttt{RProtoBuf} package to dispatch methods that are not -generic in the S3 sense, such as \texttt{new} and -\texttt{serialize}. - -Each R object stores an external pointer to an object managed by -the \texttt{protobuf} C++ library. -The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp} is used to -facilitate the integration of the R and C++ code for these objects. - -% Message, Descriptor, FieldDescriptor, EnumDescriptor, -% FileDescriptor, EnumValueDescriptor -% -% grep RPB_FUNC * | grep -v define|wc -l -% 84 -% grep RPB_ * | grep -v RPB_FUNCTION | grep METHOD|wc -l -% 33 - -There are over 100 C++ functions that provide the glue code between -the member functions of the 6 primary Message and Descriptor classes -in the protobuf library. Wrapping each method individually allows us -to add user friendly custom error handling, type coercion, and -performance improvements at the cost of a more verbose -implementation. The RProtoBuf implementation in many ways motivated -the development of Rcpp Modules \citep{eddelbuettel2010exposing}, -which provide a more concise way of wrapping C++ functions and classes -in a single entity. - -The \texttt{RProtoBuf} package combines the \emph{R typical} dispatch -of the form \verb|method( object, arguments)| and the more traditional -object oriented notation \verb|object$method(arguments)|. -Additionally, \texttt{RProtoBuf} implements the \texttt{.DollarNames} S3 generic function -(defined in the \texttt{utils} package) for all classes to enable tab -completion. Completion possibilities include pseudo method names for all -classes, plus dynamic dispatch on names or types specific to a given object. - -% TODO(ms): Add column check box for doing dynamic dispatch based on type. -\begin{table}[h] -\centering -\begin{tabular}{|l|c|c|l|} -\hline -\textbf{Class} & \textbf{Slots} & \textbf{Methods} & \textbf{Dynamic Dispatch}\\ -\hline -\hline -Message & 2 & 20 & yes (field names)\\ -\hline -Descriptor & 2 & 16 & yes (field names, enum types, nested types)\\ -\hline -FieldDescriptor & 4 & 18 & no\\ -\hline -EnumDescriptor & 4 & 11 & yes (enum constant names)\\ -\hline -FileDescriptor & 3 & 6 & yes (message/field definitions)\\ -\hline -EnumValueDescriptor & 3 & 6 & no\\ -\hline -\end{tabular} -\end{table} - -\subsection{Messages} - -The \texttt{Message} S4 class represents Protocol Buffer Messages and -is the core abstraction of \CRANpkg{RProtoBuf}. Each \texttt{Message} -contains a pointer to a \texttt{Descriptor} which defines the schema -of the data defined in the Message, as well as a number of -\texttt{FieldDescriptors} for the individual fields of the message. A -complete list of the slots and methods for \texttt{Messages} -is available in Table~\ref{Message-methods-table}. - -\begin{table}[h] -\centering -\begin{small} -\begin{tabular}{l|p{10cm}} -\hline -\textbf{Slot} & \textbf{Description} \\ -\hline -\texttt{pointer} & External pointer to the \texttt{Message} object of the C++ proto library. Documentation for the -\texttt{Message} class is available from the protocol buffer project page: -\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.message.html#Message} \\ -\hline -\texttt{type} & Fully qualified name of the message. For example a \texttt{Person} message -has its \texttt{type} slot set to \texttt{tutorial.Person} \\[.3cm] -\hline -\textbf{Method} & \textbf{Description} \\ -\hline -\texttt{has} & Indicates if a message has a given field. \\ -\texttt{clone} & Creates a clone of the message \\ -\texttt{isInitialized} & Indicates if a message has all its required fields set\\ -\texttt{serialize} & serialize a message to a file, binary connection, or raw vector\\ -\texttt{clear} & Clear one or several fields of a message, or the entire message\\ -\texttt{size} & The number of elements in a message field\\ -\texttt{bytesize} & The number of bytes the message would take once serialized\\ -\hline -\texttt{swap} & swap elements of a repeated field of a message\\ -\texttt{set} & set elements of a repeated field\\ -\texttt{fetch} & fetch elements of a repeated field\\ -\texttt{setExtension} & set an extension of a message\\ -\texttt{getExtension} & get the value of an extension of a message\\ -\texttt{add} & add elements to a repeated field \\ -\hline -\texttt{str} & the R structure of the message\\ -\texttt{as.character} & character representation of a message\\ -\texttt{toString} & character representation of a message (same as \texttt{as.character}) \\ -\texttt{as.list} & converts message to a named R list\\ -\texttt{update} & updates several fields of a message at once\\ -\texttt{descriptor} & get the descriptor of the message type of this message\\ -\texttt{fileDescriptor} & get the file descriptor of this message's descriptor\\ -\hline -\end{tabular} -\end{small} -\caption{\label{Message-methods-table}Description of slots and methods for the \texttt{Message} S4 class} -\end{table} - -\subsection{Descriptors} - -Descriptors describe the type of a Message. This includes what fields -a message contains and what the types of those fields are. Message -descriptors are represented in R with the \emph{Descriptor} S4 -class. The class contains the slots \texttt{pointer} and -\texttt{type}. Similarly to messages, the \verb|$| operator can be -used to retrieve descriptors that are contained in the descriptor, or -invoke pseudo-methods. - -When \CRANpkg{RProtoBuf} is first loaded it calls -\texttt{readProtoFiles} to read in an example \texttt{.proto} file -included with the package. The \texttt{tutorial.Person} descriptor -and any other descriptors defined in loaded \texttt{.proto} files are -then available on the search path. - -<<>>= -# field descriptor -tutorial.Person$email - -# enum descriptor -tutorial.Person$PhoneType - -# nested type descriptor -tutorial.Person$PhoneNumber -# same as -tutorial.Person.PhoneNumber -@ - -Table~\ref{Descriptor-methods-table} provides a complete list of the -slots and avalailable methods for Descriptors. - -\begin{table}[h] -\centering -\begin{small} -\begin{tabular}{l|p{10cm}} -\hline -\textbf{Slot} & \textbf{Description} \\ -\hline -\texttt{pointer} & External pointer to the \texttt{Descriptor} object of the C++ proto library. Documentation for the -\texttt{Descriptor} class is available from the protocol buffer project page: -\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.descriptor.html#Descriptor} \\ -\hline -\texttt{type} & Fully qualified path of the message type. \\[.3cm] -\hline -\textbf{Method} & \textbf{Description} \\ -\hline -\texttt{new} & Creates a prototype of a message described by this descriptor.\\ -\texttt{read} & Reads a message from a file or binary connection.\\ -\texttt{readASCII} & Read a message in ASCII format from a file or -text connection.\\ -\hline -\texttt{name} & Retrieve the name of the message type associated with -this descriptor.\\ -\texttt{as.character} & character representation of a descriptor\\ -\texttt{toString} & character representation of a descriptor (same as \texttt{as.character}) \\ -\texttt{as.list} & return a named -list of the field, enum, and nested descriptors included in this descriptor.\\ -\texttt{asMessage} & return DescriptorProto message. \\ -\hline -\texttt{fileDescriptor} & Retrieve the file descriptor of this -descriptor.\\ -\texttt{containing\_type} & Retrieve the descriptor describing the message type containing this descriptor.\\ -\texttt{field\_count} & Return the number of fields in this descriptor.\\ -\texttt{field} & Return the descriptor for the specified field in this descriptor.\\ -\texttt{nested\_type\_count} & The number of nested types in this descriptor.\\ -\texttt{nested\_type} & Return the descriptor for the specified nested -type in this descriptor.\\ -\texttt{enum\_type\_count} & The number of enum types in this descriptor.\\ -\texttt{enum\_type} & Return the descriptor for the specified enum -type in this descriptor.\\ -\hline -\end{tabular} -\end{small} -\caption{\label{Descriptor-methods-table}Description of slots and methods for the \texttt{Descriptor} S4 class} -\end{table} - -\subsection{Field Descriptors} -\label{subsec-field-descriptor} - -The class \emph{FieldDescriptor} represents field -descriptor in R. This is a wrapper S4 class around the -\texttt{google::protobuf::FieldDescriptor} C++ class. -Table~\ref{fielddescriptor-methods-table} describes the methods -defined for the \texttt{FieldDescriptor} class. - -\begin{table}[h] -\centering -\begin{small} -\begin{tabular}{l|p{10cm}} -\hline -\textbf{Slot} & \textbf{Description} \\ -\hline -\texttt{pointer} & External pointer to the \texttt{FieldDescriptor} C++ variable \\ -\hline -\texttt{name} & Simple name of the field \\ -\hline -\texttt{full\_name} & Fully qualified name of the field \\ -\hline -\texttt{type} & Name of the message type where the field is declared \\[.3cm] -\hline -\textbf{Method} & \textbf{Description} \\ -\hline -\texttt{as.character} & Character representation of a descriptor\\ -\texttt{toString} & Character -representation of a descriptor (same as \texttt{as.character}) \\ -\texttt{asMessage} & Return FieldDescriptorProto message. \\ -\texttt{name} & Return the name of the field descriptor.\\ -\texttt{fileDescriptor} & Return the fileDescriptor where this field is defined.\\ -\texttt{containing\_type} & Return the containing descriptor of this field.\\ -\texttt{is\_extension} & Return TRUE if this field is an extension.\\ -\texttt{number} & Gets the declared tag number of the field.\\ -\texttt{type} & Gets the type of the field.\\ -\texttt{cpp\_type} & Gets the C++ type of the field.\\ -\texttt{label} & Gets the label of a field (optional, required, or repeated).\\ -\texttt{is\_repeated} & Return TRUE if this field is repeated.\\ -\texttt{is\_required} & Return TRUE if this field is required.\\ -\texttt{is\_optional} & Return TRUE if this field is optional.\\ -\texttt{has\_default\_value} & Return TRUE if this field has a default value.\\ -\texttt{default\_value} & Return the default value.\\ -\texttt{message\_type} & Return the message type if this is a message type field.\\ -\texttt{enum\_type} & Return the enum type if this is an enum type field.\\ -\hline -\end{tabular} -\end{small} -\caption{\label{fielddescriptor-methods-table}Description of slots and - methods for the \texttt{FieldDescriptor} S4 class} -\end{table} - -% TODO(ms): Useful distinction to make -- FieldDescriptor does not do -% separate '$' dispatch like Messages, Descriptors, and -% EnumDescriptors do. Should it? - -\subsection{Enum Descriptors} -\label{subsec-enum-descriptor} - -The class \emph{EnumDescriptor} is an R wrapper -class around the C++ class \texttt{google::protobuf::EnumDescriptor}. -Table~\ref{enumdescriptor-methods-table} describes the methods -defined for the \texttt{EnumDescriptor} class. - -The \verb|$| operator can be used to retrieve the value of enum -constants contained in the EnumDescriptor, or to invoke -pseudo-methods. - -<<>>= -tutorial.Person$PhoneType -tutorial.Person$PhoneType$WORK -@ - -\begin{table}[h] -\centering -\begin{small} -\begin{tabular}{l|p{10cm}} -\hline -\textbf{Slot} & \textbf{Description} \\ -\hline -\texttt{pointer} & External pointer to the \texttt{EnumDescriptor} C++ variable \\ -\hline -\texttt{name} & Simple name of the enum \\ -\hline -\texttt{full\_name} & Fully qualified name of the enum \\ -\hline -\texttt{type} & Name of the message type where the enum is declared \\[.3cm] -\hline -\textbf{Method} & \textbf{Description} \\ -\hline -\texttt{as.list} & return a named -integer vector with the values of the enum and their names.\\ -\texttt{as.character} & character representation of a descriptor\\ [TRUNCATED] To get the complete diff run: svnlook diff /svnroot/rprotobuf -r 701 From noreply at r-forge.r-project.org Sat Jan 4 02:22:06 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 02:22:06 +0100 (CET) Subject: [Rprotobuf-commits] r702 - papers/rjournal Message-ID: <20140104012206.D9A4D186BAC@r-forge.r-project.org> Author: edd Date: 2014-01-04 02:22:05 +0100 (Sat, 04 Jan 2014) New Revision: 702 Modified: papers/rjournal/eddelbuettel-stokely.Rnw Log: table 1 Modified: papers/rjournal/eddelbuettel-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-04 01:06:19 UTC (rev 701) +++ papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-04 01:22:05 UTC (rev 702) @@ -248,13 +248,14 @@ %from a variety of data streams using a variety of different %languages. The definition + %% TODO(de) Can we make this not break the width of the page? \noindent \begin{table} -\begin{tabular}{@{\hskip .01\textwidth}p{.40\textwidth}@{\hskip .015\textwidth}|@{\hskip .015\textwidth}p{0.55\textwidth}@{\hskip .01\textwidth}} -\hline +\begin{tabular}{@{\hskip .01\textwidth}p{.40\textwidth}@{\hskip .02\textwidth}@{\hskip .02\textwidth}p{0.55\textwidth}@{\hskip .01\textwidth}} +\toprule Schema : \texttt{addressbook.proto} & Example R Session\\ -\hline +\cmidrule{1-2} \begin{minipage}{.35\textwidth} \vspace{2mm} \begin{example} @@ -275,18 +276,18 @@ } \end{example} \vspace{2mm} -\end{minipage} & \begin{minipage}{.45\textwidth} +\end{minipage} & \begin{minipage}{.5\textwidth} <>= library(RProtoBuf) -person <- new(tutorial.Person, id=1, name="Dirk") -person -person$name -person$name <- "Romain" -cat(as.character(person)) -serialize(person, NULL) +p <- new(tutorial.Person, id=1, name="Dirk") +class(p) +p$name +p$name <- "Murray" +cat(as.character(p)) +serialize(p, NULL) @ \end{minipage} \\ -\hline +\bottomrule \end{tabular} \caption{The schema representation from a \texttt{.proto} file for the \texttt{tutorial.Person} class (left) and simple R code for creating @@ -1324,7 +1325,7 @@ with very significant contributions, both in code and design, made by Romain Fran\c{c}ois. His continued influence on design and code is appreciated. Several features of the package are influenced -by on the design of the \CRANpkg{rJava} package by Simon Urbanek +by the design of the \CRANpkg{rJava} package by Simon Urbanek The user-defined table mechanism, implemented by Duncan Temple Lang for the purpose of the \pkg{RObjectTables} package allowed the dynamic symbol lookup. Kenton Varda was generous with his time in reviewing code and explaining From noreply at r-forge.r-project.org Sat Jan 4 02:34:42 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 02:34:42 +0100 (CET) Subject: [Rprotobuf-commits] r703 - papers/rjournal Message-ID: <20140104013442.B0199186D4F@r-forge.r-project.org> Author: edd Date: 2014-01-04 02:34:42 +0100 (Sat, 04 Jan 2014) New Revision: 703 Modified: papers/rjournal/eddelbuettel-stokely.Rnw papers/rjournal/eddelbuettel-stokely.bib Log: another incomplete round of edits Modified: papers/rjournal/eddelbuettel-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-04 01:22:05 UTC (rev 702) +++ papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-04 01:34:42 UTC (rev 703) @@ -311,8 +311,8 @@ %schema used by one or more messages, and DescriptorPools, which %provide access to descriptors. -Before we can create a new Protocol Buffer Message or parse a -serialized stream of bytes as a Message, we must read in the message +Before one can create a new Protocol Buffer Message or parse a +serialized stream of bytes as a Message, one must first read in the message type specification from a \texttt{.proto} file. New \texttt{.proto} files are imported with the \code{readProtoFiles} @@ -329,7 +329,7 @@ described in more detail in Section~\ref{sec-lookup}. <<>>= -ls( "RProtoBuf:DescriptorPool" ) +ls("RProtoBuf:DescriptorPool") @ %\subsection{Importing proto files} @@ -354,8 +354,8 @@ %function to create messages. <<>>= -p1 <- new( tutorial.Person ) -p <- new( tutorial.Person, name = "Romain", id = 1 ) +p1 <- new(tutorial.Person) +p <- new(tutorial.Person, name = "Murray", id = 1) @ \subsection{Access and modify fields of a message} @@ -367,7 +367,7 @@ <<>>= p$name p$id -p$email <- "francoisromain at free.fr" +p$email <- "murray at stokely.org" @ However, as opposed to R lists, no partial matching is performed @@ -377,7 +377,7 @@ of a mesages, supplying either their name or their tag number : <<>>= -p[["name"]] <- "Romain Francois" +p[["name"]] <- "Murray Stokely" p[[ 2 ]] <- 3 p[[ "email" ]] @ @@ -403,7 +403,7 @@ representation of the contents of a message. <<>>= -writeLines( as.character( p ) ) +writeLines(as.character(p)) @ \subsection{Serializing messages} @@ -416,25 +416,25 @@ %(raw vector in R speech) that represents the message. <<>>= -serialize( p, NULL ) +serialize(p, NULL) @ The same method can also be used to serialize messages to files : <<>>= tf1 <- tempfile() -serialize( p, tf1 ) -readBin( tf1, raw(0), 500 ) +serialize(p, tf1) +readBin(tf1, raw(0), 500) @ Or to arbitrary binary connections: <<>>= tf2 <- tempfile() -con <- file( tf2, open = "wb" ) -serialize( p, con ) -close( con ) -readBin( tf2, raw(0), 500 ) +con <- file(tf2, open = "wb") +serialize(p, con) +close(con) +readBin(tf2, raw(0), 500) @ \texttt{serialize} can also be used in a more traditional @@ -442,11 +442,11 @@ <<>>= # serialize to a file -p$serialize( tf1 ) +p$serialize(tf1) # serialize to a binary connection -con <- file( tf2, open = "wb" ) -p$serialize( con ) -close( con ) +con <- file(tf2, open = "wb") +p$serialize(con) +close(con) @ @@ -465,26 +465,26 @@ to the \texttt{read} function in the form of a descriptor : <<>>= -msg <- read( tutorial.Person, tf1 ) -writeLines( as.character( msg ) ) +msg <- read(tutorial.Person, tf1) +writeLines(as.character(msg)) @ The \texttt{input} argument of \texttt{read} can also be a binary readable R connection, such as a binary file connection: <<>>= -con <- file( tf2, open = "rb" ) -message <- read( tutorial.Person, con ) -close( con ) -writeLines( as.character( message ) ) +con <- file(tf2, open = "rb") +message <- read(tutorial.Person, con) +close(con) +writeLines(as.character(message)) @ Finally, the payload of the message can be used : <<>>= # reading the raw vector payload of the message -payload <- readBin( tf1, raw(0), 5000 ) -message <- read( tutorial.Person, payload ) +payload <- readBin(tf1, raw(0), 5000) +message <- read(tutorial.Person, payload) @ @@ -493,13 +493,13 @@ <<>>= # reading from a file -message <- tutorial.Person$read( tf1 ) +message <- tutorial.Person$read(tf1) # reading from a binary connection -con <- file( tf2, open = "rb" ) -message <- tutorial.Person$read( con ) -close( con ) +con <- file(tf2, open = "rb") +message <- tutorial.Person$read(con) +close(con) # read from the payload -message <- tutorial.Person$read( payload ) +message <- tutorial.Person$read(payload) @ @@ -513,7 +513,7 @@ Each R object stores an external pointer to an object managed by the \texttt{protobuf} C++ library. -The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp} is used to +The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp,eddelbuettel2013seamless} is used to facilitate the integration of the R and C++ code for these objects. % Message, Descriptor, FieldDescriptor, EnumDescriptor, @@ -530,12 +530,12 @@ to add user friendly custom error handling, type coercion, and performance improvements at the cost of a more verbose implementation. The RProtoBuf implementation in many ways motivated -the development of Rcpp Modules \citep{eddelbuettel2010exposing}, +the development of Rcpp Modules \citep{eddelbuettel2013exposing}, which provide a more concise way of wrapping C++ functions and classes in a single entity. The \texttt{RProtoBuf} package combines the \emph{R typical} dispatch -of the form \verb|method( object, arguments)| and the more traditional +of the form \verb|method(object, arguments)| and the more traditional object oriented notation \verb|object$method(arguments)|. Additionally, \texttt{RProtoBuf} implements the \texttt{.DollarNames} S3 generic function (defined in the \texttt{utils} package) for all classes to enable tab Modified: papers/rjournal/eddelbuettel-stokely.bib =================================================================== --- papers/rjournal/eddelbuettel-stokely.bib 2014-01-04 01:22:05 UTC (rev 702) +++ papers/rjournal/eddelbuettel-stokely.bib 2014-01-04 01:34:42 UTC (rev 703) @@ -53,11 +53,12 @@ url = {http://www.cs.uiowa.edu/~luke/R/serialize/serialize.ps}, year = {2003}, } - at article{eddelbuettel2010exposing, + at manual{eddelbuettel2013exposing, title={Exposing C++ functions and classes with Rcpp modules}, author={Eddelbuettel, Dirk and Fran{\c{c}}ois, Romain}, - year={2010}, - publisher={Citeseer} + year={2013}, + note={Vignette included in R package Rcpp}, + url = {http://CRAN.R-project.org/package=Rcpp}, } @inproceedings{cantrill2004dynamic, title={Dynamic Instrumentation of Production Systems.}, From noreply at r-forge.r-project.org Sat Jan 4 03:15:44 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 03:15:44 +0100 (CET) Subject: [Rprotobuf-commits] r704 - pkg/src Message-ID: <20140104021544.B012F1868C7@r-forge.r-project.org> Author: murray Date: 2014-01-04 03:15:43 +0100 (Sat, 04 Jan 2014) New Revision: 704 Modified: pkg/src/rprotobuf.h Log: Add two missing headers found by clang IncludeWhatYouUse, sort includes, and address a TODO by making O_BINARY definition conditional. This is only needed in Windows, but I suppose there is hope that one day someone might port this package to Windows. Modified: pkg/src/rprotobuf.h =================================================================== --- pkg/src/rprotobuf.h 2014-01-04 01:34:42 UTC (rev 703) +++ pkg/src/rprotobuf.h 2014-01-04 02:15:43 UTC (rev 704) @@ -21,11 +21,17 @@ #ifndef RPROTOBUF_H #define RPROTOBUF_H +// TODO(mstokely): should we check if this header is available? +#include +#include // for strerror #include // g++-4.7 wants this -/* should we check this is available */ -#include -/* FIXME: need to include some header file instead of this define */ +#include // for string +// O_BINARY does not exist on Unix/Linux, since there is no distinction +// between text mode and binary mode files there, but if we ever got +// this code running on Windows this would be needed. +#ifndef O_BINARY #define O_BINARY 0 +#endif #include #include From noreply at r-forge.r-project.org Sat Jan 4 03:17:55 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 03:17:55 +0100 (CET) Subject: [Rprotobuf-commits] r705 - pkg/src Message-ID: <20140104021756.11983186978@r-forge.r-project.org> Author: murray Date: 2014-01-04 03:17:55 +0100 (Sat, 04 Jan 2014) New Revision: 705 Modified: pkg/src/S4_classes.h Log: Add forward declarations of two classes used in this header but not otherwise included (e.g. this previously depended on the include order of headers). Found by clang IncludeWhatYouUse. Modified: pkg/src/S4_classes.h =================================================================== --- pkg/src/S4_classes.h 2014-01-04 02:15:43 UTC (rev 704) +++ pkg/src/S4_classes.h 2014-01-04 02:17:55 UTC (rev 705) @@ -22,6 +22,9 @@ namespace rprotobuf { +class ZeroCopyInputStreamWrapper; +class ZeroCopyOutputStreamWrapper; + class S4_EnumValueDescriptor : public Rcpp::S4 { public: S4_EnumValueDescriptor(const GPB::EnumValueDescriptor* d) : S4("EnumValueDescriptor") { @@ -192,7 +195,7 @@ S4_ArrayInputStream(Rcpp::RawVector payload, int block_size) : S4("ArrayInputStream") { GPB::io::ArrayInputStream* stream = new GPB::io::ArrayInputStream(payload.begin(), payload.size(), block_size); - Rcpp::XPtr wrapper( + Rcpp::XPtr wrapper( new ZeroCopyInputStreamWrapper(stream), true, R_NilValue, payload); slot("pointer") = wrapper; } From noreply at r-forge.r-project.org Sat Jan 4 03:18:45 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 03:18:45 +0100 (CET) Subject: [Rprotobuf-commits] r706 - pkg/src Message-ID: <20140104021845.A39E6186996@r-forge.r-project.org> Author: murray Date: 2014-01-04 03:18:44 +0100 (Sat, 04 Jan 2014) New Revision: 706 Modified: pkg/src/DescriptorPoolLookup.h pkg/src/RWarningErrorCollector.h pkg/src/Rcppsupport.h Log: Add some missing standard includes. Found by clang iwyu. Modified: pkg/src/DescriptorPoolLookup.h =================================================================== --- pkg/src/DescriptorPoolLookup.h 2014-01-04 02:17:55 UTC (rev 705) +++ pkg/src/DescriptorPoolLookup.h 2014-01-04 02:18:44 UTC (rev 706) @@ -1,6 +1,9 @@ #ifndef RPROTOBUF_DescriptorPoolLookup_H #define RPROTOBUF_DescriptorPoolLookup_H +#include /* For set */ +#include /* For string */ + #include "RSourceTree.h" #include "RWarningErrorCollector.h" Modified: pkg/src/RWarningErrorCollector.h =================================================================== --- pkg/src/RWarningErrorCollector.h 2014-01-04 02:17:55 UTC (rev 705) +++ pkg/src/RWarningErrorCollector.h 2014-01-04 02:18:44 UTC (rev 706) @@ -1,3 +1,5 @@ +#include /* For string */ + #include "rprotobuf.h" namespace rprotobuf { Modified: pkg/src/Rcppsupport.h =================================================================== --- pkg/src/Rcppsupport.h 2014-01-04 02:17:55 UTC (rev 705) +++ pkg/src/Rcppsupport.h 2014-01-04 02:18:44 UTC (rev 706) @@ -20,6 +20,9 @@ #ifndef RPROTOBUF__RCPPSUPPORT__H #define RPROTOBUF__RCPPSUPPORT__H +#include /* for stringstream, basic_ostream */ +#include /* for string */ + #include "rprotobuf.h" namespace rprotobuf { From noreply at r-forge.r-project.org Sat Jan 4 03:19:53 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 03:19:53 +0100 (CET) Subject: [Rprotobuf-commits] r707 - pkg/src Message-ID: <20140104021953.26E66186B07@r-forge.r-project.org> Author: murray Date: 2014-01-04 03:19:52 +0100 (Sat, 04 Jan 2014) New Revision: 707 Modified: pkg/src/SocketCopyingInputStream.h Log: Sort includes. From clang iwyu. Modified: pkg/src/SocketCopyingInputStream.h =================================================================== --- pkg/src/SocketCopyingInputStream.h 2014-01-04 02:18:44 UTC (rev 706) +++ pkg/src/SocketCopyingInputStream.h 2014-01-04 02:19:52 UTC (rev 707) @@ -1,15 +1,14 @@ #ifndef RPROTOBUF_SocketCopyingInputStream_H #define RPROTOBUF_SocketCopyingInputStream_H -#include -#include - -#include "sisocks.h" - /* FIXME: this should be probably handled by sisocks we need it for the TCP_NODELAY socket option */ #include +#include +#include +#include "sisocks.h" + namespace rprotobuf { class SocketCopyingInputStream : public GPB::io::CopyingInputStream { From noreply at r-forge.r-project.org Sat Jan 4 03:29:01 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 03:29:01 +0100 (CET) Subject: [Rprotobuf-commits] r708 - in pkg: . R inst/proto inst/unitTests man Message-ID: <20140104022901.66F21186BE4@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-04 03:28:59 +0100 (Sat, 04 Jan 2014) New Revision: 708 Added: pkg/R/rexp_obj.R pkg/R/serialize_pb.R pkg/inst/proto/rexp.proto pkg/inst/unitTests/runit.serialize_pb.R pkg/man/serialize_pb.Rd Modified: pkg/NAMESPACE Log: Merge serialize_pb from RProtoBufUtils Modified: pkg/NAMESPACE =================================================================== --- pkg/NAMESPACE 2014-01-04 02:19:52 UTC (rev 707) +++ pkg/NAMESPACE 2014-01-04 02:28:59 UTC (rev 708) @@ -105,4 +105,7 @@ exportPattern( "^CPPTYPE_" ) exportPattern( "^LABEL_" ) +# copied from RProtoBufUtils +export( "serialize_pb", "unserialize_pb", "can_serialize_pb" ) + # export( run_unit_tests ) Added: pkg/R/rexp_obj.R =================================================================== --- pkg/R/rexp_obj.R (rev 0) +++ pkg/R/rexp_obj.R 2014-01-04 02:28:59 UTC (rev 708) @@ -0,0 +1,161 @@ +rexp_obj <- function(obj){ + sm <- typeof(obj); + msg <- switch(sm, + "character" = rexp_string(obj), + "raw" = rexp_raw(obj), + "double" = rexp_double(obj), + "complex" = rexp_complex(obj), + "integer" = rexp_integer(obj), + "list" = rexp_list(obj), + "logical" = rexp_logical(obj), + "NULL" = rexp_null(), + {warning("Unsupported R object type:", sm); rexp_null()} + ); + + attrib <- attributes(obj) + msg$attrName <- names(attrib) + msg$attrValue <- lapply(attrib, rexp_obj) + msg +} + +rexp_string <- function(obj){ + xvalue <- lapply(as.list(obj), function(x){ + new(pb(rexp.STRING), strval=x, isNA=is.na(x)) + }) + new(pb(rexp.REXP), rclass = 0, stringValue=xvalue) +} + +rexp_raw <- function(obj){ + new(pb(rexp.REXP), rclass= 1, rawValue = obj) +} + +rexp_double <- function(obj){ + new(pb(rexp.REXP), rclass=2, realValue = obj) +} + +rexp_complex <- function(obj){ + xvalue <- lapply(as.list(obj), function(x){ + new(pb(rexp.CMPLX), real=Re(x), imag=Im(x)) + }) + new(pb(rexp.REXP), rclass=3, complexValue = xvalue) +} + +rexp_integer <- function(obj){ + new(pb(rexp.REXP), rclass=4, intValue = obj) +} + +rexp_list <- function(obj){ + xobj <- lapply(obj, rexp_obj) + new(pb(rexp.REXP), rclass=5, rexpValue = xobj) +} + +rexp_logical <- function(obj){ + xobj <- as.integer(obj) + xobj[is.na(obj)] <- 2 + new(pb(rexp.REXP), rclass=6, booleanValue = xobj) +} + +rexp_null <- function(){ + new(pb(rexp.REXP), rclass=7) +} + +unrexp <- function(msg){ + stopifnot(is(msg, "Message")) + stopifnot(msg at type == "rexp.REXP") + + myrexp <- as.list(msg) + xobj <- switch(as.character(myrexp$rclass), + "0" = unrexp_string(myrexp), + "1" = unrexp_raw(myrexp), + "2" = unrexp_double(myrexp), + "3" = unrexp_complex(myrexp), + "4" = unrexp_integer(myrexp), + "5" = unrexp_list(myrexp), + "6" = unrexp_logical(myrexp), + "7" = unrexp_null(), + stop("Unsupported rclass:", myrexp$rclass) + ) + + if(length(myrexp$attrValue)){ + attrib <- lapply(myrexp$attrValue, unrexp) + names(attrib) <- myrexp$attrName + attributes(xobj) <- attrib + } + + xobj +} + +unrexp_string <- function(myrexp){ + mystring <- unlist(lapply(myrexp$stringValue, "[[", "strval")) + isNA <- unlist(lapply(myrexp$stringValue, "[[", "isNA")) + mystring[isNA] <- NA + mystring +} + +unrexp_raw <- function(myrexp){ + myrexp$rawValue +} + +unrexp_double <- function(myrexp){ + myrexp$realValue +} + +unrexp_complex <- function(myrexp){ + xvalue <- lapply(myrexp$complexValue, function(x){ + complex(real=x$real, imaginary=x$imag) + }) + unlist(xvalue) +} + +unrexp_integer <- function(myrexp){ + myrexp$intValue +} + +unrexp_list <- function(myrexp){ + lapply(myrexp$rexpValue, unrexp) +} + +unrexp_logical <- function(myrexp){ + xvalue <- myrexp$booleanValue + xvalue[xvalue==2] <- NA + as.logical(xvalue) +} + +unrexp_null <- function(){ + NULL +} + +#Helper function to lookup a PB descriptor +pb <- function(name){ + descriptor <- deparse(substitute(name)) + if(!exists(descriptor, "RProtoBuf:DescriptorPool")){ + stop("No ProtoBuf Descriptor for: ", descriptor) + } + get(descriptor, "RProtoBuf:DescriptorPool") +} + +#Checks if object can be serialized +can_serialize_pb <- rexp_valid <- function(obj) { + valid.types <- c("character", "raw", "double", "complex", "integer", + "list", "logical", "NULL") + sm <- typeof(obj) + if (sm %in% valid.types) { + if (sm == "list") { + if (any(! unlist(lapply(obj, rexp_valid)))) { + return(FALSE) + } + } + } else { + return(FALSE) + } + attrib <- attributes(obj) + if (is.null(attrib)) { + return(TRUE) + } + if (rexp_valid(names(attrib))) { + if (rexp_valid(unname(attrib))) { + return(TRUE) + } + } + return(FALSE) +} Added: pkg/R/serialize_pb.R =================================================================== --- pkg/R/serialize_pb.R (rev 0) +++ pkg/R/serialize_pb.R 2014-01-04 02:28:59 UTC (rev 708) @@ -0,0 +1,43 @@ +#' Serialize R object to Protocol Buffer Message. +#' +#' This function serializes R objects to a general purpose protobuf message. It +#' uses the same \code{rexp.proto} descriptor and mapping between R objects and +#' protobuf messages as RHIPE. +#' +#' Third party clients need both the message and the \code{rexp.proto} descriptor +#' to read serialized R objects. The latter is included in the the package +#' installation \code{proto} directory: +#' \code{system.file(package="RProtoBuf", "proto/rexp.proto")} +#' +#' Currently, the following storage types are supported: +#' \code{character}, \code{raw}, \code{double}, \code{complex}, \code{integer}, +#' \code{list}, and \code{NULL}. Objects with other storage types, such as +#' functions, environments, S4 classes, etc, will be skipped with a warning. +#' Missing values, attributes and numeric precision will be preserved. +#' +#' @param object R object to serialize +#' @param connection passed on to \code{\link{serialize}} +#' @param ... additional arguments passed on to \code{\link{serialize}} +#' @aliases unserialize_pb can_serialize_pb +#' @export unserialize_pb +#' @export can_serialize_pb +#' @export +#' @examples msg <- tempfile(); +#' serialize_pb(iris, msg); +#' obj <- unserialize_pb(msg); +#' identical(iris, obj); +#' +serialize_pb <- function(object, connection, ...){ + + #convert object to protobuf message + msg <- rexp_obj(object); + + #serialize the message + serialize(msg, connection = connection, ...); +} + +unserialize_pb <- function(connection){ + + #convert object to protobuf message + unrexp(read(pb(rexp.REXP), connection)); +} Added: pkg/inst/proto/rexp.proto =================================================================== --- pkg/inst/proto/rexp.proto (rev 0) +++ pkg/inst/proto/rexp.proto 2014-01-04 02:28:59 UTC (rev 708) @@ -0,0 +1,41 @@ +package rexp; + +message REXP { + enum RClass { + STRING = 0; + RAW = 1; + REAL = 2; + COMPLEX = 3; + INTEGER = 4; + LIST = 5; + LOGICAL = 6; + NULLTYPE = 7; + } + enum RBOOLEAN { + F=0; + T=1; + NA=2; + } + + required RClass rclass = 1 ; + repeated double realValue = 2 [packed=true]; + repeated sint32 intValue = 3 [packed=true]; + repeated RBOOLEAN booleanValue = 4; + repeated STRING stringValue = 5; + + optional bytes rawValue = 6; + repeated CMPLX complexValue = 7; + repeated REXP rexpValue = 8; + + repeated string attrName = 11; + repeated REXP attrValue = 12; +} +message STRING { + optional string strval = 1; + optional bool isNA = 2 [default=false]; +} +message CMPLX { + optional double real = 1 [default=0]; + required double imag = 2; +} + Added: pkg/inst/unitTests/runit.serialize_pb.R =================================================================== --- pkg/inst/unitTests/runit.serialize_pb.R (rev 0) +++ pkg/inst/unitTests/runit.serialize_pb.R 2014-01-04 02:28:59 UTC (rev 708) @@ -0,0 +1,27 @@ +#Jeroen Ooms + +test.serialize_pb <- function() { + #verify that rexp.proto is loaded + RProtoBuf:::pb(rexp.REXP) + + #serialize a nested list + x <- list(foo=cars, bar=Titanic) + checkEquals(unserialize_pb(serialize_pb(x, NULL)), x) + + #a bit of everything, copied from jsonlite package + set.seed('123') + myobject <- list( + mynull = NULL, + mycomplex = lapply(eigen(matrix(-rnorm(9),3)), round, 3), + mymatrix = round(matrix(rnorm(9), 3),3), + myint = as.integer(c(1,2,3)), + mydf = cars, + mylist = list(foo='bar', 123, NA, NULL, list('test')), + mylogical = c(TRUE,FALSE,NA), + mychar = c('foo', NA, 'bar'), + somemissings = c(1,2,NA,NaN,5, Inf, 7 -Inf, 9, NA), + myrawvec = charToRaw('This is a test') + ); + + checkEquals(unserialize_pb(serialize_pb(myobject, NULL)), myobject) +} Added: pkg/man/serialize_pb.Rd =================================================================== --- pkg/man/serialize_pb.Rd (rev 0) +++ pkg/man/serialize_pb.Rd 2014-01-04 02:28:59 UTC (rev 708) @@ -0,0 +1,45 @@ +\name{serialize_pb} +\alias{can_serialize_pb} +\alias{serialize_pb} +\alias{unserialize_pb} +\title{Serialize R object to Protocol Buffer Message.} +\usage{ + serialize_pb(object, connection, ...) +} +\arguments{ + \item{object}{R object to serialize} + + \item{connection}{passed on to \code{\link{serialize}}} + + \item{...}{additional arguments passed on to + \code{\link{serialize}}} +} +\description{ + This function serializes R objects to a general purpose + protobuf message. It uses the same \code{rexp.proto} + descriptor and mapping between R objects and protobuf + messages as RHIPE. +} +\details{ + Third party clients need both the message and the + \code{rexp.proto} descriptor to read serialized R + objects. The latter is included in the the package + installation \code{proto} directory: + \code{system.file(package="RProtoBuf", + "proto/rexp.proto")} + + Currently, the following storage types are supported: + \code{character}, \code{raw}, \code{double}, + \code{complex}, \code{integer}, \code{list}, and + \code{NULL}. Objects with other storage types, such as + functions, environments, S4 classes, etc, will be skipped + with a warning. Missing values, attributes and numeric + precision will be preserved. +} +\examples{ +msg <- tempfile(); +serialize_pb(iris, msg); +obj <- unserialize_pb(msg); +identical(iris, obj); +} + From noreply at r-forge.r-project.org Sat Jan 4 04:01:46 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 04:01:46 +0100 (CET) Subject: [Rprotobuf-commits] r709 - papers/rjournal pkg Message-ID: <20140104030147.21B60186C2F@r-forge.r-project.org> Author: edd Date: 2014-01-04 04:01:43 +0100 (Sat, 04 Jan 2014) New Revision: 709 Modified: papers/rjournal/eddelbuettel-stokely.Rnw pkg/DESCRIPTION Log: increment minor version; small update re RProtoBufUtils Modified: papers/rjournal/eddelbuettel-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-04 02:28:59 UTC (rev 708) +++ papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-04 03:01:43 UTC (rev 709) @@ -1067,22 +1067,20 @@ protocol buffer integration with R. However, this implementation takes a different approach: any R object is serialized into a message based on a single catch-all \texttt{proto} schema. Jeroen Ooms took a -similar approach influenced by Saptarshi in his \pkg{RProtoBufUtils} -package. Unlike Saptarshi's package, however, RProtoBufUtils depends -on RProtoBuf for underlying message operations. This package is -available at \url{https://github.com/jeroenooms/RProtoBufUtils}. +similar approach influenced by Saptarshi in the \pkg{RProtoBufUtils} +package (which has now been integrated in \pkg{RProtoBuf}). Unlike +Saptarshi's package, however, RProtoBufUtils depends +on, and extends, RProtoBuf for underlying message operations. -The \textbf{RProtoBufUtils} package by Jereoen Ooms provides a +One key extension of \pkg{RProtoBufUtils} is the \texttt{serialize\_pb} method to convert R objects into serialized -protocol buffers in this format, and the \texttt{can\_serialize\_pb} +protocol buffers in the catch-all schema. The \texttt{can\_serialize\_pb} method can be used to determine whether the given R object can safely -be expressed in this way. To show how how this method works, we +be expressed in this way. To illustrate how this method works, we attempt to convert all of the built-in datasets from R into this serialized protocol buffer representation. <>= -library(RProtoBufUtils) - datasets <- subset(as.data.frame(data()$results), Package=="datasets") datasets$load.name <- sub("\\s+.*$", "", datasets$Item) n <- nrow(datasets) Modified: pkg/DESCRIPTION =================================================================== --- pkg/DESCRIPTION 2014-01-04 02:28:59 UTC (rev 708) +++ pkg/DESCRIPTION 2014-01-04 03:01:43 UTC (rev 709) @@ -1,5 +1,5 @@ Package: RProtoBuf -Version: 0.3.2.3 +Version: 0.3.2.4 Date: $Date$ Author: Romain Francois, Dirk Eddelbuettel and Murray Stokely Maintainer: Dirk Eddelbuettel From noreply at r-forge.r-project.org Sat Jan 4 04:11:47 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 04:11:47 +0100 (CET) Subject: [Rprotobuf-commits] r710 - papers/rjournal Message-ID: <20140104031147.A74D4184EBD@r-forge.r-project.org> Author: murray Date: 2014-01-04 04:11:46 +0100 (Sat, 04 Jan 2014) New Revision: 710 Modified: papers/rjournal/JSSwrapper.tex Log: Update the alternative JSSwrapper which gives a quick and dirty way to see what the article would basically look like in JSS style without the commitment of fully converting everything over just yet. Modified: papers/rjournal/JSSwrapper.tex =================================================================== --- papers/rjournal/JSSwrapper.tex 2014-01-04 03:01:43 UTC (rev 709) +++ papers/rjournal/JSSwrapper.tex 2014-01-04 03:11:46 UTC (rev 710) @@ -1,4 +1,5 @@ \documentclass[article]{jss} +\usepackage{booktabs} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -69,7 +70,7 @@ \begin{document} -\include{eddelbuettel-francois-stokely} +\include{eddelbuettel-stokely} %% include your article here, just as usual %% Note that you should use the \pkg{}, \proglang{} and \code{} commands. From noreply at r-forge.r-project.org Sat Jan 4 10:07:18 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 10:07:18 +0100 (CET) Subject: [Rprotobuf-commits] r711 - pkg/src Message-ID: <20140104090718.9795D186C8F@r-forge.r-project.org> Author: murray Date: 2014-01-04 10:07:18 +0100 (Sat, 04 Jan 2014) New Revision: 711 Modified: pkg/src/rprotobuf.cpp pkg/src/rprotobuf.h Log: Remove two unused macros and make a method parameter const. Suggested by Flexelint. Modified: pkg/src/rprotobuf.cpp =================================================================== --- pkg/src/rprotobuf.cpp 2014-01-04 03:11:46 UTC (rev 710) +++ pkg/src/rprotobuf.cpp 2014-01-04 09:07:18 UTC (rev 711) @@ -213,7 +213,7 @@ return _TRUE_; } -GPB::FieldDescriptor* getFieldDescriptor(GPB::Message* message, SEXP name) { +RcppExport GPB::FieldDescriptor* getFieldDescriptor(const GPB::Message* message, SEXP name) { GPB::FieldDescriptor* field_desc = (GPB::FieldDescriptor*)0; BEGIN_RCPP const GPB::Descriptor* desc = message->GetDescriptor(); Modified: pkg/src/rprotobuf.h =================================================================== --- pkg/src/rprotobuf.h 2014-01-04 03:11:46 UTC (rev 710) +++ pkg/src/rprotobuf.h 2014-01-04 09:07:18 UTC (rev 711) @@ -75,7 +75,7 @@ #endif -#define FIN_DBG(ptr, CLAZZ) +// #define FIN_DBG(ptr, CLAZZ) // #define FIN_DBG(ptr, CLAZZ) Rprintf( "RProtoBuf finalizing %s (%p)\n", CLAZZ, // ptr ) @@ -111,8 +111,6 @@ #define GET_ENUM_VALUE_DESCRIPTOR_POINTER_FROM_S4(m) \ (GPB::EnumValueDescriptor*) EXTPTR_PTR(GET_SLOT(m, Rf_install("pointer"))) -#define GET_METHOD(xp) (GPB::MethodDescriptor*) EXTPTR_PTR(xp) - #define COPYSTRING(s) s #define THROW_SOCKET_ERROR(message) Rf_error("%s : %s", message, strerror(sockerrno)) @@ -138,7 +136,7 @@ RcppExport SEXP getExtensionDescriptor(SEXP); RcppExport SEXP readProtoFiles(SEXP, SEXP); RcppExport Rboolean isMessage(SEXP, const char*); -RcppExport GPB::FieldDescriptor* getFieldDescriptor(GPB::Message*, SEXP); +RcppExport GPB::FieldDescriptor* getFieldDescriptor(const GPB::Message*, SEXP); /* in extractors.cpp */ RcppExport SEXP getMessageField(SEXP, SEXP); From noreply at r-forge.r-project.org Sat Jan 4 10:13:45 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 10:13:45 +0100 (CET) Subject: [Rprotobuf-commits] r712 - pkg/src Message-ID: <20140104091345.9E33718515F@r-forge.r-project.org> Author: murray Date: 2014-01-04 10:13:45 +0100 (Sat, 04 Jan 2014) New Revision: 712 Modified: pkg/src/std.lnt Log: Suppress lint warnings about unreachable code since lint doesn't know about the R .Calls Modified: pkg/src/std.lnt =================================================================== --- pkg/src/std.lnt 2014-01-04 09:07:18 UTC (rev 711) +++ pkg/src/std.lnt 2014-01-04 09:13:45 UTC (rev 712) @@ -76,9 +76,44 @@ // 578: declaration of symbol 'index' hides symbol index from strings.h -esym(578, index) +// 1790: Base class 'Rcpp::S4' has no non-destructor virtual functions - seems fine? +// Sutter, Herb, Exceptional C++, Item 22 +-esym(1790, Rcpp::S4) + // END_RCPP __ex__ reference parameter is not const, but maybe should be -elibmacro( 1764 ) +// O_BINARY is just set to 0 since it doesn't exist on Unix, which causes warnings. +-esym(835, |) + +// 765: external symbol could be made static. (R code calls these entry points.) +-esym(765, FileDescriptor__*) +-esym(765, FieldDescriptor__*) +-esym(765, Descriptor__*) +-esym(765, Message__*) +-esym(765, EnumDescriptor__*) +-esym(765, EnumValueDescriptor__*) +-esym(765, do_dollar_Descriptor) +-esym(765, getMessageField) +-esym(765, setMessageField) +-esym(765, getExtensionDescriptor) +// 00classes.R +-esym(765, update_message) +-esym(765, newProtoMessage) +// 714: not referenced (by C++, but it is in a .Call line in R code) +-esym(714, FileDescriptor__*) +-esym(714, FieldDescriptor__*) +-esym(714, Descriptor__*) +-esym(714, Message__*) +-esym(714, EnumDescriptor__*) +-esym(714, EnumValueDescriptor__*) +-esym(714, do_dollar_Descriptor) +-esym(714, getMessageField) +-esym(714, setMessageField) +-esym(714, getExtensionDescriptor) +-esym(714, update_message) +-esym(714, newProtoMessage) + // These don't work, because they are in library headers e.g. need -elibmacro // -emacro( (*), VOID_END_RCPP) // -emacro( (*), END_RCPP) From noreply at r-forge.r-project.org Sat Jan 4 18:21:35 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 18:21:35 +0100 (CET) Subject: [Rprotobuf-commits] r713 - old old/sockets pkg pkg/src Message-ID: <20140104172135.10D6818683C@r-forge.r-project.org> Author: edd Date: 2014-01-04 18:21:34 +0100 (Sat, 04 Jan 2014) New Revision: 713 Added: old/sockets/ old/sockets/SocketCopyingInputStream.cpp old/sockets/SocketCopyingInputStream.h old/sockets/sisocks.h Removed: pkg/src/SocketCopyingInputStream.cpp pkg/src/SocketCopyingInputStream.h pkg/src/sisocks.h Modified: pkg/ChangeLog Log: Deprecate files SocketCopyingInputStream.{cpp,h}, sisocks.h as the current RProtoBuf design does not provide networking capabilities Copied: old/sockets/SocketCopyingInputStream.cpp (from rev 712, pkg/src/SocketCopyingInputStream.cpp) =================================================================== --- old/sockets/SocketCopyingInputStream.cpp (rev 0) +++ old/sockets/SocketCopyingInputStream.cpp 2014-01-04 17:21:34 UTC (rev 713) @@ -0,0 +1,22 @@ +// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- +#include "rprotobuf.h" +#include "SocketCopyingInputStream.h" + +namespace rprotobuf { + +SocketCopyingInputStream::SocketCopyingInputStream(int id) { socket_id = id; } + +/** + * read from the socket + * + * @param buffer buffer to fill with at most size bytes + * @param size maximum number of bytes + * + * @return the number of bytes actually read + */ +int SocketCopyingInputStream::Read(void* buffer, int size) { + int received = recv(socket_id, buffer, size, 0); + if (received < 0) THROW_SOCKET_ERROR("recv"); + return received; +} +} Copied: old/sockets/SocketCopyingInputStream.h (from rev 712, pkg/src/SocketCopyingInputStream.h) =================================================================== --- old/sockets/SocketCopyingInputStream.h (rev 0) +++ old/sockets/SocketCopyingInputStream.h 2014-01-04 17:21:34 UTC (rev 713) @@ -0,0 +1,26 @@ +#ifndef RPROTOBUF_SocketCopyingInputStream_H +#define RPROTOBUF_SocketCopyingInputStream_H + +/* FIXME: this should be probably handled by sisocks + we need it for the TCP_NODELAY socket option */ +#include +#include +#include + +#include "sisocks.h" + +namespace rprotobuf { + +class SocketCopyingInputStream : public GPB::io::CopyingInputStream { + public: + SocketCopyingInputStream(int socket_id); + + int Read(void* buffer, int size); + + private: + int socket_id; +}; + +} // namespace rprotobuf + +#endif Copied: old/sockets/sisocks.h (from rev 712, pkg/src/sisocks.h) =================================================================== --- old/sockets/sisocks.h (rev 0) +++ old/sockets/sisocks.h 2014-01-04 17:21:34 UTC (rev 713) @@ -0,0 +1,214 @@ +/* system independent sockets (basically for unix and Win) + (C)2000,1 Simon Urbanek + + conditional defines: + + MAIN + should be defined in just one file that will contain the fn definitions and variables + + USE_SNPRINTF + emulate snprintf on Win platforms (you will + lose the security which is provided under unix of course) + + SOCK_ERRORS + include error code handling and checking functions +*/ + +#ifndef __SISOCKS_H__ +#define __SISOCKS_H__ + +#if defined __GNUC__ && !defined unix && !defined Win32 /* MacOS X hack (gcc on any platform should behave as unix - except for Win32, where we need to keep using winsock) */ +#define unix +#endif + +#if defined SOCK_ERRORS || defined USE_SNPRINTF +#include +#endif +#include + +#ifdef unix +#include +#include +#include +#include +#include +#include +#include + +#define sockerrno errno + +#define SOCKET int +#define INVALID_SOCKET (-1) +#define closesocket(A) close(A) + +#else + +#define windows +#include +#include +#include +#include +#define inet_aton(A,B) (0, B.s_addr=inet_addr(A)) + +#define sockerrno WSAGetLastError() + +#define ECONNREFUSED WSAECONNREFUSED +#define EADDRINUSE WSAEADDRINUSE +#define ENOTSOCK WSAENOTSOCK +#define EISCONN WSAEISCONN +#define ETIMEDOUT WSAETIMEDOUT +#define ENETUNREACH WSAENETUNREACH +#define EINPROGRESS WSAEINPROGRESS +#define EALREADY WSAEALREADY +#define EAFNOSUPPORT WSAEAFNOSUPPORT +#define EBADF WSAEBADF +#define EINVAL WSAEINVAL +#define EOPNOTSUPP WSAEOPNOTSUPP +#define EFAULT WSAEFAULT +#define EWOULDBLOCK WSAEWOULDBLOCK +#define EACCES WSAEACCES + +#ifdef USE_SNPRINTF +#ifdef MAIN +int snprintf(char *buf, int len, char *fmt, ...) +{ + va_list argptr; + int cnt; + + va_start(argptr, fmt); + cnt = vsprintf(buf, fmt, argptr); + va_end(argptr); + + return(cnt); +} +#else +extern int snprintf(char *buf, int len, char *fmt, ...); +#endif +#endif + +#endif + +#define SA struct sockaddr +#define SAIN struct sockaddr_in + +#ifdef windows + +#ifdef MAIN +int initsocks(void) +{ + WSADATA dt; + /* initialize WinSock 1.1 */ + return (WSAStartup(0x0101,&dt))?-1:0; +} +#else +extern int initsocks(void); +#endif + +#define donesocks() WSACleanup() +#else + +/* no stupid stuff necessary for unix */ +#define initsocks() +#define donesocks() + +#endif + +#ifdef SOCK_ERRORS + +#ifdef MAIN +int suppmode=0; +int socklasterr; +FILE *sockerrlog=0; + +/* copy error description to buf or set *buf=0 if none */ +int sockerrorchecks(char *buf, int blen, int res) { + *buf=0; + if (res==-1) { + switch(sockerrno) { + case EBADF: strncpy(buf,"bad descriptor",blen); break; + case EINVAL: strncpy(buf,"already in use",blen); break; + case EACCES: strncpy(buf,"access denied",blen); break; + case ENOTSOCK: strncpy(buf,"descriptor is not a socket",blen); break; + case EOPNOTSUPP: strncpy(buf,"operation not supported",blen); break; + case EFAULT: strncpy(buf,"fault",blen); break; + case EWOULDBLOCK: strncpy(buf,"operation would block",blen); break; + case EISCONN: strncpy(buf,"is already connected",blen); break; + case ECONNREFUSED: strncpy(buf,"connection refused",blen); break; + case ETIMEDOUT: strncpy(buf,"operation timed out",blen); break; + case ENETUNREACH: strncpy(buf,"network is unreachable",blen); break; + case EADDRINUSE: strncpy(buf,"address already in use",blen); break; + case EINPROGRESS: strncpy(buf,"in progress",blen); break; + case EALREADY: strncpy(buf,"previous connect request not completed yet",blen); break; +#ifdef unix + default: snprintf(buf,blen,"unknown socket error %d",sockerrno); +#else + default: sprintf(buf,"unknown socket error %d",sockerrno); +#endif + } + } + return res; +} + +/* check socket error and add to log file if necessary */ +int sockerrorcheck(char *sn, int rtb, int res) { + if (!sockerrlog) sockerrlog=stderr; + if ((signed int)res==-1) { + if (socklasterr==sockerrno) { + suppmode++; + } else { + if (suppmode>0) { + fprintf(sockerrlog,"##> REP: (last error has been repeated %d times.)\n",suppmode); + suppmode=0; + } + fprintf(sockerrlog,"##> SOCK_ERROR: %s error #%d",sn,sockerrno); + switch(sockerrno) { + case EBADF: fprintf(sockerrlog,"(bad descriptor)"); break; + case EINVAL: fprintf(sockerrlog,"(already in use)"); break; + case EACCES: fprintf(sockerrlog,"(access denied)"); break; + case ENOTSOCK: fprintf(sockerrlog,"(descriptor is not a socket)"); break; + case EOPNOTSUPP: fprintf(sockerrlog,"(operation not supported)"); break; + case EFAULT: fprintf(sockerrlog,"(fault)"); break; + case EWOULDBLOCK: fprintf(sockerrlog,"(operation would block)"); break; + case EISCONN: fprintf(sockerrlog,"(is already connected)"); break; + case ECONNREFUSED: fprintf(sockerrlog,"(connection refused)"); break; + case ETIMEDOUT: fprintf(sockerrlog,"(operation timed out)"); break; + case ENETUNREACH: fprintf(sockerrlog,"(network is unreachable)"); break; + case EADDRINUSE: fprintf(sockerrlog,"(address already in use)"); break; + case EINPROGRESS: fprintf(sockerrlog,"(in progress)"); break; + case EALREADY: fprintf(sockerrlog,"(previous connect request not completed yet)"); break; + default: fprintf(sockerrlog,"(?)"); + } + fprintf(sockerrlog,"\n"); fflush(sockerrlog); + socklasterr=sockerrno; + } + if (rtb) exit(1); + } + return res; +} +#else +extern int suppmode=0; +extern int socklasterr; +extern FILE *sockerrlog=0; + +int sockerrorchecks(char *buf, int blen, int res); +int sockerrorcheck(char *sn, int rtb, int res); +#endif + +#define FCF(X,F) sockerrorcheck(X,1,F) +#define CF(X,F) sockerrorcheck(X,0,F) + +#endif + +#ifdef MAIN +struct sockaddr *build_sin(struct sockaddr_in *sa,char *ip,int port) { + memset(sa,0,sizeof(struct sockaddr_in)); + sa->sin_family=AF_INET; + sa->sin_port=htons(port); + sa->sin_addr.s_addr=(ip)?inet_addr(ip):htonl(INADDR_ANY); + return (struct sockaddr*)sa; +} +#else +struct sockaddr *build_sin(struct sockaddr_in *sa,char *ip,int port); +#endif + +#endif /* __SISOCKS_H__ */ Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-04 09:13:45 UTC (rev 712) +++ pkg/ChangeLog 2014-01-04 17:21:34 UTC (rev 713) @@ -1,3 +1,8 @@ +2014-01-04 Dirk Eddelbuettel + + * src: Deprecate files SocketCopyingInputStream.{cpp,h}, sisocks.h as + the current RProtoBuf design does not provide networking capabilities + 2013-12-31 Murray Stokely * src/wrapper_Message.cpp: Fix type coercion bug in add() method Deleted: pkg/src/SocketCopyingInputStream.cpp =================================================================== --- pkg/src/SocketCopyingInputStream.cpp 2014-01-04 09:13:45 UTC (rev 712) +++ pkg/src/SocketCopyingInputStream.cpp 2014-01-04 17:21:34 UTC (rev 713) @@ -1,22 +0,0 @@ -// -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- -#include "rprotobuf.h" -#include "SocketCopyingInputStream.h" - -namespace rprotobuf { - -SocketCopyingInputStream::SocketCopyingInputStream(int id) { socket_id = id; } - -/** - * read from the socket - * - * @param buffer buffer to fill with at most size bytes - * @param size maximum number of bytes - * - * @return the number of bytes actually read - */ -int SocketCopyingInputStream::Read(void* buffer, int size) { - int received = recv(socket_id, buffer, size, 0); - if (received < 0) THROW_SOCKET_ERROR("recv"); - return received; -} -} Deleted: pkg/src/SocketCopyingInputStream.h =================================================================== --- pkg/src/SocketCopyingInputStream.h 2014-01-04 09:13:45 UTC (rev 712) +++ pkg/src/SocketCopyingInputStream.h 2014-01-04 17:21:34 UTC (rev 713) @@ -1,26 +0,0 @@ -#ifndef RPROTOBUF_SocketCopyingInputStream_H -#define RPROTOBUF_SocketCopyingInputStream_H - -/* FIXME: this should be probably handled by sisocks - we need it for the TCP_NODELAY socket option */ -#include -#include -#include - -#include "sisocks.h" - -namespace rprotobuf { - -class SocketCopyingInputStream : public GPB::io::CopyingInputStream { - public: - SocketCopyingInputStream(int socket_id); - - int Read(void* buffer, int size); - - private: - int socket_id; -}; - -} // namespace rprotobuf - -#endif Deleted: pkg/src/sisocks.h =================================================================== --- pkg/src/sisocks.h 2014-01-04 09:13:45 UTC (rev 712) +++ pkg/src/sisocks.h 2014-01-04 17:21:34 UTC (rev 713) @@ -1,214 +0,0 @@ -/* system independent sockets (basically for unix and Win) - (C)2000,1 Simon Urbanek - - conditional defines: - - MAIN - should be defined in just one file that will contain the fn definitions and variables - - USE_SNPRINTF - emulate snprintf on Win platforms (you will - lose the security which is provided under unix of course) - - SOCK_ERRORS - include error code handling and checking functions -*/ - -#ifndef __SISOCKS_H__ -#define __SISOCKS_H__ - -#if defined __GNUC__ && !defined unix && !defined Win32 /* MacOS X hack (gcc on any platform should behave as unix - except for Win32, where we need to keep using winsock) */ -#define unix -#endif - -#if defined SOCK_ERRORS || defined USE_SNPRINTF -#include -#endif -#include - -#ifdef unix -#include -#include -#include -#include -#include -#include -#include - -#define sockerrno errno - -#define SOCKET int -#define INVALID_SOCKET (-1) -#define closesocket(A) close(A) - -#else - -#define windows -#include -#include -#include -#include -#define inet_aton(A,B) (0, B.s_addr=inet_addr(A)) - -#define sockerrno WSAGetLastError() - -#define ECONNREFUSED WSAECONNREFUSED -#define EADDRINUSE WSAEADDRINUSE -#define ENOTSOCK WSAENOTSOCK -#define EISCONN WSAEISCONN -#define ETIMEDOUT WSAETIMEDOUT -#define ENETUNREACH WSAENETUNREACH -#define EINPROGRESS WSAEINPROGRESS -#define EALREADY WSAEALREADY -#define EAFNOSUPPORT WSAEAFNOSUPPORT -#define EBADF WSAEBADF -#define EINVAL WSAEINVAL -#define EOPNOTSUPP WSAEOPNOTSUPP -#define EFAULT WSAEFAULT -#define EWOULDBLOCK WSAEWOULDBLOCK -#define EACCES WSAEACCES - -#ifdef USE_SNPRINTF -#ifdef MAIN -int snprintf(char *buf, int len, char *fmt, ...) -{ - va_list argptr; - int cnt; - - va_start(argptr, fmt); - cnt = vsprintf(buf, fmt, argptr); - va_end(argptr); - - return(cnt); -} -#else -extern int snprintf(char *buf, int len, char *fmt, ...); -#endif -#endif - -#endif - -#define SA struct sockaddr -#define SAIN struct sockaddr_in - -#ifdef windows - -#ifdef MAIN -int initsocks(void) -{ - WSADATA dt; - /* initialize WinSock 1.1 */ - return (WSAStartup(0x0101,&dt))?-1:0; -} -#else -extern int initsocks(void); -#endif - -#define donesocks() WSACleanup() -#else - -/* no stupid stuff necessary for unix */ -#define initsocks() -#define donesocks() - -#endif - -#ifdef SOCK_ERRORS - -#ifdef MAIN -int suppmode=0; -int socklasterr; -FILE *sockerrlog=0; - -/* copy error description to buf or set *buf=0 if none */ -int sockerrorchecks(char *buf, int blen, int res) { - *buf=0; - if (res==-1) { - switch(sockerrno) { - case EBADF: strncpy(buf,"bad descriptor",blen); break; - case EINVAL: strncpy(buf,"already in use",blen); break; - case EACCES: strncpy(buf,"access denied",blen); break; - case ENOTSOCK: strncpy(buf,"descriptor is not a socket",blen); break; - case EOPNOTSUPP: strncpy(buf,"operation not supported",blen); break; - case EFAULT: strncpy(buf,"fault",blen); break; - case EWOULDBLOCK: strncpy(buf,"operation would block",blen); break; - case EISCONN: strncpy(buf,"is already connected",blen); break; - case ECONNREFUSED: strncpy(buf,"connection refused",blen); break; - case ETIMEDOUT: strncpy(buf,"operation timed out",blen); break; - case ENETUNREACH: strncpy(buf,"network is unreachable",blen); break; - case EADDRINUSE: strncpy(buf,"address already in use",blen); break; - case EINPROGRESS: strncpy(buf,"in progress",blen); break; - case EALREADY: strncpy(buf,"previous connect request not completed yet",blen); break; -#ifdef unix - default: snprintf(buf,blen,"unknown socket error %d",sockerrno); -#else - default: sprintf(buf,"unknown socket error %d",sockerrno); -#endif - } - } - return res; -} - -/* check socket error and add to log file if necessary */ -int sockerrorcheck(char *sn, int rtb, int res) { - if (!sockerrlog) sockerrlog=stderr; - if ((signed int)res==-1) { - if (socklasterr==sockerrno) { - suppmode++; - } else { - if (suppmode>0) { - fprintf(sockerrlog,"##> REP: (last error has been repeated %d times.)\n",suppmode); - suppmode=0; - } - fprintf(sockerrlog,"##> SOCK_ERROR: %s error #%d",sn,sockerrno); - switch(sockerrno) { - case EBADF: fprintf(sockerrlog,"(bad descriptor)"); break; - case EINVAL: fprintf(sockerrlog,"(already in use)"); break; - case EACCES: fprintf(sockerrlog,"(access denied)"); break; - case ENOTSOCK: fprintf(sockerrlog,"(descriptor is not a socket)"); break; - case EOPNOTSUPP: fprintf(sockerrlog,"(operation not supported)"); break; - case EFAULT: fprintf(sockerrlog,"(fault)"); break; - case EWOULDBLOCK: fprintf(sockerrlog,"(operation would block)"); break; - case EISCONN: fprintf(sockerrlog,"(is already connected)"); break; - case ECONNREFUSED: fprintf(sockerrlog,"(connection refused)"); break; - case ETIMEDOUT: fprintf(sockerrlog,"(operation timed out)"); break; - case ENETUNREACH: fprintf(sockerrlog,"(network is unreachable)"); break; - case EADDRINUSE: fprintf(sockerrlog,"(address already in use)"); break; - case EINPROGRESS: fprintf(sockerrlog,"(in progress)"); break; - case EALREADY: fprintf(sockerrlog,"(previous connect request not completed yet)"); break; - default: fprintf(sockerrlog,"(?)"); - } - fprintf(sockerrlog,"\n"); fflush(sockerrlog); - socklasterr=sockerrno; - } - if (rtb) exit(1); - } - return res; -} -#else -extern int suppmode=0; -extern int socklasterr; -extern FILE *sockerrlog=0; - -int sockerrorchecks(char *buf, int blen, int res); -int sockerrorcheck(char *sn, int rtb, int res); -#endif - -#define FCF(X,F) sockerrorcheck(X,1,F) -#define CF(X,F) sockerrorcheck(X,0,F) - -#endif - -#ifdef MAIN -struct sockaddr *build_sin(struct sockaddr_in *sa,char *ip,int port) { - memset(sa,0,sizeof(struct sockaddr_in)); - sa->sin_family=AF_INET; - sa->sin_port=htons(port); - sa->sin_addr.s_addr=(ip)?inet_addr(ip):htonl(INADDR_ANY); - return (struct sockaddr*)sa; -} -#else -struct sockaddr *build_sin(struct sockaddr_in *sa,char *ip,int port); -#endif - -#endif /* __SISOCKS_H__ */ From noreply at r-forge.r-project.org Sat Jan 4 20:31:19 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 20:31:19 +0100 (CET) Subject: [Rprotobuf-commits] r714 - pkg Message-ID: <20140104193119.69B1A186B68@r-forge.r-project.org> Author: edd Date: 2014-01-04 20:31:18 +0100 (Sat, 04 Jan 2014) New Revision: 714 Modified: pkg/DESCRIPTION Log: adding Jeroen to Authors Modified: pkg/DESCRIPTION =================================================================== --- pkg/DESCRIPTION 2014-01-04 17:21:34 UTC (rev 713) +++ pkg/DESCRIPTION 2014-01-04 19:31:18 UTC (rev 714) @@ -1,7 +1,7 @@ Package: RProtoBuf Version: 0.3.2.4 Date: $Date$ -Author: Romain Francois, Dirk Eddelbuettel and Murray Stokely +Author: Romain Francois, Dirk Eddelbuettel, Murray Stokely and Jeroen Ooms Maintainer: Dirk Eddelbuettel Title: R Interface to the Protocol Buffers API Description: Protocol Buffers are a way of encoding structured data in an From noreply at r-forge.r-project.org Sat Jan 4 22:06:34 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 4 Jan 2014 22:06:34 +0100 (CET) Subject: [Rprotobuf-commits] r715 - in pkg/inst: . python Message-ID: <20140104210634.BE882186CD2@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-04 22:06:34 +0100 (Sat, 04 Jan 2014) New Revision: 715 Added: pkg/inst/python/ pkg/inst/python/readmsg.py pkg/inst/python/runtest.sh pkg/inst/python/writemsg.R Log: adding some python tests Added: pkg/inst/python/readmsg.py =================================================================== --- pkg/inst/python/readmsg.py (rev 0) +++ pkg/inst/python/readmsg.py 2014-01-04 21:06:34 UTC (rev 715) @@ -0,0 +1,16 @@ +#!/usr/bin/env python +# +# Simple test script to read a serialized message in python +# +import rexp_pb2; +import glob; + +messages = glob.glob("*.msg"); +for myfile in messages: + print("Reading message " + myfile + " ...") + f = open(myfile, 'rb') + msg = rexp_pb2.REXP(); + msg.ParseFromString(f.read()) + f.close(); + print(msg) + del msg Added: pkg/inst/python/runtest.sh =================================================================== --- pkg/inst/python/runtest.sh (rev 0) +++ pkg/inst/python/runtest.sh 2014-01-04 21:06:34 UTC (rev 715) @@ -0,0 +1,16 @@ +#!/bin/sh + +#write some PB data with R +Rscript writemsg.R + +#compile proto file +cp -f ../proto/rexp.proto . +protoc rexp.proto --python_out=. +rm rexp.proto + +#read with python and print +python readmsg.py + +#cleanup tmp files +rm rexp_pb2.py +rm *.msg Property changes on: pkg/inst/python/runtest.sh ___________________________________________________________________ Added: svn:executable + * Added: pkg/inst/python/writemsg.R =================================================================== --- pkg/inst/python/writemsg.R (rev 0) +++ pkg/inst/python/writemsg.R 2014-01-04 21:06:34 UTC (rev 715) @@ -0,0 +1,13 @@ +#simple R script to serialize some R objects +library(RProtoBuf) + +#some vectors +serialize_pb(c(1,2,pi, NA,NaN,Inf,-Inf), "double.msg") +serialize_pb(c(1L, 2L, NA), "integer.msg") +serialize_pb(c(TRUE, FALSE, NA), "logical.msg") +serialize_pb(c("foo", "bar", NA), "character.msg") +serialize_pb(charToRaw("This is a test"), "raw.msg") + +#lists +serialize_pb(list(foo=c(1,2,pi), bar=TRUE, baz="blabla", zoo=NULL), "list.msg") +serialize_pb(iris[1:3,], "dataframe.msg") From noreply at r-forge.r-project.org Sun Jan 5 00:11:08 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 5 Jan 2014 00:11:08 +0100 (CET) Subject: [Rprotobuf-commits] r716 - pkg/inst/python Message-ID: <20140104231108.2CA9918459A@r-forge.r-project.org> Author: edd Date: 2014-01-05 00:11:07 +0100 (Sun, 05 Jan 2014) New Revision: 716 Modified: pkg/inst/python/runtest.sh Log: also remove .pyc file Modified: pkg/inst/python/runtest.sh =================================================================== --- pkg/inst/python/runtest.sh 2014-01-04 21:06:34 UTC (rev 715) +++ pkg/inst/python/runtest.sh 2014-01-04 23:11:07 UTC (rev 716) @@ -12,5 +12,5 @@ python readmsg.py #cleanup tmp files -rm rexp_pb2.py +rm rexp_pb2.py rexp_pb2.pyc rm *.msg From noreply at r-forge.r-project.org Sun Jan 5 00:40:19 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 5 Jan 2014 00:40:19 +0100 (CET) Subject: [Rprotobuf-commits] r717 - pkg/src Message-ID: <20140104234019.BD005184313@r-forge.r-project.org> Author: murray Date: 2014-01-05 00:40:18 +0100 (Sun, 05 Jan 2014) New Revision: 717 Modified: pkg/src/wrapper_ZeroCopyInputStream.cpp Log: Add necessary BEGIN/END_RCPP macros so that we catch exceptions and don't crash R. This doesn't work currently, but now I see what it is for and it seems useful. Next() always throws the range_error when on an arrayinputobject instantiated with raw(0:10) as in the commented out examples from the man page. Will try to get that working. Modified: pkg/src/wrapper_ZeroCopyInputStream.cpp =================================================================== --- pkg/src/wrapper_ZeroCopyInputStream.cpp 2014-01-04 23:11:07 UTC (rev 716) +++ pkg/src/wrapper_ZeroCopyInputStream.cpp 2014-01-04 23:40:18 UTC (rev 717) @@ -4,6 +4,7 @@ namespace rprotobuf { SEXP ZeroCopyInputStream_Next(SEXP xp) { + BEGIN_RCPP GPB::io::ZeroCopyInputStream* stream = GET_ZCIS(xp); int s = 0; const void* in; @@ -15,9 +16,11 @@ result.assign(reinterpret_cast(in), reinterpret_cast(in) + s); } return result; + END_RCPP } SEXP ZeroCopyInputStream_BackUp(SEXP xp, SEXP size) { + BEGIN_RCPP GPB::io::ZeroCopyInputStream* stream = GET_ZCIS(xp); int s = GET_int(size, 0); if (s <= 0) { @@ -25,17 +28,22 @@ } stream->BackUp(s); return R_NilValue; + END_RCPP } SEXP ZeroCopyInputStream_Skip(SEXP xp, SEXP size) { + BEGIN_RCPP GPB::io::ZeroCopyInputStream* stream = GET_ZCIS(xp); int s = GET_int(size, 0); bool res = stream->Skip(s); return (Rf_ScalarLogical(res ? _TRUE_ : _FALSE_)); + END_RCPP } SEXP ZeroCopyInputStream_ByteCount(SEXP xp) { + BEGIN_RCPP GPB::io::ZeroCopyInputStream* stream = GET_ZCIS(xp); return (Rf_ScalarReal((double)stream->ByteCount())); + END_RCPP } } From noreply at r-forge.r-project.org Sun Jan 5 01:06:17 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 5 Jan 2014 01:06:17 +0100 (CET) Subject: [Rprotobuf-commits] r718 - pkg Message-ID: <20140105000617.A98C3186D45@r-forge.r-project.org> Author: murray Date: 2014-01-05 01:06:16 +0100 (Sun, 05 Jan 2014) New Revision: 718 Modified: pkg/ChangeLog Log: document last change. Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-04 23:40:18 UTC (rev 717) +++ pkg/ChangeLog 2014-01-05 00:06:16 UTC (rev 718) @@ -1,3 +1,9 @@ +2014-01-04 Murray Stokely + + * src/wrapper_ZeroCopyInputStream.cpp (rprotobuf): Add + BEGIN/END_RCPP macros to gracefully catch exceptions and return + them as R language errors. + 2014-01-04 Dirk Eddelbuettel * src: Deprecate files SocketCopyingInputStream.{cpp,h}, sisocks.h as From noreply at r-forge.r-project.org Sun Jan 5 03:02:16 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 5 Jan 2014 03:02:16 +0100 (CET) Subject: [Rprotobuf-commits] r719 - in pkg: . inst/unitTests inst/unitTests/data Message-ID: <20140105020216.D5ABD186C69@r-forge.r-project.org> Author: murray Date: 2014-01-05 03:02:15 +0100 (Sun, 05 Jan 2014) New Revision: 719 Added: pkg/inst/unitTests/data/encoding.proto Modified: pkg/ChangeLog pkg/inst/unitTests/runit.serialize.R Log: Add example message types used in the encoding documentation and new unit test verifying the byte-by-byte encoding used for some examples. Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-05 00:06:16 UTC (rev 718) +++ pkg/ChangeLog 2014-01-05 02:02:15 UTC (rev 719) @@ -1,5 +1,10 @@ 2014-01-04 Murray Stokely + * inst/unitTests/data/encoding.proto: Add example messages used in + the encoding documentation. + * inst/unitTests/runit.serialize.R (test.encoding): Add tests + verifying the exact byte serialization as described in the + encoding documentation. * src/wrapper_ZeroCopyInputStream.cpp (rprotobuf): Add BEGIN/END_RCPP macros to gracefully catch exceptions and return them as R language errors. Added: pkg/inst/unitTests/data/encoding.proto =================================================================== --- pkg/inst/unitTests/data/encoding.proto (rev 0) +++ pkg/inst/unitTests/data/encoding.proto 2014-01-05 02:02:15 UTC (rev 719) @@ -0,0 +1,16 @@ +// Examples from: +// https://developers.google.com/protocol-buffers/docs/encoding +package protobuf_encoding_test; + +message Test1 { + required int32 a = 1; +} +message Test2 { + required string b = 2; +} +message Test3 { + required Test1 c = 3; +} +message Test4 { + repeated int32 d = 4 [packed=true]; +} Modified: pkg/inst/unitTests/runit.serialize.R =================================================================== --- pkg/inst/unitTests/runit.serialize.R 2014-01-05 00:06:16 UTC (rev 718) +++ pkg/inst/unitTests/runit.serialize.R 2014-01-05 02:02:15 UTC (rev 719) @@ -1,3 +1,4 @@ +# -*- indent-tabs-mode: nil; tab-width: 4; show-trailing-whitespace: t; c-indent-level: 4; c-basic-offset: 4; -*- # Copyright 2012 Google Inc. All Rights Reserved. # Author: Murray Stokely # @@ -15,7 +16,6 @@ # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - # this is executed before each test function .setUp <- function(){ if( !exists("tutorial.Person", "RProtoBuf:DescriptorPool") ) { @@ -25,6 +25,37 @@ } } +test.encoding <- function() { + if (!exists("protobuf_encoding_test.Test1", + "RProtoBuf:DescriptorPool")) { + unittest.proto.file <- system.file("unitTests", "data", + "encoding.proto", + package="RProtoBuf") + readProtoFiles(file=unittest.proto.file) + } + + # Encoding examples from: + # https://developers.google.com/protocol-buffers/docs/encoding + test1 <- new(protobuf_encoding_test.Test1) + test1$a <- 150 + checkIdentical(test1$serialize(NULL), as.raw(c(0x08,0x96,0x01))) + + test2 <- new(protobuf_encoding_test.Test2) + test2$b <- "testing" + checkIdentical(test2$serialize(NULL), + as.raw(c(0x12, 0x07, 0x74, 0x65, 0x73, 0x74, 0x69, 0x6e, 0x67))) + + test3 <- new(protobuf_encoding_test.Test3) + test3$c$a <- 150 + checkIdentical(test3$serialize(NULL), + as.raw(c(0x1a, 0x03, 0x08, 0x96, 0x01))) + + test4 <- new(protobuf_encoding_test.Test4) + test4$d <- c(3, 270, 86942) + checkIdentical(test4$serialize(NULL), + as.raw(c(0x22, 0x06, 0x03, 0x8e, 0x02, 0x9e, 0xa7, 0x05))) +} + test.serialize <- function() { person <- new(tutorial.Person) From noreply at r-forge.r-project.org Sun Jan 5 03:40:12 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 5 Jan 2014 03:40:12 +0100 (CET) Subject: [Rprotobuf-commits] r720 - in pkg/inst: . opencpu Message-ID: <20140105024012.1F49A1839DE@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-05 03:40:05 +0100 (Sun, 05 Jan 2014) New Revision: 720 Added: pkg/inst/opencpu/ pkg/inst/opencpu/ocpu-getdata.R pkg/inst/opencpu/ocpu-getdata.py pkg/inst/opencpu/ocpu-rpc.R pkg/inst/opencpu/ocpu-rpc.py pkg/inst/opencpu/readme.txt pkg/inst/opencpu/rexp_pb2.py Log: adding examples/tests with protobuf over https using OpenCPU Added: pkg/inst/opencpu/ocpu-getdata.R =================================================================== --- pkg/inst/opencpu/ocpu-getdata.R (rev 0) +++ pkg/inst/opencpu/ocpu-getdata.R 2014-01-05 02:40:05 UTC (rev 720) @@ -0,0 +1,10 @@ +# Jeroen Ooms +# +# HTTPS+ProtoBuf RPC POC using OpenCPU +# Script below downloads MASS::Animals using protobuf +library(RProtoBuf) +library(httr) + +req <- GET ('https://public.opencpu.org/ocpu/library/MASS/data/Animals/pb') +output <- unserialize_pb(req$content) +identical(output, MASS::Animals) Added: pkg/inst/opencpu/ocpu-getdata.py =================================================================== --- pkg/inst/opencpu/ocpu-getdata.py (rev 0) +++ pkg/inst/opencpu/ocpu-getdata.py 2014-01-05 02:40:05 UTC (rev 720) @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# Jeroen Ooms +# +# HTTPS+ProtoBuf RPC POC using OpenCPU +# Script below downloads MASS::Animals using protobuf +import urllib2; +from rexp_pb2 import *; + +#HTTP GET +req = urllib2.Request('https://public.opencpu.org/ocpu/library/MASS/data/Animals/pb'); +res = urllib2.urlopen(req); + +#parse output pb +msg = REXP(); +msg.ParseFromString(res.read()); + +#the return value is a double vector in this case +print(msg); Added: pkg/inst/opencpu/ocpu-rpc.R =================================================================== --- pkg/inst/opencpu/ocpu-rpc.R (rev 0) +++ pkg/inst/opencpu/ocpu-rpc.R 2014-01-05 02:40:05 UTC (rev 720) @@ -0,0 +1,27 @@ +# Jeroen Ooms +# +# HTTPS+ProtoBuf RPC POC using OpenCPU +# The call below maps to: do.call(stats::rnorm, list(n=42, mean=100)) + +# !! This requires httr (>= 0.2.99). Version 0.2 has a bug. +# library(devtools) +# install_github("httr") + +# Actual code +library(RProtoBuf) +library(httr) + +args <- list(n=42, mean=100) +payload <- serialize_pb(args, NULL) + +req <- POST ( + url = "https://public.opencpu.org/ocpu/library/stats/R/rnorm/pb", + body = payload, + add_headers( + "Content-Type" = "application/x-protobuf" + ) +) + +#This is the output of stats::rnorm(n=42, mean=100) +output <- unserialize_pb(req$content) +print(length(output)) Added: pkg/inst/opencpu/ocpu-rpc.py =================================================================== --- pkg/inst/opencpu/ocpu-rpc.py (rev 0) +++ pkg/inst/opencpu/ocpu-rpc.py 2014-01-05 02:40:05 UTC (rev 720) @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# Jeroen Ooms +# +# HTTPS+ProtoBuf RPC POC using OpenCPU +# The call below maps to: do.call(stats::rnorm, list(n=42, mean=100)) +import urllib2; +from rexp_pb2 import *; + +#create the post payload, i.e. list(n=42, mean=100) +payload = REXP( + rclass = 5, + rexpValue = [ + REXP(rclass = 2, realValue = [42]), + REXP(rclass = 2, realValue = [100]) + ], + attrName = [ + "names" + ], + attrValue = [ + REXP(rclass = 0, stringValue = [STRING(strval="n"), STRING(strval="mean")]) + ] +); + +#HTTP POST +req = urllib2.Request( + 'https://public.opencpu.org/ocpu/library/stats/R/rnorm/pb', + data = payload.SerializeToString(), + headers = { + 'Content-type': 'application/x-protobuf' + } +); +res = urllib2.urlopen(req); + +#parse output pb +msg = REXP(); +msg.ParseFromString(res.read()); + +#the return value is a double vector in this case +print(msg.realValue); + + +##### To debug: +#f = open("payload.msg", "wb") +#f.write(payload.SerializeToString()) +#f.close() +# +# Then do in R do: +# library(RProtoBuf) +# payload <- unserialize_pb("payload.msg") +# do.call(stats::rnorm, payload) \ No newline at end of file Added: pkg/inst/opencpu/readme.txt =================================================================== --- pkg/inst/opencpu/readme.txt (rev 0) +++ pkg/inst/opencpu/readme.txt 2014-01-05 02:40:05 UTC (rev 720) @@ -0,0 +1,2 @@ +These scripts illustrate how protocol buffers can be used as a data interchange format +or as the basis of an RPC protocol. \ No newline at end of file Added: pkg/inst/opencpu/rexp_pb2.py =================================================================== --- pkg/inst/opencpu/rexp_pb2.py (rev 0) +++ pkg/inst/opencpu/rexp_pb2.py 2014-01-05 02:40:05 UTC (rev 720) @@ -0,0 +1,281 @@ +# Generated by the protocol buffer compiler. DO NOT EDIT! + +from google.protobuf import descriptor +from google.protobuf import message +from google.protobuf import reflection +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + + + +DESCRIPTOR = descriptor.FileDescriptor( + name='rexp.proto', + package='rexp', + serialized_pb='\n\nrexp.proto\x12\x04rexp\"\xb3\x03\n\x04REXP\x12!\n\x06rclass\x18\x01 \x02(\x0e\x32\x11.rexp.REXP.RClass\x12\x15\n\trealValue\x18\x02 \x03(\x01\x42\x02\x10\x01\x12\x14\n\x08intValue\x18\x03 \x03(\x11\x42\x02\x10\x01\x12)\n\x0c\x62ooleanValue\x18\x04 \x03(\x0e\x32\x13.rexp.REXP.RBOOLEAN\x12!\n\x0bstringValue\x18\x05 \x03(\x0b\x32\x0c.rexp.STRING\x12\x10\n\x08rawValue\x18\x06 \x01(\x0c\x12!\n\x0c\x63omplexValue\x18\x07 \x03(\x0b\x32\x0b.rexp.CMPLX\x12\x1d\n\trexpValue\x18\x08 \x03(\x0b\x32\n.rexp.REXP\x12\x10\n\x08\x61ttrName\x18\x0b \x03(\t\x12\x1d\n\tattrValue\x18\x0c \x03(\x0b\x32\n.rexp.REXP\"f\n\x06RClass\x12\n\n\x06STRING\x10\x00\x12\x07\n\x03RAW\x10\x01\x12\x08\n\x04REAL\x10\x02\x12\x0b\n\x07\x43OMPLEX\x10\x03\x12\x0b\n\x07INTEGER\x10\x04\x12\x08\n\x04LIST\x10\x05\x12\x0b\n\x07LOGICAL\x10\x06\x12\x0c\n\x08NULLTYPE\x10\x07\" \n\x08RBOOLEAN\x12\x05\n\x01\x46\x10\x00\x12\x05\n\x01T\x10\x01\x12\x06\n\x02NA\x10\x02\"-\n\x06STRING\x12\x0e\n\x06strval\x18\x01 \x01(\t\x12\x13\n\x04isNA\x18\x02 \x01(\x08:\x05\x66\x61lse\"&\n\x05\x43MPLX\x12\x0f\n\x04real\x18\x01 \x01(\x01:\x01\x30\x12\x0c\n\x04imag\x18\x02 \x02(\x01\x32(\n\x04ocpu\x12 \n\x06\x64oCall\x12\n.rexp.REXP\x1a\n.rexp.REXP') + + + +_REXP_RCLASS = descriptor.EnumDescriptor( + name='RClass', + full_name='rexp.REXP.RClass', + filename=None, + file=DESCRIPTOR, + values=[ + descriptor.EnumValueDescriptor( + name='STRING', index=0, number=0, + options=None, + type=None), + descriptor.EnumValueDescriptor( + name='RAW', index=1, number=1, + options=None, + type=None), + descriptor.EnumValueDescriptor( + name='REAL', index=2, number=2, + options=None, + type=None), + descriptor.EnumValueDescriptor( + name='COMPLEX', index=3, number=3, + options=None, + type=None), + descriptor.EnumValueDescriptor( + name='INTEGER', index=4, number=4, + options=None, + type=None), + descriptor.EnumValueDescriptor( + name='LIST', index=5, number=5, + options=None, + type=None), + descriptor.EnumValueDescriptor( + name='LOGICAL', index=6, number=6, + options=None, + type=None), + descriptor.EnumValueDescriptor( + name='NULLTYPE', index=7, number=7, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=320, + serialized_end=422, +) + +_REXP_RBOOLEAN = descriptor.EnumDescriptor( + name='RBOOLEAN', + full_name='rexp.REXP.RBOOLEAN', + filename=None, + file=DESCRIPTOR, + values=[ + descriptor.EnumValueDescriptor( + name='F', index=0, number=0, + options=None, + type=None), + descriptor.EnumValueDescriptor( + name='T', index=1, number=1, + options=None, + type=None), + descriptor.EnumValueDescriptor( + name='NA', index=2, number=2, + options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=424, + serialized_end=456, +) + + +_REXP = descriptor.Descriptor( + name='REXP', + full_name='rexp.REXP', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + descriptor.FieldDescriptor( + name='rclass', full_name='rexp.REXP.rclass', index=0, + number=1, type=14, cpp_type=8, label=2, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + descriptor.FieldDescriptor( + name='realValue', full_name='rexp.REXP.realValue', index=1, + number=2, type=1, cpp_type=5, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=descriptor._ParseOptions(descriptor_pb2.FieldOptions(), '\020\001')), + descriptor.FieldDescriptor( + name='intValue', full_name='rexp.REXP.intValue', index=2, + number=3, type=17, cpp_type=1, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=descriptor._ParseOptions(descriptor_pb2.FieldOptions(), '\020\001')), + descriptor.FieldDescriptor( + name='booleanValue', full_name='rexp.REXP.booleanValue', index=3, + number=4, type=14, cpp_type=8, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + descriptor.FieldDescriptor( + name='stringValue', full_name='rexp.REXP.stringValue', index=4, + number=5, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + descriptor.FieldDescriptor( + name='rawValue', full_name='rexp.REXP.rawValue', index=5, + number=6, type=12, cpp_type=9, label=1, + has_default_value=False, default_value="", + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + descriptor.FieldDescriptor( + name='complexValue', full_name='rexp.REXP.complexValue', index=6, + number=7, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + descriptor.FieldDescriptor( + name='rexpValue', full_name='rexp.REXP.rexpValue', index=7, + number=8, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + descriptor.FieldDescriptor( + name='attrName', full_name='rexp.REXP.attrName', index=8, + number=11, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + descriptor.FieldDescriptor( + name='attrValue', full_name='rexp.REXP.attrValue', index=9, + number=12, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + _REXP_RCLASS, + _REXP_RBOOLEAN, + ], + options=None, + is_extendable=False, + extension_ranges=[], + serialized_start=21, + serialized_end=456, +) + + +_STRING = descriptor.Descriptor( + name='STRING', + full_name='rexp.STRING', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + descriptor.FieldDescriptor( + name='strval', full_name='rexp.STRING.strval', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=unicode("", "utf-8"), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + descriptor.FieldDescriptor( + name='isNA', full_name='rexp.STRING.isNA', index=1, + number=2, type=8, cpp_type=7, label=1, + has_default_value=True, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + extension_ranges=[], + serialized_start=458, + serialized_end=503, +) + + +_CMPLX = descriptor.Descriptor( + name='CMPLX', + full_name='rexp.CMPLX', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + descriptor.FieldDescriptor( + name='real', full_name='rexp.CMPLX.real', index=0, + number=1, type=1, cpp_type=5, label=1, + has_default_value=True, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + descriptor.FieldDescriptor( + name='imag', full_name='rexp.CMPLX.imag', index=1, + number=2, type=1, cpp_type=5, label=2, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + options=None), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + options=None, + is_extendable=False, + extension_ranges=[], + serialized_start=505, + serialized_end=543, +) + +_REXP.fields_by_name['rclass'].enum_type = _REXP_RCLASS +_REXP.fields_by_name['booleanValue'].enum_type = _REXP_RBOOLEAN +_REXP.fields_by_name['stringValue'].message_type = _STRING +_REXP.fields_by_name['complexValue'].message_type = _CMPLX +_REXP.fields_by_name['rexpValue'].message_type = _REXP +_REXP.fields_by_name['attrValue'].message_type = _REXP +_REXP_RCLASS.containing_type = _REXP; +_REXP_RBOOLEAN.containing_type = _REXP; +DESCRIPTOR.message_types_by_name['REXP'] = _REXP +DESCRIPTOR.message_types_by_name['STRING'] = _STRING +DESCRIPTOR.message_types_by_name['CMPLX'] = _CMPLX + +class REXP(message.Message): + __metaclass__ = reflection.GeneratedProtocolMessageType + DESCRIPTOR = _REXP + + # @@protoc_insertion_point(class_scope:rexp.REXP) + +class STRING(message.Message): + __metaclass__ = reflection.GeneratedProtocolMessageType + DESCRIPTOR = _STRING + + # @@protoc_insertion_point(class_scope:rexp.STRING) + +class CMPLX(message.Message): + __metaclass__ = reflection.GeneratedProtocolMessageType + DESCRIPTOR = _CMPLX + + # @@protoc_insertion_point(class_scope:rexp.CMPLX) + +# @@protoc_insertion_point(module_scope) From noreply at r-forge.r-project.org Sun Jan 5 04:11:30 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 5 Jan 2014 04:11:30 +0100 (CET) Subject: [Rprotobuf-commits] r721 - in pkg: . R Message-ID: <20140105031130.92A23184E39@r-forge.r-project.org> Author: murray Date: 2014-01-05 04:11:26 +0100 (Sun, 05 Jan 2014) New Revision: 721 Modified: pkg/ChangeLog pkg/R/00classes.R pkg/R/wrapper_CodedInputStream.R Log: Get ReadRaw() and ReadString() working with ArrayInputStream / ZeroCopyInputStream. Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-05 02:40:05 UTC (rev 720) +++ pkg/ChangeLog 2014-01-05 03:11:26 UTC (rev 721) @@ -1,5 +1,10 @@ 2014-01-04 Murray Stokely + * R/wrapper_CodedInputStream.R: Accept numeric size arguments for + ReadRaw and ReadString to make this more user friendly for + interactive use by calling as.integer() as needed. + * R/00classes.R (P): Add missing object prameters in + ZeroCopyInputStream calls to ReadRaw and ReadString. * inst/unitTests/data/encoding.proto: Add example messages used in the encoding documentation. * inst/unitTests/runit.serialize.R (test.encoding): Add tests Modified: pkg/R/00classes.R =================================================================== --- pkg/R/00classes.R 2014-01-05 02:40:05 UTC (rev 720) +++ pkg/R/00classes.R 2014-01-05 03:11:26 UTC (rev 721) @@ -344,8 +344,8 @@ "BackUp" = function(...) BackUp(x, ...), # CodedInputStream related - "ReadRaw" = function(...) ReadRaw(...), - "ReadString" = function(...) ReadString(...), + "ReadRaw" = function(...) ReadRaw(x, ...), + "ReadString" = function(...) ReadString(x, ...), "ReadVarint32"= function() ReadVarint32(x), "ReadVarint64" = function() ReadVarint64(x), "ReadLittleEndian32" = function() ReadLittleEndian32(x), Modified: pkg/R/wrapper_CodedInputStream.R =================================================================== --- pkg/R/wrapper_CodedInputStream.R 2014-01-05 02:40:05 UTC (rev 720) +++ pkg/R/wrapper_CodedInputStream.R 2014-01-05 03:11:26 UTC (rev 721) @@ -5,6 +5,15 @@ setMethod( "ReadRaw", c( object="ZeroCopyInputStream", size = "integer" ), function(object, size){ .Call( "ZeroCopyInputStream_ReadRaw", object at pointer, size, PACKAGE = "RProtoBuf" ) } ) +setMethod("ReadRaw", c( object="ZeroCopyInputStream", size = "numeric" ), + function(object, size) { + if (size %% 1 == 0) { + .Call( "ZeroCopyInputStream_ReadRaw", object at pointer, as.integer(size), + PACKAGE = "RProtoBuf" ) + } else { + stop("Size must be a whole number.") + } +} ) setGeneric( "ReadString", function(object, size ){ standardGeneric( "ReadString" ) @@ -12,6 +21,15 @@ setMethod( "ReadString", c( object="ZeroCopyInputStream", size = "integer" ), function(object, size){ .Call( "ZeroCopyInputStream_ReadString", object at pointer, size, PACKAGE = "RProtoBuf" ) } ) +setMethod("ReadString", c( object="ZeroCopyInputStream", size = "numeric" ), + function(object, size) { + if (size %% 1 == 0) { + .Call("ZeroCopyInputStream_ReadString", object at pointer, as.integer(size), + PACKAGE = "RProtoBuf" ) + } else { + stop("Size must be a whole number.") + } +} ) setGeneric( "ReadVarint32", function(object){ standardGeneric( "ReadVarint32" ) @@ -40,4 +58,3 @@ setMethod( "ReadVarint64", c( object="ZeroCopyInputStream"), function(object){ .Call( "ZeroCopyInputStream_ReadVarint64", object at pointer, PACKAGE = "RProtoBuf" ) } ) - From noreply at r-forge.r-project.org Sun Jan 5 04:26:06 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 5 Jan 2014 04:26:06 +0100 (CET) Subject: [Rprotobuf-commits] r722 - in pkg: . src Message-ID: <20140105032606.3AFCE186B02@r-forge.r-project.org> Author: murray Date: 2014-01-05 04:26:03 +0100 (Sun, 05 Jan 2014) New Revision: 722 Modified: pkg/ChangeLog pkg/src/streams.cpp Log: Add missing BEGIN/END_RCPP and gracefully catch exceptions rather than long jumping without freeing resources with Rf_error. Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-05 03:11:26 UTC (rev 721) +++ pkg/ChangeLog 2014-01-05 03:26:03 UTC (rev 722) @@ -13,6 +13,7 @@ * src/wrapper_ZeroCopyInputStream.cpp (rprotobuf): Add BEGIN/END_RCPP macros to gracefully catch exceptions and return them as R language errors. + * src/streams.cpp (rprotobuf): Idem. 2014-01-04 Dirk Eddelbuettel Modified: pkg/src/streams.cpp =================================================================== --- pkg/src/streams.cpp 2014-01-05 03:11:26 UTC (rev 721) +++ pkg/src/streams.cpp 2014-01-05 03:26:03 UTC (rev 722) @@ -14,7 +14,7 @@ // {{{ FileInputStream SEXP FileInputStream_new(SEXP filename, SEXP block_size, SEXP close_on_delete) { - + BEGIN_RCPP NEW_S4_OBJECT("FileInputStream"); int fd = open(CHAR(STRING_ELT(filename, 0)), O_RDONLY | O_BINARY); @@ -28,25 +28,33 @@ UNPROTECT(2); /* oo, ptr */ return oo; + END_RCPP } SEXP FileInputStream_GetErrno(SEXP xp) { + BEGIN_RCPP GPB::io::FileInputStream* stream = GET_FIS(xp); return Rf_ScalarInteger(stream->GetErrno()); + END_RCPP } SEXP FileInputStream_SetCloseOnDelete(SEXP xp, SEXP close) { + BEGIN_RCPP GPB::io::FileInputStream* stream = GET_FIS(xp); stream->SetCloseOnDelete(LOGICAL(close)); return R_NilValue; + END_RCPP } SEXP FileInputStream_Close(SEXP xp) { + BEGIN_RCPP GPB::io::FileInputStream* stream = GET_FIS(xp); bool res = stream->Close(); return Rf_ScalarLogical(res ? _TRUE_ : _FALSE_); + END_RCPP } // }}} // {{{ ConnectionInputStream SEXP ConnectionInputStream_new(SEXP con, SEXP was_open) { + BEGIN_RCPP NEW_S4_OBJECT("ConnectionInputStream"); ConnectionInputStream* stream = new ConnectionInputStream(con, (bool)LOGICAL(was_open)[0]); ZeroCopyInputStreamWrapper* wrapper = new ZeroCopyInputStreamWrapper(stream); @@ -56,6 +64,7 @@ UNPROTECT(2); /* oo, ptr */ return oo; + END_RCPP } // }}} // }}} @@ -63,33 +72,40 @@ // {{{ output streams // {{{ ZeroCopyOutputStream SEXP ZeroCopyOutputStream_Next(SEXP xp, SEXP payload) { + BEGIN_RCPP GPB::io::ZeroCopyOutputStream* stream = GET_ZCOS(xp); void* out; int s = LENGTH(payload); bool res = stream->Next(&out, &s); if (!res) { - Rf_error("cannot write to stream"); + Rcpp_error("cannot write to stream"); } memcpy(out, RAW(payload), s); return Rf_ScalarInteger(s); + END_RCPP } SEXP ZeroCopyOutputStream_ByteCount(SEXP xp) { + BEGIN_RCPP GPB::io::ZeroCopyOutputStream* stream = GET_ZCOS(xp); return (Rf_ScalarReal((double)stream->ByteCount())); + END_RCPP } SEXP ZeroCopyOutputStream_BackUp(SEXP xp, SEXP count) { + BEGIN_RCPP GPB::io::ZeroCopyOutputStream* stream = GET_ZCOS(xp); int s = GET_int(count, 0); stream->BackUp(s); return R_NilValue; + END_RCPP } // }}} // {{{ ArrayOutputStream // }}} // {{{ FileOutputStream SEXP FileOutputStream_new(SEXP filename, SEXP block_size, SEXP close_on_delete) { + BEGIN_RCPP NEW_S4_OBJECT("FileOutputStream"); int fd = open(CHAR(STRING_ELT(filename, 0)), O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0666); @@ -103,29 +119,39 @@ UNPROTECT(2); /* oo, ptr */ return oo; + END_RCPP } SEXP FileOutputStream_Flush(SEXP xp) { + BEGIN_RCPP GPB::io::FileOutputStream* stream = GET_FOS(xp); bool res = stream->Flush(); return Rf_ScalarLogical(res ? _TRUE_ : _FALSE_); + END_RCPP } SEXP FileOutputStream_Close(SEXP xp) { + BEGIN_RCPP GPB::io::FileOutputStream* stream = GET_FOS(xp); bool res = stream->Close(); return Rf_ScalarLogical(res ? _TRUE_ : _FALSE_); + END_RCPP } SEXP FileOutputStream_GetErrno(SEXP xp) { + BEGIN_RCPP GPB::io::FileOutputStream* stream = GET_FOS(xp); return Rf_ScalarInteger(stream->GetErrno()); + END_RCPP } SEXP FileOutputStream_SetCloseOnDelete(SEXP xp, SEXP close) { + BEGIN_RCPP GPB::io::FileOutputStream* stream = GET_FOS(xp); stream->SetCloseOnDelete(LOGICAL(close)); return R_NilValue; + END_RCPP } // }}} // {{{ ConnectionOutputStream SEXP ConnectionOutputStream_new(SEXP con, SEXP was_open) { + BEGIN_RCPP NEW_S4_OBJECT("ConnectionOutputStream"); ConnectionOutputStream* stream = new ConnectionOutputStream(con, (bool)LOGICAL(was_open)[0]); ZeroCopyOutputStreamWrapper* wrapper = new ZeroCopyOutputStreamWrapper(stream); @@ -137,6 +163,7 @@ UNPROTECT(2); /* oo, ptr */ return oo; + END_RCPP } // }}} @@ -144,92 +171,116 @@ // {{{ Read*** functions using CodedInputStream SEXP ZeroCopyInputStream_ReadRaw(SEXP xp, SEXP size) { + BEGIN_RCPP GPB::io::CodedInputStream* coded_stream = GET_CIS(xp); int s = INTEGER(size)[0]; SEXP payload = PROTECT(Rf_allocVector(RAWSXP, s)); - if (!coded_stream->ReadRaw(RAW(payload), s)) Rf_error("error reading raw bytes"); + if (!coded_stream->ReadRaw(RAW(payload), s)) Rcpp_error("error reading raw bytes"); UNPROTECT(1); /* payload */ return payload; + END_RCPP } SEXP ZeroCopyInputStream_ReadString(SEXP xp, SEXP size) { + BEGIN_RCPP GPB::io::CodedInputStream* coded_stream = GET_CIS(xp); int s = INTEGER(size)[0]; std::string buffer(""); - if (!coded_stream->ReadString(&buffer, s)) Rf_error("error reading string"); + if (!coded_stream->ReadString(&buffer, s)) Rcpp_error("error reading string"); return Rf_mkString(buffer.c_str()); + END_RCPP } SEXP ZeroCopyInputStream_ReadVarint32(SEXP xp) { + BEGIN_RCPP GPB::io::CodedInputStream* coded_stream = GET_CIS(xp); uint32 res = 0; - if (!coded_stream->ReadVarint32(&res)) Rf_error("error reading varint32"); + if (!coded_stream->ReadVarint32(&res)) Rcpp_error("error reading varint32"); return Rf_ScalarInteger(res); + END_RCPP } SEXP ZeroCopyInputStream_ReadLittleEndian32(SEXP xp) { + BEGIN_RCPP GPB::io::CodedInputStream* coded_stream = GET_CIS(xp); uint32 res = 0; - if (!coded_stream->ReadVarint32(&res)) Rf_error("error reading little endian int32"); + if (!coded_stream->ReadVarint32(&res)) Rcpp_error("error reading little endian int32"); return Rf_ScalarInteger(res); + END_RCPP } SEXP ZeroCopyInputStream_ReadLittleEndian64(SEXP xp) { + BEGIN_RCPP GPB::io::CodedInputStream* coded_stream = GET_CIS(xp); uint64 res = 0; - if (!coded_stream->ReadVarint64(&res)) Rf_error("error reading little endian int32"); + if (!coded_stream->ReadVarint64(&res)) Rcpp_error("error reading little endian int32"); return Rf_ScalarReal((double)res); + END_RCPP } SEXP ZeroCopyInputStream_ReadVarint64(SEXP xp) { + BEGIN_RCPP GPB::io::CodedInputStream* coded_stream = GET_CIS(xp); uint64 res = 0; - if (!coded_stream->ReadVarint64(&res)) Rf_error("error reading varint64"); + if (!coded_stream->ReadVarint64(&res)) Rcpp_error("error reading varint64"); return Rf_ScalarReal((double)res); + END_RCPP } // }}} // {{{ Write*** functions using CodedOuputStream SEXP ZeroCopyOutputStream_WriteRaw(SEXP xp, SEXP payload) { + BEGIN_RCPP GPB::io::CodedOutputStream* stream = GET_COS(xp); stream->WriteRaw(RAW(payload), LENGTH(payload)); return R_NilValue; + END_RCPP } SEXP ZeroCopyOutputStream_WriteString(SEXP xp, SEXP payload) { + BEGIN_RCPP if (LENGTH(payload) > 1) { Rf_warning("only the first element is used"); } if (LENGTH(payload) == 0) { - Rf_error("need at least one element"); + Rcpp_error("need at least one element"); } GPB::io::CodedOutputStream* stream = GET_COS(xp); stream->WriteString(CHAR(STRING_ELT(payload, 0))); return R_NilValue; + END_RCPP } SEXP ZeroCopyOutputStream_WriteLittleEndian32(SEXP xp, SEXP payload) { + BEGIN_RCPP GPB::io::CodedOutputStream* stream = GET_COS(xp); stream->WriteLittleEndian32(GET_int32(payload, 0)); return R_NilValue; + END_RCPP } SEXP ZeroCopyOutputStream_WriteLittleEndian64(SEXP xp, SEXP payload) { + BEGIN_RCPP GPB::io::CodedOutputStream* stream = GET_COS(xp); stream->WriteLittleEndian64(GET_int64(payload, 0)); return R_NilValue; + END_RCPP } SEXP ZeroCopyOutputStream_WriteVarint32(SEXP xp, SEXP payload) { + BEGIN_RCPP GPB::io::CodedOutputStream* stream = GET_COS(xp); stream->WriteVarint32(GET_int32(payload, 0)); return R_NilValue; + END_RCPP } SEXP ZeroCopyOutputStream_WriteVarint64(SEXP xp, SEXP payload) { + BEGIN_RCPP GPB::io::CodedOutputStream* stream = GET_COS(xp); stream->WriteVarint64(GET_int64(payload, 0)); return R_NilValue; + END_RCPP } // }}} From noreply at r-forge.r-project.org Sun Jan 5 04:50:08 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 5 Jan 2014 04:50:08 +0100 (CET) Subject: [Rprotobuf-commits] r723 - in pkg: . inst/unitTests man Message-ID: <20140105035008.EEB4C186C0A@r-forge.r-project.org> Author: murray Date: 2014-01-05 04:50:05 +0100 (Sun, 05 Jan 2014) New Revision: 723 Modified: pkg/ChangeLog pkg/inst/unitTests/runit.serialize.R pkg/man/ArrayInputStream-class.Rd Log: Add a basic test and some examples to the man page of using ArrayInputStream, which now works. I think the design may be a bit muddled here as the CodedInputStream consumes from the ZeroCopyInputStream which precludes examples like the original ones commented out here from working. We may need to change htis a bit. This makes a great interactive way to understand how protocol buffer serialization works, but I don't have a strong need for this since serialize() of the whole message (done in C++) works fine. Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-05 03:26:03 UTC (rev 722) +++ pkg/ChangeLog 2014-01-05 03:50:05 UTC (rev 723) @@ -10,10 +10,13 @@ * inst/unitTests/runit.serialize.R (test.encoding): Add tests verifying the exact byte serialization as described in the encoding documentation. + (test.arrayinputstream): Add new test of ArrayInputStreams. * src/wrapper_ZeroCopyInputStream.cpp (rprotobuf): Add BEGIN/END_RCPP macros to gracefully catch exceptions and return them as R language errors. * src/streams.cpp (rprotobuf): Idem. + * man/ArrayInputStream-class.Rd: Add some examples of using + ArrayInputStream. 2014-01-04 Dirk Eddelbuettel Modified: pkg/inst/unitTests/runit.serialize.R =================================================================== --- pkg/inst/unitTests/runit.serialize.R 2014-01-05 03:26:03 UTC (rev 722) +++ pkg/inst/unitTests/runit.serialize.R 2014-01-05 03:50:05 UTC (rev 723) @@ -25,6 +25,19 @@ } } +test.arrayinputstream <- function() { + # Note: This class is experimental, and some parts of the design + # may need to change, but this works now. + stream <- ArrayInputStream(as.raw(0:10)) + checkEquals(stream$ReadRaw(5), as.raw(0:4)) + + stringstream <- ArrayInputStream(as.raw(c(0x74, 0x65, 0x73, 0x74, 0x69, 0x6e, 0x67))) + checkEquals(stringstream$ReadString(as.integer(7)), "testing") + + intstream <- ArrayInputStream(as.raw(c(0x9e, 0xa7, 0x05))) + checkEquals(intstream$ReadVarint32(), 86942) +} + test.encoding <- function() { if (!exists("protobuf_encoding_test.Test1", "RProtoBuf:DescriptorPool")) { Modified: pkg/man/ArrayInputStream-class.Rd =================================================================== --- pkg/man/ArrayInputStream-class.Rd 2014-01-05 03:26:03 UTC (rev 722) +++ pkg/man/ArrayInputStream-class.Rd 2014-01-05 03:50:05 UTC (rev 723) @@ -25,17 +25,24 @@ \seealso{ \linkS4class{ZeroCopyInputStream} for methods } -% \examples{ +\examples{ +stream <- ArrayInputStream(as.raw(0:10)) +stream$ReadRaw(5) + +stringsstream <- ArrayInputStream(as.raw(c(0x74, 0x65, 0x73, 0x74, 0x69, 0x6e, 0x67))) +stringsstream$ReadString(7) + +intstream <- ArrayInputStream(as.raw(c(0x9e, 0xa7, 0x05))) +intstream$ReadVarint32() +} % local({ % stream <- ArrayInputStream( as.raw(0:10) ) % stopifnot( identical( stream$Next(), as.raw(0:10) ) ) % stream$BackUp( 5 ) % stopifnot( identical( stream$Next(), as.raw(6:10) ) ) -% +% % # limiting the block size % stream <- ArrayInputStream( as.raw(0:10), 5 ) % stopifnot( identical(length( stream$Next() ), 5L ) ) % }) -% -% } \keyword{classes} From noreply at r-forge.r-project.org Tue Jan 7 22:14:02 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 7 Jan 2014 22:14:02 +0100 (CET) Subject: [Rprotobuf-commits] r724 - papers/rjournal Message-ID: <20140107211402.539121862E4@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-07 22:14:01 +0100 (Tue, 07 Jan 2014) New Revision: 724 Modified: papers/rjournal/eddelbuettel-stokely.Rnw papers/rjournal/eddelbuettel-stokely.bib Log: add section on opencpu Modified: papers/rjournal/eddelbuettel-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-05 03:50:05 UTC (rev 723) +++ papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-07 21:14:01 UTC (rev 724) @@ -1,4 +1,4 @@ -% !TeX root = RJwrapper.tex + % We don't want a left margin for Sinput or Soutput for our table 1. %\DefineVerbatimEnvironment{Sinput}{Verbatim} {xleftmargin=0em} %\DefineVerbatimEnvironment{Soutput}{Verbatim}{xleftmargin=0em} @@ -1308,6 +1308,213 @@ TODO(mstokely): Talk about Jeroen Ooms OpenCPU, or talk about Andy Chu's Poly. +\section{Application: Protocol Buffers for Data Interchange in Web Services} + +As the name implies, the primary application of protocol buffers is +data interchange in the context of inter-system communications. +Network protocols such as HTTP describe procedures on client-server +communication, i.e. how to iniate requests, authenticate, send messages, +etc. However, network +protocols generally do not regulate \emph{content} of messages: they allow +transfer of any media type, such as web pages, files or video. +When designing systems where various components require exchange of specific data +structures, we need something on top of the protocol that prescribes +how these structures are to be respresented in messages (buffers) on the +network. Protocol buffers solve exactly this problem by providing +a cross platform method for serializing arbitrary structures into well defined +messages, that can be exchanged using any protocol. The descriptors +(\texttt{.proto} files) are used to formally define the interface of a +remote API or network application. Libraries to parse and generate protobuf +messages are available for many programming languages, making it +relatively straight forward to implement clients and servers. + + +\subsection{Interacting with R through HTTPS and Protocol Buffers} + +One example of a system that supports protocol buffers to interact +with R is OpenCPU \citep{opencpu}. OpenCPU is a framework for embedded statistical +computation and reproducible research based on R and Latex. It exposes a +HTTP(S) API to access and manipulate R objects and allows for performing +remote R function calls. Clients do not need to understand +or generate any R code: HTTP requests are automatically mapped to +function calls, and arguments/return values can be posted/retrieved +using several data interchange formats, such as protocol buffers. +OpenCPU uses the \texttt{serialize\_pb} and \texttt{unserialize\_pb} functions +from the \texttt{RProtoBuf} package to convert between R objects and protobuf +messages. Therefore, clients need the \texttt{rexp.proto} descriptor mentioned +earlier to parse and generate protobuf messages when interacting with OpenCPU. + +\subsection{HTTP GET: Retrieving an R object} + +The \texttt{HTTP GET} method is used to read a resource from OpenCPU. For example, +to access the dataset \texttt{Animals} from the package \texttt{MASS}, a +client performs the following HTTP request: + +\begin{verbatim} + GET https://public.opencpu.org/ocpu/library/MASS/data/Animals/pb +\end{verbatim} +The postfix \texttt{/pb} in the URL tells the server to send this +object in the form of a protobuf message. Alternative formats include +\texttt{/json}, \texttt{/csv}, \texttt{/rds} and others. If the request +is successful, OpenCPU returns the serialized object with HTTP status +code 200 and HTTP response header \texttt{Content-Type: application/x-protobuf}. +The latter is the conventional MIME type that formally notifies the client to +interpret the response as a protobuf message. + +Because both HTTP and Protocol Buffers have libraries available for many +languages, clients can be implemented in just a few lines of code. Below +example code for both R and Python that retrieve a dataset from R with +OpenCPU using a protobuf message. In R, we use the HTTP client from +the \texttt{httr} package \citep{httr}, and the protobuf +parser from the \texttt{RProtoBuf} package. In this illustrative example we +download a dataset which is part of the base R distribution, so we can actually +verify that the object was transferred without loss of information. + +<>= +# Load packages +library(RProtoBuf) +library(httr) + +# Retrieve and parse message +req <- GET ('https://public.opencpu.org/ocpu/library/MASS/data/Animals/pb') +output <- unserialize_pb(req$content) + +# Check that no information was lost +identical(output, MASS::Animals) +@ +This code suggests a method for exchanging objects between R servers, however this can +also be done without protocol buffers. The main advantage of using an inter-operable format +is that we can actually access R objects from within another +programming language. For example, in a very similar fasion we can retrieve the same +dataset in a Python client. To parse messages in Python, we first compile the +\texttt{rexp.proto} descriptor into a python module using the \texttt{protoc} compiler: + +\begin{verbatim} + protoc rexp.proto --python_out=. +\end{verbatim} +This generates python module called \texttt{rexp\_pb2.py}, containing both the +descriptor information as well as methods to read and manipulate the R object +message. In the example below we use the HTTP client from the \texttt{urllib2} +module. + +\begin{verbatim} +# Import modules +import urllib2 +from rexp_pb2 import REXP + +# Retrieve message +req = urllib2.Request('https://public.opencpu.org/ocpu/library/MASS/data/Animals/pb') +res = urllib2.urlopen(req) + +# Parse rexp.proto message +msg = REXP() +msg.ParseFromString(res.read()) +print(msg) +\end{verbatim} +The \texttt{msg} object contains all data from the Animals dataset. From here we +can easily extract the desired fields for further use in Python. + + +\subsection{HTTP POST: Calling an R function} + +The example above shows how the \texttt{HTTP GET} method retrieves a +resource from OpenCPU, for example an R object. The \texttt{HTTP POST} +method on the other hand is used for calling functions and running scripts, +which is the primary purpose of the framework. As before, the \texttt{/pb} +postfix requests to retrieve the output as a protobuf message, in this +case the function return value. However, OpenCPU allows us to supply the +arguments of the function call in the form of protobuf messages as well. +This is a bit more work, because clients needs to both generate messages +containing R objects to post to the server, as well as retrieve and parse +protobuf messages returned by the server. Using protocol buffers to post +function arguments is not required, and for simple (scalar) arguments +the standard \texttt{appliation/www-url-encoded} format might be sufficient. +However, with protocol buffers the client can perform function calls with +more complex arguments such as R vectors or lists. The result is a complete +RPC system to do arbitrary R function calls from within +any programming language. + +The following example R client code performs the remote function call +\texttt{stats::rnorm(n=42, mean=100)}. The function arguments (in this +case \texttt{n} and \texttt{mean}) as well as the return value (a vector +with 42 random numbers) are transferred using a protobuf message. RPC in +OpenCPU works like the \texttt{do.call} function in R, hence all arguments +are contained within a list. + +<<>>= +#requires httr >= 0.2.99 +library(httr) +library(RProtoBuf) + +args <- list(n=42, mean=100) +payload <- serialize_pb(args, NULL) + +req <- POST ( + url = "https://public.opencpu.org/ocpu/library/stats/R/rnorm/pb", + body = payload, + add_headers ( + "Content-Type" = "application/x-protobuf" + ) +) + +#This is the output of stats::rnorm(n=42, mean=100) +output <- unserialize_pb(req$content) +print(output) +@ +The OpenCPU server basically performs the following steps to process the above RPC request: + +<>= +fnargs <- unserialize_pb(inputmsg) +val <- do.call(stats::rnorm, fnargs) +outputmsg <- serialize_pb(val) +@ +In reality the OpenCPU provides a lot of meta functionality such as handling +of sessions, exceptions, security, and much more. OpenCPU also makes it possible to store +output of a function call on the server, instead of directly retrieving it. Thereby +objects can be shared with other users or used as arguments in a subsequent +function call. But in its essence, the HTTP API provides a simple way to perform remote +R function calls over HTTPS. The same request can be performed in Python as follows: + +\begin{verbatim} +import urllib2; +from rexp_pb2 import *; + +#create the post payload, i.e. list(n=42, mean=100) +payload = REXP( + rclass = 5, + rexpValue = [ + REXP(rclass = 2, realValue = [42]), + REXP(rclass = 2, realValue = [100]) + ], + attrName = [ + "names" + ], + attrValue = [ + REXP(rclass = 0, stringValue = [STRING(strval="n"), STRING(strval="mean")]) + ] +); + +#HTTP POST +req = urllib2.Request( + "https://public.opencpu.org/ocpu/library/stats/R/rnorm/pb", + data = payload.SerializeToString(), + headers = { + 'Content-type': 'application/x-protobuf' + } +) +res = urllib2.urlopen(req); + +#parse output pb +msg = REXP(); +msg.ParseFromString(res.read()); + +#the return value is a double vector in this case +print(msg.realValue); +\end{verbatim} + + + + \section{Summary} % RProtoBuf has been used. Modified: papers/rjournal/eddelbuettel-stokely.bib =================================================================== --- papers/rjournal/eddelbuettel-stokely.bib 2014-01-05 03:50:05 UTC (rev 723) +++ papers/rjournal/eddelbuettel-stokely.bib 2014-01-07 21:14:01 UTC (rev 724) @@ -271,3 +271,17 @@ year={2009}, publisher={Wiley. com} } + at Manual{httr, + title = {httr: Tools for working with URLs and HTTP}, + author = {Hadley Wickham}, + year = {2012}, + note = {R package version 0.2}, + url = {http://CRAN.R-project.org/package=httr}, +} + at Manual{opencpu, + title = {OpenCPU system for embedded statistical computation and reproducible research}, + author = {Jeroen Ooms}, + year = {2013}, + note = {R package version 1.2.2}, + url = {http://www.opencpu.org}, +} From noreply at r-forge.r-project.org Thu Jan 9 01:13:28 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 01:13:28 +0100 (CET) Subject: [Rprotobuf-commits] r725 - papers/rjournal Message-ID: <20140109001328.7F8E7185F4D@r-forge.r-project.org> Author: murray Date: 2014-01-09 01:13:27 +0100 (Thu, 09 Jan 2014) New Revision: 725 Modified: papers/rjournal/eddelbuettel-stokely.Rnw Log: Comment out an unneeded section now obviated by Jereoen's OpenCPU section. Modified: papers/rjournal/eddelbuettel-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-07 21:14:01 UTC (rev 724) +++ papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-09 00:13:27 UTC (rev 725) @@ -1060,7 +1060,6 @@ options("RProtoBuf.int64AsString" = FALSE) @ - \section{Evaluation: data.frame to Protocol Buffer Serialization} Saptarshi Guha wrote the RHIPE package \citep{rhipe} which includes @@ -1253,61 +1252,18 @@ %\section{Basic usage example - tutorial.Person} -\section{Application: Distributed Data Collection with MapReduce} +\include{app-mapreduce} -TODO(mstokely): Make this better. +%\section{Application: Sending/receiving Interaction With Servers} +% +%Combined +%with an RPC system this means that one can interactively craft request +%messages, send the serialized message to a remote server, read back a +%response, and then parse the response protocol buffer interactively. -Many large data sets in fields such as particle physics and -information processing are stored in binned or histogram form in order -to reduce the data storage requirements -\citep{scott2009multivariate}. Protocol Buffers make a particularly -good data transport format in distributed MapReduces environments -where large numbers of computers process a large data set for analysis. +%TODO(mstokely): Talk about Jeroen Ooms OpenCPU, or talk about Andy +%Chu's Poly. -There are two common patterns for generating histograms of large data -sets with MapReduce. In the first method, each mapper task can -generate a histogram over a subset of the data that is has been -assigned, and then the histograms of each mapper are sent to one or -more reducer tasks to merge. - -In the second method, each mapper rounds a data point to a bucket -width and outputs that bucket as a key and '1' as a value. Reducers -then sum up all of the values with the same key and output to a data store. - -In both methods, the mapper tasks must choose identical -bucket boundaries even though they are analyzing disjoint parts of the -input set that may cover different ranges, or we must implement -multiple phases. - -\begin{figure}[h!] -\begin{center} -\includegraphics[width=\textwidth]{histogram-mapreduce-diag1.pdf} -\end{center} -\caption{Diagram of MapReduce Histogram Generation Pattern} -\label{fig:mr-histogram-pattern1} -\end{figure} - -Figure~\ref{fig:mr-histogram-pattern1} illustrates the second method -described above for histogram generation of large data sets with -MapReduce. - -This package is designed to be helpful if some of the Map or Reduce -tasks are written in R, or if those components are written in other -languages and only the resulting output histograms need to be -manipulated in R. - -\section{Application: Sending/receiving Interaction With Servers} - -Unlike Apache Thrift, Protocol Buffers do not include a concrete RPC -implementation. However, serialized protocol buffers can trivially be -sent over TCP or integrated with a proprietary RPC system. Combined -with an RPC system this means that one can interactively craft request -messages, send the serialized message to a remote server, read back a -response, and then parse the response protocol buffer interactively. - -TODO(mstokely): Talk about Jeroen Ooms OpenCPU, or talk about Andy -Chu's Poly. - \section{Application: Protocol Buffers for Data Interchange in Web Services} As the name implies, the primary application of protocol buffers is From noreply at r-forge.r-project.org Thu Jan 9 01:27:46 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 01:27:46 +0100 (CET) Subject: [Rprotobuf-commits] r726 - papers/rjournal Message-ID: <20140109002746.80647186A0D@r-forge.r-project.org> Author: murray Date: 2014-01-09 01:27:46 +0100 (Thu, 09 Jan 2014) New Revision: 726 Modified: papers/rjournal/eddelbuettel-stokely.Rnw Log: Fix some typos / add misisng articles in Jereoen's new OpenCPU section. Move this example before the MapReduce one. It uses the serialize_pb method but we haven't described that here. A short introduction of that concept is needed for the application. I think some of the first paragraph here should be moved to the introduction so this section can focus only on an application. Modified: papers/rjournal/eddelbuettel-stokely.Rnw =================================================================== --- papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-09 00:13:27 UTC (rev 725) +++ papers/rjournal/eddelbuettel-stokely.Rnw 2014-01-09 00:27:46 UTC (rev 726) @@ -1252,29 +1252,22 @@ %\section{Basic usage example - tutorial.Person} -\include{app-mapreduce} +\section{Application: Data Interchange in Web Services} -%\section{Application: Sending/receiving Interaction With Servers} -% -%Combined -%with an RPC system this means that one can interactively craft request -%messages, send the serialized message to a remote server, read back a -%response, and then parse the response protocol buffer interactively. +% TODO(jeroen): I think maybe some of this should go earlier in the +% paper, so this part can focus only on introducing the application, +% Can you integrate some of this text earlier, maybe into the the +% introduction? -%TODO(mstokely): Talk about Jeroen Ooms OpenCPU, or talk about Andy -%Chu's Poly. - -\section{Application: Protocol Buffers for Data Interchange in Web Services} - -As the name implies, the primary application of protocol buffers is +As described earlier, the primary application of protocol buffers is data interchange in the context of inter-system communications. -Network protocols such as HTTP describe procedures on client-server -communication, i.e. how to iniate requests, authenticate, send messages, -etc. However, network +Network protocols such as HTTP provide mechanisms for client-server +communication, i.e. how to initiate requests, authenticate, send messages, +etc. However, many network protocols generally do not regulate \emph{content} of messages: they allow transfer of any media type, such as web pages, files or video. When designing systems where various components require exchange of specific data -structures, we need something on top of the protocol that prescribes +structures, we need something on top of the network protocol that prescribes how these structures are to be respresented in messages (buffers) on the network. Protocol buffers solve exactly this problem by providing a cross platform method for serializing arbitrary structures into well defined @@ -1284,12 +1277,11 @@ messages are available for many programming languages, making it relatively straight forward to implement clients and servers. - \subsection{Interacting with R through HTTPS and Protocol Buffers} One example of a system that supports protocol buffers to interact with R is OpenCPU \citep{opencpu}. OpenCPU is a framework for embedded statistical -computation and reproducible research based on R and Latex. It exposes a +computation and reproducible research based on R and \LaTeX. It exposes a HTTP(S) API to access and manipulate R objects and allows for performing remote R function calls. Clients do not need to understand or generate any R code: HTTP requests are automatically mapped to @@ -1319,11 +1311,13 @@ Because both HTTP and Protocol Buffers have libraries available for many languages, clients can be implemented in just a few lines of code. Below -example code for both R and Python that retrieve a dataset from R with +is example code for both R and Python that retrieves a dataset from R with OpenCPU using a protobuf message. In R, we use the HTTP client from -the \texttt{httr} package \citep{httr}, and the protobuf -parser from the \texttt{RProtoBuf} package. In this illustrative example we -download a dataset which is part of the base R distribution, so we can actually +the \texttt{httr} package \citep{httr}. +% superfluous? +%, and the protobuf parser from the \texttt{RProtoBuf} package. +In this example we +download a dataset which is part of the base R distribution, so we can verify that the object was transferred without loss of information. <>= @@ -1341,7 +1335,7 @@ This code suggests a method for exchanging objects between R servers, however this can also be done without protocol buffers. The main advantage of using an inter-operable format is that we can actually access R objects from within another -programming language. For example, in a very similar fasion we can retrieve the same +programming language. For example, in a very similar fashion we can retrieve the same dataset in a Python client. To parse messages in Python, we first compile the \texttt{rexp.proto} descriptor into a python module using the \texttt{protoc} compiler: @@ -1469,8 +1463,60 @@ \end{verbatim} +\section{Application: Distributed Data Collection with MapReduce} +TODO(mstokely): Make this better. +Many large data sets in fields such as particle physics and +information processing are stored in binned or histogram form in order +to reduce the data storage requirements +\citep{scott2009multivariate}. Protocol Buffers make a particularly +good data transport format in distributed MapReduces environments +where large numbers of computers process a large data set for analysis. + +There are two common patterns for generating histograms of large data +sets with MapReduce. In the first method, each mapper task can +generate a histogram over a subset of the data that is has been +assigned, and then the histograms of each mapper are sent to one or +more reducer tasks to merge. + +In the second method, each mapper rounds a data point to a bucket +width and outputs that bucket as a key and '1' as a value. Reducers +then sum up all of the values with the same key and output to a data store. + +In both methods, the mapper tasks must choose identical +bucket boundaries even though they are analyzing disjoint parts of the +input set that may cover different ranges, or we must implement +multiple phases. + +\begin{figure}[h!] +\begin{center} +\includegraphics[width=\textwidth]{histogram-mapreduce-diag1.pdf} +\end{center} +\caption{Diagram of MapReduce Histogram Generation Pattern} +\label{fig:mr-histogram-pattern1} +\end{figure} + +Figure~\ref{fig:mr-histogram-pattern1} illustrates the second method +described above for histogram generation of large data sets with +MapReduce. + +This package is designed to be helpful if some of the Map or Reduce +tasks are written in R, or if those components are written in other +languages and only the resulting output histograms need to be +manipulated in R. + +%\section{Application: Sending/receiving Interaction With Servers} +% +%Combined +%with an RPC system this means that one can interactively craft request +%messages, send the serialized message to a remote server, read back a +%response, and then parse the response protocol buffer interactively. + +%TODO(mstokely): Talk about Jeroen Ooms OpenCPU, or talk about Andy +%Chu's Poly. + + \section{Summary} % RProtoBuf has been used. From noreply at r-forge.r-project.org Thu Jan 9 01:49:19 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 01:49:19 +0100 (CET) Subject: [Rprotobuf-commits] r727 - in papers: . jss rjournal Message-ID: <20140109004919.CE280186869@r-forge.r-project.org> Author: murray Date: 2014-01-09 01:49:19 +0100 (Thu, 09 Jan 2014) New Revision: 727 Added: papers/jss/ papers/jss/eddelbuettel-stokely.bib papers/jss/jss.bst papers/jss/jss.cls papers/jss/jss.dtx papers/jss/jsslogo.jpg papers/jss/protobuf-distributed-system-crop.pdf Removed: papers/rjournal/jss.bst papers/rjournal/jss.cls papers/rjournal/jss.dtx papers/rjournal/jsslogo.jpg Log: Move the paper over to a JSS subdirectory so we can better improve the presentation and organization aspects in the JSS style. It's over 20 pages now. Copied: papers/jss/eddelbuettel-stokely.bib (from rev 724, papers/rjournal/eddelbuettel-stokely.bib) =================================================================== --- papers/jss/eddelbuettel-stokely.bib (rev 0) +++ papers/jss/eddelbuettel-stokely.bib 2014-01-09 00:49:19 UTC (rev 727) @@ -0,0 +1,287 @@ + at article{eddelbuettel2011rcpp, + title={Rcpp: Seamless R and C++ integration}, + author={Eddelbuettel, Dirk and Fran{\c{c}}ois, Romain}, + journal={Journal of Statistical Software}, + volume={40}, + number={8}, + pages={1--18}, + year={2011} +} + at Manual{msgpackR, + title = {msgpackR: A library to serialize or unserialize data in MessagePack format}, + author = {Mikiya Tanizawa}, + year = {2013}, + note = {R package version 1.1}, + url = {http://CRAN.R-project.org/package=msgpackR}, +} + at Manual{rmongodb, + title={rmongodb: R-MongoDB driver}, + author={Gerald Lindsly}, + year = {2013}, + note = {R package version 1.3.3}, + url = {http://CRAN.R-project.org/package=rmongodb}, +} + at Manual{int64, + title = {int64: 64 bit integer types}, + author = {Romain Francois}, + year = {2011}, + note = {R package version 1.1.2}, + url = {http://CRAN.R-project.org/package=int64}, +} + at Manual{bit64, + title = {bit64: A S3 class for vectors of 64bit integers}, + author = {Jens Oehlschl?gel}, + year = {2012}, + note = {R package version 0.9-3}, + url = {http://CRAN.R-project.org/package=bit64}, +} + at book{eddelbuettel2013seamless, + title={Seamless R and C++ Integration with Rcpp}, + author={Eddelbuettel, Dirk}, + year={2013}, + publisher={Springer} +} + at Manual{rhipe, + title = {RHIPE: A Distributed Environment for the Analysis of Large and Complex Datasets}, + author = {Saptarshi Guha}, + year = {2010}, + url = {http://www.stat.purdue.edu/~sguha/rhipe/}, +} + at misc{serialization, +author= {Tierney, Luke}, +title = {A New Serialization Mechanism for R}, +url = {http://www.cs.uiowa.edu/~luke/R/serialize/serialize.ps}, +year = {2003}, +} + at manual{eddelbuettel2013exposing, + title={Exposing C++ functions and classes with Rcpp modules}, + author={Eddelbuettel, Dirk and Fran{\c{c}}ois, Romain}, + year={2013}, + note={Vignette included in R package Rcpp}, + url = {http://CRAN.R-project.org/package=Rcpp}, +} + at inproceedings{cantrill2004dynamic, + title={Dynamic Instrumentation of Production Systems.}, + author={Cantrill, Bryan and Shapiro, Michael W and Leventhal, Adam H and others}, + booktitle={USENIX Annual Technical Conference, General Track}, + pages={15--28}, + year={2004} +} + at article{swain1991color, + title={Color indexing}, + author={Swain, Michael J and Ballard, Dana H}, + journal={International journal of computer vision}, + volume={7}, + number={1}, + pages={11--32}, + year={1991}, + publisher={Springer} +} + at article{rubner2000earth, + title={The earth mover's distance as a metric for image retrieval}, + author={Rubner, Yossi and Tomasi, Carlo and Guibas, Leonidas J}, + journal={International Journal of Computer Vision}, + volume={40}, + number={2}, + pages={99--121}, + year={2000}, + publisher={Springer} +} + at book{kullback1997information, + title={Information theory and statistics}, + author={Kullback, Solomon}, + year={1997}, + publisher={Courier Dover Publications} +} + at inproceedings{puzicha1997non, + title={Non-parametric similarity measures for unsupervised texture segmentation and image retrieval}, + author={Puzicha, Jan and Hofmann, Thomas and Buhmann, Joachim M}, + booktitle={Computer Vision and Pattern Recognition, 1997. Proceedings., 1997 IEEE Computer Society Conference on}, + pages={267--272}, + year={1997}, + organization={IEEE} +} + at inproceedings{fang1999computing, + title={Computing Iceberg Queries Efficiently.}, + author={Fang, Min and Shivakumar, Narayanan and Garcia-Molina, Hector and Motwani, Rajeev and Ullman, Jeffrey D}, + booktitle={Internaational Conference on Very Large Databases (VLDB'98), New York, August 1998}, + year={1999}, + organization={Stanford InfoLab} +} + at Manual{emdist, + title = {emdist: Earth Mover's Distance}, + author = {Simon Urbanek and Yossi Rubner}, + year = {2012}, + note = {R package version 0.3-1}, + url = {http://cran.r-project.org/package=emdist}, +} + at article{Wegiel:2010:CTT:1932682.1869479, + author = {Wegiel, Michal and Krintz, Chandra}, + title = {Cross-language, Type-safe, and Transparent Object Sharing for Co-located Managed Runtimes}, + journal = {SIGPLAN Not.}, + issue_date = {October 2010}, + volume = {45}, + number = {10}, + month = oct, + year = {2010}, + issn = {0362-1340}, + pages = {223--240}, + numpages = {18}, + url = {http://doi.acm.org/10.1145/1932682.1869479}, + doi = {10.1145/1932682.1869479}, + acmid = {1869479}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {collection, communication, cross-language, garbage, managed, memory, model, object, rpc, runtimes, shared, synchronization, transparent, type-safe}, +} + at article{wickham2011split, + title={The split-apply-combine strategy for data analysis}, + author={Wickham, Hadley}, + journal={Journal of Statistical Software}, + volume={40}, + number={1}, + pages={1--29}, + year={2011}, + publisher={Citeseer} +} + at inproceedings{Sumaray:2012:CDS:2184751.2184810, + author = {Sumaray, Audie and Makki, S. Kami}, + title = {A Comparison of Data Serialization Formats for Optimal Efficiency on a Mobile Platform}, + booktitle = {Proceedings of the 6th International Conference on Ubiquitous Information Management and Communication}, + series = {ICUIMC '12}, + year = {2012}, + isbn = {978-1-4503-1172-4}, + location = {Kuala Lumpur, Malaysia}, + pages = {48:1--48:6}, + articleno = {48}, + numpages = {6}, + url = {http://doi.acm.org/10.1145/2184751.2184810}, + doi = {10.1145/2184751.2184810}, + acmid = {2184810}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {Android, Dalvik, JSON, ProtoBuf, XML, data serialization, thrift}, +} + at Manual{RObjectTables, + title = {User-Defined Tables in the R Search Path}, + author = {Duncan Temple Lang}, + year = {2012}, + url = {http://www.omegahat.org/RObjectTables/RObjectTables.pdf}, +} + at Manual{rprotobuf, + title = {RProtoBuf: R Interface to the Protocol Buffers API}, + author = {Romain Francois and Dirk Eddelbuettel and Murray Stokely}, + note = {R package version 0.3.2}, + year = {2013}, + url = {http://cran.r-project.org/web/packages/RProtoBuf/index.html}, +} + at Manual{r, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2013}, + url = {http://www.R-project.org/}, + } + at article{dean2008mapreduce, + title={MapReduce: simplified data processing on large clusters}, + author={Dean, Jeffrey and Ghemawat, Sanjay}, + journal={Communications of the ACM}, + volume={51}, + number={1}, + pages={107--113}, + year={2008}, + publisher={ACM} +} + at article{bostock2011d3, + title={D$^3$ Data-Driven Documents}, + author={Bostock, Michael and Ogievetsky, Vadim and Heer, Jeffrey}, + journal={Visualization and Computer Graphics, IEEE Transactions on}, + volume={17}, + number={12}, + pages={2301--2309}, + year={2011}, + publisher={IEEE} +} +% celebrated article in this field. Also see the parallel paragraph. + at article{Manku:1998:AMO:276305.276342, + author = {Manku, Gurmeet Singh and Rajagopalan, Sridhar and Lindsay, Bruce G.}, + title = {Approximate medians and other quantiles in one pass and with limited memory}, + journal = {SIGMOD Rec.}, + issue_date = {June 1998}, + volume = {27}, + number = {2}, + month = jun, + year = {1998}, + issn = {0163-5808}, + pages = {426--435}, + numpages = {10}, + url = {http://doi.acm.org/10.1145/276305.276342}, + doi = {10.1145/276305.276342}, + acmid = {276342}, + publisher = {ACM}, + address = {New York, NY, USA}, +} +% Has a section on protocol buffers + at article{Pike:2005:IDP:1239655.1239658, + author = {Pike, Rob and Dorward, Sean and Griesemer, Robert and Quinlan, Sean}, + title = {Interpreting the data: Parallel analysis with Sawzall}, + journal = {Sci. Program.}, + issue_date = {October 2005}, + volume = {13}, + number = {4}, + month = oct, + year = {2005}, + issn = {1058-9244}, + pages = {277--298}, + numpages = {22}, + acmid = {1239658}, + publisher = {IOS Press}, + address = {Amsterdam, The Netherlands, The Netherlands}, +} + at Manual{protobuf, + title = {Protocol Buffers: Developer Guide}, + author = {Google}, + year = {2012}, + url = {http://code.google.com/apis/protocolbuffers/docs/overview.html} +} + at article{sturges1926choice, + title={The choice of a class interval}, + author={Sturges, Herbert A}, + journal={Journal of the American Statistical Association}, + volume={21}, + number={153}, + pages={65--66}, + year={1926} +} + at article{scott1979optimal, + title={On optimal and data-based histograms}, + author={Scott, David W}, + journal={Biometrika}, + volume={66}, + number={3}, + pages={605--610}, + year={1979}, + publisher={Biometrika Trust} +} + at book{scott2009multivariate, + title={Multivariate density estimation: theory, practice, and visualization}, + author={Scott, David W}, + volume={383}, + year={2009}, + publisher={Wiley. com} +} + at Manual{httr, + title = {httr: Tools for working with URLs and HTTP}, + author = {Hadley Wickham}, + year = {2012}, + note = {R package version 0.2}, + url = {http://CRAN.R-project.org/package=httr}, +} + at Manual{opencpu, + title = {OpenCPU system for embedded statistical computation and reproducible research}, + author = {Jeroen Ooms}, + year = {2013}, + note = {R package version 1.2.2}, + url = {http://www.opencpu.org}, +} Copied: papers/jss/jss.bst (from rev 724, papers/rjournal/jss.bst) =================================================================== --- papers/jss/jss.bst (rev 0) +++ papers/jss/jss.bst 2014-01-09 00:49:19 UTC (rev 727) @@ -0,0 +1,1631 @@ +%% +%% This is file `jss.bst', +%% generated with the docstrip utility. +%% +%% The original source files were: +%% +%% merlin.mbs (with options: `ay,nat,nm-rvx,keyxyr,dt-beg,yr-par,note-yr,tit-qq,atit-u,trnum-it,vol-bf,volp-com,num-xser,pre-edn,isbn,issn,edpar,pp,ed,xedn,xand,etal-it,revdata,eprint,url,url-blk,doi,nfss') +%% +%% ** BibTeX style file for JSS publications (http://www.jstatsoft.org/) +%% +%% Copyright 1994-2007 Patrick W Daly +%% License: GPL-2 + % =============================================================== + % IMPORTANT NOTICE: + % This bibliographic style (bst) file has been generated from one or + % more master bibliographic style (mbs) files, listed above, provided + % with kind permission of Patrick W Daly. + % + % This generated file can be redistributed and/or modified under the terms + % of the General Public License (Version 2). + % =============================================================== + % Name and version information of the main mbs file: + % \ProvidesFile{merlin.mbs}[2007/04/24 4.20 (PWD, AO, DPC)] + % For use with BibTeX version 0.99a or later + %------------------------------------------------------------------- + % This bibliography style file is intended for texts in ENGLISH + % This is an author-year citation style bibliography. As such, it is + % non-standard LaTeX, and requires a special package file to function properly. + % Such a package is natbib.sty by Patrick W. Daly + % The form of the \bibitem entries is + % \bibitem[Jones et al.(1990)]{key}... + % \bibitem[Jones et al.(1990)Jones, Baker, and Smith]{key}... + % The essential feature is that the label (the part in brackets) consists + % of the author names, as they should appear in the citation, with the year + % in parentheses following. There must be no space before the opening + % parenthesis! + % With natbib v5.3, a full list of authors may also follow the year. + % In natbib.sty, it is possible to define the type of enclosures that is + % really wanted (brackets or parentheses), but in either case, there must + % be parentheses in the label. + % The \cite command functions as follows: + % \citet{key} ==>> Jones et al. (1990) + % \citet*{key} ==>> Jones, Baker, and Smith (1990) + % \citep{key} ==>> (Jones et al., 1990) + % \citep*{key} ==>> (Jones, Baker, and Smith, 1990) + % \citep[chap. 2]{key} ==>> (Jones et al., 1990, chap. 2) + % \citep[e.g.][]{key} ==>> (e.g. Jones et al., 1990) + % \citep[e.g.][p. 32]{key} ==>> (e.g. Jones et al., p. 32) + % \citeauthor{key} ==>> Jones et al. + % \citeauthor*{key} ==>> Jones, Baker, and Smith + % \citeyear{key} ==>> 1990 + %--------------------------------------------------------------------- + +ENTRY + { address + archive + author + booktitle + chapter + collaboration + doi + edition + editor + eid + eprint + howpublished + institution + isbn + issn + journal + key + month + note + number + numpages + organization + pages + publisher + school + series + title + type + url + volume + year + } + {} + { label extra.label sort.label short.list } +INTEGERS { output.state before.all mid.sentence after.sentence after.block } +FUNCTION {init.state.consts} +{ #0 'before.all := + #1 'mid.sentence := + #2 'after.sentence := + #3 'after.block := +} +STRINGS { s t} +FUNCTION {output.nonnull} +{ 's := + output.state mid.sentence = + { ", " * write$ } + { output.state after.block = + { add.period$ write$ + newline$ + "\newblock " write$ + } + { output.state before.all = + 'write$ + { add.period$ " " * write$ } + if$ + } + if$ + mid.sentence 'output.state := + } + if$ + s +} +FUNCTION {output} +{ duplicate$ empty$ + 'pop$ + 'output.nonnull + if$ +} +FUNCTION {output.check} +{ 't := + duplicate$ empty$ + { pop$ "empty " t * " in " * cite$ * warning$ } + 'output.nonnull + if$ +} +FUNCTION {fin.entry} +{ add.period$ + write$ + newline$ +} + +FUNCTION {new.block} +{ output.state before.all = + 'skip$ + { after.block 'output.state := } + if$ +} +FUNCTION {new.sentence} +{ output.state after.block = + 'skip$ + { output.state before.all = + 'skip$ + { after.sentence 'output.state := } + if$ + } + if$ +} +FUNCTION {add.blank} +{ " " * before.all 'output.state := +} + +FUNCTION {date.block} +{ + new.block +} + +FUNCTION {not} +{ { #0 } + { #1 } + if$ +} +FUNCTION {and} +{ 'skip$ + { pop$ #0 } + if$ +} +FUNCTION {or} +{ { pop$ #1 } + 'skip$ + if$ +} +FUNCTION {non.stop} +{ duplicate$ + "}" * add.period$ + #-1 #1 substring$ "." = +} + +STRINGS {z} +FUNCTION {remove.dots} +{ 'z := + "" + { z empty$ not } + { z #1 #1 substring$ + z #2 global.max$ substring$ 'z := + duplicate$ "." = 'pop$ + { * } + if$ + } + while$ +} +FUNCTION {new.block.checkb} +{ empty$ + swap$ empty$ + and + 'skip$ + 'new.block + if$ +} +FUNCTION {field.or.null} +{ duplicate$ empty$ + { pop$ "" } + 'skip$ + if$ +} +FUNCTION {emphasize} +{ duplicate$ empty$ + { pop$ "" } + { "\emph{" swap$ * "}" * } + if$ +} +FUNCTION {bolden} +{ duplicate$ empty$ + { pop$ "" } + { "\textbf{" swap$ * "}" * } + if$ +} +FUNCTION {tie.or.space.prefix} +{ duplicate$ text.length$ #3 < + { "~" } + { " " } + if$ + swap$ +} + +FUNCTION {capitalize} +{ "u" change.case$ "t" change.case$ } + +FUNCTION {space.word} +{ " " swap$ * " " * } + % Here are the language-specific definitions for explicit words. + % Each function has a name bbl.xxx where xxx is the English word. + % The language selected here is ENGLISH +FUNCTION {bbl.and} +{ "and"} + +FUNCTION {bbl.etal} +{ "et~al." } + +FUNCTION {bbl.editors} +{ "eds." } + +FUNCTION {bbl.editor} +{ "ed." } + +FUNCTION {bbl.edby} +{ "edited by" } + +FUNCTION {bbl.edition} +{ "edition" } + +FUNCTION {bbl.volume} +{ "volume" } + +FUNCTION {bbl.of} +{ "of" } + +FUNCTION {bbl.number} +{ "number" } + +FUNCTION {bbl.nr} +{ "no." } + +FUNCTION {bbl.in} +{ "in" } + +FUNCTION {bbl.pages} +{ "pp." } + +FUNCTION {bbl.page} +{ "p." } + +FUNCTION {bbl.eidpp} +{ "pages" } + +FUNCTION {bbl.chapter} +{ "chapter" } + +FUNCTION {bbl.techrep} +{ "Technical Report" } + +FUNCTION {bbl.mthesis} +{ "Master's thesis" } + +FUNCTION {bbl.phdthesis} +{ "Ph.D. thesis" } + +MACRO {jan} {"January"} + +MACRO {feb} {"February"} + +MACRO {mar} {"March"} + +MACRO {apr} {"April"} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"August"} + +MACRO {sep} {"September"} + +MACRO {oct} {"October"} + +MACRO {nov} {"November"} + +MACRO {dec} {"December"} + +MACRO {acmcs} {"ACM Computing Surveys"} + +MACRO {acta} {"Acta Informatica"} + +MACRO {cacm} {"Communications of the ACM"} + +MACRO {ibmjrd} {"IBM Journal of Research and Development"} + +MACRO {ibmsj} {"IBM Systems Journal"} + +MACRO {ieeese} {"IEEE Transactions on Software Engineering"} + +MACRO {ieeetc} {"IEEE Transactions on Computers"} + +MACRO {ieeetcad} + {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"} + +MACRO {ipl} {"Information Processing Letters"} + +MACRO {jacm} {"Journal of the ACM"} + +MACRO {jcss} {"Journal of Computer and System Sciences"} + +MACRO {scp} {"Science of Computer Programming"} + +MACRO {sicomp} {"SIAM Journal on Computing"} + +MACRO {tocs} {"ACM Transactions on Computer Systems"} + +MACRO {tods} {"ACM Transactions on Database Systems"} + +MACRO {tog} {"ACM Transactions on Graphics"} + +MACRO {toms} {"ACM Transactions on Mathematical Software"} + +MACRO {toois} {"ACM Transactions on Office Information Systems"} + +MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"} + +MACRO {tcs} {"Theoretical Computer Science"} +FUNCTION {bibinfo.check} +{ swap$ + duplicate$ missing$ + { + pop$ pop$ + "" + } + { duplicate$ empty$ + { + swap$ pop$ + } + { swap$ + pop$ + } + if$ + } + if$ +} +FUNCTION {bibinfo.warn} +{ swap$ + duplicate$ missing$ + { + swap$ "missing " swap$ * " in " * cite$ * warning$ pop$ + "" + } + { duplicate$ empty$ + { + swap$ "empty " swap$ * " in " * cite$ * warning$ + } + { swap$ + pop$ + } + if$ + } + if$ +} +FUNCTION {format.eprint} +{ eprint duplicate$ empty$ + 'skip$ + { "\eprint" + archive empty$ + 'skip$ + { "[" * archive * "]" * } + if$ + "{" * swap$ * "}" * + } + if$ +} +FUNCTION {format.url} +{ url empty$ + { "" } + { "\urlprefix\url{" url * "}" * } + if$ +} + +INTEGERS { nameptr namesleft numnames } + + +STRINGS { bibinfo} + +FUNCTION {format.names} +{ 'bibinfo := + duplicate$ empty$ 'skip$ { + 's := + "" 't := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}{ jj}{ f{}}" + format.name$ + remove.dots + bibinfo bibinfo.check + 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + s nameptr "{ll}" format.name$ duplicate$ "others" = + { 't := } + { pop$ } + if$ + "," * + t "others" = + { + " " * bbl.etal emphasize * + } + { " " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ + } if$ +} +FUNCTION {format.names.ed} +{ + 'bibinfo := + duplicate$ empty$ 'skip$ { + 's := + "" 't := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{f{}~}{vv~}{ll}{ jj}" + format.name$ + remove.dots + bibinfo bibinfo.check + 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + s nameptr "{ll}" format.name$ duplicate$ "others" = + { 't := } + { pop$ } + if$ + "," * + t "others" = + { + + " " * bbl.etal emphasize * + } + { " " * t * } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ + } if$ +} +FUNCTION {format.key} +{ empty$ + { key field.or.null } + { "" } + if$ +} + +FUNCTION {format.authors} +{ author "author" format.names + duplicate$ empty$ 'skip$ + { collaboration "collaboration" bibinfo.check + duplicate$ empty$ 'skip$ + { " (" swap$ * ")" * } + if$ + * + } + if$ +} +FUNCTION {get.bbl.editor} +{ editor num.names$ #1 > 'bbl.editors 'bbl.editor if$ } + +FUNCTION {format.editors} +{ editor "editor" format.names duplicate$ empty$ 'skip$ + { + " " * + get.bbl.editor + "(" swap$ * ")" * + * + } + if$ +} +FUNCTION {format.isbn} +{ isbn "isbn" bibinfo.check + duplicate$ empty$ 'skip$ + { + new.block + "ISBN " swap$ * + } + if$ +} + +FUNCTION {format.issn} +{ issn "issn" bibinfo.check + duplicate$ empty$ 'skip$ + { + new.block + "ISSN " swap$ * + } + if$ +} + +FUNCTION {format.doi} +{ doi "doi" bibinfo.check + duplicate$ empty$ 'skip$ + { + new.block + "\doi{" swap$ * "}" * + } + if$ +} +FUNCTION {format.note} +{ + note empty$ + { "" } + { note #1 #1 substring$ + duplicate$ "{" = + 'skip$ + { output.state mid.sentence = + { "l" } + { "u" } + if$ + change.case$ + } + if$ + note #2 global.max$ substring$ * "note" bibinfo.check + } + if$ +} + +FUNCTION {format.title} +{ title + "title" bibinfo.check + duplicate$ empty$ 'skip$ + { + "\enquote{" swap$ * + add.period$ "}" * + } + if$ +} +FUNCTION {format.full.names} +{'s := + "" 't := + #1 'nameptr := + s num.names$ 'numnames := + numnames 'namesleft := + { namesleft #0 > } + { s nameptr + "{vv~}{ll}" format.name$ + 't := + nameptr #1 > + { + namesleft #1 > + { ", " * t * } + { + s nameptr "{ll}" format.name$ duplicate$ "others" = + { 't := } + { pop$ } + if$ + t "others" = + { + " " * bbl.etal emphasize * + } + { + numnames #2 > + { "," * } + 'skip$ + if$ + bbl.and + space.word * t * + } + if$ + } + if$ + } + 't + if$ + nameptr #1 + 'nameptr := + namesleft #1 - 'namesleft := + } + while$ +} + +FUNCTION {author.editor.key.full} +{ author empty$ + { editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.full.names } + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {author.key.full} +{ author empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { author format.full.names } + if$ +} + +FUNCTION {editor.key.full} +{ editor empty$ + { key empty$ + { cite$ #1 #3 substring$ } + 'key + if$ + } + { editor format.full.names } + if$ +} + +FUNCTION {make.full.names} +{ type$ "book" = + type$ "inbook" = + or + 'author.editor.key.full + { type$ "proceedings" = + 'editor.key.full + 'author.key.full + if$ + } + if$ +} + +FUNCTION {output.bibitem} +{ newline$ + "\bibitem[{" write$ + label write$ + ")" make.full.names duplicate$ short.list = + { pop$ } + { * } + if$ + "}]{" * write$ + cite$ write$ + "}" write$ + newline$ + "" + before.all 'output.state := +} + +FUNCTION {n.dashify} +{ + 't := + "" + { t empty$ not } + { t #1 #1 substring$ "-" = + { t #1 #2 substring$ "--" = not + { "--" * + t #2 global.max$ substring$ 't := + } + { { t #1 #1 substring$ "-" = } + { "-" * + t #2 global.max$ substring$ 't := + } + while$ + } + if$ + } + { t #1 #1 substring$ * + t #2 global.max$ substring$ 't := + } + if$ + } + while$ +} + +FUNCTION {word.in} +{ bbl.in capitalize + " " * } + +FUNCTION {format.date} +{ year "year" bibinfo.check duplicate$ empty$ + { + "empty year in " cite$ * "; set to ????" * warning$ + pop$ "????" + } + 'skip$ + if$ + extra.label * + before.all 'output.state := + " (" swap$ * ")" * +} +FUNCTION {format.btitle} +{ title "title" bibinfo.check + duplicate$ empty$ 'skip$ + { + emphasize + } + if$ +} +FUNCTION {either.or.check} +{ empty$ + 'pop$ + { "can't use both " swap$ * " fields in " * cite$ * warning$ } + if$ +} +FUNCTION {format.bvolume} +{ volume empty$ + { "" } + { bbl.volume volume tie.or.space.prefix + "volume" bibinfo.check * * + series "series" bibinfo.check + duplicate$ empty$ 'pop$ + { swap$ bbl.of space.word * swap$ + emphasize * } + if$ + "volume and number" number either.or.check + } + if$ +} +FUNCTION {format.number.series} +{ volume empty$ + { number empty$ + { series field.or.null } + { series empty$ + { number "number" bibinfo.check } + { output.state mid.sentence = + { bbl.number } + { bbl.number capitalize } + if$ + number tie.or.space.prefix "number" bibinfo.check * * + bbl.in space.word * + series "series" bibinfo.check * + } + if$ + } + if$ + } + { "" } + if$ +} + +FUNCTION {format.edition} +{ edition duplicate$ empty$ 'skip$ + { + output.state mid.sentence = + { "l" } + { "t" } + if$ change.case$ + "edition" bibinfo.check + " " * bbl.edition * + } + if$ +} +INTEGERS { multiresult } +FUNCTION {multi.page.check} +{ 't := + #0 'multiresult := + { multiresult not + t empty$ not + and + } + { t #1 #1 substring$ + duplicate$ "-" = + swap$ duplicate$ "," = + swap$ "+" = + or or + { #1 'multiresult := } + { t #2 global.max$ substring$ 't := } + if$ + } + while$ + multiresult +} +FUNCTION {format.pages} +{ pages duplicate$ empty$ 'skip$ + { duplicate$ multi.page.check + { + bbl.pages swap$ + n.dashify + } + { + bbl.page swap$ + } + if$ + tie.or.space.prefix + "pages" bibinfo.check + * * + } + if$ +} +FUNCTION {format.journal.pages} +{ pages duplicate$ empty$ 'pop$ + { swap$ duplicate$ empty$ + { pop$ pop$ format.pages } + { + ", " * + swap$ + n.dashify + "pages" bibinfo.check + * + } + if$ + } + if$ +} +FUNCTION {format.journal.eid} +{ eid "eid" bibinfo.check + duplicate$ empty$ 'pop$ + { swap$ duplicate$ empty$ 'skip$ + { + ", " * + } + if$ + swap$ * + numpages empty$ 'skip$ + { bbl.eidpp numpages tie.or.space.prefix + "numpages" bibinfo.check * * + " (" swap$ * ")" * * + } + if$ + } + if$ +} +FUNCTION {format.vol.num.pages} +{ volume field.or.null + duplicate$ empty$ 'skip$ + { + "volume" bibinfo.check + } + if$ + bolden + number "number" bibinfo.check duplicate$ empty$ 'skip$ + { + swap$ duplicate$ empty$ + { "there's a number but no volume in " cite$ * warning$ } + 'skip$ + if$ + swap$ + "(" swap$ * ")" * + } + if$ * + eid empty$ + { format.journal.pages } + { format.journal.eid } + if$ +} + +FUNCTION {format.chapter.pages} +{ chapter empty$ + 'format.pages + { type empty$ + { bbl.chapter } + { type "l" change.case$ + "type" bibinfo.check + } + if$ + chapter tie.or.space.prefix + "chapter" bibinfo.check + * * + pages empty$ + 'skip$ + { ", " * format.pages * } + if$ + } + if$ +} + +FUNCTION {format.booktitle} +{ + booktitle "booktitle" bibinfo.check + emphasize +} +FUNCTION {format.in.ed.booktitle} +{ format.booktitle duplicate$ empty$ 'skip$ + { + editor "editor" format.names.ed duplicate$ empty$ 'pop$ + { + " " * + get.bbl.editor + "(" swap$ * "), " * + * swap$ + * } + if$ + word.in swap$ * + } + if$ +} +FUNCTION {format.thesis.type} +{ type duplicate$ empty$ + 'pop$ + { swap$ pop$ + "t" change.case$ "type" bibinfo.check + } + if$ +} +FUNCTION {format.tr.number} +{ number "number" bibinfo.check + type duplicate$ empty$ + { pop$ bbl.techrep } + 'skip$ + if$ + "type" bibinfo.check + swap$ duplicate$ empty$ + { pop$ "t" change.case$ } + { tie.or.space.prefix * * } + if$ +} +FUNCTION {format.article.crossref} +{ + word.in + " \cite{" * crossref * "}" * +} +FUNCTION {format.book.crossref} +{ volume duplicate$ empty$ + { "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + pop$ word.in + } + { bbl.volume + capitalize + swap$ tie.or.space.prefix "volume" bibinfo.check * * bbl.of space.word * + } + if$ + " \cite{" * crossref * "}" * +} +FUNCTION {format.incoll.inproc.crossref} +{ + word.in + " \cite{" * crossref * "}" * +} +FUNCTION {format.org.or.pub} +{ 't := + "" + address empty$ t empty$ and + 'skip$ + { + t empty$ + { address "address" bibinfo.check * + } + { t * + address empty$ + 'skip$ + { ", " * address "address" bibinfo.check * } + if$ + } + if$ + } + if$ +} +FUNCTION {format.publisher.address} +{ publisher "publisher" bibinfo.warn format.org.or.pub +} + +FUNCTION {format.organization.address} +{ organization "organization" bibinfo.check format.org.or.pub +} + +FUNCTION {article} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + crossref missing$ + { + journal + "journal" bibinfo.check + emphasize + "journal" output.check + format.vol.num.pages output + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + format.issn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {book} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.date "year" output.check + date.block + format.btitle "title" output.check + crossref missing$ + { format.bvolume output + new.block + format.number.series output + format.edition output + new.sentence + format.publisher.address output + } + { + new.block + format.book.crossref output.nonnull + } + if$ + format.isbn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {booklet} +{ output.bibitem + format.authors output + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + howpublished "howpublished" bibinfo.check output + address "address" bibinfo.check output + format.isbn output + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + author empty$ + { format.editors "author and editor" output.check + editor format.key output + } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + format.date "year" output.check + date.block + format.btitle "title" output.check + crossref missing$ + { + format.bvolume output + format.chapter.pages "chapter and pages" output.check + new.block + format.number.series output + format.edition output + new.sentence + format.publisher.address output + } + { + format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + crossref missing$ + { format.isbn output } + 'skip$ + if$ + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.authors "author" output.check + author format.key output + format.date "year" output.check + date.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.edition output + format.chapter.pages output + new.sentence + format.publisher.address output + format.isbn output + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + format.doi output + new.block + format.note output + format.eprint output + format.url output + fin.entry +} +FUNCTION {inproceedings} +{ output.bibitem [TRUNCATED] To get the complete diff run: svnlook diff /svnroot/rprotobuf -r 727 From noreply at r-forge.r-project.org Thu Jan 9 01:51:36 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 01:51:36 +0100 (CET) Subject: [Rprotobuf-commits] r728 - papers/jss Message-ID: <20140109005136.6942918694C@r-forge.r-project.org> Author: murray Date: 2014-01-09 01:51:36 +0100 (Thu, 09 Jan 2014) New Revision: 728 Added: papers/jss/Makefile Log: Add Makefile to build the JSS paper. Added: papers/jss/Makefile =================================================================== --- papers/jss/Makefile (rev 0) +++ papers/jss/Makefile 2014-01-09 00:51:36 UTC (rev 728) @@ -0,0 +1,17 @@ +all: clean article.pdf + +clean: + rm -fr article.pdf + rm -fr article.out + rm -fr article.aux + rm -fr article.log + rm -fr article.bbl + rm -fr article.blg + rm -fr article.brf + +article.pdf: article.Rnw + R CMD Sweave article.Rnw + pdflatex article.tex + bibtex article + pdflatex article.tex + pdflatex article.tex From noreply at r-forge.r-project.org Thu Jan 9 02:35:01 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 02:35:01 +0100 (CET) Subject: [Rprotobuf-commits] r729 - papers/jss Message-ID: <20140109013501.C8802186A25@r-forge.r-project.org> Author: murray Date: 2014-01-09 02:35:01 +0100 (Thu, 09 Jan 2014) New Revision: 729 Added: papers/jss/article.tex Modified: papers/jss/eddelbuettel-stokely.bib Log: Check in the new JSS article, and rewrite/improve my MapReduce example application section. Added: papers/jss/article.tex =================================================================== --- papers/jss/article.tex (rev 0) +++ papers/jss/article.tex 2014-01-09 01:35:01 UTC (rev 729) @@ -0,0 +1,1813 @@ +\documentclass[article]{jss} +\usepackage{booktabs} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +% +% Local helpers to make this more compatible with R Journal style. +% +\newcommand{\CRANpkg}[1]{\pkg{#1}} +\RequirePackage{fancyvrb} +\RequirePackage{alltt} +\DefineVerbatimEnvironment{example}{Verbatim}{} + +%% almost as usual +\author{Dirk Eddelbuettel\\Debian and R Projects \And + Murray Stokely\\Google, Inc} +\title{\pkg{RProtoBuf}: Efficient Cross-Language Data Serialization in R} + +%% for pretty printing and a nice hypersummary also set: +\Plainauthor{Dirk Eddelbuettel, Murray Stokely} %% comma-separated +\Plaintitle{RProtoBuf: Efficient Cross-Language Data Serialization in R} +\Shorttitle{\pkg{RProtoBuf}: Protocol Buffers in R} %% a short title (if necessary) + +%% an abstract and keywords +\Abstract{ +Modern data collection and analysis pipelines often involve +a sophisticated mix of applications written in general purpose and +specialized programming languages. Protocol Buffers are a popular +method of serializing structured data between applications---while remaining +independent of programming languages or operating system. The +\CRANpkg{RProtoBuf} package provides a complete interface to this +library. +} +\Keywords{r, protocol buffers, serialization, cross-platform} +\Plainkeywords{r, protocol buffers, serialization, cross-platform} %% without formatting +%% at least one keyword must be supplied + +%% publication information +%% NOTE: Typically, this can be left commented and will be filled out by the technical editor +%% \Volume{50} +%% \Issue{9} +%% \Month{June} +%% \Year{2012} +%% \Submitdate{2012-06-04} +%% \Acceptdate{2012-06-04} + +%% The address of (at least) one author should be given +%% in the following format: +\Address{ + Dirk Eddelbuettel\\ + \\ + Murray Stokely\\ + Google, Inc.\\ + 1600 Amphitheatre Parkway\\ + Mountain View, CA 94040\\ + USA\\ + E-mail: \email{mstokely at google.com}\\ + URL: \url{http://www.stokely.org/} +} +%% It is also possible to add a telephone and fax number +%% before the e-mail in the following format: +%% Telephone: +43/512/507-7103 +%% Fax: +43/512/507-2851 + +%% for those who use Sweave please include the following line (with % symbols): +%% need no \usepackage{Sweave.sty} + +%% end of declarations %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +\begin{document} + + +%% include your article here, just as usual +%% Note that you should use the \pkg{}, \proglang{} and \code{} commands. + + +% We don't want a left margin for Sinput or Soutput for our table 1. +%\DefineVerbatimEnvironment{Sinput}{Verbatim} {xleftmargin=0em} +%\DefineVerbatimEnvironment{Soutput}{Verbatim}{xleftmargin=0em} +%\DefineVerbatimEnvironment{Scode}{Verbatim}{xleftmargin=2em} +% Setting the topsep to 0 reduces spacing from input to output and +% improves table 1. +\fvset{listparameters={\setlength{\topsep}{0pt}}} +\renewenvironment{Schunk}{\vspace{\topsep}}{\vspace{\topsep}} + +\title{RProtoBuf: Efficient Cross-Language Data Serialization in R} +\author{by Dirk Eddelbuettel and Murray Stokely} + +%% DE: I tend to have wider option(width=...) so this +%% guarantees better line breaks + +\maketitle + +\abstract{Modern data collection and analysis pipelines often involve + a sophisticated mix of applications written in general purpose and + specialized programming languages. Protocol Buffers are a popular + method of serializing structured data between applications---while remaining + independent of programming languages or operating system. The + \CRANpkg{RProtoBuf} package provides a complete interface between this + library and the R environment for statistical computing. + %TODO(ms) keep it less than 150 words. +} + +%TODO(de) 'protocol buffers' or 'Protocol Buffers' ? + +\section{Introduction} + +Modern data collection and analysis pipelines are increasingly being +built using collections of components to better manage software +complexity through reusability, modularity, and fault +isolation \citep{Wegiel:2010:CTT:1932682.1869479}. +Data analysis patterns such as Split-Apply-Combine +\citep{wickham2011split} explicitly break up large problems into +manageable pieces. These patterns are frequently employed with +different programming languages used for the different phases of data +analysis -- collection, cleaning, analysis, post-processing, and +presentation in order to take advantage of the unique combination of +performance, speed of development, and library support offered by +different environments. Each stage of the data +analysis pipeline may involve storing intermediate results in a +file or sending them over the network. +% DE: Nice! + +Given these requirements, how do we safely share intermediate results +between different applications, possibly written in different +languages, and possibly running on different computer system, possibly +spanning different operating systems? Programming +languages such as R, Julia, Java, and Python include built-in +serialization support, but these formats are tied to the specific +% DE: need to define serialization? +programming language in use and thus lock the user into a single +environment. CSV files can be read and written by many applications +and so are often used for exporting tabular data. However, CSV files +have a number of disadvantages, such as a limitation of exporting only +tabular datasets, lack of type-safety, inefficient text representation +and parsing, and ambiguities in the format involving special +characters. JSON is another widely-supported format used mostly on +the web that removes many of these disadvantages, but it too suffers +from being too slow to parse and also does not provide strong typing +between integers and floating point. Because the schema information +is not kept separately, multiple JSON messages of the same type +needlessly duplicate the field names with each message. +% +% +% +A number of binary formats based on JSON have been proposed that +reduce the parsing cost and improve the efficiency. MessagePack +\citep{msgpackR} and BSON \citep{rmongodb} both have R interfaces, but +these formats lack a separate schema for the serialized data and thus +still duplicate field names with each message sent over the network or +stored in a file. Such formats also lack support for versioning when +data storage needs evolve over time, or when application logic and +requirement changes dictate update to the message format. +% DE: Need to talk about XML ? + +Once the data serialization needs of an application become complex +enough, developers typically benefit from the use of an +\emph{interface description language}, or \emph{IDL}. IDLs like +Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact +well-documented schema for cross-langauge data structures and +efficient binary interchange formats. The schema can be used to +generate model classes for statically-typed programming languages such +as C++ and Java, or can be used with reflection for dynamically-typed +programming languages. Since the schema is provided separately from +the encoded data, the data can be efficiently encoded to minimize +storage costs of the stored data when compared with simple +``schema-less'' binary interchange formats. + +% TODO(mstokely): Take a more conversational tone here asking +% questions and motivating protocol buffers? + +% TODO(mstokely): If we go to JSS, include a larger paragraph here +% referencing each numbered section. I don't like these generally, +% but its useful for this paper I think because we have a boring bit +% in the middle (full class/method details) and interesting +% applications at the end. +This article describes the basics of Google's Protocol Buffers through +an easy to use R package, \CRANpkg{RProtoBuf}. After describing the +basics of protocol buffers and \CRANpkg{RProtoBuf}, we illustrate +several common use cases for protocol buffers in data analysis. + +\section{Protocol Buffers} + +FIXME Introductory section which may include references in parentheses +\citep{R}, or cite a reference such as \citet{R} in the text. + +% This content is good. Maybe use and cite? +% http://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html + + +%% TODO(de,ms) What follows is oooooold and was lifted from the webpage +%% Rewrite? +Protocol Buffers can be described as a modern, language-neutral, platform-neutral, +extensible mechanism for sharing and storing structured data. Since their +introduction, Protocol Buffers have been widely adopted in industry with +applications as varied as database-internal messaging (Drizzle), % DE: citation? +Sony Playstations, Twitter, Google Search, Hadoop, and Open Street Map. While +% TODO(DE): This either needs a citation, or remove the name drop +traditional IDLs have at time been criticized for code bloat and +complexity, Protocol Buffers are based on a simple list and records +model that is compartively flexible and simple to use. + +Some of the key features provided by Protocol Buffers for data analysis +include: + +\begin{itemize} +\item \emph{Portable}: Allows users to send and receive data between + applications or different computers. +\item \emph{Efficient}: Data is serialized into a compact binary + representation for transmission or storage. +\item \emph{Exentsible}: New fields can be added to Protocol Buffer Schemas + in a forward-compatible way that do not break older applications. +\item \emph{Stable}: Protocol Buffers have been in wide use for over a + decade. +\end{itemize} + +Figure~\ref{fig:protobuf-distributed-usecase} illustrates an example +communication workflow with protocol buffers and an interactive R +session. Common use cases include populating a request RPC protocol +buffer in R that is then serialized and sent over the network to a +remote server. The server would then deserialize the message, act on +the request, and respond with a new protocol buffer over the network. The key +difference to, say, a request to an Rserve instance is that the remote server +may not even know the R language. + +%Protocol buffers are a language-neutral, platform-neutral, extensible +%way of serializing structured data for use in communications +%protocols, data storage, and more. + +%Protocol Buffers offer key features such as an efficient data interchange +%format that is both language- and operating system-agnostic yet uses a +%lightweight and highly performant encoding, object serialization and +%de-serialization as well data and configuration management. Protocol +%buffers are also forward compatible: updates to the \texttt{proto} +%files do not break programs built against the previous specification. + +%While benchmarks are not available, Google states on the project page that in +%comparison to XML, protocol buffers are at the same time \textsl{simpler}, +%between three to ten times \textsl{smaller}, between twenty and one hundred +%times \textsl{faster}, as well as less ambiguous and easier to program. + +Many sources compare data serialization formats and show protocol +buffers very favorably to the alternatives, such +as \citet{Sumaray:2012:CDS:2184751.2184810} + +%The flexibility of the reflection-based API is particularly well +%suited for interactive data analysis. + +% XXX Design tradeoffs: reflection vs proto compiler + +For added speed and efficiency, the C++, Java, and Python bindings to +Protocol Buffers are used with a compiler that translates a protocol +buffer schema description file (ending in \texttt{.proto}) into +language-specific classes that can be used to create, read, write and +manipulate protocol buffer messages. The R interface, in contrast, +uses a reflection-based API that is particularly well suited for +interactive data analysis. All messages in R have a single class +structure, but different accessor methods are created at runtime based +on the name fields of the specified message type. + +% In other words, given the 'proto' +%description file, code is automatically generated for the chosen +%target language(s). The project page contains a tutorial for each of +%these officially supported languages: +%\url{http://code.google.com/apis/protocolbuffers/docs/tutorials.html} + +%The protocol buffers code is released under an open-source (BSD) license. The +%protocol buffer project (\url{http://code.google.com/p/protobuf/}) +%contains a C++ library and a set of runtime libraries and compilers for +%C++, Java and Python. + +%With these languages, the workflow follows standard practice of so-called +%Interface Description Languages (IDL) +%(c.f. \href{http://en.wikipedia.org/wiki/Interface_description_language}{Wikipedia +% on IDL}). This consists of compiling a protocol buffer description file +%(ending in \texttt{.proto}) into language specific classes that can be used + +%Besides the officially supported C++, Java and Python implementations, several projects have been +%created to support protocol buffers for many languages. The list of known +%languages to support protocol buffers is compiled as part of the +%project page: \url{http://code.google.com/p/protobuf/wiki/ThirdPartyAddOns} + +\begin{figure}[t] +\begin{center} +\includegraphics[width=\textwidth]{protobuf-distributed-system-crop.pdf} +\end{center} +\caption{Example protobuf usage} +\label{fig:protobuf-distributed-usecase} +\end{figure} + +\section{Basic Usage: Messages and Descriptors} + +This section describes how to use the R API to create and manipulate +protocol buffer messages in R, and how to read and write the +binary \emph{payload} of the messages to files and arbitrary binary +R connections. + +The two fundamental building blocks of Protocol Buffers are Messages +and Descriptors. Messages provide a common abstract encapsulation of +structured data fields of the type specified in a Message Descriptor. +Message Descriptors are defined in \texttt{.proto} files and define a +schema for a particular named class of messages. + +Table~\ref{tab:proto} shows an example \texttt{.proto} file which +defines the \texttt{tutorial.Person} type. The R code in the right +column shows an example of creating a new message of this type and +populating its fields. + +% Commented out because we said this earlier. +%This separation +%between schema and the message objects is in contrast to +%more verbose formats like JSON, and when combined with the efficient +%binary representation of any Message object explains a large part of +%the performance and storage-space advantage offered by Protocol +%Buffers. TODO(ms): we already said some of this above. clean up. + +% lifted from protobuf page: +%With Protocol Buffers you define how you want your data to be +%structured once, and then you can read or write structured data to and +%from a variety of data streams using a variety of different +%languages. The definition + + +%% TODO(de) Can we make this not break the width of the page? +\noindent +\begin{table} +\begin{tabular}{@{\hskip .01\textwidth}p{.40\textwidth}@{\hskip .02\textwidth}@{\hskip .02\textwidth}p{0.55\textwidth}@{\hskip .01\textwidth}} +\toprule +Schema : \texttt{addressbook.proto} & Example R Session\\ +\cmidrule{1-2} +\begin{minipage}{.35\textwidth} +\vspace{2mm} +\begin{example} +package tutorial; +message Person { + required string name = 1; + required int32 id = 2; + optional string email = 3; + enum PhoneType { + MOBILE = 0; HOME = 1; + WORK = 2; + } + message PhoneNumber { + required string number = 1; + optional PhoneType type = 2; + } + repeated PhoneNumber phone = 4; +} +\end{example} +\vspace{2mm} +\end{minipage} & \begin{minipage}{.5\textwidth} +\begin{Schunk} +\begin{Sinput} +R> library(RProtoBuf) +R> p <- new(tutorial.Person, id=1, name="Dirk") +R> class(p) +\end{Sinput} +\begin{Soutput} +[1] "Message" +attr(,"package") +[1] "RProtoBuf" +\end{Soutput} +\begin{Sinput} +R> p$name +\end{Sinput} +\begin{Soutput} +[1] "Dirk" +\end{Soutput} +\begin{Sinput} +R> p$name <- "Murray" +R> cat(as.character(p)) +\end{Sinput} +\begin{Soutput} +name: "Murray" +id: 1 +\end{Soutput} +\begin{Sinput} +R> serialize(p, NULL) +\end{Sinput} +\begin{Soutput} + [1] 0a 06 4d 75 72 72 61 79 10 01 +\end{Soutput} +\end{Schunk} +\end{minipage} \\ +\bottomrule +\end{tabular} +\caption{The schema representation from a \texttt{.proto} file for the + \texttt{tutorial.Person} class (left) and simple R code for creating + an object of this class and accessing its fields (right).} +\label{tab:proto} +\end{table} + +%This section may contain a figure such as Figure~\ref{figure:rlogo}. +% +%\begin{figure}[htbp] +% \centering +% \includegraphics{Rlogo} +% \caption{The logo of R.} +% \label{figure:rlogo} +%\end{figure} + +\subsection{Importing Message Descriptors from .proto files} + +%The three basic abstractions of \CRANpkg{RProtoBuf} are Messages, +%which encapsulate a data structure, Descriptors, which define the +%schema used by one or more messages, and DescriptorPools, which +%provide access to descriptors. + +Before one can create a new Protocol Buffer Message or parse a +serialized stream of bytes as a Message, one must first read in the message +type specification from a \texttt{.proto} file. + +New \texttt{.proto} files are imported with the \code{readProtoFiles} +function, which can import a single file, all files in a directory, or +all \texttt{.proto} files provided by another R package. + +The \texttt{.proto} file syntax for defining the structure of protocol +buffer data is described comprehensively on Google Code: +\url{http://code.google.com/apis/protocolbuffers/docs/proto.html}. + +Once the proto files are imported, all message descriptors are +are available in the R search path in the \texttt{RProtoBuf:DescriptorPool} +special environment. The underlying mechanism used here is +described in more detail in Section~\ref{sec-lookup}. + +\begin{Schunk} +\begin{Sinput} +R> ls("RProtoBuf:DescriptorPool") +\end{Sinput} +\begin{Soutput} + [1] "rexp.CMPLX" + [2] "rexp.REXP" + [3] "rexp.STRING" + [4] "rprotobuf.HelloWorldRequest" + [5] "rprotobuf.HelloWorldResponse" + [6] "tutorial.AddressBook" + [7] "tutorial.Person" + [8] "tutorial.Test1" + [9] "tutorial.Test2" +[10] "tutorial.Test3" +[11] "tutorial.Test4" +\end{Soutput} +\end{Schunk} + +%\subsection{Importing proto files} +%In contrast to the other languages (Java, C++, Python) that are officially +%supported by Google, the implementation used by the \texttt{RProtoBuf} +%package does not rely on the \texttt{protoc} compiler (with the exception of +%the two functions discussed in the previous section). This means that no +%initial step of statically compiling the proto file into C++ code that is +%then accessed by R code is necessary. Instead, \texttt{proto} files are +%parsed and processed \textsl{at runtime} by the protobuf C++ library---which +%is much more appropriate for a dynamic language. + +\subsection{Creating a message} + +New messages are created with the \texttt{new} function which accepts +a Message Descriptor and optionally a list of ``name = value'' pairs +to set in the message. +%The objects contained in the special environment are +%descriptors for their associated message types. Descriptors will be +%discussed in detail in another part of this document, but for the +%purpose of this section, descriptors are just used with the \texttt{new} +%function to create messages. + +\begin{Schunk} +\begin{Sinput} +R> p1 <- new(tutorial.Person) +R> p <- new(tutorial.Person, name = "Murray", id = 1) +\end{Sinput} +\end{Schunk} + +\subsection{Access and modify fields of a message} + +Once the message is created, its fields can be queried +and modified using the dollar operator of R, making protocol +buffer messages seem like lists. + +\begin{Schunk} +\begin{Sinput} +R> p$name +\end{Sinput} +\begin{Soutput} +[1] "Murray" +\end{Soutput} +\begin{Sinput} +R> p$id +\end{Sinput} +\begin{Soutput} +[1] 1 +\end{Soutput} +\begin{Sinput} +R> p$email <- "murray at stokely.org" +\end{Sinput} +\end{Schunk} + +However, as opposed to R lists, no partial matching is performed +and the name must be given entirely. + +The \verb|[[| operator can also be used to query and set fields +of a mesages, supplying either their name or their tag number : + +\begin{Schunk} +\begin{Sinput} +R> p[["name"]] <- "Murray Stokely" +R> p[[ 2 ]] <- 3 +R> p[[ "email" ]] +\end{Sinput} +\begin{Soutput} +[1] "murray at stokely.org" +\end{Soutput} +\end{Schunk} + +Protocol buffers include a 64-bit integer type, but R lacks native +64-bit integer support. A workaround is available and described in +Section~\ref{sec:int64} for working with large integer values. + +% TODO(mstokely): Document extensions here. +% There are none in addressbook.proto though. + +\subsection{Display messages} + +Protocol buffer messages and descriptors implement \texttt{show} +methods that provide basic information about the message : + +\begin{Schunk} +\begin{Sinput} +R> p +\end{Sinput} +\begin{Soutput} +[1] "message of type 'tutorial.Person' with 3 fields set" +\end{Soutput} +\end{Schunk} + +For additional information, such as for debugging purposes, +the \texttt{as.character} method provides a more complete ASCII +representation of the contents of a message. + +\begin{Schunk} +\begin{Sinput} +R> writeLines(as.character(p)) +\end{Sinput} +\begin{Soutput} +name: "Murray Stokely" +id: 3 +email: "murray at stokely.org" +\end{Soutput} +\end{Schunk} + +\subsection{Serializing messages} + +However, the main focus of protocol buffer messages is +efficiency. Therefore, messages are transported as a sequence +of bytes. The \texttt{serialize} method is implemented for +protocol buffer messages to serialize a message into a sequence of +bytes that represents the message. +%(raw vector in R speech) that represents the message. + +\begin{Schunk} +\begin{Sinput} +R> serialize(p, NULL) +\end{Sinput} +\begin{Soutput} + [1] 0a 0e 4d 75 72 72 61 79 20 53 74 6f 6b 65 6c 79 10 03 1a 12 +[21] 6d 75 72 72 61 79 40 73 74 6f 6b 65 6c 79 2e 6f 72 67 +\end{Soutput} +\end{Schunk} + +The same method can also be used to serialize messages to files : + +\begin{Schunk} +\begin{Sinput} +R> tf1 <- tempfile() +R> serialize(p, tf1) +R> readBin(tf1, raw(0), 500) +\end{Sinput} +\begin{Soutput} + [1] 0a 0e 4d 75 72 72 61 79 20 53 74 6f 6b 65 6c 79 10 03 1a 12 +[21] 6d 75 72 72 61 79 40 73 74 6f 6b 65 6c 79 2e 6f 72 67 +\end{Soutput} +\end{Schunk} + +Or to arbitrary binary connections: + +\begin{Schunk} +\begin{Sinput} +R> tf2 <- tempfile() +R> con <- file(tf2, open = "wb") +R> serialize(p, con) +R> close(con) +R> readBin(tf2, raw(0), 500) +\end{Sinput} +\begin{Soutput} + [1] 0a 0e 4d 75 72 72 61 79 20 53 74 6f 6b 65 6c 79 10 03 1a 12 +[21] 6d 75 72 72 61 79 40 73 74 6f 6b 65 6c 79 2e 6f 72 67 +\end{Soutput} +\end{Schunk} + +\texttt{serialize} can also be used in a more traditional +object oriented fashion using the dollar operator : + +\begin{Schunk} +\begin{Sinput} +R> # serialize to a file +R> p$serialize(tf1) +R> # serialize to a binary connection +R> con <- file(tf2, open = "wb") +R> p$serialize(con) +R> close(con) +\end{Sinput} +\end{Schunk} + + +\subsection{Parsing messages} + +The \texttt{RProtoBuf} package defines the \texttt{read} and +\texttt{readASCII} functions to read messages from files, raw vectors, +or arbitrary connections. \texttt{read} expects to read the message +payload from binary files or connections and \texttt{readASCII} parses +the human-readable ASCII output that is created with +\code{as.character}. + +The binary representation of the message (often called the payload) +does not contain information that can be used to dynamically +infer the message type, so we have to provide this information +to the \texttt{read} function in the form of a descriptor : + +\begin{Schunk} +\begin{Sinput} +R> msg <- read(tutorial.Person, tf1) +R> writeLines(as.character(msg)) +\end{Sinput} +\begin{Soutput} +name: "Murray Stokely" +id: 3 +email: "murray at stokely.org" +\end{Soutput} +\end{Schunk} + +The \texttt{input} argument of \texttt{read} can also be a binary +readable R connection, such as a binary file connection: + +\begin{Schunk} +\begin{Sinput} +R> con <- file(tf2, open = "rb") +R> message <- read(tutorial.Person, con) +R> close(con) +R> writeLines(as.character(message)) +\end{Sinput} +\begin{Soutput} +name: "Murray Stokely" +id: 3 +email: "murray at stokely.org" +\end{Soutput} +\end{Schunk} + +Finally, the payload of the message can be used : + +\begin{Schunk} +\begin{Sinput} +R> # reading the raw vector payload of the message +R> payload <- readBin(tf1, raw(0), 5000) +R> message <- read(tutorial.Person, payload) +\end{Sinput} +\end{Schunk} + + +\texttt{read} can also be used as a pseudo method of the descriptor +object : + +\begin{Schunk} +\begin{Sinput} +R> # reading from a file +R> message <- tutorial.Person$read(tf1) +R> # reading from a binary connection +R> con <- file(tf2, open = "rb") +R> message <- tutorial.Person$read(con) +R> close(con) +R> # read from the payload +R> message <- tutorial.Person$read(payload) +\end{Sinput} +\end{Schunk} + + +\section{Under the hood: S4 Classes, Methods, and Pseudo Methods} + +The \CRANpkg{RProtoBuf} package uses the S4 system to store +information about descriptors and messages. Using the S4 system +allows the \texttt{RProtoBuf} package to dispatch methods that are not +generic in the S3 sense, such as \texttt{new} and +\texttt{serialize}. + +Each R object stores an external pointer to an object managed by +the \texttt{protobuf} C++ library. +The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp,eddelbuettel2013seamless} is used to +facilitate the integration of the R and C++ code for these objects. + +% Message, Descriptor, FieldDescriptor, EnumDescriptor, +% FileDescriptor, EnumValueDescriptor +% +% grep RPB_FUNC * | grep -v define|wc -l +% 84 +% grep RPB_ * | grep -v RPB_FUNCTION | grep METHOD|wc -l +% 33 + +There are over 100 C++ functions that provide the glue code between +the member functions of the 6 primary Message and Descriptor classes +in the protobuf library. Wrapping each method individually allows us +to add user friendly custom error handling, type coercion, and +performance improvements at the cost of a more verbose +implementation. The RProtoBuf implementation in many ways motivated +the development of Rcpp Modules \citep{eddelbuettel2013exposing}, +which provide a more concise way of wrapping C++ functions and classes +in a single entity. + +The \texttt{RProtoBuf} package combines the \emph{R typical} dispatch +of the form \verb|method(object, arguments)| and the more traditional +object oriented notation \verb|object$method(arguments)|. +Additionally, \texttt{RProtoBuf} implements the \texttt{.DollarNames} S3 generic function +(defined in the \texttt{utils} package) for all classes to enable tab +completion. Completion possibilities include pseudo method names for all +classes, plus dynamic dispatch on names or types specific to a given object. + +% TODO(ms): Add column check box for doing dynamic dispatch based on type. +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|l|} +\hline +\textbf{Class} & \textbf{Slots} & \textbf{Methods} & \textbf{Dynamic Dispatch}\\ +\hline +\hline +Message & 2 & 20 & yes (field names)\\ +\hline +Descriptor & 2 & 16 & yes (field names, enum types, nested types)\\ +\hline +FieldDescriptor & 4 & 18 & no\\ +\hline +EnumDescriptor & 4 & 11 & yes (enum constant names)\\ +\hline +FileDescriptor & 3 & 6 & yes (message/field definitions)\\ +\hline +EnumValueDescriptor & 3 & 6 & no\\ +\hline +\end{tabular} +\end{table} + +\subsection{Messages} + +The \texttt{Message} S4 class represents Protocol Buffer Messages and +is the core abstraction of \CRANpkg{RProtoBuf}. Each \texttt{Message} +contains a pointer to a \texttt{Descriptor} which defines the schema +of the data defined in the Message, as well as a number of +\texttt{FieldDescriptors} for the individual fields of the message. A +complete list of the slots and methods for \texttt{Messages} +is available in Table~\ref{Message-methods-table}. + +\begin{table}[h] +\centering +\begin{small} +\begin{tabular}{l|p{10cm}} +\hline +\textbf{Slot} & \textbf{Description} \\ +\hline +\texttt{pointer} & External pointer to the \texttt{Message} object of the C++ proto library. Documentation for the +\texttt{Message} class is available from the protocol buffer project page: +\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.message.html#Message} \\ +\hline +\texttt{type} & Fully qualified name of the message. For example a \texttt{Person} message +has its \texttt{type} slot set to \texttt{tutorial.Person} \\[.3cm] +\hline +\textbf{Method} & \textbf{Description} \\ +\hline +\texttt{has} & Indicates if a message has a given field. \\ +\texttt{clone} & Creates a clone of the message \\ +\texttt{isInitialized} & Indicates if a message has all its required fields set\\ +\texttt{serialize} & serialize a message to a file, binary connection, or raw vector\\ +\texttt{clear} & Clear one or several fields of a message, or the entire message\\ +\texttt{size} & The number of elements in a message field\\ +\texttt{bytesize} & The number of bytes the message would take once serialized\\ +\hline +\texttt{swap} & swap elements of a repeated field of a message\\ +\texttt{set} & set elements of a repeated field\\ +\texttt{fetch} & fetch elements of a repeated field\\ +\texttt{setExtension} & set an extension of a message\\ +\texttt{getExtension} & get the value of an extension of a message\\ +\texttt{add} & add elements to a repeated field \\ +\hline +\texttt{str} & the R structure of the message\\ +\texttt{as.character} & character representation of a message\\ +\texttt{toString} & character representation of a message (same as \texttt{as.character}) \\ +\texttt{as.list} & converts message to a named R list\\ +\texttt{update} & updates several fields of a message at once\\ +\texttt{descriptor} & get the descriptor of the message type of this message\\ +\texttt{fileDescriptor} & get the file descriptor of this message's descriptor\\ +\hline +\end{tabular} +\end{small} +\caption{\label{Message-methods-table}Description of slots and methods for the \texttt{Message} S4 class} +\end{table} + +\subsection{Descriptors} + +Descriptors describe the type of a Message. This includes what fields +a message contains and what the types of those fields are. Message +descriptors are represented in R with the \emph{Descriptor} S4 +class. The class contains the slots \texttt{pointer} and +\texttt{type}. Similarly to messages, the \verb|$| operator can be +used to retrieve descriptors that are contained in the descriptor, or +invoke pseudo-methods. + +When \CRANpkg{RProtoBuf} is first loaded it calls +\texttt{readProtoFiles} to read in an example \texttt{.proto} file +included with the package. The \texttt{tutorial.Person} descriptor +and any other descriptors defined in loaded \texttt{.proto} files are +then available on the search path. + +\begin{Schunk} +\begin{Sinput} +R> # field descriptor +R> tutorial.Person$email +\end{Sinput} +\begin{Soutput} +[1] "descriptor for field 'email' of type 'tutorial.Person' " +\end{Soutput} +\begin{Sinput} +R> # enum descriptor +R> tutorial.Person$PhoneType +\end{Sinput} +\begin{Soutput} +[1] "descriptor for enum 'PhoneType' of type 'tutorial.Person' with 3 values" +\end{Soutput} +\begin{Sinput} +R> # nested type descriptor +R> tutorial.Person$PhoneNumber +\end{Sinput} +\begin{Soutput} +[1] "descriptor for type 'tutorial.Person.PhoneNumber' " +\end{Soutput} +\begin{Sinput} +R> # same as +R> tutorial.Person.PhoneNumber +\end{Sinput} +\begin{Soutput} +[1] "descriptor for type 'tutorial.Person.PhoneNumber' " +\end{Soutput} +\end{Schunk} + +Table~\ref{Descriptor-methods-table} provides a complete list of the +slots and avalailable methods for Descriptors. + +\begin{table}[h] +\centering +\begin{small} +\begin{tabular}{l|p{10cm}} +\hline +\textbf{Slot} & \textbf{Description} \\ +\hline +\texttt{pointer} & External pointer to the \texttt{Descriptor} object of the C++ proto library. Documentation for the +\texttt{Descriptor} class is available from the protocol buffer project page: +\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.descriptor.html#Descriptor} \\ +\hline +\texttt{type} & Fully qualified path of the message type. \\[.3cm] +\hline +\textbf{Method} & \textbf{Description} \\ +\hline +\texttt{new} & Creates a prototype of a message described by this descriptor.\\ +\texttt{read} & Reads a message from a file or binary connection.\\ +\texttt{readASCII} & Read a message in ASCII format from a file or +text connection.\\ +\hline +\texttt{name} & Retrieve the name of the message type associated with +this descriptor.\\ +\texttt{as.character} & character representation of a descriptor\\ +\texttt{toString} & character representation of a descriptor (same as \texttt{as.character}) \\ +\texttt{as.list} & return a named +list of the field, enum, and nested descriptors included in this descriptor.\\ [TRUNCATED] To get the complete diff run: svnlook diff /svnroot/rprotobuf -r 729 From noreply at r-forge.r-project.org Thu Jan 9 03:17:58 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 03:17:58 +0100 (CET) Subject: [Rprotobuf-commits] r730 - papers/jss Message-ID: <20140109021758.5A9831866FC@r-forge.r-project.org> Author: murray Date: 2014-01-09 03:17:58 +0100 (Thu, 09 Jan 2014) New Revision: 730 Added: papers/jss/article.Rnw Removed: papers/jss/article.tex Log: Oops check in the Rnw rather than the generated tex file. Also, add section \labels and add a roadmap section at the end of the introduction that highlights the main sections. Added: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw (rev 0) +++ papers/jss/article.Rnw 2014-01-09 02:17:58 UTC (rev 730) @@ -0,0 +1,1639 @@ +\documentclass[article]{jss} +\usepackage{booktabs} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +% +% Local helpers to make this more compatible with R Journal style. +% +\newcommand{\CRANpkg}[1]{\pkg{#1}} +\RequirePackage{fancyvrb} +\RequirePackage{alltt} +\DefineVerbatimEnvironment{example}{Verbatim}{} + +%% almost as usual +\author{Dirk Eddelbuettel\\Debian and R Projects \And + Murray Stokely\\Google, Inc} +\title{\pkg{RProtoBuf}: Efficient Cross-Language Data Serialization in R} + +%% for pretty printing and a nice hypersummary also set: +\Plainauthor{Dirk Eddelbuettel, Murray Stokely} %% comma-separated +\Plaintitle{RProtoBuf: Efficient Cross-Language Data Serialization in R} +\Shorttitle{\pkg{RProtoBuf}: Protocol Buffers in R} %% a short title (if necessary) + +%% an abstract and keywords +\Abstract{ +Modern data collection and analysis pipelines often involve +a sophisticated mix of applications written in general purpose and +specialized programming languages. Protocol Buffers are a popular +method of serializing structured data between applications---while remaining +independent of programming languages or operating system. The +\CRANpkg{RProtoBuf} package provides a complete interface between this +library and the R environment for statistical computing. +%TODO(ms) keep it less than 150 words. +} +\Keywords{r, protocol buffers, serialization, cross-platform} +\Plainkeywords{r, protocol buffers, serialization, cross-platform} %% without formatting +%% at least one keyword must be supplied + +%% publication information +%% NOTE: Typically, this can be left commented and will be filled out by the technical editor +%% \Volume{50} +%% \Issue{9} +%% \Month{June} +%% \Year{2012} +%% \Submitdate{2012-06-04} +%% \Acceptdate{2012-06-04} + +%% The address of (at least) one author should be given +%% in the following format: +\Address{ + Dirk Eddelbuettel\\ + \\ + Murray Stokely\\ + Google, Inc.\\ + 1600 Amphitheatre Parkway\\ + Mountain View, CA 94040\\ + USA\\ + E-mail: \email{mstokely at google.com}\\ + URL: \url{http://www.stokely.org/} +} +%% It is also possible to add a telephone and fax number +%% before the e-mail in the following format: +%% Telephone: +43/512/507-7103 +%% Fax: +43/512/507-2851 + +%% for those who use Sweave please include the following line (with % symbols): +%% need no \usepackage{Sweave.sty} + +%% end of declarations %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +\begin{document} + + +%% include your article here, just as usual +%% Note that you should use the \pkg{}, \proglang{} and \code{} commands. + + +% We don't want a left margin for Sinput or Soutput for our table 1. +%\DefineVerbatimEnvironment{Sinput}{Verbatim} {xleftmargin=0em} +%\DefineVerbatimEnvironment{Soutput}{Verbatim}{xleftmargin=0em} +%\DefineVerbatimEnvironment{Scode}{Verbatim}{xleftmargin=2em} +% Setting the topsep to 0 reduces spacing from input to output and +% improves table 1. +\fvset{listparameters={\setlength{\topsep}{0pt}}} +\renewenvironment{Schunk}{\vspace{\topsep}}{\vspace{\topsep}} + +\title{RProtoBuf: Efficient Cross-Language Data Serialization in R} +\author{by Dirk Eddelbuettel and Murray Stokely} + +%% DE: I tend to have wider option(width=...) so this +%% guarantees better line breaks +<>= +options(width=65, prompt="R> ", digits=4) +@ + +\maketitle + +%TODO(de) 'protocol buffers' or 'Protocol Buffers' ? + +\section{Introduction} + +Modern data collection and analysis pipelines are increasingly being +built using collections of components to better manage software +complexity through reusability, modularity, and fault +isolation \citep{Wegiel:2010:CTT:1932682.1869479}. +Data analysis patterns such as Split-Apply-Combine +\citep{wickham2011split} explicitly break up large problems into +manageable pieces. These patterns are frequently employed with +different programming languages used for the different phases of data +analysis -- collection, cleaning, analysis, post-processing, and +presentation in order to take advantage of the unique combination of +performance, speed of development, and library support offered by +different environments. Each stage of the data +analysis pipeline may involve storing intermediate results in a +file or sending them over the network. +% DE: Nice! + +Given these requirements, how do we safely share intermediate results +between different applications, possibly written in different +languages, and possibly running on different computer system, possibly +spanning different operating systems? Programming +languages such as R, Julia, Java, and Python include built-in +serialization support, but these formats are tied to the specific +% DE: need to define serialization? +programming language in use and thus lock the user into a single +environment. CSV files can be read and written by many applications +and so are often used for exporting tabular data. However, CSV files +have a number of disadvantages, such as a limitation of exporting only +tabular datasets, lack of type-safety, inefficient text representation +and parsing, and ambiguities in the format involving special +characters. JSON is another widely-supported format used mostly on +the web that removes many of these disadvantages, but it too suffers +from being too slow to parse and also does not provide strong typing +between integers and floating point. Because the schema information +is not kept separately, multiple JSON messages of the same type +needlessly duplicate the field names with each message. +% +% +% +A number of binary formats based on JSON have been proposed that +reduce the parsing cost and improve the efficiency. MessagePack +\citep{msgpackR} and BSON \citep{rmongodb} both have R interfaces, but +these formats lack a separate schema for the serialized data and thus +still duplicate field names with each message sent over the network or +stored in a file. Such formats also lack support for versioning when +data storage needs evolve over time, or when application logic and +requirement changes dictate update to the message format. +% DE: Need to talk about XML ? + +Once the data serialization needs of an application become complex +enough, developers typically benefit from the use of an +\emph{interface description language}, or \emph{IDL}. IDLs like +Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact +well-documented schema for cross-langauge data structures and +efficient binary interchange formats. The schema can be used to +generate model classes for statically-typed programming languages such +as C++ and Java, or can be used with reflection for dynamically-typed +programming languages. Since the schema is provided separately from +the encoded data, the data can be efficiently encoded to minimize +storage costs of the stored data when compared with simple +``schema-less'' binary interchange formats. + +% TODO(mstokely): Take a more conversational tone here asking +% questions and motivating protocol buffers? + +% TODO(mstokely): If we go to JSS, include a larger paragraph here +% referencing each numbered section. I don't like these generally, +% but its useful for this paper I think because we have a boring bit +% in the middle (full class/method details) and interesting +% applications at the end. + +Section~\ref{sec:protobuf} provides a general overview of Protocol +Buffers. Section~\ref{sec:rprotobuf-basic} describes the interactive +R interface provided by \CRANpkg{RProtoBuf} and introduces the two +main abstractions: \emph{Messages} and \emph{Descriptors}. +Section~\ref{sec:rprotobuf-classes} describes the implementation +details of the main S4 classes making up this package. +Section~\ref{sec:types} describes the challenges of type coercion +between R and other languages. Section~\ref{sec:evaluation} +introduces a general R language schema for serializing arbitrary R +objects and evaluates it against R's built-in serialization. +Sections~\label{sec:opencpu} and \label{sec:mapreduce} provide +real-world use cases of \CRANpkg{RProtoBuf} in web service and +MapReduce environments, respectively. + +%This article describes the basics of Google's Protocol Buffers through +%an easy to use R package, \CRANpkg{RProtoBuf}. After describing the +%basics of protocol buffers and \CRANpkg{RProtoBuf}, we illustrate +%several common use cases for protocol buffers in data analysis. + +\section{Protocol Buffers} +\label{sec:protobuf} + +%FIXME Introductory section which may include references in parentheses +%\citep{R}, or cite a reference such as \citet{R} in the text. + +% This content is good. Maybe use and cite? +% http://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html + +%% TODO(de,ms) What follows is oooooold and was lifted from the webpage +%% Rewrite? +Protocol Buffers can be described as a modern, language-neutral, platform-neutral, +extensible mechanism for sharing and storing structured data. Since their +introduction, Protocol Buffers have been widely adopted in industry with +applications as varied as database-internal messaging (Drizzle), % DE: citation? +Sony Playstations, Twitter, Google Search, Hadoop, and Open Street Map. While +% TODO(DE): This either needs a citation, or remove the name drop +traditional IDLs have at time been criticized for code bloat and +complexity, Protocol Buffers are based on a simple list and records +model that is compartively flexible and simple to use. + +Some of the key features provided by Protocol Buffers for data analysis +include: + +\begin{itemize} +\item \emph{Portable}: Allows users to send and receive data between + applications or different computers. +\item \emph{Efficient}: Data is serialized into a compact binary + representation for transmission or storage. +\item \emph{Exentsible}: New fields can be added to Protocol Buffer Schemas + in a forward-compatible way that do not break older applications. +\item \emph{Stable}: Protocol Buffers have been in wide use for over a + decade. +\end{itemize} + +Figure~\ref{fig:protobuf-distributed-usecase} illustrates an example +communication workflow with protocol buffers and an interactive R +session. Common use cases include populating a request RPC protocol +buffer in R that is then serialized and sent over the network to a +remote server. The server would then deserialize the message, act on +the request, and respond with a new protocol buffer over the network. The key +difference to, say, a request to an Rserve instance is that the remote server +may not even know the R language. + +%Protocol buffers are a language-neutral, platform-neutral, extensible +%way of serializing structured data for use in communications +%protocols, data storage, and more. + +%Protocol Buffers offer key features such as an efficient data interchange +%format that is both language- and operating system-agnostic yet uses a +%lightweight and highly performant encoding, object serialization and +%de-serialization as well data and configuration management. Protocol +%buffers are also forward compatible: updates to the \texttt{proto} +%files do not break programs built against the previous specification. + +%While benchmarks are not available, Google states on the project page that in +%comparison to XML, protocol buffers are at the same time \textsl{simpler}, +%between three to ten times \textsl{smaller}, between twenty and one hundred +%times \textsl{faster}, as well as less ambiguous and easier to program. + +Many sources compare data serialization formats and show protocol +buffers very favorably to the alternatives, such +as \citet{Sumaray:2012:CDS:2184751.2184810} + +%The flexibility of the reflection-based API is particularly well +%suited for interactive data analysis. + +% XXX Design tradeoffs: reflection vs proto compiler + +For added speed and efficiency, the C++, Java, and Python bindings to +Protocol Buffers are used with a compiler that translates a protocol +buffer schema description file (ending in \texttt{.proto}) into +language-specific classes that can be used to create, read, write and +manipulate protocol buffer messages. The R interface, in contrast, +uses a reflection-based API that is particularly well suited for +interactive data analysis. All messages in R have a single class +structure, but different accessor methods are created at runtime based +on the name fields of the specified message type. + +% In other words, given the 'proto' +%description file, code is automatically generated for the chosen +%target language(s). The project page contains a tutorial for each of +%these officially supported languages: +%\url{http://code.google.com/apis/protocolbuffers/docs/tutorials.html} + +%The protocol buffers code is released under an open-source (BSD) license. The +%protocol buffer project (\url{http://code.google.com/p/protobuf/}) +%contains a C++ library and a set of runtime libraries and compilers for +%C++, Java and Python. + +%With these languages, the workflow follows standard practice of so-called +%Interface Description Languages (IDL) +%(c.f. \href{http://en.wikipedia.org/wiki/Interface_description_language}{Wikipedia +% on IDL}). This consists of compiling a protocol buffer description file +%(ending in \texttt{.proto}) into language specific classes that can be used + +%Besides the officially supported C++, Java and Python implementations, several projects have been +%created to support protocol buffers for many languages. The list of known +%languages to support protocol buffers is compiled as part of the +%project page: \url{http://code.google.com/p/protobuf/wiki/ThirdPartyAddOns} + +\begin{figure}[t] +\begin{center} +\includegraphics[width=\textwidth]{protobuf-distributed-system-crop.pdf} +\end{center} +\caption{Example protobuf usage} +\label{fig:protobuf-distributed-usecase} +\end{figure} + +\section{Basic Usage: Messages and Descriptors} +\label{sec:rprotobuf-basic} + +This section describes how to use the R API to create and manipulate +protocol buffer messages in R, and how to read and write the +binary \emph{payload} of the messages to files and arbitrary binary +R connections. + +The two fundamental building blocks of Protocol Buffers are Messages +and Descriptors. Messages provide a common abstract encapsulation of +structured data fields of the type specified in a Message Descriptor. +Message Descriptors are defined in \texttt{.proto} files and define a +schema for a particular named class of messages. + +Table~\ref{tab:proto} shows an example \texttt{.proto} file which +defines the \texttt{tutorial.Person} type. The R code in the right +column shows an example of creating a new message of this type and +populating its fields. + +% Commented out because we said this earlier. +%This separation +%between schema and the message objects is in contrast to +%more verbose formats like JSON, and when combined with the efficient +%binary representation of any Message object explains a large part of +%the performance and storage-space advantage offered by Protocol +%Buffers. TODO(ms): we already said some of this above. clean up. + +% lifted from protobuf page: +%With Protocol Buffers you define how you want your data to be +%structured once, and then you can read or write structured data to and +%from a variety of data streams using a variety of different +%languages. The definition + + +%% TODO(de) Can we make this not break the width of the page? +\noindent +\begin{table} +\begin{tabular}{@{\hskip .01\textwidth}p{.40\textwidth}@{\hskip .02\textwidth}@{\hskip .02\textwidth}p{0.55\textwidth}@{\hskip .01\textwidth}} +\toprule +Schema : \texttt{addressbook.proto} & Example R Session\\ +\cmidrule{1-2} +\begin{minipage}{.35\textwidth} +\vspace{2mm} +\begin{example} +package tutorial; +message Person { + required string name = 1; + required int32 id = 2; + optional string email = 3; + enum PhoneType { + MOBILE = 0; HOME = 1; + WORK = 2; + } + message PhoneNumber { + required string number = 1; + optional PhoneType type = 2; + } + repeated PhoneNumber phone = 4; +} +\end{example} +\vspace{2mm} +\end{minipage} & \begin{minipage}{.5\textwidth} +<>= +library(RProtoBuf) +p <- new(tutorial.Person, id=1, name="Dirk") +class(p) +p$name +p$name <- "Murray" +cat(as.character(p)) +serialize(p, NULL) +@ +\end{minipage} \\ +\bottomrule +\end{tabular} +\caption{The schema representation from a \texttt{.proto} file for the + \texttt{tutorial.Person} class (left) and simple R code for creating + an object of this class and accessing its fields (right).} +\label{tab:proto} +\end{table} + +%This section may contain a figure such as Figure~\ref{figure:rlogo}. +% +%\begin{figure}[htbp] +% \centering +% \includegraphics{Rlogo} +% \caption{The logo of R.} +% \label{figure:rlogo} +%\end{figure} + +\subsection{Importing Message Descriptors from .proto files} + +%The three basic abstractions of \CRANpkg{RProtoBuf} are Messages, +%which encapsulate a data structure, Descriptors, which define the +%schema used by one or more messages, and DescriptorPools, which +%provide access to descriptors. + +Before one can create a new Protocol Buffer Message or parse a +serialized stream of bytes as a Message, one must first read in the message +type specification from a \texttt{.proto} file. + +New \texttt{.proto} files are imported with the \code{readProtoFiles} +function, which can import a single file, all files in a directory, or +all \texttt{.proto} files provided by another R package. + +The \texttt{.proto} file syntax for defining the structure of protocol +buffer data is described comprehensively on Google Code: +\url{http://code.google.com/apis/protocolbuffers/docs/proto.html}. + +Once the proto files are imported, all message descriptors are +are available in the R search path in the \texttt{RProtoBuf:DescriptorPool} +special environment. The underlying mechanism used here is +described in more detail in Section~\ref{sec-lookup}. + +<<>>= +ls("RProtoBuf:DescriptorPool") +@ + +%\subsection{Importing proto files} +%In contrast to the other languages (Java, C++, Python) that are officially +%supported by Google, the implementation used by the \texttt{RProtoBuf} +%package does not rely on the \texttt{protoc} compiler (with the exception of +%the two functions discussed in the previous section). This means that no +%initial step of statically compiling the proto file into C++ code that is +%then accessed by R code is necessary. Instead, \texttt{proto} files are +%parsed and processed \textsl{at runtime} by the protobuf C++ library---which +%is much more appropriate for a dynamic language. + +\subsection{Creating a message} + +New messages are created with the \texttt{new} function which accepts +a Message Descriptor and optionally a list of ``name = value'' pairs +to set in the message. +%The objects contained in the special environment are +%descriptors for their associated message types. Descriptors will be +%discussed in detail in another part of this document, but for the +%purpose of this section, descriptors are just used with the \texttt{new} +%function to create messages. + +<<>>= +p1 <- new(tutorial.Person) +p <- new(tutorial.Person, name = "Murray", id = 1) +@ + +\subsection{Access and modify fields of a message} + +Once the message is created, its fields can be queried +and modified using the dollar operator of R, making protocol +buffer messages seem like lists. + +<<>>= +p$name +p$id +p$email <- "murray at stokely.org" +@ + +However, as opposed to R lists, no partial matching is performed +and the name must be given entirely. + +The \verb|[[| operator can also be used to query and set fields +of a mesages, supplying either their name or their tag number : + +<<>>= +p[["name"]] <- "Murray Stokely" +p[[ 2 ]] <- 3 +p[[ "email" ]] +@ + +Protocol buffers include a 64-bit integer type, but R lacks native +64-bit integer support. A workaround is available and described in +Section~\ref{sec:int64} for working with large integer values. + +% TODO(mstokely): Document extensions here. +% There are none in addressbook.proto though. + +\subsection{Display messages} + +Protocol buffer messages and descriptors implement \texttt{show} +methods that provide basic information about the message : + +<<>>= +p +@ + +For additional information, such as for debugging purposes, +the \texttt{as.character} method provides a more complete ASCII +representation of the contents of a message. + +<<>>= +writeLines(as.character(p)) +@ + +\subsection{Serializing messages} + +However, the main focus of protocol buffer messages is +efficiency. Therefore, messages are transported as a sequence +of bytes. The \texttt{serialize} method is implemented for +protocol buffer messages to serialize a message into a sequence of +bytes that represents the message. +%(raw vector in R speech) that represents the message. + +<<>>= +serialize(p, NULL) +@ + +The same method can also be used to serialize messages to files : + +<<>>= +tf1 <- tempfile() +serialize(p, tf1) +readBin(tf1, raw(0), 500) +@ + +Or to arbitrary binary connections: + +<<>>= +tf2 <- tempfile() +con <- file(tf2, open = "wb") +serialize(p, con) +close(con) +readBin(tf2, raw(0), 500) +@ + +\texttt{serialize} can also be used in a more traditional +object oriented fashion using the dollar operator : + +<<>>= +# serialize to a file +p$serialize(tf1) +# serialize to a binary connection +con <- file(tf2, open = "wb") +p$serialize(con) +close(con) +@ + + +\subsection{Parsing messages} + +The \texttt{RProtoBuf} package defines the \texttt{read} and +\texttt{readASCII} functions to read messages from files, raw vectors, +or arbitrary connections. \texttt{read} expects to read the message +payload from binary files or connections and \texttt{readASCII} parses +the human-readable ASCII output that is created with +\code{as.character}. + +The binary representation of the message (often called the payload) +does not contain information that can be used to dynamically +infer the message type, so we have to provide this information +to the \texttt{read} function in the form of a descriptor : + +<<>>= +msg <- read(tutorial.Person, tf1) +writeLines(as.character(msg)) +@ + +The \texttt{input} argument of \texttt{read} can also be a binary +readable R connection, such as a binary file connection: + +<<>>= +con <- file(tf2, open = "rb") +message <- read(tutorial.Person, con) +close(con) +writeLines(as.character(message)) +@ + +Finally, the payload of the message can be used : + +<<>>= +# reading the raw vector payload of the message +payload <- readBin(tf1, raw(0), 5000) +message <- read(tutorial.Person, payload) +@ + + +\texttt{read} can also be used as a pseudo method of the descriptor +object : + +<<>>= +# reading from a file +message <- tutorial.Person$read(tf1) +# reading from a binary connection +con <- file(tf2, open = "rb") +message <- tutorial.Person$read(con) +close(con) +# read from the payload +message <- tutorial.Person$read(payload) +@ + + +\section{Under the hood: S4 Classes, Methods, and Pseudo Methods} +\label{sec:rprotobuf-classes} + +The \CRANpkg{RProtoBuf} package uses the S4 system to store +information about descriptors and messages. Using the S4 system +allows the \texttt{RProtoBuf} package to dispatch methods that are not +generic in the S3 sense, such as \texttt{new} and +\texttt{serialize}. + +Each R object stores an external pointer to an object managed by +the \texttt{protobuf} C++ library. +The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp,eddelbuettel2013seamless} is used to +facilitate the integration of the R and C++ code for these objects. + +% Message, Descriptor, FieldDescriptor, EnumDescriptor, +% FileDescriptor, EnumValueDescriptor +% +% grep RPB_FUNC * | grep -v define|wc -l +% 84 +% grep RPB_ * | grep -v RPB_FUNCTION | grep METHOD|wc -l +% 33 + +There are over 100 C++ functions that provide the glue code between +the member functions of the 6 primary Message and Descriptor classes +in the protobuf library. Wrapping each method individually allows us +to add user friendly custom error handling, type coercion, and +performance improvements at the cost of a more verbose +implementation. The RProtoBuf implementation in many ways motivated +the development of Rcpp Modules \citep{eddelbuettel2013exposing}, +which provide a more concise way of wrapping C++ functions and classes +in a single entity. + +The \texttt{RProtoBuf} package combines the \emph{R typical} dispatch +of the form \verb|method(object, arguments)| and the more traditional +object oriented notation \verb|object$method(arguments)|. +Additionally, \texttt{RProtoBuf} implements the \texttt{.DollarNames} S3 generic function +(defined in the \texttt{utils} package) for all classes to enable tab +completion. Completion possibilities include pseudo method names for all +classes, plus dynamic dispatch on names or types specific to a given object. + +% TODO(ms): Add column check box for doing dynamic dispatch based on type. +\begin{table}[h] +\centering +\begin{tabular}{|l|c|c|l|} +\hline +\textbf{Class} & \textbf{Slots} & \textbf{Methods} & \textbf{Dynamic Dispatch}\\ +\hline +\hline +Message & 2 & 20 & yes (field names)\\ +\hline +Descriptor & 2 & 16 & yes (field names, enum types, nested types)\\ +\hline +FieldDescriptor & 4 & 18 & no\\ +\hline +EnumDescriptor & 4 & 11 & yes (enum constant names)\\ +\hline +FileDescriptor & 3 & 6 & yes (message/field definitions)\\ +\hline +EnumValueDescriptor & 3 & 6 & no\\ +\hline +\end{tabular} +\end{table} + +\subsection{Messages} + +The \texttt{Message} S4 class represents Protocol Buffer Messages and +is the core abstraction of \CRANpkg{RProtoBuf}. Each \texttt{Message} +contains a pointer to a \texttt{Descriptor} which defines the schema +of the data defined in the Message, as well as a number of +\texttt{FieldDescriptors} for the individual fields of the message. A +complete list of the slots and methods for \texttt{Messages} +is available in Table~\ref{Message-methods-table}. + +\begin{table}[h] +\centering +\begin{small} +\begin{tabular}{l|p{10cm}} +\hline +\textbf{Slot} & \textbf{Description} \\ +\hline +\texttt{pointer} & External pointer to the \texttt{Message} object of the C++ proto library. Documentation for the +\texttt{Message} class is available from the protocol buffer project page: +\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.message.html#Message} \\ +\hline +\texttt{type} & Fully qualified name of the message. For example a \texttt{Person} message +has its \texttt{type} slot set to \texttt{tutorial.Person} \\[.3cm] +\hline +\textbf{Method} & \textbf{Description} \\ +\hline +\texttt{has} & Indicates if a message has a given field. \\ +\texttt{clone} & Creates a clone of the message \\ +\texttt{isInitialized} & Indicates if a message has all its required fields set\\ +\texttt{serialize} & serialize a message to a file, binary connection, or raw vector\\ +\texttt{clear} & Clear one or several fields of a message, or the entire message\\ +\texttt{size} & The number of elements in a message field\\ +\texttt{bytesize} & The number of bytes the message would take once serialized\\ +\hline +\texttt{swap} & swap elements of a repeated field of a message\\ +\texttt{set} & set elements of a repeated field\\ +\texttt{fetch} & fetch elements of a repeated field\\ +\texttt{setExtension} & set an extension of a message\\ +\texttt{getExtension} & get the value of an extension of a message\\ +\texttt{add} & add elements to a repeated field \\ +\hline +\texttt{str} & the R structure of the message\\ +\texttt{as.character} & character representation of a message\\ +\texttt{toString} & character representation of a message (same as \texttt{as.character}) \\ +\texttt{as.list} & converts message to a named R list\\ +\texttt{update} & updates several fields of a message at once\\ +\texttt{descriptor} & get the descriptor of the message type of this message\\ +\texttt{fileDescriptor} & get the file descriptor of this message's descriptor\\ +\hline +\end{tabular} +\end{small} +\caption{\label{Message-methods-table}Description of slots and methods for the \texttt{Message} S4 class} +\end{table} + +\subsection{Descriptors} + +Descriptors describe the type of a Message. This includes what fields +a message contains and what the types of those fields are. Message +descriptors are represented in R with the \emph{Descriptor} S4 +class. The class contains the slots \texttt{pointer} and +\texttt{type}. Similarly to messages, the \verb|$| operator can be +used to retrieve descriptors that are contained in the descriptor, or +invoke pseudo-methods. + +When \CRANpkg{RProtoBuf} is first loaded it calls +\texttt{readProtoFiles} to read in an example \texttt{.proto} file +included with the package. The \texttt{tutorial.Person} descriptor +and any other descriptors defined in loaded \texttt{.proto} files are +then available on the search path. + +<<>>= +# field descriptor +tutorial.Person$email + +# enum descriptor +tutorial.Person$PhoneType + +# nested type descriptor +tutorial.Person$PhoneNumber +# same as +tutorial.Person.PhoneNumber +@ + +Table~\ref{Descriptor-methods-table} provides a complete list of the +slots and avalailable methods for Descriptors. + +\begin{table}[h] +\centering +\begin{small} +\begin{tabular}{l|p{10cm}} +\hline +\textbf{Slot} & \textbf{Description} \\ +\hline +\texttt{pointer} & External pointer to the \texttt{Descriptor} object of the C++ proto library. Documentation for the +\texttt{Descriptor} class is available from the protocol buffer project page: +\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.descriptor.html#Descriptor} \\ +\hline +\texttt{type} & Fully qualified path of the message type. \\[.3cm] +\hline +\textbf{Method} & \textbf{Description} \\ +\hline +\texttt{new} & Creates a prototype of a message described by this descriptor.\\ +\texttt{read} & Reads a message from a file or binary connection.\\ +\texttt{readASCII} & Read a message in ASCII format from a file or +text connection.\\ +\hline +\texttt{name} & Retrieve the name of the message type associated with +this descriptor.\\ +\texttt{as.character} & character representation of a descriptor\\ +\texttt{toString} & character representation of a descriptor (same as \texttt{as.character}) \\ +\texttt{as.list} & return a named +list of the field, enum, and nested descriptors included in this descriptor.\\ +\texttt{asMessage} & return DescriptorProto message. \\ +\hline +\texttt{fileDescriptor} & Retrieve the file descriptor of this +descriptor.\\ +\texttt{containing\_type} & Retrieve the descriptor describing the message type containing this descriptor.\\ +\texttt{field\_count} & Return the number of fields in this descriptor.\\ +\texttt{field} & Return the descriptor for the specified field in this descriptor.\\ +\texttt{nested\_type\_count} & The number of nested types in this descriptor.\\ +\texttt{nested\_type} & Return the descriptor for the specified nested +type in this descriptor.\\ +\texttt{enum\_type\_count} & The number of enum types in this descriptor.\\ +\texttt{enum\_type} & Return the descriptor for the specified enum +type in this descriptor.\\ +\hline +\end{tabular} +\end{small} +\caption{\label{Descriptor-methods-table}Description of slots and methods for the \texttt{Descriptor} S4 class} +\end{table} + +\subsection{Field Descriptors} +\label{subsec-field-descriptor} + +The class \emph{FieldDescriptor} represents field +descriptor in R. This is a wrapper S4 class around the +\texttt{google::protobuf::FieldDescriptor} C++ class. +Table~\ref{fielddescriptor-methods-table} describes the methods +defined for the \texttt{FieldDescriptor} class. + +\begin{table}[h] +\centering +\begin{small} +\begin{tabular}{l|p{10cm}} +\hline +\textbf{Slot} & \textbf{Description} \\ +\hline +\texttt{pointer} & External pointer to the \texttt{FieldDescriptor} C++ variable \\ +\hline +\texttt{name} & Simple name of the field \\ +\hline +\texttt{full\_name} & Fully qualified name of the field \\ +\hline +\texttt{type} & Name of the message type where the field is declared \\[.3cm] +\hline +\textbf{Method} & \textbf{Description} \\ +\hline +\texttt{as.character} & Character representation of a descriptor\\ +\texttt{toString} & Character +representation of a descriptor (same as \texttt{as.character}) \\ +\texttt{asMessage} & Return FieldDescriptorProto message. \\ +\texttt{name} & Return the name of the field descriptor.\\ +\texttt{fileDescriptor} & Return the fileDescriptor where this field is defined.\\ +\texttt{containing\_type} & Return the containing descriptor of this field.\\ +\texttt{is\_extension} & Return TRUE if this field is an extension.\\ +\texttt{number} & Gets the declared tag number of the field.\\ +\texttt{type} & Gets the type of the field.\\ +\texttt{cpp\_type} & Gets the C++ type of the field.\\ +\texttt{label} & Gets the label of a field (optional, required, or repeated).\\ +\texttt{is\_repeated} & Return TRUE if this field is repeated.\\ +\texttt{is\_required} & Return TRUE if this field is required.\\ [TRUNCATED] To get the complete diff run: svnlook diff /svnroot/rprotobuf -r 730 From noreply at r-forge.r-project.org Thu Jan 9 03:19:55 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 03:19:55 +0100 (CET) Subject: [Rprotobuf-commits] r731 - papers/jss Message-ID: <20140109021955.89AE01867C4@r-forge.r-project.org> Author: murray Date: 2014-01-09 03:19:54 +0100 (Thu, 09 Jan 2014) New Revision: 731 Added: papers/jss/histogram-mapreduce-diag1.pdf Log: move missing pdf from rjournal dir. Added: papers/jss/histogram-mapreduce-diag1.pdf =================================================================== (Binary files differ) Property changes on: papers/jss/histogram-mapreduce-diag1.pdf ___________________________________________________________________ Added: svn:mime-type + application/octet-stream From noreply at r-forge.r-project.org Thu Jan 9 03:23:02 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 03:23:02 +0100 (CET) Subject: [Rprotobuf-commits] r732 - papers/jss Message-ID: <20140109022303.1F63D1868DF@r-forge.r-project.org> Author: edd Date: 2014-01-09 03:23:01 +0100 (Thu, 09 Jan 2014) New Revision: 732 Modified: papers/jss/Makefile Log: minor tweak to makefile Modified: papers/jss/Makefile =================================================================== --- papers/jss/Makefile 2014-01-09 02:19:54 UTC (rev 731) +++ papers/jss/Makefile 2014-01-09 02:23:01 UTC (rev 732) @@ -1,13 +1,7 @@ all: clean article.pdf clean: - rm -fr article.pdf - rm -fr article.out - rm -fr article.aux - rm -fr article.log - rm -fr article.bbl - rm -fr article.blg - rm -fr article.brf + rm -fr article.{pdf,out,aux,log,bbl,blg,brf} article.pdf: article.Rnw R CMD Sweave article.Rnw From noreply at r-forge.r-project.org Thu Jan 9 03:24:20 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 03:24:20 +0100 (CET) Subject: [Rprotobuf-commits] r733 - papers/jss Message-ID: <20140109022420.EC6D518691D@r-forge.r-project.org> Author: edd Date: 2014-01-09 03:24:20 +0100 (Thu, 09 Jan 2014) New Revision: 733 Modified: papers/jss/Makefile Log: brown paper bag fix to makefile Modified: papers/jss/Makefile =================================================================== --- papers/jss/Makefile 2014-01-09 02:23:01 UTC (rev 732) +++ papers/jss/Makefile 2014-01-09 02:24:20 UTC (rev 733) @@ -1,7 +1,8 @@ all: clean article.pdf clean: - rm -fr article.{pdf,out,aux,log,bbl,blg,brf} + rm -fr article.pdf article.out article.aux article.log article.bbl \ + article.blg article.brf article.pdf: article.Rnw R CMD Sweave article.Rnw From noreply at r-forge.r-project.org Thu Jan 9 15:56:41 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 15:56:41 +0100 (CET) Subject: [Rprotobuf-commits] r734 - papers/jss Message-ID: <20140109145641.25BA9186A78@r-forge.r-project.org> Author: edd Date: 2014-01-09 15:56:40 +0100 (Thu, 09 Jan 2014) New Revision: 734 Modified: papers/jss/article.Rnw Log: small edits paper does NOT 'compile' without network connectivity which is BAD BAD BAD BAD Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-09 02:24:20 UTC (rev 733) +++ papers/jss/article.Rnw 2014-01-09 14:56:40 UTC (rev 734) @@ -137,18 +137,23 @@ between integers and floating point. Because the schema information is not kept separately, multiple JSON messages of the same type needlessly duplicate the field names with each message. +Lastly, XML is a well-established and widely-supported protocol with the ability to define +just about any arbitrarily complex schema. However, it pays for this +complexity with comparatively large and verbose messages, and added +complexities at the parsing side. % % % A number of binary formats based on JSON have been proposed that reduce the parsing cost and improve the efficiency. MessagePack -\citep{msgpackR} and BSON \citep{rmongodb} both have R interfaces, but +and BSON both have R interfaces \citep{msgpackR,rmongodb}, but +% DE Why do we cite these packages, but not the numerous JSON packages? these formats lack a separate schema for the serialized data and thus still duplicate field names with each message sent over the network or stored in a file. Such formats also lack support for versioning when data storage needs evolve over time, or when application logic and requirement changes dictate update to the message format. -% DE: Need to talk about XML ? +% DE: Need to talk about XML -- added a few lines at previous paragraph Once the data serialization needs of an application become complex enough, developers typically benefit from the use of an @@ -1060,7 +1065,7 @@ a <- new(protobuf_unittest.TestAllTypes) a$optional_bool <- TRUE a$optional_bool <- FALSE -<>= +<>= a$optional_bool <- NA <>= try(a$optional_bool <- NA,silent=TRUE) @@ -1074,7 +1079,7 @@ <<>>= as.integer(2^31-1) -<>= +<>= as.integer(2^31 - 1) + as.integer(1) <>= try(as.integer(2^31 - 1) + as.integer(1)) @@ -1421,6 +1426,7 @@ # Check that no information was lost identical(output, MASS::Animals) @ + This code suggests a method for exchanging objects between R servers, however this can also be done without protocol buffers. The main advantage of using an inter-operable format is that we can actually access R objects from within another @@ -1480,7 +1486,7 @@ OpenCPU works like the \texttt{do.call} function in R, hence all arguments are contained within a list. -<<>>= +<>= #requires httr >= 0.2.99 library(httr) library(RProtoBuf) From noreply at r-forge.r-project.org Thu Jan 9 22:11:39 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 22:11:39 +0100 (CET) Subject: [Rprotobuf-commits] r735 - in pkg: . R inst/unitTests src Message-ID: <20140109211139.A7DFA1861CF@r-forge.r-project.org> Author: murray Date: 2014-01-09 22:11:39 +0100 (Thu, 09 Jan 2014) New Revision: 735 Modified: pkg/ChangeLog pkg/R/00classes.R pkg/R/read.R pkg/inst/unitTests/runit.addressbook.R pkg/src/wrapper_Descriptor.cpp Log: Add a logical argument 'partial' to readASCII that allows one to read in incomplete message types in ASCII format. Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-09 14:56:40 UTC (rev 734) +++ pkg/ChangeLog 2014-01-09 21:11:39 UTC (rev 735) @@ -1,3 +1,10 @@ +2014-01-09 Murray Stokely + + * R/read.R: Add a logical argument 'partial' to readASCII that + accepts uninitialized message fragments. + * src/wrapper_Descriptor.cpp (rprotobuf): Idem + * inst/unitTests/runit.addressbook.R (test.ascii): Add tests for above. + 2014-01-04 Murray Stokely * R/wrapper_CodedInputStream.R: Accept numeric size arguments for Modified: pkg/R/00classes.R =================================================================== --- pkg/R/00classes.R 2014-01-09 14:56:40 UTC (rev 734) +++ pkg/R/00classes.R 2014-01-09 21:11:39 UTC (rev 735) @@ -203,7 +203,7 @@ switch( name, "new" = function( ... ) newProto( x, ... ) , "read" = function( input ) read( x, input ) , - "readASCII" = function( input ) readASCII( x, input ), + "readASCII" = function(input, ...) readASCII(x, input, ...), "toString" = function(...) toString(x, ...) , "as.character" = function(...) as.character(x, ...) , "as.list" = function(...) as.list(x, ...) , Modified: pkg/R/read.R =================================================================== --- pkg/R/read.R 2014-01-09 14:56:40 UTC (rev 734) +++ pkg/R/read.R 2014-01-09 21:11:39 UTC (rev 735) @@ -29,24 +29,28 @@ -setGeneric( "readASCII", function( descriptor, input ){ +setGeneric( "readASCII", function( descriptor, input, partial=FALSE ){ standardGeneric( "readASCII" ) } ) setMethod( "readASCII", c( descriptor = "Descriptor" , input = "character" ), -function(descriptor, input ){ - .Call( "Descriptor__readASCIIFromString", descriptor at pointer, input, PACKAGE = "RProtoBuf" ) +function(descriptor, input, partial=FALSE){ + stopifnot(is.logical(partial), length(partial) == 1) + .Call( "Descriptor__readASCIIFromString", descriptor at pointer, input, + partial, PACKAGE = "RProtoBuf" ) } ) setMethod( "readASCII", c( descriptor = "Descriptor" ), -function( descriptor, input ){ +function( descriptor, input, partial=FALSE){ + stopifnot(is.logical(partial), length(partial) == 1) if( !inherits( input, "connection" ) ){ stop( "can only read from connections" ) } wasopen <- identical( summary(input)[["opened"]], "opened" ) if( !wasopen ) open( input, "rb" ) stopifnot(summary(input)[["text"]] == "binary") - message <- .Call( "Descriptor__readASCIIFromConnection", descriptor at pointer, input, PACKAGE = "RProtoBuf" ) + message <- .Call( "Descriptor__readASCIIFromConnection", descriptor at pointer, input, + partial, PACKAGE = "RProtoBuf" ) if( !wasopen ) close( input ) message } ) Modified: pkg/inst/unitTests/runit.addressbook.R =================================================================== --- pkg/inst/unitTests/runit.addressbook.R 2014-01-09 14:56:40 UTC (rev 734) +++ pkg/inst/unitTests/runit.addressbook.R 2014-01-09 21:11:39 UTC (rev 735) @@ -66,4 +66,16 @@ # Verify that we get an exception if we forget the file() and thus treat the # path as a protobuf string. checkException( readASCII( tutorial.AddressBook, out.file2)) + + incomplete.msg <- new(tutorial.Person, name="Murray", email="murray at stokely.org") + tmp.file <- tempfile() + writeLines(as.character(incomplete.msg), file(tmp.file)) + + checkTrue(!incomplete.msg$isInitialized()) + # Verify we normally get an exception if we try to read an incomplete ASCII protocol buffer + checkException( tutorial.Person$readASCII(file(tmp.file))) + + # Verify we can however read it if we set partial=TRUE. + new.msg <- tutorial.Person$readASCII(file(tmp.file), TRUE) + checkEquals(incomplete.msg$name, new.msg$name) } Modified: pkg/src/wrapper_Descriptor.cpp =================================================================== --- pkg/src/wrapper_Descriptor.cpp 2014-01-09 14:56:40 UTC (rev 734) +++ pkg/src/wrapper_Descriptor.cpp 2014-01-09 21:11:39 UTC (rev 735) @@ -173,18 +173,31 @@ return (S4_Message(message)); } -RPB_FUNCTION_2(S4_Message, METHOD(readASCIIFromString), Rcpp::XPtr desc, - std::string input) { +RPB_FUNCTION_3(S4_Message, METHOD(readASCIIFromString), Rcpp::XPtr desc, + std::string input, bool partial) { GPB::Message* message = PROTOTYPE(desc); - if (GPB::TextFormat::ParseFromString(input, message)) { - return (S4_Message(message)); + if (partial) { + // Allow partial messages where object is not fully initialized + GPB::TextFormat::Parser parser; + parser.AllowPartialMessage(partial); + if (parser.ParseFromString(input, message)) { + return (S4_Message(message)); + } else { + throw std::range_error("Could not parse ASCII protocol buffer from text string." + " Consider setting partial=TRUE"); + } } else { - throw std::range_error("Could not parse ASCII protocol buffer from text string."); + // Default parser requires fully initialized ascii messages. + if (GPB::TextFormat::ParseFromString(input, message)) { + return (S4_Message(message)); + } else { + throw std::range_error("Could not parse ASCII protocol buffer from text string."); + } } } RPB_FUNCTION_2(S4_Message, METHOD(readASCIIFromConnection), Rcpp::XPtr desc, - int conn_id) { + int conn_id, bool partial) { RconnectionCopyingInputStream wrapper(conn_id); GPB::io::CopyingInputStreamAdaptor stream(&wrapper); @@ -193,14 +206,23 @@ if (!message) { throw std::range_error("could not call factory->GetPrototype(desc)->New()"); } - if (!GPB::TextFormat::Parse(&stream, message)) { - throw std::range_error("Could not parse ASCII protocol buffer."); + if (partial) { + // Allow partial messages where object is not fully initialized + GPB::TextFormat::Parser parser; + parser.AllowPartialMessage(partial); + if (!parser.Parse(&stream, message)) { + throw std::range_error("Could not parse ASCII protocol buffer."); + } } else { - if (wrapper.Failure()) { - throw std::range_error("Could not read ASCII protocol buffer."); + // Default parser requires fully initialized ascii messages. + if (!GPB::TextFormat::Parse(&stream, message)) { + throw std::range_error("Could not parse ASCII protocol buffer."); } - return (S4_Message(message)); } + if (wrapper.Failure()) { + throw std::range_error("Could not read ASCII protocol buffer."); + } + return (S4_Message(message)); } #undef METHOD From noreply at r-forge.r-project.org Thu Jan 9 22:42:55 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 9 Jan 2014 22:42:55 +0100 (CET) Subject: [Rprotobuf-commits] r736 - pkg/src Message-ID: <20140109214256.01AEC1853B6@r-forge.r-project.org> Author: murray Date: 2014-01-09 22:42:55 +0100 (Thu, 09 Jan 2014) New Revision: 736 Modified: pkg/src/wrapper_Descriptor.cpp Log: Oops, update rpb macro to reflect 3rd argument. Modified: pkg/src/wrapper_Descriptor.cpp =================================================================== --- pkg/src/wrapper_Descriptor.cpp 2014-01-09 21:11:39 UTC (rev 735) +++ pkg/src/wrapper_Descriptor.cpp 2014-01-09 21:42:55 UTC (rev 736) @@ -196,7 +196,7 @@ } } -RPB_FUNCTION_2(S4_Message, METHOD(readASCIIFromConnection), Rcpp::XPtr desc, +RPB_FUNCTION_3(S4_Message, METHOD(readASCIIFromConnection), Rcpp::XPtr desc, int conn_id, bool partial) { RconnectionCopyingInputStream wrapper(conn_id); GPB::io::CopyingInputStreamAdaptor stream(&wrapper); From noreply at r-forge.r-project.org Fri Jan 10 19:45:18 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 10 Jan 2014 19:45:18 +0100 (CET) Subject: [Rprotobuf-commits] r737 - pkg/src Message-ID: <20140110184518.196BB186044@r-forge.r-project.org> Author: murray Date: 2014-01-10 19:45:17 +0100 (Fri, 10 Jan 2014) New Revision: 737 Modified: pkg/src/wrapper_Descriptor.cpp Log: Move more specific help text to correct if branch. Noticed by Karl in code review. Modified: pkg/src/wrapper_Descriptor.cpp =================================================================== --- pkg/src/wrapper_Descriptor.cpp 2014-01-09 21:42:55 UTC (rev 736) +++ pkg/src/wrapper_Descriptor.cpp 2014-01-10 18:45:17 UTC (rev 737) @@ -183,15 +183,15 @@ if (parser.ParseFromString(input, message)) { return (S4_Message(message)); } else { - throw std::range_error("Could not parse ASCII protocol buffer from text string." - " Consider setting partial=TRUE"); + throw std::range_error("Could not parse ASCII protocol buffer from text string."); } } else { // Default parser requires fully initialized ascii messages. if (GPB::TextFormat::ParseFromString(input, message)) { return (S4_Message(message)); } else { - throw std::range_error("Could not parse ASCII protocol buffer from text string."); + throw std::range_error("Could not parse ASCII protocol buffer from text string." + " Consider setting partial=TRUE"); } } } From noreply at r-forge.r-project.org Fri Jan 10 21:16:15 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 10 Jan 2014 21:16:15 +0100 (CET) Subject: [Rprotobuf-commits] r738 - / pkg/src windows Message-ID: <20140110201615.9C4691868F2@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-10 21:16:15 +0100 (Fri, 10 Jan 2014) New Revision: 738 Added: pkg/src/Makevars.win windows/ windows/cross-compile.sh windows/lib/ Log: adding some windows stuff Added: pkg/src/Makevars.win =================================================================== --- pkg/src/Makevars.win (rev 0) +++ pkg/src/Makevars.win 2014-01-10 20:16:15 UTC (rev 738) @@ -0,0 +1,7 @@ +# -*- mode: Makefile -*- +## +## The folders C:/protobuf-2.5.0/lib/{i386,x64}/ contain static libraries for windows. +RCPP_LDFLAGS = $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") +PROTOBUFROOT= "C:/protobuf-2.5.0" +PKG_CPPFLAGS= -I$(PROTOBUFROOT)/src +PKG_LIBS=$(RCPP_LDFLAGS) -L$(PROTOBUFROOT)/lib${R_ARCH} -lprotobuf Added: windows/cross-compile.sh =================================================================== --- windows/cross-compile.sh (rev 0) +++ windows/cross-compile.sh 2014-01-10 20:16:15 UTC (rev 738) @@ -0,0 +1,26 @@ +# Script to create static libraries for windows using on mingw-w64 v3.0 +# cross compiler on Ubuntu 14.04 (older versions of mingw-w64 didn't work) + +# install cross compilers +sudo apt-get install make gcc-mingw-w64-x86-64 gcc-mingw-w64-i686 mingw-w64 + +# create output dirs +mkdir -p lib/{i386,x64} + +# get libprotobuf +wget https://protobuf.googlecode.com/files/protobuf-2.5.0.tar.gz +tar xzvf protobuf-2.5.0.tar.gz +cd protobuf-2.5.0 + +# Build for win32 +./configure --host=i686-w64-mingw32 --disable-shared +make +cp src/.libs/libprotobuf.a ../lib/i386/ +make clean + +# Build for win64 +./configure --host=x86_64-w64-mingw32 --disable-shared +make +cp src/.libs/libprotobuf.a ../lib/x64/ +make clean + From noreply at r-forge.r-project.org Fri Jan 10 23:27:55 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 10 Jan 2014 23:27:55 +0100 (CET) Subject: [Rprotobuf-commits] r739 - pkg/src windows/lib windows/lib/i386 windows/lib/x64 Message-ID: <20140110222756.04481186990@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-10 23:27:55 +0100 (Fri, 10 Jan 2014) New Revision: 739 Added: windows/lib/i386/ windows/lib/i386/libprotobuf.a windows/lib/x64/ windows/lib/x64/libprotobuf.a Modified: pkg/src/Makevars.win Log: add one-time static libraries for windows for CRAN Modified: pkg/src/Makevars.win =================================================================== --- pkg/src/Makevars.win 2014-01-10 20:16:15 UTC (rev 738) +++ pkg/src/Makevars.win 2014-01-10 22:27:55 UTC (rev 739) @@ -1,7 +1,13 @@ # -*- mode: Makefile -*- ## ## The folders C:/protobuf-2.5.0/lib/{i386,x64}/ contain static libraries for windows. -RCPP_LDFLAGS = $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") -PROTOBUFROOT= "C:/protobuf-2.5.0" +## There is a problem with mingw64 and libprotobuf that results in ld reading symbols multiple times +## We can workaround it using "-Wl,-allow-multiple-definition". +## See also https://sourceware.org/bugzilla/show_bug.cgi?id=12762 +## +MINGW64HACK= "-Wl,-allow-multiple-definition" +RCPP_LDFLAGS = $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") $(MINGW64HACK) +PROTOBUFROOT= "../protobuf-2.5.0" PKG_CPPFLAGS= -I$(PROTOBUFROOT)/src PKG_LIBS=$(RCPP_LDFLAGS) -L$(PROTOBUFROOT)/lib${R_ARCH} -lprotobuf + Added: windows/lib/i386/libprotobuf.a =================================================================== (Binary files differ) Property changes on: windows/lib/i386/libprotobuf.a ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: windows/lib/x64/libprotobuf.a =================================================================== (Binary files differ) Property changes on: windows/lib/x64/libprotobuf.a ___________________________________________________________________ Added: svn:mime-type + application/octet-stream From noreply at r-forge.r-project.org Fri Jan 10 23:41:27 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 10 Jan 2014 23:41:27 +0100 (CET) Subject: [Rprotobuf-commits] r740 - pkg Message-ID: <20140110224127.D9B83184633@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-10 23:41:27 +0100 (Fri, 10 Jan 2014) New Revision: 740 Modified: pkg/DESCRIPTION Log: windows build Modified: pkg/DESCRIPTION =================================================================== --- pkg/DESCRIPTION 2014-01-10 22:27:55 UTC (rev 739) +++ pkg/DESCRIPTION 2014-01-10 22:41:27 UTC (rev 740) @@ -17,4 +17,3 @@ License: GPL-2 URL: http://r-forge.r-project.org/projects/rprotobuf/, http://romainfrancois.blog.free.fr/index.php?category/R-package/RProtoBuf, http://dirk.eddelbuettel.com/blog/code/rprotobuf/ BugReports: http://r-forge.r-project.org/tracker/index.php?group_id=576&atid=2338 -OS_type: unix From noreply at r-forge.r-project.org Sat Jan 11 05:02:28 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 11 Jan 2014 05:02:28 +0100 (CET) Subject: [Rprotobuf-commits] r741 - in pkg: . inst/unitTests src Message-ID: <20140111040229.2E1711869F0@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-11 05:02:26 +0100 (Sat, 11 Jan 2014) New Revision: 741 Added: pkg/configure.win Modified: pkg/inst/unitTests/runit.addressbook.R pkg/src/Makevars.win Log: fixes for windows Added: pkg/configure.win =================================================================== Modified: pkg/inst/unitTests/runit.addressbook.R =================================================================== --- pkg/inst/unitTests/runit.addressbook.R 2014-01-10 22:41:27 UTC (rev 740) +++ pkg/inst/unitTests/runit.addressbook.R 2014-01-11 04:02:26 UTC (rev 741) @@ -51,12 +51,15 @@ # (better than silently getting an empty proto.) book4 <- checkException( readASCII( tutorial.AddressBook, file(out.file, "rt"))) - # Verify that we get an exception if the file is not readable. - old.mode <- file.info(out.file)[["mode"]] - Sys.chmod(out.file, "0000") - book5 <- checkException( readASCII( tutorial.AddressBook, file(out.file, "rb"))) - # Set the permissions back to ensure the file is cleaned up properly. - Sys.chmod(out.file, old.mode) + # Test does not work on windows because of chmod + if(!grepl("mingw", R.Version()$platform)){ + # Verify that we get an exception if the file is not readable. + old.mode <- file.info(out.file)[["mode"]] + Sys.chmod(out.file, "0000") + book5 <- checkException( readASCII( tutorial.AddressBook, file(out.file, "rb"))) + # Set the permissions back to ensure the file is cleaned up properly. + Sys.chmod(out.file, old.mode) + } # Verify that we get an exception if the file is not parseable. out.file2 <- tempfile() Modified: pkg/src/Makevars.win =================================================================== --- pkg/src/Makevars.win 2014-01-10 22:41:27 UTC (rev 740) +++ pkg/src/Makevars.win 2014-01-11 04:02:26 UTC (rev 741) @@ -1,13 +1,14 @@ # -*- mode: Makefile -*- ## -## The folders C:/protobuf-2.5.0/lib/{i386,x64}/ contain static libraries for windows. -## There is a problem with mingw64 and libprotobuf that results in ld reading symbols multiple times -## We can workaround it using "-Wl,-allow-multiple-definition". +## The folders ../protobuf-2.5.0/lib/{i386,x64}/ contain static +libraries for windows. +## mingw64-4.8.1 has an issue that results in ld reading symbols multiple times. +## A workaround is to add "-Wl,-allow-multiple-definition" to RCPP_LDFLAGS. ## See also https://sourceware.org/bugzilla/show_bug.cgi?id=12762 +## However the problem did not appear for mingw 4.7.3. ## -MINGW64HACK= "-Wl,-allow-multiple-definition" -RCPP_LDFLAGS = $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") $(MINGW64HACK) +## MINGW64HACK= "-Wl,-allow-multiple-definition" +RCPP_LDFLAGS = $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") PROTOBUFROOT= "../protobuf-2.5.0" PKG_CPPFLAGS= -I$(PROTOBUFROOT)/src PKG_LIBS=$(RCPP_LDFLAGS) -L$(PROTOBUFROOT)/lib${R_ARCH} -lprotobuf - From noreply at r-forge.r-project.org Sat Jan 11 05:19:47 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 11 Jan 2014 05:19:47 +0100 (CET) Subject: [Rprotobuf-commits] r742 - papers/jss Message-ID: <20140111041948.0572A186903@r-forge.r-project.org> Author: edd Date: 2014-01-11 05:19:46 +0100 (Sat, 11 Jan 2014) New Revision: 742 Modified: papers/jss/article.Rnw Log: a few fixes Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-11 04:02:26 UTC (rev 741) +++ papers/jss/article.Rnw 2014-01-11 04:19:46 UTC (rev 742) @@ -464,7 +464,7 @@ and the name must be given entirely. The \verb|[[| operator can also be used to query and set fields -of a mesages, supplying either their name or their tag number : +of a messages, supplying either their name or their tag number : <<>>= p[["name"]] <- "Murray Stokely" @@ -740,7 +740,7 @@ @ Table~\ref{Descriptor-methods-table} provides a complete list of the -slots and avalailable methods for Descriptors. +slots and available methods for Descriptors. \begin{table}[h] \centering @@ -993,7 +993,7 @@ like Protocol Buffers is that it provides a highly portable basic type system that different language and hardware implementations can map to the most appropriate type in different environments. -Table~\ref{table-get-types} details the correspondance between the +Table~\ref{table-get-types} details the correspondence between the field type and the type of data that is retrieved by \verb|$| and \verb|[[| extractors. @@ -1033,7 +1033,7 @@ \hline \end{tabular} \end{small} -\caption{\label{table-get-types}Correspondance between field type and +\caption{\label{table-get-types}Correspondence between field type and R type retrieved by the extractors. \footnotesize{1. R lacks native 64-bit integers, so the \texttt{RProtoBuf.int64AsString} option is available to return large integers as characters to avoid losing @@ -1190,7 +1190,7 @@ \Sexpr{m} data sets could be converted to Protocol Buffers (\Sexpr{format(100*m/n,digits=1)}\%). The next section illustrates how -many bytes were usued to store the data sets under four different +many bytes were used to store the data sets under four different situations (1) normal R serialization, (2) R serialization followed by gzip, (3) normal protocol buffer serialization, (4) protocol buffer serialization followed by gzip. @@ -1362,7 +1362,7 @@ transfer of any media type, such as web pages, files or video. When designing systems where various components require exchange of specific data structures, we need something on top of the network protocol that prescribes -how these structures are to be respresented in messages (buffers) on the +how these structures are to be represented in messages (buffers) on the network. Protocol buffers solve exactly this problem by providing a cross platform method for serializing arbitrary structures into well defined messages, that can be exchanged using any protocol. The descriptors @@ -1473,7 +1473,7 @@ containing R objects to post to the server, as well as retrieve and parse protobuf messages returned by the server. Using protocol buffers to post function arguments is not required, and for simple (scalar) arguments -the standard \texttt{appliation/www-url-encoded} format might be sufficient. +the standard \texttt{application/www-url-encoded} format might be sufficient. However, with protocol buffers the client can perform function calls with more complex arguments such as R vectors or lists. The result is a complete RPC system to do arbitrary R function calls from within From noreply at r-forge.r-project.org Sat Jan 11 05:20:18 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 11 Jan 2014 05:20:18 +0100 (CET) Subject: [Rprotobuf-commits] r743 - papers/jss Message-ID: <20140111042018.8C1BE18690C@r-forge.r-project.org> Author: edd Date: 2014-01-11 05:20:14 +0100 (Sat, 11 Jan 2014) New Revision: 743 Modified: papers/jss/article.Rnw Log: another one Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-11 04:19:46 UTC (rev 742) +++ papers/jss/article.Rnw 2014-01-11 04:20:14 UTC (rev 743) @@ -225,7 +225,7 @@ applications or different computers. \item \emph{Efficient}: Data is serialized into a compact binary representation for transmission or storage. -\item \emph{Exentsible}: New fields can be added to Protocol Buffer Schemas +\item \emph{Extensible}: New fields can be added to Protocol Buffer Schemas in a forward-compatible way that do not break older applications. \item \emph{Stable}: Protocol Buffers have been in wide use for over a decade. From noreply at r-forge.r-project.org Sat Jan 11 05:32:33 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 11 Jan 2014 05:32:33 +0100 (CET) Subject: [Rprotobuf-commits] r744 - pkg/src windows Message-ID: <20140111043233.E8589186B0C@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-11 05:32:33 +0100 (Sat, 11 Jan 2014) New Revision: 744 Added: windows/protobuf-2.5.0-windows.zip windows/readme.txt Removed: windows/lib/ Modified: pkg/src/Makevars.win Log: update the windows library Modified: pkg/src/Makevars.win =================================================================== --- pkg/src/Makevars.win 2014-01-11 04:20:14 UTC (rev 743) +++ pkg/src/Makevars.win 2014-01-11 04:32:33 UTC (rev 744) @@ -10,5 +10,5 @@ ## MINGW64HACK= "-Wl,-allow-multiple-definition" RCPP_LDFLAGS = $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") PROTOBUFROOT= "../protobuf-2.5.0" -PKG_CPPFLAGS= -I$(PROTOBUFROOT)/src +PKG_CPPFLAGS= -I$(PROTOBUFROOT)/include PKG_LIBS=$(RCPP_LDFLAGS) -L$(PROTOBUFROOT)/lib${R_ARCH} -lprotobuf Added: windows/protobuf-2.5.0-windows.zip =================================================================== (Binary files differ) Property changes on: windows/protobuf-2.5.0-windows.zip ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: windows/readme.txt =================================================================== --- windows/readme.txt (rev 0) +++ windows/readme.txt 2014-01-11 04:32:33 UTC (rev 744) @@ -0,0 +1,4 @@ +i386 was built using msys + rtools 3.1 (which has mingw 4.6.3) +x64 was built using msys + mingw-x64-4.7.3-posix-sjlj-rev1 (from mingw-builds installer) + +All other combinations I tried (including mingw 4.8.1) resulted in one error or another \ No newline at end of file From noreply at r-forge.r-project.org Sat Jan 11 06:24:57 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 11 Jan 2014 06:24:57 +0100 (CET) Subject: [Rprotobuf-commits] r745 - pkg/src Message-ID: <20140111052457.2D55018696E@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-11 06:24:56 +0100 (Sat, 11 Jan 2014) New Revision: 745 Modified: pkg/src/Makevars.win Log: typo Modified: pkg/src/Makevars.win =================================================================== --- pkg/src/Makevars.win 2014-01-11 04:32:33 UTC (rev 744) +++ pkg/src/Makevars.win 2014-01-11 05:24:56 UTC (rev 745) @@ -1,14 +1,15 @@ # -*- mode: Makefile -*- ## -## The folders ../protobuf-2.5.0/lib/{i386,x64}/ contain static -libraries for windows. -## mingw64-4.8.1 has an issue that results in ld reading symbols multiple times. -## A workaround is to add "-Wl,-allow-multiple-definition" to RCPP_LDFLAGS. +## The folders ../protobuf-2.5.0/lib/{i386,x64}/ contain static libraries +## for windows. Note that mingw64-4.8.1 has an issue that results in ld +## reading symbols multiple times. As a workaround, we can add a flag +## "-Wl,-allow-multiple-definition" to RCPP_LDFLAGS. ## See also https://sourceware.org/bugzilla/show_bug.cgi?id=12762 -## However the problem did not appear for mingw 4.7.3. +## However the problem did not appear for mingw 4.7.3. So that's what we +## ended up using. ## ## MINGW64HACK= "-Wl,-allow-multiple-definition" -RCPP_LDFLAGS = $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") +RCPP_LDFLAGS= $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") PROTOBUFROOT= "../protobuf-2.5.0" PKG_CPPFLAGS= -I$(PROTOBUFROOT)/include PKG_LIBS=$(RCPP_LDFLAGS) -L$(PROTOBUFROOT)/lib${R_ARCH} -lprotobuf From noreply at r-forge.r-project.org Sat Jan 11 06:53:01 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 11 Jan 2014 06:53:01 +0100 (CET) Subject: [Rprotobuf-commits] r746 - papers/jss Message-ID: <20140111055301.72F85186799@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-11 06:53:01 +0100 (Sat, 11 Jan 2014) New Revision: 746 Modified: papers/jss/article.Rnw Log: fix Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-11 05:24:56 UTC (rev 745) +++ papers/jss/article.Rnw 2014-01-11 05:53:01 UTC (rev 746) @@ -1473,7 +1473,7 @@ containing R objects to post to the server, as well as retrieve and parse protobuf messages returned by the server. Using protocol buffers to post function arguments is not required, and for simple (scalar) arguments -the standard \texttt{application/www-url-encoded} format might be sufficient. +the standard \texttt{application/x-www-form-urlencoded} format might be sufficient. However, with protocol buffers the client can perform function calls with more complex arguments such as R vectors or lists. The result is a complete RPC system to do arbitrary R function calls from within From noreply at r-forge.r-project.org Sat Jan 11 17:39:52 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 11 Jan 2014 17:39:52 +0100 (CET) Subject: [Rprotobuf-commits] r747 - papers/jss Message-ID: <20140111163953.0B3BF180983@r-forge.r-project.org> Author: edd Date: 2014-01-11 17:39:52 +0100 (Sat, 11 Jan 2014) New Revision: 747 Modified: papers/jss/eddelbuettel-stokely.bib Log: Firstname Lastname and Otherfirst OtherLast ordering for authors Modified: papers/jss/eddelbuettel-stokely.bib =================================================================== --- papers/jss/eddelbuettel-stokely.bib 2014-01-11 05:53:01 UTC (rev 746) +++ papers/jss/eddelbuettel-stokely.bib 2014-01-11 16:39:52 UTC (rev 747) @@ -1,6 +1,6 @@ @article{eddelbuettel2011rcpp, title={Rcpp: Seamless R and C++ integration}, - author={Eddelbuettel, Dirk and Fran{\c{c}}ois, Romain}, + author={Dirk Eddelbuettel and Romain Fran{\c{c}}ois}, journal={Journal of Statistical Software}, volume={40}, number={8}, @@ -23,21 +23,21 @@ } @Manual{int64, title = {int64: 64 bit integer types}, - author = {Romain Francois}, + author = {Romain Fran{\c{c}}ois}, year = {2011}, note = {R package version 1.1.2}, url = {http://CRAN.R-project.org/package=int64}, } @Manual{bit64, title = {bit64: A S3 class for vectors of 64bit integers}, - author = {Jens Oehlschl?gel}, + author = {Jens Oehlschl\"{a}gel}, year = {2012}, note = {R package version 0.9-3}, url = {http://CRAN.R-project.org/package=bit64}, } @book{eddelbuettel2013seamless, title={Seamless R and C++ Integration with Rcpp}, - author={Eddelbuettel, Dirk}, + author={Dirk Eddelbuettel}, year={2013}, publisher={Springer} } @@ -48,28 +48,28 @@ url = {http://www.stat.purdue.edu/~sguha/rhipe/}, } @misc{serialization, -author= {Tierney, Luke}, +author= {Luke Tierney}, title = {A New Serialization Mechanism for R}, url = {http://www.cs.uiowa.edu/~luke/R/serialize/serialize.ps}, year = {2003}, } @manual{eddelbuettel2013exposing, title={Exposing C++ functions and classes with Rcpp modules}, - author={Eddelbuettel, Dirk and Fran{\c{c}}ois, Romain}, + author={Dirk Eddelbuettel and Romain Fran{\c{c}}ois}, year={2013}, note={Vignette included in R package Rcpp}, url = {http://CRAN.R-project.org/package=Rcpp}, } @inproceedings{cantrill2004dynamic, title={Dynamic Instrumentation of Production Systems.}, - author={Cantrill, Bryan and Shapiro, Michael W and Leventhal, Adam H and others}, + author={Bryan Cantrill and Michael W Shapiro and Adam H Leventhal and others}, booktitle={USENIX Annual Technical Conference, General Track}, pages={15--28}, year={2004} } @article{swain1991color, title={Color indexing}, - author={Swain, Michael J and Ballard, Dana H}, + author={Michael J Swain and Dana H Ballard}, journal={International journal of computer vision}, volume={7}, number={1}, @@ -79,7 +79,7 @@ } @article{rubner2000earth, title={The earth mover's distance as a metric for image retrieval}, - author={Rubner, Yossi and Tomasi, Carlo and Guibas, Leonidas J}, + author={Yossi Rubner and Carlo Tomasi and Leonidas J Guibas}, journal={International Journal of Computer Vision}, volume={40}, number={2}, @@ -89,13 +89,13 @@ } @book{kullback1997information, title={Information theory and statistics}, - author={Kullback, Solomon}, + author={Solomon Kullback}, year={1997}, publisher={Courier Dover Publications} } @inproceedings{puzicha1997non, title={Non-parametric similarity measures for unsupervised texture segmentation and image retrieval}, - author={Puzicha, Jan and Hofmann, Thomas and Buhmann, Joachim M}, + author={Jan Puzicha and Thomas Hofmann and Joachim M Buhmann}, booktitle={Computer Vision and Pattern Recognition, 1997. Proceedings., 1997 IEEE Computer Society Conference on}, pages={267--272}, year={1997}, @@ -103,7 +103,7 @@ } @inproceedings{fang1999computing, title={Computing Iceberg Queries Efficiently.}, - author={Fang, Min and Shivakumar, Narayanan and Garcia-Molina, Hector and Motwani, Rajeev and Ullman, Jeffrey D}, + author={Min Fang and Narayanan Shivakumar and Hector Garcia-Molina and Rajeev Motwani and Jeffrey D Ullman}, booktitle={Internaational Conference on Very Large Databases (VLDB'98), New York, August 1998}, year={1999}, organization={Stanford InfoLab} @@ -116,7 +116,7 @@ url = {http://cran.r-project.org/package=emdist}, } @article{Wegiel:2010:CTT:1932682.1869479, - author = {Wegiel, Michal and Krintz, Chandra}, + author = {Michal Wegiel and Chandra Krintz}, title = {Cross-language, Type-safe, and Transparent Object Sharing for Co-located Managed Runtimes}, journal = {SIGPLAN Not.}, issue_date = {October 2010}, @@ -136,7 +136,7 @@ } @article{wickham2011split, title={The split-apply-combine strategy for data analysis}, - author={Wickham, Hadley}, + author={Hadley Wickham}, journal={Journal of Statistical Software}, volume={40}, number={1}, @@ -145,7 +145,7 @@ publisher={Citeseer} } @inproceedings{Sumaray:2012:CDS:2184751.2184810, - author = {Sumaray, Audie and Makki, S. Kami}, + author = {Audie Sumaray and S. Kami Makki}, title = {A Comparison of Data Serialization Formats for Optimal Efficiency on a Mobile Platform}, booktitle = {Proceedings of the 6th International Conference on Ubiquitous Information Management and Communication}, series = {ICUIMC '12}, @@ -164,7 +164,7 @@ } @Manual{RObjectTables, title = {User-Defined Tables in the R Search Path}, - author = {Duncan Temple Lang}, + author = {Duncan {Temple Lang}}, year = {2012}, url = {http://www.omegahat.org/RObjectTables/RObjectTables.pdf}, } @@ -185,7 +185,7 @@ } @article{dean2008mapreduce, title={MapReduce: simplified data processing on large clusters}, - author={Dean, Jeffrey and Ghemawat, Sanjay}, + author={Jeffrey Dean and Sanjay Ghemawat}, journal={Communications of the ACM}, volume={51}, number={1}, @@ -195,7 +195,7 @@ } @article{bostock2011d3, title={D$^3$ Data-Driven Documents}, - author={Bostock, Michael and Ogievetsky, Vadim and Heer, Jeffrey}, + author={Michael Bostock and Vadim Ogievetsky and Jeffrey Heer}, journal={Visualization and Computer Graphics, IEEE Transactions on}, volume={17}, number={12}, @@ -205,7 +205,7 @@ } % celebrated article in this field. Also see the parallel paragraph. @article{Manku:1998:AMO:276305.276342, - author = {Manku, Gurmeet Singh and Rajagopalan, Sridhar and Lindsay, Bruce G.}, + author = {Gurmeet Singh Manku and Sridhar Rajagopalan and Bruce G. Lindsay}, title = {Approximate medians and other quantiles in one pass and with limited memory}, journal = {SIGMOD Rec.}, issue_date = {June 1998}, @@ -224,7 +224,7 @@ } % Has a section on protocol buffers @article{Pike:2005:IDP:1239655.1239658, - author = {Pike, Rob and Dorward, Sean and Griesemer, Robert and Quinlan, Sean}, + author = {Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan}, title = {Interpreting the data: Parallel analysis with Sawzall}, journal = {Sci. Program.}, issue_date = {October 2005}, @@ -247,7 +247,7 @@ } @article{sturges1926choice, title={The choice of a class interval}, - author={Sturges, Herbert A}, + author={Herbert A Sturges}, journal={Journal of the American Statistical Association}, volume={21}, number={153}, @@ -263,7 +263,7 @@ } @article{scott1979optimal, title={On optimal and data-based histograms}, - author={Scott, David W}, + author={David W Scott}, journal={Biometrika}, volume={66}, number={3}, @@ -273,7 +273,7 @@ } @book{scott2009multivariate, title={Multivariate density estimation: theory, practice, and visualization}, - author={Scott, David W}, + author={David W Scott}, volume={383}, year={2009}, publisher={Wiley. com} From noreply at r-forge.r-project.org Sat Jan 11 18:17:46 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 11 Jan 2014 18:17:46 +0100 (CET) Subject: [Rprotobuf-commits] r748 - pkg Message-ID: <20140111171746.93766186746@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-11 18:17:46 +0100 (Sat, 11 Jan 2014) New Revision: 748 Modified: pkg/configure.win Log: configure.win Modified: pkg/configure.win =================================================================== --- pkg/configure.win 2014-01-11 16:39:52 UTC (rev 747) +++ pkg/configure.win 2014-01-11 17:17:46 UTC (rev 748) @@ -0,0 +1,5 @@ +# This is a temporary solution for when the headers/lib are not avaialble on the machine +"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e ' +download.file("http://www.stat.ucla.edu/~jeroen/files/protobuf-2.5.0-windows.zip", "lib.zip"); +unzip("lib.zip"); +' \ No newline at end of file From noreply at r-forge.r-project.org Sat Jan 11 18:28:04 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 11 Jan 2014 18:28:04 +0100 (CET) Subject: [Rprotobuf-commits] r749 - papers/jss Message-ID: <20140111172804.99C181867ED@r-forge.r-project.org> Author: edd Date: 2014-01-11 18:28:04 +0100 (Sat, 11 Jan 2014) New Revision: 749 Added: papers/jss/article.bib Removed: papers/jss/eddelbuettel-stokely.bib Modified: papers/jss/article.Rnw Log: rename bib file Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-11 17:17:46 UTC (rev 748) +++ papers/jss/article.Rnw 2014-01-11 17:28:04 UTC (rev 749) @@ -14,8 +14,9 @@ \DefineVerbatimEnvironment{example}{Verbatim}{} %% almost as usual -\author{Dirk Eddelbuettel\\Debian and R Projects \And - Murray Stokely\\Google, Inc} +\author{Dirk Eddelbuettel\\Debian Project \And + Murray Stokely\\Google, Inc \And + Jeroen Ooms\\UCLA} \title{\pkg{RProtoBuf}: Efficient Cross-Language Data Serialization in R} %% for pretty printing and a nice hypersummary also set: @@ -50,7 +51,11 @@ %% The address of (at least) one author should be given %% in the following format: \Address{ - Dirk Eddelbuettel\\ + Dirk Eddelbuettel \\ + Debian Project \\ + River Forest, IL, USA\\ + E-mail: \email{edd at debian.org}\\ + URL: \url{http://dirk.eddelbuettel.com}\\ \\ Murray Stokely\\ Google, Inc.\\ @@ -58,7 +63,13 @@ Mountain View, CA 94040\\ USA\\ E-mail: \email{mstokely at google.com}\\ - URL: \url{http://www.stokely.org/} + URL: \url{http://www.stokely.org/}\\ + \\ + Jeroen Ooms\\ + UCLA Department of Statistics\\ + University of California\\ + E-mail: \email{jeroen.ooms at stat.ucla.edu}\\ + URL: \url{http://jeroenooms.github.io} } %% It is also possible to add a telephone and fax number %% before the e-mail in the following format: @@ -636,24 +647,23 @@ % TODO(ms): Add column check box for doing dynamic dispatch based on type. \begin{table}[h] \centering -\begin{tabular}{|l|c|c|l|} -\hline -\textbf{Class} & \textbf{Slots} & \textbf{Methods} & \textbf{Dynamic Dispatch}\\ -\hline -\hline -Message & 2 & 20 & yes (field names)\\ -\hline -Descriptor & 2 & 16 & yes (field names, enum types, nested types)\\ -\hline -FieldDescriptor & 4 & 18 & no\\ -\hline -EnumDescriptor & 4 & 11 & yes (enum constant names)\\ -\hline -FileDescriptor & 3 & 6 & yes (message/field definitions)\\ -\hline -EnumValueDescriptor & 3 & 6 & no\\ -\hline +\begin{tabular}{lccl} +\toprule +\textbf{Class} & + \textbf{Slots} & + \textbf{Methods} & + \textbf{Dynamic Dispatch}\\ +\cmidrule{1-4} +Message & 2 & 20 & yes (field names)\\ +Descriptor & 2 & 16 & yes (field names, enum types, nested types)\\ +FieldDescriptor & 4 & 18 & no\\ +EnumDescriptor & 4 & 11 & yes (enum constant names)\\ +FileDescriptor & 3 & \phantom{1}6 & yes (message/field definitions)\\ +EnumValueDescriptor & 3 & \phantom{1}6 & no\\ +\bottomrule \end{tabular} +\caption{\label{Message-methods-table}Overview of Class, Slot, Method and + Dispatch Relationships} \end{table} \subsection{Messages} @@ -669,34 +679,32 @@ \begin{table}[h] \centering \begin{small} -\begin{tabular}{l|p{10cm}} -\hline +\begin{tabular}{lp{10cm}} +\toprule \textbf{Slot} & \textbf{Description} \\ -\hline +\cmidrule(r){2-2} \texttt{pointer} & External pointer to the \texttt{Message} object of the C++ proto library. Documentation for the \texttt{Message} class is available from the protocol buffer project page: \url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.message.html#Message} \\ -\hline \texttt{type} & Fully qualified name of the message. For example a \texttt{Person} message has its \texttt{type} slot set to \texttt{tutorial.Person} \\[.3cm] -\hline \textbf{Method} & \textbf{Description} \\ -\hline +\cmidrule(r){2-2} \texttt{has} & Indicates if a message has a given field. \\ \texttt{clone} & Creates a clone of the message \\ \texttt{isInitialized} & Indicates if a message has all its required fields set\\ \texttt{serialize} & serialize a message to a file, binary connection, or raw vector\\ \texttt{clear} & Clear one or several fields of a message, or the entire message\\ \texttt{size} & The number of elements in a message field\\ -\texttt{bytesize} & The number of bytes the message would take once serialized\\ -\hline +\texttt{bytesize} & The number of bytes the message would take once serialized\\[3mm] +% \texttt{swap} & swap elements of a repeated field of a message\\ \texttt{set} & set elements of a repeated field\\ \texttt{fetch} & fetch elements of a repeated field\\ \texttt{setExtension} & set an extension of a message\\ \texttt{getExtension} & get the value of an extension of a message\\ -\texttt{add} & add elements to a repeated field \\ -\hline +\texttt{add} & add elements to a repeated field \\[3mm] +% \texttt{str} & the R structure of the message\\ \texttt{as.character} & character representation of a message\\ \texttt{toString} & character representation of a message (same as \texttt{as.character}) \\ @@ -745,23 +753,21 @@ \begin{table}[h] \centering \begin{small} -\begin{tabular}{l|p{10cm}} -\hline +\begin{tabular}{lp{10cm}} +\toprule \textbf{Slot} & \textbf{Description} \\ -\hline +\cmidrule(r){2-2} \texttt{pointer} & External pointer to the \texttt{Descriptor} object of the C++ proto library. Documentation for the \texttt{Descriptor} class is available from the protocol buffer project page: \url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.descriptor.html#Descriptor} \\ -\hline \texttt{type} & Fully qualified path of the message type. \\[.3cm] -\hline +% \textbf{Method} & \textbf{Description} \\ -\hline +\cmidrule(r){2-2} \texttt{new} & Creates a prototype of a message described by this descriptor.\\ \texttt{read} & Reads a message from a file or binary connection.\\ \texttt{readASCII} & Read a message in ASCII format from a file or text connection.\\ -\hline \texttt{name} & Retrieve the name of the message type associated with this descriptor.\\ \texttt{as.character} & character representation of a descriptor\\ @@ -769,7 +775,6 @@ \texttt{as.list} & return a named list of the field, enum, and nested descriptors included in this descriptor.\\ \texttt{asMessage} & return DescriptorProto message. \\ -\hline \texttt{fileDescriptor} & Retrieve the file descriptor of this descriptor.\\ \texttt{containing\_type} & Retrieve the descriptor describing the message type containing this descriptor.\\ @@ -781,7 +786,7 @@ \texttt{enum\_type\_count} & The number of enum types in this descriptor.\\ \texttt{enum\_type} & Return the descriptor for the specified enum type in this descriptor.\\ -\hline +\bottomrule \end{tabular} \end{small} \caption{\label{Descriptor-methods-table}Description of slots and methods for the \texttt{Descriptor} S4 class} @@ -799,23 +804,19 @@ \begin{table}[h] \centering \begin{small} -\begin{tabular}{l|p{10cm}} -\hline +\begin{tabular}{lp{10cm}} +\toprule \textbf{Slot} & \textbf{Description} \\ -\hline +\cmidrule(r){2-2} \texttt{pointer} & External pointer to the \texttt{FieldDescriptor} C++ variable \\ -\hline \texttt{name} & Simple name of the field \\ -\hline \texttt{full\_name} & Fully qualified name of the field \\ -\hline \texttt{type} & Name of the message type where the field is declared \\[.3cm] -\hline +% \textbf{Method} & \textbf{Description} \\ -\hline +\cmidrule(r){2-2} \texttt{as.character} & Character representation of a descriptor\\ -\texttt{toString} & Character -representation of a descriptor (same as \texttt{as.character}) \\ +\texttt{toString} & Character representation of a descriptor (same as \texttt{as.character}) \\ \texttt{asMessage} & Return FieldDescriptorProto message. \\ \texttt{name} & Return the name of the field descriptor.\\ \texttt{fileDescriptor} & Return the fileDescriptor where this field is defined.\\ @@ -832,7 +833,7 @@ \texttt{default\_value} & Return the default value.\\ \texttt{message\_type} & Return the message type if this is a message type field.\\ \texttt{enum\_type} & Return the enum type if this is an enum type field.\\ -\hline +\bottomrule \end{tabular} \end{small} \caption{\label{fielddescriptor-methods-table}Description of slots and @@ -863,20 +864,17 @@ \begin{table}[h] \centering \begin{small} -\begin{tabular}{l|p{10cm}} -\hline +\begin{tabular}{lp{10cm}} +\toprule \textbf{Slot} & \textbf{Description} \\ -\hline +\cmidrule(r){2-2} \texttt{pointer} & External pointer to the \texttt{EnumDescriptor} C++ variable \\ -\hline \texttt{name} & Simple name of the enum \\ -\hline \texttt{full\_name} & Fully qualified name of the enum \\ -\hline \texttt{type} & Name of the message type where the enum is declared \\[.3cm] -\hline +% \textbf{Method} & \textbf{Description} \\ -\hline +\cmidrule(r){2-2} \texttt{as.list} & return a named integer vector with the values of the enum and their names.\\ \texttt{as.character} & character representation of a descriptor\\ @@ -890,7 +888,7 @@ \texttt{has} & Return TRUE if this enum contains the specified named constant string.\\ \texttt{value\_count} & Return the number of constants in this enum (same as \texttt{length}).\\ \texttt{value} & Return the EnumValueDescriptor of an enum value of specified index, name, or number.\\ -\hline +\bottomrule \end{tabular} \end{small} \caption{\label{enumdescriptor-methods-table}Description of slots and methods for the \texttt{EnumDescriptor} S4 class} @@ -916,27 +914,24 @@ \begin{table}[h] \centering \begin{small} -\begin{tabular}{l|p{10cm}} -\hline -\textbf{slot} & \textbf{description} \\ -\hline +\begin{tabular}{lp{10cm}} +\toprule +\textbf{Slot} & \textbf{Description} \\ +\cmidrule(r){2-2} \texttt{pointer} & external pointer to the \texttt{FileDescriptor} object of the C++ proto library. Documentation for the \texttt{FileDescriptor} class is available from the protocol buffer project page: \url{http://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.descriptor.html#FileDescriptor} \\ -\hline \texttt{filename} & fully qualified pathname of the \texttt{.proto} file.\\ -\hline \texttt{package} & package name defined in this \texttt{.proto} file.\\[.3cm] -\hline \textbf{Method} & \textbf{Description} \\ -\hline +\cmidrule(r){2-2} \texttt{name} & Return the filename for this FileDescriptorProto.\\ \texttt{package} & Return the file-level package name specified in this FileDescriptorProto.\\ \texttt{as.character} & character representation of a descriptor. \\ \texttt{toString} & character representation of a descriptor (same as \texttt{as.character}). \\ \texttt{asMessage} & return FileDescriptorProto message. \\ \texttt{as.list} & return named list of descriptors defined in this file descriptor.\\ -\hline +\bottomrule \end{tabular} \end{small} \caption{\label{filedescriptor-methods-table}Description of slots and methods for the \texttt{FileDescriptor} S4 class} @@ -961,25 +956,23 @@ \begin{table}[h] \centering \begin{small} -\begin{tabular}{l|p{10cm}} -\hline -\textbf{slot} & \textbf{description} \\ -\hline +\begin{tabular}{lp{10cm}} +\toprule +\textbf{Slot} & \textbf{Description} \\ +\cmidrule(r){2-2} \texttt{pointer} & External pointer to the \texttt{EnumValueDescriptor} C++ variable \\ -\hline \texttt{name} & simple name of the enum value \\ -\hline \texttt{full\_name} & fully qualified name of the enum value \\[.3cm] -\hline +% \textbf{Method} & \textbf{Description} \\ -\hline +\cmidrule(r){2-2} \texttt{number} & return the number of this EnumValueDescriptor. \\ \texttt{name} & Return the name of the enum value descriptor.\\ \texttt{enum\_type} & return the EnumDescriptor type of this EnumValueDescriptor. \\ \texttt{as.character} & character representation of a descriptor. \\ \texttt{toString} & character representation of a descriptor (same as \texttt{as.character}). \\ \texttt{asMessage} & return EnumValueDescriptorProto message. \\ -\hline +\bottomrule \end{tabular} \end{small} \caption{\label{EnumValueDescriptor-methods-table}Description of slots @@ -1000,37 +993,29 @@ \begin{table}[h] \centering \begin{small} -\begin{tabular}{|c|p{5cm}p{5cm}|} -\hline -field type & R type (non repeated) & R type (repeated) \\ -\hline -\hline +\begin{tabular}{lp{5cm}p{5cm}} +\toprule +Field type & R type (non repeated) & R type (repeated) \\ +\cmidrule(r){2-3} double & \texttt{double} vector & \texttt{double} vector \\ -float & \texttt{double} vector & \texttt{double} vector \\ -\hline +float & \texttt{double} vector & \texttt{double} vector \\[3mm] uint32 & \texttt{double} vector & \texttt{double} vector \\ -fixed32 & \texttt{double} vector & \texttt{double} vector \\ -\hline +fixed32 & \texttt{double} vector & \texttt{double} vector \\[3mm] int32 & \texttt{integer} vector & \texttt{integer} vector \\ sint32 & \texttt{integer} vector & \texttt{integer} vector \\ -sfixed32 & \texttt{integer} vector & \texttt{integer} vector \\ -\hline +sfixed32 & \texttt{integer} vector & \texttt{integer} vector \\[3mm] int64 & \texttt{integer} or \texttt{character} vector \footnotemark & \texttt{integer} or \texttt{character} vector \\ uint64 & \texttt{integer} or \texttt{character} vector & \texttt{integer} or \texttt{character} vector \\ sint64 & \texttt{integer} or \texttt{character} vector & \texttt{integer} or \texttt{character} vector \\ fixed64 & \texttt{integer} or \texttt{character} vector & \texttt{integer} or \texttt{character} vector \\ -sfixed64 & \texttt{integer} or \texttt{character} vector & \texttt{integer} or \texttt{character} vector \\ -\hline -bool & \texttt{logical} vector & \texttt{logical} vector \\ -\hline +sfixed64 & \texttt{integer} or \texttt{character} vector & \texttt{integer} or \texttt{character} vector \\\hline +bool & \texttt{logical} vector & \texttt{logical} vector \\[3mm] string & \texttt{character} vector & \texttt{character} vector \\ -bytes & \texttt{character} vector & \texttt{character} vector \\ -\hline -enum & \texttt{integer} vector & \texttt{integer} vector \\ -\hline +bytes & \texttt{character} vector & \texttt{character} vector \\[3mm] +enum & \texttt{integer} vector & \texttt{integer} vector \\[3mm] message & \texttt{S4} object of class \texttt{Message} & \texttt{list} of \texttt{S4} objects of class \texttt{Message} \\ -\hline +\bottomrule \end{tabular} \end{small} \caption{\label{table-get-types}Correspondence between field type and @@ -1240,68 +1225,69 @@ % Fri Dec 27 17:00:03 2013 \begin{table}[h!] \begin{center} + \small \scalebox{0.9}{ -\begin{tabular}{l|r|r|r|r|r} - \hline -Data Set & object.size & \multicolumn{2}{c|}{R Serialization} & -\multicolumn{2}{c}{RProtoBuf Serialization} \\ - & & Default & gzipped & Default & gzipped \\ - \hline -uspop & 584.00 & 268 & 172 & 211 & 148 \\ - Titanic & 1960.00 & 633 & 257 & 481 & 249 \\ - volcano & 42656.00 & 42517 & 5226 & 42476 & 4232 \\ - euro.cross & 2728.00 & 1319 & 910 & 1207 & 891 \\ - attenu & 14568.00 & 8234 & 2165 & 7771 & 2336 \\ - ToothGrowth & 2568.00 & 1486 & 349 & 1239 & 391 \\ - lynx & 1344.00 & 1028 & 429 & 971 & 404 \\ - nottem & 2352.00 & 2036 & 627 & 1979 & 641 \\ - sleep & 2752.00 & 746 & 282 & 483 & 260 \\ - co2 & 4176.00 & 3860 & 1473 & 3803 & 1453 \\ - austres & 1144.00 & 828 & 439 & 771 & 410 \\ - ability.cov & 1944.00 & 716 & 357 & 589 & 341 \\ - EuStockMarkets & 60664.00 & 59785 & 21232 & 59674 & 19882 \\ - treering & 64272.00 & 63956 & 17647 & 63900 & 17758 \\ - freeny.x & 1944.00 & 1445 & 1311 & 1372 & 1289 \\ - Puromycin & 2088.00 & 813 & 306 & 620 & 320 \\ - warpbreaks & 2768.00 & 1231 & 310 & 811 & 343 \\ - BOD & 1088.00 & 334 & 182 & 226 & 168 \\ - sunspots & 22992.00 & 22676 & 6482 & 22620 & 6742 \\ - beaver2 & 4184.00 & 3423 & 751 & 3468 & 840 \\ - anscombe & 2424.00 & 991 & 375 & 884 & 352 \\ - esoph & 5624.00 & 3111 & 548 & 2240 & 665 \\ - PlantGrowth & 1680.00 & 646 & 303 & 459 & 314 \\ - infert & 15848.00 & 14328 & 1172 & 13197 & 1404 \\ - BJsales & 1632.00 & 1316 & 496 & 1259 & 465 \\ - stackloss & 1688.00 & 917 & 293 & 844 & 283 \\ - crimtab & 7936.00 & 4641 & 713 & 1655 & 576 \\ - LifeCycleSavings & 6048.00 & 3014 & 1420 & 2825 & 1407 \\ - Harman74.cor & 9144.00 & 6056 & 2045 & 5861 & 2070 \\ - nhtemp & 912.00 & 596 & 240 & 539 & 223 \\ - faithful & 5136.00 & 4543 & 1339 & 4936 & 1776 \\ - freeny & 5296.00 & 2465 & 1518 & 2271 & 1507 \\ - discoveries & 1232.00 & 916 & 199 & 859 & 180 \\ - state.x77 & 7168.00 & 4251 & 1754 & 4068 & 1756 \\ - pressure & 1096.00 & 498 & 277 & 427 & 273 \\ - fdeaths & 1008.00 & 692 & 291 & 635 & 272 \\ - euro & 976.00 & 264 & 186 & 202 & 161 \\ - LakeHuron & 1216.00 & 900 & 420 & 843 & 404 \\ - mtcars & 6736.00 & 3798 & 1204 & 3633 & 1206 \\ - precip & 4992.00 & 1793 & 813 & 1615 & 815 \\ - state.area & 440.00 & 422 & 246 & 405 & 235 \\ - attitude & 3024.00 & 1990 & 544 & 1920 & 561 \\ - randu & 10496.00 & 9794 & 8859 & 10441 & 9558 \\ - state.name & 3088.00 & 844 & 408 & 724 & 415 \\ - airquality & 5496.00 & 4551 & 1241 & 2874 & 1294 \\ - airmiles & 624.00 & 308 & 170 & 251 & 148 \\ - quakes & 33112.00 & 32246 & 9898 & 29063 & 11595 \\ - islands & 3496.00 & 1232 & 563 & 1098 & 561 \\ - OrchardSprays & 3600.00 & 2164 & 445 & 1897 & 483 \\ - WWWusage & 1232.00 & 916 & 274 & 859 & 251 \\ - \hline +\begin{tabular}{lrrrrr} + \toprule + Data Set & object.size & \multicolumn{2}{c}{R Serialization} & + \multicolumn{2}{c}{RProtoBuf Serial.} \\ + & & default & gzipped & default & gzipped \\ + \cmidrule(r){2-6} + uspop & 584 & 268 & 172 & 211 & 148 \\ + Titanic & 1960 & 633 & 257 & 481 & 249 \\ + volcano & 42656 & 42517 & 5226 & 42476 & 4232 \\ + euro.cross & 2728 & 1319 & 910 & 1207 & 891 \\ + attenu & 14568 & 8234 & 2165 & 7771 & 2336 \\ + ToothGrowth & 2568 & 1486 & 349 & 1239 & 391 \\ + lynx & 1344 & 1028 & 429 & 971 & 404 \\ + nottem & 2352 & 2036 & 627 & 1979 & 641 \\ + sleep & 2752 & 746 & 282 & 483 & 260 \\ + co2 & 4176 & 3860 & 1473 & 3803 & 1453 \\ + austres & 1144 & 828 & 439 & 771 & 410 \\ + ability.cov & 1944 & 716 & 357 & 589 & 341 \\ + EuStockMarkets & 60664 & 59785 & 21232 & 59674 & 19882 \\ + treering & 64272 & 63956 & 17647 & 63900 & 17758 \\ + freeny.x & 1944 & 1445 & 1311 & 1372 & 1289 \\ + Puromycin & 2088 & 813 & 306 & 620 & 320 \\ + warpbreaks & 2768 & 1231 & 310 & 811 & 343 \\ + BOD & 1088 & 334 & 182 & 226 & 168 \\ + sunspots & 22992 & 22676 & 6482 & 22620 & 6742 \\ + beaver2 & 4184 & 3423 & 751 & 3468 & 840 \\ + anscombe & 2424 & 991 & 375 & 884 & 352 \\ + esoph & 5624 & 3111 & 548 & 2240 & 665 \\ + PlantGrowth & 1680 & 646 & 303 & 459 & 314 \\ + infert & 15848 & 14328 & 1172 & 13197 & 1404 \\ + BJsales & 1632 & 1316 & 496 & 1259 & 465 \\ + stackloss & 1688 & 917 & 293 & 844 & 283 \\ + crimtab & 7936 & 4641 & 713 & 1655 & 576 \\ + LifeCycleSavings & 6048 & 3014 & 1420 & 2825 & 1407 \\ + Harman74.cor & 9144 & 6056 & 2045 & 5861 & 2070 \\ + nhtemp & 912 & 596 & 240 & 539 & 223 \\ + faithful & 5136 & 4543 & 1339 & 4936 & 1776 \\ + freeny & 5296 & 2465 & 1518 & 2271 & 1507 \\ + discoveries & 1232 & 916 & 199 & 859 & 180 \\ + state.x77 & 7168 & 4251 & 1754 & 4068 & 1756 \\ + pressure & 1096 & 498 & 277 & 427 & 273 \\ + fdeaths & 1008 & 692 & 291 & 635 & 272 \\ + euro & 976 & 264 & 186 & 202 & 161 \\ + LakeHuron & 1216 & 900 & 420 & 843 & 404 \\ + mtcars & 6736 & 3798 & 1204 & 3633 & 1206 \\ + precip & 4992 & 1793 & 813 & 1615 & 815 \\ + state.area & 440 & 422 & 246 & 405 & 235 \\ + attitude & 3024 & 1990 & 544 & 1920 & 561 \\ + randu & 10496 & 9794 & 8859 & 10441 & 9558 \\ + state.name & 3088 & 844 & 408 & 724 & 415 \\ + airquality & 5496 & 4551 & 1241 & 2874 & 1294 \\ + airmiles & 624 & 308 & 170 & 251 & 148 \\ + quakes & 33112 & 32246 & 9898 & 29063 & 11595 \\ + islands & 3496 & 1232 & 563 & 1098 & 561 \\ + OrchardSprays & 3600 & 2164 & 445 & 1897 & 483 \\ + WWWusage & 1232 & 916 & 274 & 859 & 251 \\ + \bottomrule \end{tabular} } -\caption{Serialization sizes with R's built-in serialization and - RProtoBuf for 50 sample R datasets.} +\caption{Serialization sizes for default serialization in R and + RProtoBuf for 50 R datasets.} \label{tab:compression} \end{center} \end{table} @@ -1637,7 +1623,7 @@ helpful in reviewing code or offering suggestions. The contemporaneous work by Saptarshi Guha on \pkg{RHIPE} was a strong initial motivator. -\bibliography{eddelbuettel-stokely} +\bibliography{article} %\section[About Java]{About \proglang{Java}} %% Note: If there is markup in \(sub)section, then it has to be escape as above. Copied: papers/jss/article.bib (from rev 747, papers/jss/eddelbuettel-stokely.bib) =================================================================== --- papers/jss/article.bib (rev 0) +++ papers/jss/article.bib 2014-01-11 17:28:04 UTC (rev 749) @@ -0,0 +1,294 @@ + at article{eddelbuettel2011rcpp, + title={Rcpp: Seamless R and C++ integration}, + author={Dirk Eddelbuettel and Romain Fran{\c{c}}ois}, + journal={Journal of Statistical Software}, + volume={40}, + number={8}, + pages={1--18}, + year={2011} +} + at Manual{msgpackR, + title = {msgpackR: A library to serialize or unserialize data in MessagePack format}, + author = {Mikiya Tanizawa}, + year = {2013}, + note = {R package version 1.1}, + url = {http://CRAN.R-project.org/package=msgpackR}, +} + at Manual{rmongodb, + title={rmongodb: R-MongoDB driver}, + author={Gerald Lindsly}, + year = {2013}, + note = {R package version 1.3.3}, + url = {http://CRAN.R-project.org/package=rmongodb}, +} + at Manual{int64, + title = {int64: 64 bit integer types}, + author = {Romain Fran{\c{c}}ois}, + year = {2011}, + note = {R package version 1.1.2}, + url = {http://CRAN.R-project.org/package=int64}, +} + at Manual{bit64, + title = {bit64: A S3 class for vectors of 64bit integers}, + author = {Jens Oehlschl\"{a}gel}, + year = {2012}, + note = {R package version 0.9-3}, + url = {http://CRAN.R-project.org/package=bit64}, +} + at book{eddelbuettel2013seamless, + title={Seamless R and C++ Integration with Rcpp}, + author={Dirk Eddelbuettel}, + year={2013}, + publisher={Springer} +} + at Manual{rhipe, + title = {RHIPE: A Distributed Environment for the Analysis of Large and Complex Datasets}, + author = {Saptarshi Guha}, + year = {2010}, + url = {http://www.stat.purdue.edu/~sguha/rhipe/}, +} + at misc{serialization, +author= {Luke Tierney}, +title = {A New Serialization Mechanism for R}, +url = {http://www.cs.uiowa.edu/~luke/R/serialize/serialize.ps}, +year = {2003}, +} + at manual{eddelbuettel2013exposing, + title={Exposing C++ functions and classes with Rcpp modules}, + author={Dirk Eddelbuettel and Romain Fran{\c{c}}ois}, + year={2013}, + note={Vignette included in R package Rcpp}, + url = {http://CRAN.R-project.org/package=Rcpp}, +} + at inproceedings{cantrill2004dynamic, + title={Dynamic Instrumentation of Production Systems.}, + author={Bryan Cantrill and Michael W Shapiro and Adam H Leventhal and others}, + booktitle={USENIX Annual Technical Conference, General Track}, + pages={15--28}, + year={2004} +} + at article{swain1991color, + title={Color indexing}, + author={Michael J Swain and Dana H Ballard}, + journal={International journal of computer vision}, + volume={7}, + number={1}, + pages={11--32}, + year={1991}, + publisher={Springer} +} + at article{rubner2000earth, + title={The earth mover's distance as a metric for image retrieval}, + author={Yossi Rubner and Carlo Tomasi and Leonidas J Guibas}, + journal={International Journal of Computer Vision}, + volume={40}, + number={2}, + pages={99--121}, + year={2000}, + publisher={Springer} +} + at book{kullback1997information, + title={Information theory and statistics}, + author={Solomon Kullback}, + year={1997}, + publisher={Courier Dover Publications} +} + at inproceedings{puzicha1997non, + title={Non-parametric similarity measures for unsupervised texture segmentation and image retrieval}, + author={Jan Puzicha and Thomas Hofmann and Joachim M Buhmann}, + booktitle={Computer Vision and Pattern Recognition, 1997. Proceedings., 1997 IEEE Computer Society Conference on}, + pages={267--272}, + year={1997}, + organization={IEEE} +} + at inproceedings{fang1999computing, + title={Computing Iceberg Queries Efficiently.}, + author={Min Fang and Narayanan Shivakumar and Hector Garcia-Molina and Rajeev Motwani and Jeffrey D Ullman}, + booktitle={Internaational Conference on Very Large Databases (VLDB'98), New York, August 1998}, + year={1999}, + organization={Stanford InfoLab} +} + at Manual{emdist, + title = {emdist: Earth Mover's Distance}, + author = {Simon Urbanek and Yossi Rubner}, + year = {2012}, + note = {R package version 0.3-1}, + url = {http://cran.r-project.org/package=emdist}, +} + at article{Wegiel:2010:CTT:1932682.1869479, + author = {Michal Wegiel and Chandra Krintz}, + title = {Cross-language, Type-safe, and Transparent Object Sharing for Co-located Managed Runtimes}, + journal = {SIGPLAN Not.}, + issue_date = {October 2010}, + volume = {45}, + number = {10}, + month = oct, + year = {2010}, + issn = {0362-1340}, + pages = {223--240}, + numpages = {18}, + url = {http://doi.acm.org/10.1145/1932682.1869479}, + doi = {10.1145/1932682.1869479}, + acmid = {1869479}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {collection, communication, cross-language, garbage, managed, memory, model, object, rpc, runtimes, shared, synchronization, transparent, type-safe}, +} + at article{wickham2011split, + title={The split-apply-combine strategy for data analysis}, + author={Hadley Wickham}, + journal={Journal of Statistical Software}, + volume={40}, + number={1}, + pages={1--29}, + year={2011}, + publisher={Citeseer} +} + at inproceedings{Sumaray:2012:CDS:2184751.2184810, + author = {Audie Sumaray and S. Kami Makki}, + title = {A Comparison of Data Serialization Formats for Optimal Efficiency on a Mobile Platform}, + booktitle = {Proceedings of the 6th International Conference on Ubiquitous Information Management and Communication}, + series = {ICUIMC '12}, + year = {2012}, + isbn = {978-1-4503-1172-4}, + location = {Kuala Lumpur, Malaysia}, + pages = {48:1--48:6}, + articleno = {48}, + numpages = {6}, + url = {http://doi.acm.org/10.1145/2184751.2184810}, + doi = {10.1145/2184751.2184810}, + acmid = {2184810}, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {Android, Dalvik, JSON, ProtoBuf, XML, data serialization, thrift}, +} + at Manual{RObjectTables, + title = {User-Defined Tables in the R Search Path}, + author = {Duncan {Temple Lang}}, + year = {2012}, + url = {http://www.omegahat.org/RObjectTables/RObjectTables.pdf}, +} + at Manual{rprotobuf, + title = {RProtoBuf: R Interface to the Protocol Buffers API}, + author = {Romain Francois and Dirk Eddelbuettel and Murray Stokely}, + note = {R package version 0.3.2}, + year = {2013}, + url = {http://cran.r-project.org/web/packages/RProtoBuf/index.html}, +} + at Manual{r, + title = {R: A Language and Environment for Statistical Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2013}, + url = {http://www.R-project.org/}, + } + at article{dean2008mapreduce, + title={MapReduce: simplified data processing on large clusters}, + author={Jeffrey Dean and Sanjay Ghemawat}, + journal={Communications of the ACM}, + volume={51}, + number={1}, + pages={107--113}, + year={2008}, + publisher={ACM} +} + at article{bostock2011d3, + title={D$^3$ Data-Driven Documents}, + author={Michael Bostock and Vadim Ogievetsky and Jeffrey Heer}, + journal={Visualization and Computer Graphics, IEEE Transactions on}, + volume={17}, + number={12}, + pages={2301--2309}, + year={2011}, + publisher={IEEE} +} +% celebrated article in this field. Also see the parallel paragraph. + at article{Manku:1998:AMO:276305.276342, + author = {Gurmeet Singh Manku and Sridhar Rajagopalan and Bruce G. Lindsay}, + title = {Approximate medians and other quantiles in one pass and with limited memory}, + journal = {SIGMOD Rec.}, + issue_date = {June 1998}, + volume = {27}, + number = {2}, + month = jun, + year = {1998}, + issn = {0163-5808}, + pages = {426--435}, + numpages = {10}, + url = {http://doi.acm.org/10.1145/276305.276342}, + doi = {10.1145/276305.276342}, + acmid = {276342}, + publisher = {ACM}, + address = {New York, NY, USA}, +} +% Has a section on protocol buffers + at article{Pike:2005:IDP:1239655.1239658, + author = {Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan}, + title = {Interpreting the data: Parallel analysis with Sawzall}, + journal = {Sci. Program.}, + issue_date = {October 2005}, + volume = {13}, + number = {4}, + month = oct, + year = {2005}, + issn = {1058-9244}, + pages = {277--298}, + numpages = {22}, + acmid = {1239658}, + publisher = {IOS Press}, + address = {Amsterdam, The Netherlands, The Netherlands}, +} + at Manual{protobuf, + title = {Protocol Buffers: Developer Guide}, + author = {Google}, + year = {2012}, + url = {http://code.google.com/apis/protocolbuffers/docs/overview.html} +} + at article{sturges1926choice, + title={The choice of a class interval}, + author={Herbert A Sturges}, + journal={Journal of the American Statistical Association}, + volume={21}, + number={153}, + pages={65--66}, + year={1926} +} + at Manual{histogramtools, + title = {HistogramTools: Utility Functions for R Histograms}, + author = {Murray Stokely}, + year = {2013}, + note = {R package version 0.3}, + url = {https://r-forge.r-project.org/projects/histogramtools/}, +} + at article{scott1979optimal, + title={On optimal and data-based histograms}, + author={David W Scott}, + journal={Biometrika}, + volume={66}, + number={3}, + pages={605--610}, + year={1979}, + publisher={Biometrika Trust} +} + at book{scott2009multivariate, + title={Multivariate density estimation: theory, practice, and visualization}, + author={David W Scott}, + volume={383}, + year={2009}, + publisher={Wiley. com} +} + at Manual{httr, + title = {httr: Tools for working with URLs and HTTP}, + author = {Hadley Wickham}, + year = {2012}, + note = {R package version 0.2}, + url = {http://CRAN.R-project.org/package=httr}, +} + at Manual{opencpu, + title = {OpenCPU system for embedded statistical computation and reproducible research}, + author = {Jeroen Ooms}, + year = {2013}, + note = {R package version 1.2.2}, + url = {http://www.opencpu.org}, +} Deleted: papers/jss/eddelbuettel-stokely.bib =================================================================== --- papers/jss/eddelbuettel-stokely.bib 2014-01-11 17:17:46 UTC (rev 748) +++ papers/jss/eddelbuettel-stokely.bib 2014-01-11 17:28:04 UTC (rev 749) @@ -1,294 +0,0 @@ - at article{eddelbuettel2011rcpp, - title={Rcpp: Seamless R and C++ integration}, - author={Dirk Eddelbuettel and Romain Fran{\c{c}}ois}, - journal={Journal of Statistical Software}, - volume={40}, - number={8}, - pages={1--18}, - year={2011} -} - at Manual{msgpackR, - title = {msgpackR: A library to serialize or unserialize data in MessagePack format}, - author = {Mikiya Tanizawa}, - year = {2013}, - note = {R package version 1.1}, - url = {http://CRAN.R-project.org/package=msgpackR}, -} - at Manual{rmongodb, - title={rmongodb: R-MongoDB driver}, - author={Gerald Lindsly}, - year = {2013}, - note = {R package version 1.3.3}, - url = {http://CRAN.R-project.org/package=rmongodb}, -} - at Manual{int64, - title = {int64: 64 bit integer types}, - author = {Romain Fran{\c{c}}ois}, - year = {2011}, - note = {R package version 1.1.2}, - url = {http://CRAN.R-project.org/package=int64}, -} - at Manual{bit64, - title = {bit64: A S3 class for vectors of 64bit integers}, - author = {Jens Oehlschl\"{a}gel}, - year = {2012}, - note = {R package version 0.9-3}, - url = {http://CRAN.R-project.org/package=bit64}, -} - at book{eddelbuettel2013seamless, - title={Seamless R and C++ Integration with Rcpp}, - author={Dirk Eddelbuettel}, - year={2013}, - publisher={Springer} -} - at Manual{rhipe, - title = {RHIPE: A Distributed Environment for the Analysis of Large and Complex Datasets}, - author = {Saptarshi Guha}, - year = {2010}, - url = {http://www.stat.purdue.edu/~sguha/rhipe/}, -} - at misc{serialization, -author= {Luke Tierney}, -title = {A New Serialization Mechanism for R}, -url = {http://www.cs.uiowa.edu/~luke/R/serialize/serialize.ps}, -year = {2003}, -} - at manual{eddelbuettel2013exposing, - title={Exposing C++ functions and classes with Rcpp modules}, - author={Dirk Eddelbuettel and Romain Fran{\c{c}}ois}, - year={2013}, - note={Vignette included in R package Rcpp}, - url = {http://CRAN.R-project.org/package=Rcpp}, -} - at inproceedings{cantrill2004dynamic, - title={Dynamic Instrumentation of Production Systems.}, - author={Bryan Cantrill and Michael W Shapiro and Adam H Leventhal and others}, - booktitle={USENIX Annual Technical Conference, General Track}, - pages={15--28}, - year={2004} -} - at article{swain1991color, - title={Color indexing}, - author={Michael J Swain and Dana H Ballard}, - journal={International journal of computer vision}, - volume={7}, - number={1}, - pages={11--32}, - year={1991}, - publisher={Springer} -} - at article{rubner2000earth, - title={The earth mover's distance as a metric for image retrieval}, - author={Yossi Rubner and Carlo Tomasi and Leonidas J Guibas}, - journal={International Journal of Computer Vision}, - volume={40}, - number={2}, - pages={99--121}, - year={2000}, - publisher={Springer} -} - at book{kullback1997information, - title={Information theory and statistics}, - author={Solomon Kullback}, - year={1997}, - publisher={Courier Dover Publications} -} - at inproceedings{puzicha1997non, [TRUNCATED] To get the complete diff run: svnlook diff /svnroot/rprotobuf -r 749 From noreply at r-forge.r-project.org Sat Jan 11 22:12:39 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 11 Jan 2014 22:12:39 +0100 (CET) Subject: [Rprotobuf-commits] r750 - papers/jss Message-ID: <20140111211239.D7E53185F7E@r-forge.r-project.org> Author: edd Date: 2014-01-11 22:12:39 +0100 (Sat, 11 Jan 2014) New Revision: 750 Modified: papers/jss/article.Rnw Log: bunch of edits Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-11 17:28:04 UTC (rev 749) +++ papers/jss/article.Rnw 2014-01-11 21:12:39 UTC (rev 750) @@ -20,7 +20,7 @@ \title{\pkg{RProtoBuf}: Efficient Cross-Language Data Serialization in R} %% for pretty printing and a nice hypersummary also set: -\Plainauthor{Dirk Eddelbuettel, Murray Stokely} %% comma-separated +\Plainauthor{Dirk Eddelbuettel, Murray Stokely, Jeroen Ooms} %% comma-separated \Plaintitle{RProtoBuf: Efficient Cross-Language Data Serialization in R} \Shorttitle{\pkg{RProtoBuf}: Protocol Buffers in R} %% a short title (if necessary) @@ -121,13 +121,12 @@ \citep{wickham2011split} explicitly break up large problems into manageable pieces. These patterns are frequently employed with different programming languages used for the different phases of data -analysis -- collection, cleaning, analysis, post-processing, and +analysis -- collection, cleaning, modeling, analysis, post-processing, and presentation in order to take advantage of the unique combination of performance, speed of development, and library support offered by -different environments. Each stage of the data +different environments and languages. Each stage of such a data analysis pipeline may involve storing intermediate results in a file or sending them over the network. -% DE: Nice! Given these requirements, how do we safely share intermediate results between different applications, possibly written in different @@ -137,34 +136,35 @@ serialization support, but these formats are tied to the specific % DE: need to define serialization? programming language in use and thus lock the user into a single -environment. CSV files can be read and written by many applications -and so are often used for exporting tabular data. However, CSV files -have a number of disadvantages, such as a limitation of exporting only -tabular datasets, lack of type-safety, inefficient text representation -and parsing, and ambiguities in the format involving special -characters. JSON is another widely-supported format used mostly on -the web that removes many of these disadvantages, but it too suffers -from being too slow to parse and also does not provide strong typing -between integers and floating point. Because the schema information -is not kept separately, multiple JSON messages of the same type -needlessly duplicate the field names with each message. -Lastly, XML is a well-established and widely-supported protocol with the ability to define -just about any arbitrarily complex schema. However, it pays for this -complexity with comparatively large and verbose messages, and added -complexities at the parsing side. -% -% -% +environment. + +\emph{Comma-separated values} (CSV) files can be read and written by many +applications and so are often used for exporting tabular data. However, CSV +files have a number of disadvantages, such as a limitation of exporting only +tabular datasets, lack of type-safety, inefficient text representation and +parsing, possibly limited precision and ambiguities in the format involving +special characters. \emph{JavaScript Object Notation} (JSON) is another +widely-supported format used mostly on the web that removes many of these +disadvantages, but it too suffers from being too slow to parse and also does +not provide strong typing between integers and floating point. Because the +schema information is not kept separately, multiple JSON messages of the same +type needlessly duplicate the field names with each message. Lastly, +\emph{Extensible Markup Language} (XML) is a well-established and widely-supported +protocol with the ability to define just about any arbitrarily complex +schema. However, it pays for this complexity with comparatively large and +verbose messages, and added complexities at the parsing side (which are +somewhat metigated by the availability of mature libraries and +parsers). + A number of binary formats based on JSON have been proposed that reduce the parsing cost and improve the efficiency. MessagePack -and BSON both have R interfaces \citep{msgpackR,rmongodb}, but +and BSON both have R interfaces, but % \citep{msgpackR,rmongodb}, but % DE Why do we cite these packages, but not the numerous JSON packages? these formats lack a separate schema for the serialized data and thus still duplicate field names with each message sent over the network or stored in a file. Such formats also lack support for versioning when data storage needs evolve over time, or when application logic and requirement changes dictate update to the message format. -% DE: Need to talk about XML -- added a few lines at previous paragraph Once the data serialization needs of an application become complex enough, developers typically benefit from the use of an @@ -188,19 +188,19 @@ % in the middle (full class/method details) and interesting % applications at the end. -Section~\ref{sec:protobuf} provides a general overview of Protocol -Buffers. Section~\ref{sec:rprotobuf-basic} describes the interactive -R interface provided by \CRANpkg{RProtoBuf} and introduces the two -main abstractions: \emph{Messages} and \emph{Descriptors}. -Section~\ref{sec:rprotobuf-classes} describes the implementation -details of the main S4 classes making up this package. -Section~\ref{sec:types} describes the challenges of type coercion -between R and other languages. Section~\ref{sec:evaluation} -introduces a general R language schema for serializing arbitrary R -objects and evaluates it against R's built-in serialization. -Sections~\label{sec:opencpu} and \label{sec:mapreduce} provide -real-world use cases of \CRANpkg{RProtoBuf} in web service and -MapReduce environments, respectively. +The rest of the paper is organized as follows. Section~\ref{sec:protobuf} +provides a general overview of Protocol Buffers. +Section~\ref{sec:rprotobuf-basic} describes the interactive R interface +provided by \CRANpkg{RProtoBuf} and introduces the two main abstractions: +\emph{Messages} and \emph{Descriptors}. Section~\ref{sec:rprotobuf-classes} +describes the implementation details of the main S4 classes making up this +package. Section~\ref{sec:types} describes the challenges of type coercion +between R and other languages. Section~\ref{sec:evaluation} introduces a +general R language schema for serializing arbitrary R objects and evaluates +it against R's built-in serialization. Sections~\ref{sec:opencpu} +and \ref{sec:mapreduce} provide real-world use cases of \CRANpkg{RProtoBuf} +in web service and MapReduce environments, respectively, before +Section~\ref{sec:summary} concludes. %This article describes the basics of Google's Protocol Buffers through %an easy to use R package, \CRANpkg{RProtoBuf}. After describing the @@ -221,10 +221,10 @@ Protocol Buffers can be described as a modern, language-neutral, platform-neutral, extensible mechanism for sharing and storing structured data. Since their introduction, Protocol Buffers have been widely adopted in industry with -applications as varied as database-internal messaging (Drizzle), % DE: citation? -Sony Playstations, Twitter, Google Search, Hadoop, and Open Street Map. While +applications as varied as %database-internal messaging (Drizzle), % DE: citation? +Sony Playstations, Twitter, Google Search, Hadoop, and Open Street Map. % TODO(DE): This either needs a citation, or remove the name drop -traditional IDLs have at time been criticized for code bloat and +While traditional IDLs have at time been criticized for code bloat and complexity, Protocol Buffers are based on a simple list and records model that is compartively flexible and simple to use. @@ -232,22 +232,22 @@ include: \begin{itemize} -\item \emph{Portable}: Allows users to send and receive data between - applications or different computers. +\item \emph{Portable}: Enable users to send and receive data between + applications as well as different computers or operating systems. \item \emph{Efficient}: Data is serialized into a compact binary representation for transmission or storage. \item \emph{Extensible}: New fields can be added to Protocol Buffer Schemas - in a forward-compatible way that do not break older applications. + in a forward-compatible way that does not break older applications. \item \emph{Stable}: Protocol Buffers have been in wide use for over a decade. \end{itemize} Figure~\ref{fig:protobuf-distributed-usecase} illustrates an example -communication workflow with protocol buffers and an interactive R -session. Common use cases include populating a request RPC protocol -buffer in R that is then serialized and sent over the network to a -remote server. The server would then deserialize the message, act on -the request, and respond with a new protocol buffer over the network. The key +communication workflow with Protocol Buffers and an interactive R session. +Common use cases include populating a request remote-procedure call (RPC) +Protocol Buffer in R that is then serialized and sent over the network to a +remote server. The server would then deserialize the message, act on the +request, and respond with a new Protocol Buffer over the network. The key difference to, say, a request to an Rserve instance is that the remote server may not even know the R language. @@ -267,9 +267,9 @@ %between three to ten times \textsl{smaller}, between twenty and one hundred %times \textsl{faster}, as well as less ambiguous and easier to program. -Many sources compare data serialization formats and show protocol -buffers very favorably to the alternatives, such -as \citet{Sumaray:2012:CDS:2184751.2184810} +Many sources compare data serialization formats and show Protocol +Buffers very favorably to the alternatives; see +\citet{Sumaray:2012:CDS:2184751.2184810} for one such comparison. %The flexibility of the reflection-based API is particularly well %suited for interactive data analysis. @@ -277,11 +277,11 @@ % XXX Design tradeoffs: reflection vs proto compiler For added speed and efficiency, the C++, Java, and Python bindings to -Protocol Buffers are used with a compiler that translates a protocol -buffer schema description file (ending in \texttt{.proto}) into +Protocol Buffers are used with a compiler that translates a Protocol +Buffer schema description file (ending in \texttt{.proto}) into language-specific classes that can be used to create, read, write and -manipulate protocol buffer messages. The R interface, in contrast, -uses a reflection-based API that is particularly well suited for +manipulate Protocol Buffer messages. The R interface, in contrast, +uses a reflection-based API that is particularly well-suited for interactive data analysis. All messages in R have a single class structure, but different accessor methods are created at runtime based on the name fields of the specified message type. @@ -324,8 +324,8 @@ binary \emph{payload} of the messages to files and arbitrary binary R connections. -The two fundamental building blocks of Protocol Buffers are Messages -and Descriptors. Messages provide a common abstract encapsulation of +The two fundamental building blocks of Protocol Buffers are \emph{Messages} +and \emph{Descriptors}. Messages provide a common abstract encapsulation of structured data fields of the type specified in a Message Descriptor. Message Descriptors are defined in \texttt{.proto} files and define a schema for a particular named class of messages. @@ -353,11 +353,11 @@ %% TODO(de) Can we make this not break the width of the page? \noindent \begin{table} -\begin{tabular}{@{\hskip .01\textwidth}p{.40\textwidth}@{\hskip .02\textwidth}@{\hskip .02\textwidth}p{0.55\textwidth}@{\hskip .01\textwidth}} +\begin{tabular}{p{.40\textwidth}p{0.55\textwidth}} \toprule Schema : \texttt{addressbook.proto} & Example R Session\\ \cmidrule{1-2} -\begin{minipage}{.35\textwidth} +\begin{minipage}{.40\textwidth} \vspace{2mm} \begin{example} package tutorial; @@ -377,10 +377,10 @@ } \end{example} \vspace{2mm} -\end{minipage} & \begin{minipage}{.5\textwidth} +\end{minipage} & \begin{minipage}{.55\textwidth} <>= library(RProtoBuf) -p <- new(tutorial.Person, id=1, name="Dirk") +p <- new(tutorial.Person,id=1,name="Dirk") class(p) p$name p$name <- "Murray" @@ -421,8 +421,8 @@ all \texttt{.proto} files provided by another R package. The \texttt{.proto} file syntax for defining the structure of protocol -buffer data is described comprehensively on Google Code: -\url{http://code.google.com/apis/protocolbuffers/docs/proto.html}. +buffer data is described comprehensively on Google Code\footnote{See +\url{http://code.google.com/apis/protocolbuffers/docs/proto.html}.}. Once the proto files are imported, all message descriptors are are available in the R search path in the \texttt{RProtoBuf:DescriptorPool} @@ -473,7 +473,6 @@ However, as opposed to R lists, no partial matching is performed and the name must be given entirely. - The \verb|[[| operator can also be used to query and set fields of a messages, supplying either their name or their tag number : @@ -483,7 +482,7 @@ p[[ "email" ]] @ -Protocol buffers include a 64-bit integer type, but R lacks native +Protocol Buffers include a 64-bit integer type, but R lacks native 64-bit integer support. A workaround is available and described in Section~\ref{sec:int64} for working with large integer values. @@ -492,7 +491,7 @@ \subsection{Display messages} -Protocol buffer messages and descriptors implement \texttt{show} +Protocol Buffer messages and descriptors implement \texttt{show} methods that provide basic information about the message : <<>>= @@ -509,10 +508,10 @@ \subsection{Serializing messages} -However, the main focus of protocol buffer messages is +However, the main focus of Protocol Buffer messages is efficiency. Therefore, messages are transported as a sequence of bytes. The \texttt{serialize} method is implemented for -protocol buffer messages to serialize a message into a sequence of +Protocol Buffer messages to serialize a message into a sequence of bytes that represents the message. %(raw vector in R speech) that represents the message. @@ -589,7 +588,7 @@ @ -\texttt{read} can also be used as a pseudo method of the descriptor +\texttt{read} can also be used as a pseudo-method of the descriptor object : <<>>= @@ -614,8 +613,9 @@ \texttt{serialize}. Each R object stores an external pointer to an object managed by -the \texttt{protobuf} C++ library. -The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp,eddelbuettel2013seamless} is used to +the \texttt{protobuf} C++ library which implements the core Protocol Buffer +functionality. The \CRANpkg{Rcpp} package +\citep{eddelbuettel2011rcpp,eddelbuettel2013seamless} is used to facilitate the integration of the R and C++ code for these objects. % Message, Descriptor, FieldDescriptor, EnumDescriptor, @@ -636,12 +636,12 @@ which provide a more concise way of wrapping C++ functions and classes in a single entity. -The \texttt{RProtoBuf} package combines the \emph{R typical} dispatch -of the form \verb|method(object, arguments)| and the more traditional -object oriented notation \verb|object$method(arguments)|. +The \texttt{RProtoBuf} package combines a dispatch mechanism +of the form \verb|method(object, arguments)| (common to R) and the more +traditional object oriented notation \verb|object$method(arguments)|. Additionally, \texttt{RProtoBuf} implements the \texttt{.DollarNames} S3 generic function (defined in the \texttt{utils} package) for all classes to enable tab -completion. Completion possibilities include pseudo method names for all +completion. Completion possibilities include pseudo-method names for all classes, plus dynamic dispatch on names or types specific to a given object. % TODO(ms): Add column check box for doing dynamic dispatch based on type. @@ -683,9 +683,9 @@ \toprule \textbf{Slot} & \textbf{Description} \\ \cmidrule(r){2-2} -\texttt{pointer} & External pointer to the \texttt{Message} object of the C++ proto library. Documentation for the -\texttt{Message} class is available from the protocol buffer project page: -\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.message.html#Message} \\ +\texttt{pointer} & External pointer to the \texttt{Message} object of the C++ protobuf library. Documentation for the +\texttt{Message} class is available from the Protocol Buffer project page. \\ +%(\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.message.html#Message}) \\ \texttt{type} & Fully qualified name of the message. For example a \texttt{Person} message has its \texttt{type} slot set to \texttt{tutorial.Person} \\[.3cm] \textbf{Method} & \textbf{Description} \\ @@ -758,8 +758,8 @@ \textbf{Slot} & \textbf{Description} \\ \cmidrule(r){2-2} \texttt{pointer} & External pointer to the \texttt{Descriptor} object of the C++ proto library. Documentation for the -\texttt{Descriptor} class is available from the protocol buffer project page: -\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.descriptor.html#Descriptor} \\ +\texttt{Descriptor} class is available from the Protocol Buffer project page.\\ +%\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.descriptor.html#Descriptor} \\ \texttt{type} & Fully qualified path of the message type. \\[.3cm] % \textbf{Method} & \textbf{Description} \\ @@ -781,7 +781,7 @@ \texttt{field\_count} & Return the number of fields in this descriptor.\\ \texttt{field} & Return the descriptor for the specified field in this descriptor.\\ \texttt{nested\_type\_count} & The number of nested types in this descriptor.\\ -\texttt{nested\_type} & Return the descriptor for the specified nested +\texttt{nested\_type} & Return the descriptor for the specified nested type in this descriptor.\\ \texttt{enum\_type\_count} & The number of enum types in this descriptor.\\ \texttt{enum\_type} & Return the descriptor for the specified enum @@ -984,8 +984,9 @@ One of the benefits of using an Interface Definition Language (IDL) like Protocol Buffers is that it provides a highly portable basic type -system that different language and hardware implementations can map to +system. This permits different language and hardware implementations to map to the most appropriate type in different environments. + Table~\ref{table-get-types} details the correspondence between the field type and the type of data that is retrieved by \verb|$| and \verb|[[| extractors. @@ -1005,11 +1006,11 @@ sint32 & \texttt{integer} vector & \texttt{integer} vector \\ sfixed32 & \texttt{integer} vector & \texttt{integer} vector \\[3mm] int64 & \texttt{integer} or \texttt{character} -vector \footnotemark & \texttt{integer} or \texttt{character} vector \\ +vector & \texttt{integer} or \texttt{character} vector \\ uint64 & \texttt{integer} or \texttt{character} vector & \texttt{integer} or \texttt{character} vector \\ sint64 & \texttt{integer} or \texttt{character} vector & \texttt{integer} or \texttt{character} vector \\ fixed64 & \texttt{integer} or \texttt{character} vector & \texttt{integer} or \texttt{character} vector \\ -sfixed64 & \texttt{integer} or \texttt{character} vector & \texttt{integer} or \texttt{character} vector \\\hline +sfixed64 & \texttt{integer} or \texttt{character} vector & \texttt{integer} or \texttt{character} vector \\[3mm] bool & \texttt{logical} vector & \texttt{logical} vector \\[3mm] string & \texttt{character} vector & \texttt{character} vector \\ bytes & \texttt{character} vector & \texttt{character} vector \\[3mm] @@ -1019,17 +1020,17 @@ \end{tabular} \end{small} \caption{\label{table-get-types}Correspondence between field type and - R type retrieved by the extractors. \footnotesize{1. R lacks native + R type retrieved by the extractors. Note that R lacks native 64-bit integers, so the \texttt{RProtoBuf.int64AsString} option is available to return large integers as characters to avoid losing - precision. This option is described in Section~\ref{sec:int64}}.} + precision. This option is described in Section~\ref{sec:int64}.} \end{table} \subsection{Booleans} R booleans can accept three values: \texttt{TRUE}, \texttt{FALSE}, and -\texttt{NA}. However, most other languages, including the protocol -buffer schema, only accept \texttt{TRUE} or \texttt{FALSE}. This means +\texttt{NA}. However, most other languages, including the Protocol +Buffer schema, only accept \texttt{TRUE} or \texttt{FALSE}. This means that we simply can not store R logical vectors that include all three possible values as booleans. The library will refuse to store \texttt{NA}s in protocol buffer boolean fields, and users must instead @@ -1059,7 +1060,7 @@ \subsection{Unsigned Integers} R lacks a native unsigned integer type. Values between $2^{31}$ and -$2^{32} - 1$ read from unsigned int protocol buffer fields must be +$2^{32} - 1$ read from unsigned into Protocol Buffer fields must be stored as doubles in R. <<>>= @@ -1140,22 +1141,21 @@ \section{Evaluation: data.frame to Protocol Buffer Serialization} \label{sec:evaluation} -Saptarshi Guha wrote the RHIPE package \citep{rhipe} which includes -protocol buffer integration with R. However, this implementation -takes a different approach: any R object is serialized into a message -based on a single catch-all \texttt{proto} schema. Jeroen Ooms took a -similar approach influenced by Saptarshi in the \pkg{RProtoBufUtils} -package (which has now been integrated in \pkg{RProtoBuf}). Unlike -Saptarshi's package, however, RProtoBufUtils depends -on, and extends, RProtoBuf for underlying message operations. +The \pkg{RHIPE} package \citep{rhipe} also includes a Protocol integration with R. +However, its implementation takes a different approach: any R object is +serialized into a message based on a single catch-all \texttt{proto} schema. +A similar approach was taken by \pkg{RProtoBufUtils} package (which has now been integrated in +\pkg{RProtoBuf}). Unlike \pkg{RHIPE}, however, \pkg{RProtoBufUtils} +depended upon on, and extended, \pkg{RProtoBuf} for underlying message operations. +%DE Shall this go away now that we sucket RPBUtils into RBP? One key extension of \pkg{RProtoBufUtils} is the \texttt{serialize\_pb} method to convert R objects into serialized -protocol buffers in the catch-all schema. The \texttt{can\_serialize\_pb} -method can be used to determine whether the given R object can safely +Protocol Buffers in the catch-all schema. The \texttt{can\_serialize\_pb} +method can then be used to determine whether the given R object can safely be expressed in this way. To illustrate how this method works, we attempt to convert all of the built-in datasets from R into this -serialized protocol buffer representation. +serialized Protocol Buffer representation. <>= datasets <- subset(as.data.frame(data()$results), Package=="datasets") @@ -1165,7 +1165,7 @@ There are \Sexpr{n} standard data sets included in R. We use the \texttt{can\_serialize\_pb} method to determine how many of those can -be safely converted to a serialized protocol buffer representation. +be safely converted to a serialized Protocol Buffer representation. <>= #datasets$valid.proto <- sapply(datasets$load.name, function(x) can_serialize_pb(eval(as.name(x)))) @@ -1177,7 +1177,7 @@ (\Sexpr{format(100*m/n,digits=1)}\%). The next section illustrates how many bytes were used to store the data sets under four different situations (1) normal R serialization, (2) R serialization followed by -gzip, (3) normal protocol buffer serialization, (4) protocol buffer +gzip, (3) normal Protocol Buffer serialization, (4) Protocol Buffer serialization followed by gzip. \subsection{Compression Performance} @@ -1601,6 +1601,7 @@ \section{Summary} +\label{sec:summary} % RProtoBuf has been used. From noreply at r-forge.r-project.org Sun Jan 12 00:44:54 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 12 Jan 2014 00:44:54 +0100 (CET) Subject: [Rprotobuf-commits] r751 - papers/jss Message-ID: <20140111234454.98246186726@r-forge.r-project.org> Author: edd Date: 2014-01-12 00:44:54 +0100 (Sun, 12 Jan 2014) New Revision: 751 Modified: papers/jss/article.Rnw Log: more edits Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-11 21:12:39 UTC (rev 750) +++ papers/jss/article.Rnw 2014-01-11 23:44:54 UTC (rev 751) @@ -1149,7 +1149,7 @@ depended upon on, and extended, \pkg{RProtoBuf} for underlying message operations. %DE Shall this go away now that we sucket RPBUtils into RBP? -One key extension of \pkg{RProtoBufUtils} is the +One key extension which \pkg{RProtoBufUtils} brought to \pkg{RProtoBuf} is the \texttt{serialize\_pb} method to convert R objects into serialized Protocol Buffers in the catch-all schema. The \texttt{can\_serialize\_pb} method can then be used to determine whether the given R object can safely @@ -1168,17 +1168,22 @@ be safely converted to a serialized Protocol Buffer representation. <>= -#datasets$valid.proto <- sapply(datasets$load.name, function(x) can_serialize_pb(eval(as.name(x)))) -#datasets <- subset(datasets, valid.proto==TRUE) +datasets$valid.proto <- sapply(datasets$load.name, + function(x) can_serialize_pb(eval(as.name(x)))) +datasets <- subset(datasets, valid.proto==TRUE) m <- nrow(datasets) @ \Sexpr{m} data sets could be converted to Protocol Buffers (\Sexpr{format(100*m/n,digits=1)}\%). The next section illustrates how many bytes were used to store the data sets under four different -situations (1) normal R serialization, (2) R serialization followed by -gzip, (3) normal Protocol Buffer serialization, (4) Protocol Buffer -serialization followed by gzip. +situations: +\begin{itemize} +\item normal R serialization, +\item R serialization followed by gzip, +\item normal Protocol Buffer serialization, and +\item Protocol Buffer serialization followed by gzip. +\end{itemize} \subsection{Compression Performance} \label{sec:compression} @@ -1207,17 +1212,21 @@ Table~\ref{tab:compression} shows the sizes of 50 sample R datasets as returned by object.size() compared to the serialized sizes. -The summary compression sizes are listed below, and a full table for a -sample of 50 datasets is included on the next page. Sizes are comparable -but protocol buffers provide simple getters and setters in multiple -languages instead of requiring other programs to parse the R -serialization format \citep{serialization}. One takeaway from this -table is that RProtoBuf does not in general provide any significant -space-savings over R's normal serialization mechanism. The benefit -from RProtoBuf comes from its interoperability with other -environments, safe versioning, +%The summary compression sizes are listed below, and a full table for a +%sample of 50 datasets is included on the next page. +Sizes are comparable but Protocol Buffers provide simple getters and setters +in multiple languages instead of requiring other programs to parse the R +serialization format.% \citep{serialization}. +One takeaway from this table is that RProtoBuf does not in general provide +any significant saving in file size compared to the normal serialization +mechanism in R which is seen as equally compact. The benefit from RProtoBuf +comes from its interoperability with other environments, as well as its safe +versioning, -TODO comparison of protobuf serialization sizes/times for various vectors. Compared to R's native serialization. Discussion of the RHIPE approach of serializing any/all R objects, vs more specific protocol buffers for specific R objects. +TODO comparison of protobuf serialization sizes/times for various vectors. +Compared to R's native serialization. Discussion of the RHIPE approach of +serializing any/all R objects, vs more specific Protocol Buffers for specific +R objects. % N.B. see table.Rnw for how this table is created. % @@ -1296,8 +1305,8 @@ TODO RProtoBuf is quite flexible and easy to use for interactive analysis, but it is not designed for certain classes of operations one -might like to do with protocol buffers. For example, taking a list of -10,000 protocol buffers, extracting a named field from each one, and +might like to do with Protocol Buffers. For example, taking a list of +10,000 Protocol Buffers, extracting a named field from each one, and computing a aggregate statistics on those values would be extremely slow with RProtoBuf, and while this is a useful class of operations, it is outside of the scope of RProtoBuf. We should be very clear @@ -1339,27 +1348,26 @@ % Can you integrate some of this text earlier, maybe into the the % introduction? -As described earlier, the primary application of protocol buffers is -data interchange in the context of inter-system communications. -Network protocols such as HTTP provide mechanisms for client-server -communication, i.e. how to initiate requests, authenticate, send messages, -etc. However, many network -protocols generally do not regulate \emph{content} of messages: they allow -transfer of any media type, such as web pages, files or video. -When designing systems where various components require exchange of specific data -structures, we need something on top of the network protocol that prescribes -how these structures are to be represented in messages (buffers) on the -network. Protocol buffers solve exactly this problem by providing -a cross platform method for serializing arbitrary structures into well defined -messages, that can be exchanged using any protocol. The descriptors -(\texttt{.proto} files) are used to formally define the interface of a -remote API or network application. Libraries to parse and generate protobuf -messages are available for many programming languages, making it -relatively straight forward to implement clients and servers. +As described earlier, the primary application of Protocol Buffers is data +interchange in the context of inter-system communications. Network protocols +such as HTTP provide mechanisms for client-server communication, i.e. how to +initiate requests, authenticate, send messages, etc. However, many network +protocols generally do not regulate the \emph{content} of messages: they +allow transfer of any media type, such as web pages, static files or +multimedia content. When designing systems where various components require +exchange of specific data structures, we need something on top of the network +protocol that prescribes how these structures are to be represented in +messages (buffers) on the network. Protocol Buffers solve exactly this +problem by providing a cross-platform method for serializing arbitrary +structures into well defined messages, which can then be exchanged using any +protocol. The descriptors (\texttt{.proto} files) are used to formally define +the interface of a remote API or network application. Libraries to parse and +generate protobuf messages are available for many programming languages, +making it relatively straightforward to implement clients and servers. \subsection{Interacting with R through HTTPS and Protocol Buffers} -One example of a system that supports protocol buffers to interact +One example of a system that supports Protocol Buffers to interact with R is OpenCPU \citep{opencpu}. OpenCPU is a framework for embedded statistical computation and reproducible research based on R and \LaTeX. It exposes a HTTP(S) API to access and manipulate R objects and allows for performing @@ -1406,7 +1414,7 @@ library(httr) # Retrieve and parse message -req <- GET ('https://public.opencpu.org/ocpu/library/MASS/data/Animals/pb') +req <- GET('https://public.opencpu.org/ocpu/library/MASS/data/Animals/pb') output <- unserialize_pb(req$content) # Check that no information was lost @@ -1414,7 +1422,7 @@ @ This code suggests a method for exchanging objects between R servers, however this can -also be done without protocol buffers. The main advantage of using an inter-operable format +also be done without Protocol Buffers. The main advantage of using an inter-operable format is that we can actually access R objects from within another programming language. For example, in a very similar fashion we can retrieve the same dataset in a Python client. To parse messages in Python, we first compile the @@ -1423,7 +1431,7 @@ \begin{verbatim} protoc rexp.proto --python_out=. \end{verbatim} -This generates python module called \texttt{rexp\_pb2.py}, containing both the +This generates Python module called \texttt{rexp\_pb2.py}, containing both the descriptor information as well as methods to read and manipulate the R object message. In the example below we use the HTTP client from the \texttt{urllib2} module. @@ -1457,7 +1465,7 @@ arguments of the function call in the form of protobuf messages as well. This is a bit more work, because clients needs to both generate messages containing R objects to post to the server, as well as retrieve and parse -protobuf messages returned by the server. Using protocol buffers to post +protobuf messages returned by the server. Using Protocol Buffers to post function arguments is not required, and for simple (scalar) arguments the standard \texttt{application/x-www-form-urlencoded} format might be sufficient. However, with protocol buffers the client can perform function calls with @@ -1499,8 +1507,9 @@ val <- do.call(stats::rnorm, fnargs) outputmsg <- serialize_pb(val) @ -In reality the OpenCPU provides a lot of meta functionality such as handling -of sessions, exceptions, security, and much more. OpenCPU also makes it possible to store + +OpenCPU also provides a lot of meta-functionality such as handling +of sessions, exceptions, security, and more. OpenCPU also makes it possible to store output of a function call on the server, instead of directly retrieving it. Thereby objects can be shared with other users or used as arguments in a subsequent function call. But in its essence, the HTTP API provides a simple way to perform remote @@ -1612,17 +1621,18 @@ \section{Acknowledgement} -The first versions of \CRANpkg{RProtoBuf} were written during 2009-2010, -with very significant contributions, both in code and design, made by -Romain Fran\c{c}ois. His continued influence on design and code is -appreciated. Several features of the package are influenced +The first versions of \CRANpkg{RProtoBuf} were written during 2009-2010. +Very significant contributions, both in code and design, were made by +Romain Fran\c{c}ois whose continued influence on design and code is +greatly appreciated. Several features of the package are influenced by the design of the \CRANpkg{rJava} package by Simon Urbanek The user-defined table mechanism, implemented by Duncan Temple Lang for the -purpose of the \pkg{RObjectTables} package allowed the dynamic symbol lookup. +purpose of the \pkg{RObjectTables} package, allows for the dynamic symbol lookup. Kenton Varda was generous with his time in reviewing code and explaining -obscure protocol buffer semantics. Karl Millar and Jeroen Ooms were -helpful in reviewing code or offering suggestions. The contemporaneous -work by Saptarshi Guha on \pkg{RHIPE} was a strong initial motivator. +obscure protocol buffer semantics. Karl Millar was very +helpful in reviewing code and offering suggestions. +%The contemporaneous work by Saptarshi Guha on \pkg{RHIPE} was a strong +%initial motivator. \bibliography{article} @@ -1630,3 +1640,4 @@ %% Note: If there is markup in \(sub)section, then it has to be escape as above. \end{document} + From noreply at r-forge.r-project.org Sun Jan 12 20:28:27 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 12 Jan 2014 20:28:27 +0100 (CET) Subject: [Rprotobuf-commits] r752 - in pkg: . src Message-ID: <20140112192828.00637186629@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-12 20:28:26 +0100 (Sun, 12 Jan 2014) New Revision: 752 Modified: pkg/configure.win pkg/src/Makevars.win Log: update makevars for CRAN Modified: pkg/configure.win =================================================================== --- pkg/configure.win 2014-01-11 23:44:54 UTC (rev 751) +++ pkg/configure.win 2014-01-12 19:28:26 UTC (rev 752) @@ -1,5 +1,7 @@ -# This is a temporary solution for when the headers/lib are not avaialble on the machine -"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e ' -download.file("http://www.stat.ucla.edu/~jeroen/files/protobuf-2.5.0-windows.zip", "lib.zip"); -unzip("lib.zip"); -' \ No newline at end of file +# Uncomment this to build on machines where the headers/lib are not installed +# on the machine already. +# +# +# "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e ' +# download.file("http://www.stat.ucla.edu/~jeroen/files/protobuf-2.5.0-windows.zip", "lib.zip"); +# unzip("lib.zip");' Modified: pkg/src/Makevars.win =================================================================== --- pkg/src/Makevars.win 2014-01-11 23:44:54 UTC (rev 751) +++ pkg/src/Makevars.win 2014-01-12 19:28:26 UTC (rev 752) @@ -1,15 +1,14 @@ # -*- mode: Makefile -*- ## -## The folders ../protobuf-2.5.0/lib/{i386,x64}/ contain static libraries -## for windows. Note that mingw64-4.8.1 has an issue that results in ld -## reading symbols multiple times. As a workaround, we can add a flag -## "-Wl,-allow-multiple-definition" to RCPP_LDFLAGS. -## See also https://sourceware.org/bugzilla/show_bug.cgi?id=12762 -## However the problem did not appear for mingw 4.7.3. So that's what we -## ended up using. +## On the CRAN windows builder, the heades and library are installed in +## a directory ${LIB_PROTOBUF}. On other machines, the configure.win file +## contains instructions for getting these files. ## -## MINGW64HACK= "-Wl,-allow-multiple-definition" +## Note that mingw64 4.8.1 had an issue that results in ld reading +## symbols multiple times. As a workaround, we can add a flag +## "-Wl,-allow-multiple-definition" to RCPP_LDFLAGS. However the +## problem did not appear for mingw 4.7.3. So that's what we used. +## RCPP_LDFLAGS= $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") -PROTOBUFROOT= "../protobuf-2.5.0" -PKG_CPPFLAGS= -I$(PROTOBUFROOT)/include -PKG_LIBS=$(RCPP_LDFLAGS) -L$(PROTOBUFROOT)/lib${R_ARCH} -lprotobuf +PKG_CPPFLAGS=-I${LIB_PROTOBUF}/include -I../protobuf-2.5.0/include +PKG_LIBS=$(RCPP_LDFLAGS) -L${LIB_PROTOBUF}/lib${R_ARCH} -L../protobuf-2.5.0/lib${R_ARCH} -lprotobuf From noreply at r-forge.r-project.org Sun Jan 12 21:40:25 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 12 Jan 2014 21:40:25 +0100 (CET) Subject: [Rprotobuf-commits] r753 - in pkg: . src Message-ID: <20140112204025.A1C2B184657@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-12 21:40:24 +0100 (Sun, 12 Jan 2014) New Revision: 753 Added: pkg/configure.win.readme Modified: pkg/DESCRIPTION pkg/configure.win pkg/src/Makevars.win Log: changes for CRAN win builder Modified: pkg/DESCRIPTION =================================================================== --- pkg/DESCRIPTION 2014-01-12 19:28:26 UTC (rev 752) +++ pkg/DESCRIPTION 2014-01-12 20:40:24 UTC (rev 753) @@ -1,5 +1,5 @@ Package: RProtoBuf -Version: 0.3.2.4 +Version: 0.3.2.5 Date: $Date$ Author: Romain Francois, Dirk Eddelbuettel, Murray Stokely and Jeroen Ooms Maintainer: Dirk Eddelbuettel Modified: pkg/configure.win =================================================================== --- pkg/configure.win 2014-01-12 19:28:26 UTC (rev 752) +++ pkg/configure.win 2014-01-12 20:40:24 UTC (rev 753) @@ -1,7 +0,0 @@ -# Uncomment this to build on machines where the headers/lib are not installed -# on the machine already. -# -# -# "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e ' -# download.file("http://www.stat.ucla.edu/~jeroen/files/protobuf-2.5.0-windows.zip", "lib.zip"); -# unzip("lib.zip");' Added: pkg/configure.win.readme =================================================================== --- pkg/configure.win.readme (rev 0) +++ pkg/configure.win.readme 2014-01-12 20:40:24 UTC (rev 753) @@ -0,0 +1,7 @@ +# Rename this file to configure.win to build on machines where the +# headers/lib are not installed on the machine already. Make sure to +# use R CMD INSTALL --force-biarch +# +"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e ' +download.file("http://r-forge.r-project.org/scm/viewvc.php/*checkout*/windows/protobuf-2.5.0-windows.zip?root=rprotobuf", "lib.zip"); +unzip("lib.zip");' Modified: pkg/src/Makevars.win =================================================================== --- pkg/src/Makevars.win 2014-01-12 19:28:26 UTC (rev 752) +++ pkg/src/Makevars.win 2014-01-12 20:40:24 UTC (rev 753) @@ -1,7 +1,7 @@ # -*- mode: Makefile -*- ## ## On the CRAN windows builder, the heades and library are installed in -## a directory ${LIB_PROTOBUF}. On other machines, the configure.win file +## a directory ${LIB_PROTOBUF}. On other machines, the configure.win.readme ## contains instructions for getting these files. ## ## Note that mingw64 4.8.1 had an issue that results in ld reading From noreply at r-forge.r-project.org Sun Jan 12 23:02:58 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 12 Jan 2014 23:02:58 +0100 (CET) Subject: [Rprotobuf-commits] r754 - / papers/jss Message-ID: <20140112220258.51EF31868B7@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-12 23:02:57 +0100 (Sun, 12 Jan 2014) New Revision: 754 Added: rprotobuf.Rproj Modified: / papers/jss/article.Rnw Log: tiny changes for that my IDE needs Property changes on: ___________________________________________________________________ Added: svn:ignore + .Rproj.user .Rhistory .RData Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-12 20:40:24 UTC (rev 753) +++ papers/jss/article.Rnw 2014-01-12 22:02:57 UTC (rev 754) @@ -83,6 +83,7 @@ \begin{document} +\SweaveOpts{concordance=FALSE} %% include your article here, just as usual Added: rprotobuf.Rproj =================================================================== --- rprotobuf.Rproj (rev 0) +++ rprotobuf.Rproj 2014-01-12 22:02:57 UTC (rev 754) @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackagePath: pkg +PackageInstallArgs: --no-multiarch --with-keep.source From noreply at r-forge.r-project.org Sun Jan 12 23:06:32 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 12 Jan 2014 23:06:32 +0100 (CET) Subject: [Rprotobuf-commits] r755 - papers/jss Message-ID: <20140112220632.B9A001868E8@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-12 23:06:32 +0100 (Sun, 12 Jan 2014) New Revision: 755 Modified: papers/jss/article.Rnw Log: move a strayed sentence Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-12 22:02:57 UTC (rev 754) +++ papers/jss/article.Rnw 2014-01-12 22:06:32 UTC (rev 755) @@ -179,6 +179,9 @@ the encoded data, the data can be efficiently encoded to minimize storage costs of the stored data when compared with simple ``schema-less'' binary interchange formats. +Many sources compare data serialization formats and show Protocol +Buffers very favorably to the alternatives; see +\citet{Sumaray:2012:CDS:2184751.2184810} for one such comparison. % TODO(mstokely): Take a more conversational tone here asking % questions and motivating protocol buffers? @@ -268,10 +271,6 @@ %between three to ten times \textsl{smaller}, between twenty and one hundred %times \textsl{faster}, as well as less ambiguous and easier to program. -Many sources compare data serialization formats and show Protocol -Buffers very favorably to the alternatives; see -\citet{Sumaray:2012:CDS:2184751.2184810} for one such comparison. - %The flexibility of the reflection-based API is particularly well %suited for interactive data analysis. From noreply at r-forge.r-project.org Sun Jan 12 23:11:07 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 12 Jan 2014 23:11:07 +0100 (CET) Subject: [Rprotobuf-commits] r756 - papers/jss Message-ID: <20140112221107.E6271185065@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-12 23:11:07 +0100 (Sun, 12 Jan 2014) New Revision: 756 Modified: papers/jss/ Log: SVN ignore tmp files Property changes on: papers/jss ___________________________________________________________________ Added: svn:ignore + article.aux article.bbl article.log article.pdf article.tex From noreply at r-forge.r-project.org Sun Jan 12 23:20:07 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 12 Jan 2014 23:20:07 +0100 (CET) Subject: [Rprotobuf-commits] r757 - papers/jss Message-ID: <20140112222008.0F11A1860CF@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-12 23:20:07 +0100 (Sun, 12 Jan 2014) New Revision: 757 Modified: papers/jss/article.Rnw Log: rewrite paragraph Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-12 22:11:07 UTC (rev 756) +++ papers/jss/article.Rnw 2014-01-12 22:20:07 UTC (rev 757) @@ -412,23 +412,22 @@ %schema used by one or more messages, and DescriptorPools, which %provide access to descriptors. -Before one can create a new Protocol Buffer Message or parse a -serialized stream of bytes as a Message, one must first read in the message -type specification from a \texttt{.proto} file. - -New \texttt{.proto} files are imported with the \code{readProtoFiles} -function, which can import a single file, all files in a directory, or -all \texttt{.proto} files provided by another R package. - +To create or parse a Protocol Buffer Message, one must first read in +the message type specification from a \texttt{.proto} file. The +\texttt{.proto} files are imported using the \code{readProtoFiles} +function, which can either import a single file, all files in a directory, +or every \texttt{.proto} file provided by a particular R package. The \texttt{.proto} file syntax for defining the structure of protocol buffer data is described comprehensively on Google Code\footnote{See \url{http://code.google.com/apis/protocolbuffers/docs/proto.html}.}. -Once the proto files are imported, all message descriptors are -are available in the R search path in the \texttt{RProtoBuf:DescriptorPool} -special environment. The underlying mechanism used here is +After importing proto files, the corresponding message descriptors are +available from the \texttt{RProtoBuf:DescriptorPool} environment in +the R search path. The underlying mechanism used here is described in more detail in Section~\ref{sec-lookup}. +%JO: can we just move the section 7 to here? It's only one paragraph% + <<>>= ls("RProtoBuf:DescriptorPool") @ From noreply at r-forge.r-project.org Sun Jan 12 23:31:58 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 12 Jan 2014 23:31:58 +0100 (CET) Subject: [Rprotobuf-commits] r758 - papers/jss Message-ID: <20140112223158.BEF2C18681F@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-12 23:31:58 +0100 (Sun, 12 Jan 2014) New Revision: 758 Modified: papers/jss/article.Rnw Log: words Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-12 22:20:07 UTC (rev 757) +++ papers/jss/article.Rnw 2014-01-12 22:31:58 UTC (rev 758) @@ -518,7 +518,7 @@ serialize(p, NULL) @ -The same method can also be used to serialize messages to files : +The same method can be used to serialize messages to files : <<>>= tf1 <- tempfile() @@ -536,8 +536,8 @@ readBin(tf2, raw(0), 500) @ -\texttt{serialize} can also be used in a more traditional -object oriented fashion using the dollar operator : +\texttt{serialize} can also be called in a more traditional +object oriented fashion using the dollar operator: <<>>= # serialize to a file From noreply at r-forge.r-project.org Sun Jan 12 23:46:25 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 12 Jan 2014 23:46:25 +0100 (CET) Subject: [Rprotobuf-commits] r759 - papers/jss Message-ID: <20140112224625.BB9F4185E50@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-12 23:46:25 +0100 (Sun, 12 Jan 2014) New Revision: 759 Modified: papers/jss/article.Rnw Log: tiny changes Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-12 22:31:58 UTC (rev 758) +++ papers/jss/article.Rnw 2014-01-12 22:46:25 UTC (rev 759) @@ -635,7 +635,7 @@ which provide a more concise way of wrapping C++ functions and classes in a single entity. -The \texttt{RProtoBuf} package combines a dispatch mechanism +The \texttt{RProtoBuf} package combines a functional dispatch mechanism of the form \verb|method(object, arguments)| (common to R) and the more traditional object oriented notation \verb|object$method(arguments)|. Additionally, \texttt{RProtoBuf} implements the \texttt{.DollarNames} S3 generic function @@ -1033,7 +1033,7 @@ that we simply can not store R logical vectors that include all three possible values as booleans. The library will refuse to store \texttt{NA}s in protocol buffer boolean fields, and users must instead -choose another type (such as integers) capable of storing three +choose another type (such as enum or integer) capable of storing three distinct values. <>= From noreply at r-forge.r-project.org Mon Jan 13 00:11:04 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 00:11:04 +0100 (CET) Subject: [Rprotobuf-commits] r760 - papers/jss Message-ID: <20140112231104.D4576180AFC@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-13 00:11:03 +0100 (Mon, 13 Jan 2014) New Revision: 760 Modified: papers/jss/article.Rnw Log: some rephrasing on int64 Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-12 22:46:25 UTC (rev 759) +++ papers/jss/article.Rnw 2014-01-12 23:11:03 UTC (rev 760) @@ -1076,18 +1076,20 @@ \subsection{64-bit integers} \label{sec:int64} -R does not have native 64-bit integer support. Instead, R treats -large integers as doubles which have limited precision. For example, -it loses the ability to distinguish some distinct integers: +R also does not support the native 64-bit integer type. Numeric vectors +with values $\geq 2^{31}$ can only be stored as doubles, which have +limited precision. Thereby R loses the ability to distinguish some +distinct integers: <<>>= 2^53 == (2^53 + 1) @ -Protocol Buffers are frequently used to pass data between different -systems, however, and most other modern systems do have support for -64-bit integers. To work around this, RProtoBuf allows users to get -and set 64-bit integer types by treating them as characters. +However, most modern languages do have support for 64-bit integers, +which becomes problematic when \pkg{RProtoBuf} is used to exchange data +with a system that requires this integer type. To work around this, +RProtoBuf allows users to get and set 64-bit integer values by specifying +them as character strings. <>= if (!exists("protobuf_unittest.TestAllTypes", @@ -1108,9 +1110,9 @@ length(unique(test$repeated_int64)) @ -However, we can specify the values as character strings so that the -C++ library on which RProtoBuf is based can store a true 64-bit -integer representation of the data. +But when the values are specified as character strings, RProtoBuf +will automatically coerse them into a true 64-bit integer types +before storing them in the Protocol Buffer message: <<>>= test$repeated_int64 <- c("9007199254740992", "9007199254740993") From noreply at r-forge.r-project.org Mon Jan 13 00:31:38 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 00:31:38 +0100 (CET) Subject: [Rprotobuf-commits] r761 - papers/jss Message-ID: <20140112233138.1ACA3186869@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-13 00:31:31 +0100 (Mon, 13 Jan 2014) New Revision: 761 Modified: papers/jss/article.Rnw Log: first pass at data frame section Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-12 23:11:03 UTC (rev 760) +++ papers/jss/article.Rnw 2014-01-12 23:31:31 UTC (rev 761) @@ -1159,18 +1159,19 @@ serialized Protocol Buffer representation. <>= -datasets <- subset(as.data.frame(data()$results), Package=="datasets") -datasets$load.name <- sub("\\s+.*$", "", datasets$Item) +datasets <- as.data.frame(data(package="datasets")$results) +datasets$name <- sub("\\s+.*$", "", datasets$Item) n <- nrow(datasets) @ -There are \Sexpr{n} standard data sets included in R. We use the -\texttt{can\_serialize\_pb} method to determine how many of those can -be safely converted to a serialized Protocol Buffer representation. +There are \Sexpr{n} standard data sets included in the base-r \pkg{datasets} +package. These datasets include data frames, matices, timeseries, tables lists, +and some more exotic data classes. The \texttt{can\_serialize\_pb} method can be +used to determine which of those can fully be converted to the \textt{rexp.proto} +Protocol Buffer representation: <>= -datasets$valid.proto <- sapply(datasets$load.name, - function(x) can_serialize_pb(eval(as.name(x)))) +datasets$valid.proto <- sapply(datasets$name, function(x) can_serialize_pb(get(x))) datasets <- subset(datasets, valid.proto==TRUE) m <- nrow(datasets) @ @@ -1190,19 +1191,19 @@ \label{sec:compression} <>= -datasets$object.size <- unname(sapply(datasets$load.name, function(x) object.size(eval(as.name(x))))) +datasets$object.size <- unname(sapply(datasets$name, function(x) object.size(eval(as.name(x))))) -datasets$R.serialize.size <- unname(sapply(datasets$load.name, function(x) length(serialize(eval(as.name(x)), NULL)))) +datasets$R.serialize.size <- unname(sapply(datasets$name, function(x) length(serialize(eval(as.name(x)), NULL)))) -datasets$R.serialize.size <- unname(sapply(datasets$load.name, function(x) length(serialize(eval(as.name(x)), NULL)))) +datasets$R.serialize.size <- unname(sapply(datasets$name, function(x) length(serialize(eval(as.name(x)), NULL)))) -datasets$R.serialize.size.gz <- unname(sapply(datasets$load.name, function(x) length(memCompress(serialize(eval(as.name(x)), NULL), "gzip")))) +datasets$R.serialize.size.gz <- unname(sapply(datasets$name, function(x) length(memCompress(serialize(eval(as.name(x)), NULL), "gzip")))) -datasets$RProtoBuf.serialize.size <- unname(sapply(datasets$load.name, function(x) length(serialize_pb(eval(as.name(x)), NULL)))) +datasets$RProtoBuf.serialize.size <- unname(sapply(datasets$name, function(x) length(serialize_pb(eval(as.name(x)), NULL)))) -datasets$RProtoBuf.serialize.size.gz <- unname(sapply(datasets$load.name, function(x) length(memCompress(serialize_pb(eval(as.name(x)), NULL), "gzip")))) +datasets$RProtoBuf.serialize.size.gz <- unname(sapply(datasets$name, function(x) length(memCompress(serialize_pb(eval(as.name(x)), NULL), "gzip")))) -clean.df <- data.frame(dataset=datasets$load.name, +clean.df <- data.frame(dataset=datasets$name, object.size=datasets$object.size, "serialized"=datasets$R.serialize.size, "gzipped serialized"=datasets$R.serialize.size.gz, From noreply at r-forge.r-project.org Mon Jan 13 00:42:00 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 00:42:00 +0100 (CET) Subject: [Rprotobuf-commits] r762 - papers/jss Message-ID: <20140112234201.0E399181173@r-forge.r-project.org> Author: edd Date: 2014-01-13 00:42:00 +0100 (Mon, 13 Jan 2014) New Revision: 762 Modified: papers/jss/article.Rnw Log: mostly spelling Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-12 23:31:31 UTC (rev 761) +++ papers/jss/article.Rnw 2014-01-12 23:42:00 UTC (rev 762) @@ -100,7 +100,7 @@ \renewenvironment{Schunk}{\vspace{\topsep}}{\vspace{\topsep}} \title{RProtoBuf: Efficient Cross-Language Data Serialization in R} -\author{by Dirk Eddelbuettel and Murray Stokely} +\author{by Dirk Eddelbuettel, Murray Stokely and Jeroen Ooms} %% DE: I tend to have wider option(width=...) so this %% guarantees better line breaks @@ -154,7 +154,7 @@ protocol with the ability to define just about any arbitrarily complex schema. However, it pays for this complexity with comparatively large and verbose messages, and added complexities at the parsing side (which are -somewhat metigated by the availability of mature libraries and +somewhat mitigated by the availability of mature libraries and parsers). A number of binary formats based on JSON have been proposed that @@ -171,7 +171,7 @@ enough, developers typically benefit from the use of an \emph{interface description language}, or \emph{IDL}. IDLs like Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact -well-documented schema for cross-langauge data structures and +well-documented schema for cross-language data structures and efficient binary interchange formats. The schema can be used to generate model classes for statically-typed programming languages such as C++ and Java, or can be used with reflection for dynamically-typed @@ -1111,7 +1111,7 @@ @ But when the values are specified as character strings, RProtoBuf -will automatically coerse them into a true 64-bit integer types +will automatically coerce them into a true 64-bit integer types before storing them in the Protocol Buffer message: <<>>= @@ -1165,7 +1165,7 @@ @ There are \Sexpr{n} standard data sets included in the base-r \pkg{datasets} -package. These datasets include data frames, matices, timeseries, tables lists, +package. These datasets include data frames, matrices, time series, tables lists, and some more exotic data classes. The \texttt{can\_serialize\_pb} method can be used to determine which of those can fully be converted to the \textt{rexp.proto} Protocol Buffer representation: From noreply at r-forge.r-project.org Mon Jan 13 01:14:02 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 01:14:02 +0100 (CET) Subject: [Rprotobuf-commits] r763 - papers/jss Message-ID: <20140113001402.ED868185E69@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-13 01:14:02 +0100 (Mon, 13 Jan 2014) New Revision: 763 Modified: papers/jss/article.Rnw Log: sync Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-12 23:42:00 UTC (rev 762) +++ papers/jss/article.Rnw 2014-01-13 00:14:02 UTC (rev 763) @@ -1176,10 +1176,23 @@ m <- nrow(datasets) @ -\Sexpr{m} data sets could be converted to Protocol Buffers -(\Sexpr{format(100*m/n,digits=1)}\%). The next section illustrates how -many bytes were used to store the data sets under four different -situations: +\Sexpr{m} data sets can be converted to Protocol Buffers +(\Sexpr{format(100*m/n,digits=1)}\%) without loss of information. Upon closer +inspection, all other datasets are objects of class \texttt{nfnGroupedData}. +This class represents a special type of data frame that has some additional +attributes used by the \pkg{nlme} package, among which a \emph{formula} object. +Because formulas are R \emph{language} objects, they have little meaning to +other systems, and are not supported by the \texttt{rexp.proto} descriptor. +When \texttt{serialize_pb} is used on objects of this class (or other objects +containing unsupported data types), it will serialize all other values and +attributes of the object, but skip over the unsupported types with a warning. + +\subsection{Compression Performance} +\label{sec:compression} + +This section compares how many bytes are used to store data sets +using four different methods: + \begin{itemize} \item normal R serialization, \item R serialization followed by gzip, @@ -1187,9 +1200,6 @@ \item Protocol Buffer serialization followed by gzip. \end{itemize} -\subsection{Compression Performance} -\label{sec:compression} - <>= datasets$object.size <- unname(sapply(datasets$name, function(x) object.size(eval(as.name(x))))) From noreply at r-forge.r-project.org Mon Jan 13 01:21:24 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 01:21:24 +0100 (CET) Subject: [Rprotobuf-commits] r764 - papers/jss Message-ID: <20140113002124.51AFE1866BB@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-13 01:21:23 +0100 (Mon, 13 Jan 2014) New Revision: 764 Modified: papers/jss/article.Rnw Log: underscore FTW Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-13 00:14:02 UTC (rev 763) +++ papers/jss/article.Rnw 2014-01-13 00:21:23 UTC (rev 764) @@ -1183,7 +1183,7 @@ attributes used by the \pkg{nlme} package, among which a \emph{formula} object. Because formulas are R \emph{language} objects, they have little meaning to other systems, and are not supported by the \texttt{rexp.proto} descriptor. -When \texttt{serialize_pb} is used on objects of this class (or other objects +When \texttt{serialize\_pb} is used on objects of this class (or other objects containing unsupported data types), it will serialize all other values and attributes of the object, but skip over the unsupported types with a warning. From noreply at r-forge.r-project.org Mon Jan 13 03:19:36 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 03:19:36 +0100 (CET) Subject: [Rprotobuf-commits] r765 - papers/jss Message-ID: <20140113021937.24FD6186299@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-13 03:19:36 +0100 (Mon, 13 Jan 2014) New Revision: 765 Modified: papers/jss/article.Rnw Log: second pass at data frames Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-13 00:21:23 UTC (rev 764) +++ papers/jss/article.Rnw 2014-01-13 02:19:36 UTC (rev 765) @@ -1,5 +1,6 @@ \documentclass[article]{jss} \usepackage{booktabs} +\usepackage[toc,page]{appendix} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -1139,25 +1140,35 @@ options("RProtoBuf.int64AsString" = FALSE) @ -\section{Evaluation: data.frame to Protocol Buffer Serialization} +\section{Converting R Data Structures into Protocol Buffers} \label{sec:evaluation} -The \pkg{RHIPE} package \citep{rhipe} also includes a Protocol integration with R. -However, its implementation takes a different approach: any R object is -serialized into a message based on a single catch-all \texttt{proto} schema. -A similar approach was taken by \pkg{RProtoBufUtils} package (which has now been integrated in -\pkg{RProtoBuf}). Unlike \pkg{RHIPE}, however, \pkg{RProtoBufUtils} -depended upon on, and extended, \pkg{RProtoBuf} for underlying message operations. -%DE Shall this go away now that we sucket RPBUtils into RBP? +The previous sections discussed functionality in the \pkg{RProtoBuf} package +for creating, manipulating, parsing and serializing Protocol Buffer messages. +In addition to these low-level methods, the package also has some high level +functionality for automatically converting R data structures into protocol +buffers and vice versa. The \texttt{serialize\_pb} and \texttt{unserialize\_pb} +functions convert arbitrary R objects into a universal Protocol Buffer structure: -One key extension which \pkg{RProtoBufUtils} brought to \pkg{RProtoBuf} is the -\texttt{serialize\_pb} method to convert R objects into serialized -Protocol Buffers in the catch-all schema. The \texttt{can\_serialize\_pb} -method can then be used to determine whether the given R object can safely -be expressed in this way. To illustrate how this method works, we -attempt to convert all of the built-in datasets from R into this -serialized Protocol Buffer representation. +<<>>= +msg <- serialize_pb(iris, NULL) +identical(iris, unserialize_pb(msg)) +@ +In order to accomplish this, \pkg{RProtoBuf} uses the same catch-all \texttt{proto} +schema that \pkg{RHIPE} uses for exchanging R data with Hadoop \citep{rhipe}. This +schema, which we will refer to as \texttt{rexp.proto} is printed in appendix +\ref{rex.proto}. Even though the \texttt{RHIPE} implementation is written in Java and +\texttt{RProtoBuf} is writting in R and \texttt{C++}, the Protocol Buffer messages +are naturally compatible between the two systems because they use the same schema. +This shows the power of using a schema based cross-platform format such as Protocol +Buffers: interoperability is archieved without tight coordination or collaboration. + +\subsection{Evaluation: Converting R Data Sets} + +To illustrate how this method works, we attempt to convert all of the built-in +datasets from R into this serialized Protocol Buffer representation. + <>= datasets <- as.data.frame(data(package="datasets")$results) datasets$name <- sub("\\s+.*$", "", datasets$Item) @@ -1167,7 +1178,7 @@ There are \Sexpr{n} standard data sets included in the base-r \pkg{datasets} package. These datasets include data frames, matrices, time series, tables lists, and some more exotic data classes. The \texttt{can\_serialize\_pb} method can be -used to determine which of those can fully be converted to the \textt{rexp.proto} +used to determine which of those can fully be converted to the \texttt{rexp.proto} Protocol Buffer representation: <>= @@ -1646,6 +1657,59 @@ %The contemporaneous work by Saptarshi Guha on \pkg{RHIPE} was a strong %initial motivator. +\newpage +\begin{appendices} + +\section{The rexp.proto schema descriptor} +\label{rexp.proto} + +Below a print of the \texttt{rexp.proto} schema (originally designed by \cite{rhipe}) +that is included with the \pkg{RProtoBuf} package and used by \texttt{serialize\_pb} and +\texttt{unserialize\_pb}. + +\begin{verbatim} +package rexp; + +message REXP { + enum RClass { + STRING = 0; + RAW = 1; + REAL = 2; + COMPLEX = 3; + INTEGER = 4; + LIST = 5; + LOGICAL = 6; + NULLTYPE = 7; + } + enum RBOOLEAN { + F=0; + T=1; + NA=2; + } + + required RClass rclass = 1 ; + repeated double realValue = 2 [packed=true]; + repeated sint32 intValue = 3 [packed=true]; + repeated RBOOLEAN booleanValue = 4; + repeated STRING stringValue = 5; + optional bytes rawValue = 6; + repeated CMPLX complexValue = 7; + repeated REXP rexpValue = 8; + repeated string attrName = 11; + repeated REXP attrValue = 12; +} +message STRING { + optional string strval = 1; + optional bool isNA = 2 [default=false]; +} +message CMPLX { + optional double real = 1 [default=0]; + required double imag = 2; +} +\end{verbatim} +\end{appendices} + + \bibliography{article} %\section[About Java]{About \proglang{Java}} From noreply at r-forge.r-project.org Mon Jan 13 07:36:03 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 07:36:03 +0100 (CET) Subject: [Rprotobuf-commits] r766 - papers/jss Message-ID: <20140113063603.92DD8186827@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-13 07:36:03 +0100 (Mon, 13 Jan 2014) New Revision: 766 Modified: papers/jss/article.Rnw Log: pass 3 Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-13 02:19:36 UTC (rev 765) +++ papers/jss/article.Rnw 2014-01-13 06:36:03 UTC (rev 766) @@ -1148,7 +1148,8 @@ In addition to these low-level methods, the package also has some high level functionality for automatically converting R data structures into protocol buffers and vice versa. The \texttt{serialize\_pb} and \texttt{unserialize\_pb} -functions convert arbitrary R objects into a universal Protocol Buffer structure: +functions serialize arbitrary R objects into a universal Protocol Buffer +message: <<>>= msg <- serialize_pb(iris, NULL) @@ -1156,14 +1157,26 @@ @ In order to accomplish this, \pkg{RProtoBuf} uses the same catch-all \texttt{proto} -schema that \pkg{RHIPE} uses for exchanging R data with Hadoop \citep{rhipe}. This +schema used by \pkg{RHIPE} for exchanging R data with Hadoop \citep{rhipe}. This schema, which we will refer to as \texttt{rexp.proto} is printed in appendix -\ref{rex.proto}. Even though the \texttt{RHIPE} implementation is written in Java and -\texttt{RProtoBuf} is writting in R and \texttt{C++}, the Protocol Buffer messages -are naturally compatible between the two systems because they use the same schema. -This shows the power of using a schema based cross-platform format such as Protocol -Buffers: interoperability is archieved without tight coordination or collaboration. +\ref{rexp.proto}. The Protocol Buffer messages generated by \pkg{RProtoBuf} and +\pkg{RHIPE} are naturally compatible between the two systems because they use the +same schema. This shows the power of using a schema based cross-platform format such +as Protocol Buffers: interoperability is archieved without effort or close coordination. +The \texttt{rexp.proto} schema supports all main R storage types holding \emph{data}. +These include \texttt{NULL}, \texttt{list} and vectors of type \texttt{logical}, +\texttt{character}, \texttt{double}, \texttt{integer} and \texttt{complex}. In addition, +every type can contain a named set of attributes, as is the case in R. The \texttt{rexp.proto} +schema does not support some of the special R specific storage types, such as \texttt{function}, +\texttt{language} or \texttt{environment}. Such objects have no native equivalent +type in Protocol Buffers, and have little meaning outside the context of R. +When serializing R objects using \texttt{serialize\_pb}, values or attributes of +unsupported types are skipped with a warning. If the user really wishes to serialize these +objects, they need to be converted into a supported type. For example, the can use +\texttt{deparse} to convert functions or language objects into strings, or \texttt{as.list} +for environments. + \subsection{Evaluation: Converting R Data Sets} To illustrate how this method works, we attempt to convert all of the built-in @@ -1177,14 +1190,13 @@ There are \Sexpr{n} standard data sets included in the base-r \pkg{datasets} package. These datasets include data frames, matrices, time series, tables lists, -and some more exotic data classes. The \texttt{can\_serialize\_pb} method can be +and some more exotic data classes. The \texttt{can\_serialize\_pb} method is used to determine which of those can fully be converted to the \texttt{rexp.proto} -Protocol Buffer representation: +Protocol Buffer representation. This method simply checks if any of the values or +attributes in an object is of an unsupported type: <>= -datasets$valid.proto <- sapply(datasets$name, function(x) can_serialize_pb(get(x))) -datasets <- subset(datasets, valid.proto==TRUE) -m <- nrow(datasets) +m <- sum(sapply(datasets$name, function(x) can_serialize_pb(get(x)))) @ \Sexpr{m} data sets can be converted to Protocol Buffers @@ -1194,10 +1206,19 @@ attributes used by the \pkg{nlme} package, among which a \emph{formula} object. Because formulas are R \emph{language} objects, they have little meaning to other systems, and are not supported by the \texttt{rexp.proto} descriptor. -When \texttt{serialize\_pb} is used on objects of this class (or other objects -containing unsupported data types), it will serialize all other values and -attributes of the object, but skip over the unsupported types with a warning. +When \texttt{serialize\_pb} is used on objects of this class, it will serialize +the data frame and all attributes, except for the formula. +<<>>= +attr(CO2, "formula") +msg <- serialize_pb(CO2, NULL) +object <- unserialize_pb(msg) +identical(CO2, object) +identical(class(CO2), class(object)) +identical(dim(CO2), dim(object)) +attr(object, "formula") +@ + \subsection{Compression Performance} \label{sec:compression} @@ -1246,11 +1267,6 @@ comes from its interoperability with other environments, as well as its safe versioning, -TODO comparison of protobuf serialization sizes/times for various vectors. -Compared to R's native serialization. Discussion of the RHIPE approach of -serializing any/all R objects, vs more specific Protocol Buffers for specific -R objects. - % N.B. see table.Rnw for how this table is created. % % latex table generated in R 3.0.2 by xtable 1.7-0 package @@ -1340,6 +1356,8 @@ \section{Descriptor lookup} \label{sec-lookup} +%JO: is this section really relevant? Maybe just a citation will do instead? + The \texttt{RProtoBuf} package uses the user defined tables framework that is defined as part of the \texttt{RObjectTables} package available from the OmegaHat project \citep{RObjectTables}. @@ -1374,7 +1392,7 @@ As described earlier, the primary application of Protocol Buffers is data interchange in the context of inter-system communications. Network protocols such as HTTP provide mechanisms for client-server communication, i.e. how to -initiate requests, authenticate, send messages, etc. However, many network +initiate requests, authenticate, send messages, etc. However, network protocols generally do not regulate the \emph{content} of messages: they allow transfer of any media type, such as web pages, static files or multimedia content. When designing systems where various components require @@ -1424,10 +1442,7 @@ languages, clients can be implemented in just a few lines of code. Below is example code for both R and Python that retrieves a dataset from R with OpenCPU using a protobuf message. In R, we use the HTTP client from -the \texttt{httr} package \citep{httr}. -% superfluous? -%, and the protobuf parser from the \texttt{RProtoBuf} package. -In this example we +the \texttt{httr} package \citep{httr}. In this example we download a dataset which is part of the base R distribution, so we can verify that the object was transferred without loss of information. @@ -1444,8 +1459,8 @@ identical(output, MASS::Animals) @ -This code suggests a method for exchanging objects between R servers, however this can -also be done without Protocol Buffers. The main advantage of using an inter-operable format +This code suggests a method for exchanging objects between R servers, however this might as +well be done without Protocol Buffers. The main advantage of using an inter-operable format is that we can actually access R objects from within another programming language. For example, in a very similar fashion we can retrieve the same dataset in a Python client. To parse messages in Python, we first compile the @@ -1536,7 +1551,10 @@ output of a function call on the server, instead of directly retrieving it. Thereby objects can be shared with other users or used as arguments in a subsequent function call. But in its essence, the HTTP API provides a simple way to perform remote -R function calls over HTTPS. The same request can be performed in Python as follows: +R function calls over HTTPS. The same request can be performed in Python as demonstrated +below. The code is a bit verbose because to show how the REXP message is created from +scratch. In practice would probably write a function or small module construct a Protocol +Buffer message representing an R list from a Python dictionary object. \begin{verbatim} import urllib2; @@ -1578,8 +1596,8 @@ \section{Application: Distributed Data Collection with MapReduce} \label{sec:mapreduce} -The MapReduce programming model \citep{dean2008mapreduce} has emerged -in the last decade as a popular framework for processing large data +Over the past years, the MapReduce programming model \citep{dean2008mapreduce} +has emerged as a poweful design pattern for processing large data sets in parallel on large compute clusters. Protocol Buffers provide a convenient mechanism to send structured data between tasks in a MapReduce cluster. In particular, the large data sets in fields From noreply at r-forge.r-project.org Mon Jan 13 20:59:07 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 20:59:07 +0100 (CET) Subject: [Rprotobuf-commits] r767 - pkg/inst/unitTests Message-ID: <20140113195907.7D3E31867DE@r-forge.r-project.org> Author: murray Date: 2014-01-13 20:59:07 +0100 (Mon, 13 Jan 2014) New Revision: 767 Modified: pkg/inst/unitTests/runit.int64.R Log: Check .Machine$sizeof.longlong >= 8 before some of the int64 tests. Modified: pkg/inst/unitTests/runit.int64.R =================================================================== --- pkg/inst/unitTests/runit.int64.R 2014-01-13 06:36:03 UTC (rev 766) +++ pkg/inst/unitTests/runit.int64.R 2014-01-13 19:59:07 UTC (rev 767) @@ -28,10 +28,15 @@ # Now just test that we can use add to set int64 fields. a$add("repeated_int64", 2:10) checkEquals(length(a$repeated_int64), 10) - - # Verify we can set character strings of large 64-bit ints - a$repeated_int64 <- c("9007199254740992", "9007199254740993") - checkEquals(length(a$repeated_int64), 2) + + if (.Machine$sizeof.longlong >= 8) { + # Verify we can set character strings of large 64-bit ints + a$repeated_int64 <- c("9007199254740992", "9007199254740993") + checkEquals(length(a$repeated_int64), 2) + } else { + warning("Can't test 64-bit int type on platform with sizeof(long long) < 8") + } + # Verify we can't set any garbage string to a repeated int64. checkException(a$repeated_int64 <-c("invalid", "invalid")) @@ -52,6 +57,7 @@ options("RProtoBuf.int64AsString" = TRUE) # But we can see they are different if we treat them as strings. - checkEquals(length(unique(a$repeated_int64)), 2) - + if (.Machine$sizeof.longlong >= 8) { + checkEquals(length(unique(a$repeated_int64)), 2) + } } From noreply at r-forge.r-project.org Mon Jan 13 22:26:04 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 22:26:04 +0100 (CET) Subject: [Rprotobuf-commits] r768 - pkg/vignettes Message-ID: <20140113212604.EBFD518674F@r-forge.r-project.org> Author: murray Date: 2014-01-13 22:26:03 +0100 (Mon, 13 Jan 2014) New Revision: 768 Modified: pkg/vignettes/RProtoBuf-intro.Rnw Log: Wrap int64 usage behind an if statement for .Machine$sizeof.longlong and correct a typo. Modified: pkg/vignettes/RProtoBuf-intro.Rnw =================================================================== --- pkg/vignettes/RProtoBuf-intro.Rnw 2014-01-13 19:59:07 UTC (rev 767) +++ pkg/vignettes/RProtoBuf-intro.Rnw 2014-01-13 21:26:03 UTC (rev 768) @@ -2038,7 +2038,8 @@ Protocol Buffers are frequently used to pass data between different systems, however, and most other systems these days have support for 64-bit integers. To work around this, RProtoBuf allows users to get -and set 64-bit integer types by treating them as characters. +and set 64-bit integer types by treating them as characters when +running on a platform with a 64-bit long long type available. <>= if (!exists("protobuf_unittest.TestAllTypes", @@ -2054,9 +2055,11 @@ precision: <<>>= -test <- new(protobuf_unittest.TestAllTypes) -test$repeated_int64 <- c(2^53, 2^53+1) -length(unique(test$repeated_int64)) +if (.Machine$sizeof.longlong >= 8) { + test <- new(protobuf_unittest.TestAllTypes) + test$repeated_int64 <- c(2^53, 2^53+1) + length(unique(test$repeated_int64)) +} @ However, we can specify the values as character strings so that the @@ -2064,7 +2067,9 @@ integer representation of the data. <<>>= -test$repeated_int64 <- c("9007199254740992", "9007199254740993") +if (.Machine$sizeof.longlong >= 8) { + test$repeated_int64 <- c("9007199254740992", "9007199254740993") +} @ When reading the value back into R, numeric types are returned by @@ -2072,13 +2077,19 @@ will be returned if the \texttt{RProtoBuf.int64AsString} option is set to \texttt{TRUE}. -<<>>= -options("RProtoBuf.int64AsString" = FALSE) -test$repeated_int64 -length(unique(test$repeated_int64)) -options("RProtoBuf.int64AsString" = TRUE) -test$repeated_int64 -length(unique(test$repeated_int64)) +% TODO: the if statement prevents us from seeing line by line +% output inside the block (E.g. the output of test$repeated_int64). +% This reduces the usefulness of this example when we are on a 64-bit +% platform. E.g. only the final ``2'' is returned. +<>= +if (.Machine$sizeof.longlong >= 8) { + options("RProtoBuf.int64AsString" = FALSE) + test$repeated_int64 + length(unique(test$repeated_int64)) + options("RProtoBuf.int64AsString" = TRUE) + test$repeated_int64 + length(unique(test$repeated_int64)) +} @ <>= @@ -2128,7 +2139,7 @@ structures using external pointers, etc ...). We'd like to thank Simon for his indirect involvment on \texttt{RProtoBuf}. -The user defined table mechasnism, implemented by Duncan Temple Lang +The user defined table mechanism, implemented by Duncan Temple Lang for the purpose of the \texttt{RObjectTables} package allowed the dynamic symbol lookup (see section~\ref{sec-lookup}). Many thanks to Duncan for this amazing feature. From noreply at r-forge.r-project.org Mon Jan 13 22:30:55 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 22:30:55 +0100 (CET) Subject: [Rprotobuf-commits] r769 - in pkg: . inst/unitTests Message-ID: <20140113213055.B198B1867C6@r-forge.r-project.org> Author: murray Date: 2014-01-13 22:30:55 +0100 (Mon, 13 Jan 2014) New Revision: 769 Modified: pkg/ChangeLog pkg/inst/unitTests/runit.int64.R Log: Check for sizeof(long long) right at the beginning of this test and exit immediately if we're not on a 64-bit platform. The previous behavior here I think would still have been broken on a 32-bit machine. Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-13 21:26:03 UTC (rev 768) +++ pkg/ChangeLog 2014-01-13 21:30:55 UTC (rev 769) @@ -1,3 +1,12 @@ +2014-01-13 Murray Stokely + + * inst/unitTests/runit.int64.R (test.int64): Skip this test with a + warning if running on a machine with sizeof(long long) < 8. + * vignettes/RProtoBuf-intro.Rnw (subsection{64-bit integer + issues}): Hide 64-bit field accesses with if conditional to + avoid running code that won't work on platforms without 64-bit + long longs. + 2014-01-09 Murray Stokely * R/read.R: Add a logical argument 'partial' to readASCII that Modified: pkg/inst/unitTests/runit.int64.R =================================================================== --- pkg/inst/unitTests/runit.int64.R 2014-01-13 21:26:03 UTC (rev 768) +++ pkg/inst/unitTests/runit.int64.R 2014-01-13 21:30:55 UTC (rev 769) @@ -23,19 +23,20 @@ readProtoFiles(file=unittest.proto.file) } + if (.Machine$sizeof.longlong < 8) { + warning("Can't test 64-bit int type on platform with sizeof(long long) < 8") + return + } + a <- new(protobuf_unittest.TestAllTypes) a$repeated_int64 <- 1 # Now just test that we can use add to set int64 fields. a$add("repeated_int64", 2:10) checkEquals(length(a$repeated_int64), 10) - if (.Machine$sizeof.longlong >= 8) { - # Verify we can set character strings of large 64-bit ints - a$repeated_int64 <- c("9007199254740992", "9007199254740993") - checkEquals(length(a$repeated_int64), 2) - } else { - warning("Can't test 64-bit int type on platform with sizeof(long long) < 8") - } + # Verify we can set character strings of large 64-bit ints + a$repeated_int64 <- c("9007199254740992", "9007199254740993") + checkEquals(length(a$repeated_int64), 2) # Verify we can't set any garbage string to a repeated int64. checkException(a$repeated_int64 <-c("invalid", "invalid")) @@ -57,7 +58,5 @@ options("RProtoBuf.int64AsString" = TRUE) # But we can see they are different if we treat them as strings. - if (.Machine$sizeof.longlong >= 8) { - checkEquals(length(unique(a$repeated_int64)), 2) - } + checkEquals(length(unique(a$repeated_int64)), 2) } From noreply at r-forge.r-project.org Mon Jan 13 23:03:09 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 23:03:09 +0100 (CET) Subject: [Rprotobuf-commits] r770 - pkg/vignettes Message-ID: <20140113220309.7303118684F@r-forge.r-project.org> Author: murray Date: 2014-01-13 23:03:09 +0100 (Mon, 13 Jan 2014) New Revision: 770 Modified: pkg/vignettes/RProtoBuf-intro.Rnw Log: Hard code output of Sweave on a 64-bit platform for the int64 section, as there is no good way to conditionally influence the sweaving and we still need to show 32-bit users what they are missing out on. Modified: pkg/vignettes/RProtoBuf-intro.Rnw =================================================================== --- pkg/vignettes/RProtoBuf-intro.Rnw 2014-01-13 21:30:55 UTC (rev 769) +++ pkg/vignettes/RProtoBuf-intro.Rnw 2014-01-13 22:03:09 UTC (rev 770) @@ -2054,23 +2054,40 @@ If we try to set an int64 field in R to double values, we lose precision: -<<>>= -if (.Machine$sizeof.longlong >= 8) { - test <- new(protobuf_unittest.TestAllTypes) - test$repeated_int64 <- c(2^53, 2^53+1) - length(unique(test$repeated_int64)) -} -@ +% We comment out the raw Sweave here because it doesn't work on 32-bit +% platforms but it is important to illustrate what the user is missing +% out on so we hard code the output from a platform with 64-bit long +% long type. +% +% <<>>= +% test <- new(protobuf_unittest.TestAllTypes) +% test$repeated_int64 <- c(2^53, 2^53+1) +% length(unique(test$repeated_int64)) +% @ +\begin{Schunk} +\begin{Sinput} +> test <- new(protobuf_unittest.TestAllTypes) +> test$repeated_int64 <- c(2^53, 2^53+1) +> length(unique(test$repeated_int64)) +\end{Sinput} +\begin{Soutput} +[1] 1 +\end{Soutput} +\end{Schunk} + However, we can specify the values as character strings so that the C++ library on which RProtoBuf is based can store a true 64-bit integer representation of the data. -<<>>= -if (.Machine$sizeof.longlong >= 8) { - test$repeated_int64 <- c("9007199254740992", "9007199254740993") -} -@ +% <<>>= +% test$repeated_int64 <- c("9007199254740992", "9007199254740993") +% @ +\begin{Schunk} +\begin{Sinput} +> test$repeated_int64 <- c("9007199254740992", "9007199254740993") +\end{Sinput} +\end{Schunk} When reading the value back into R, numeric types are returned by default, but when the full precision is required a character value @@ -2081,16 +2098,54 @@ % output inside the block (E.g. the output of test$repeated_int64). % This reduces the usefulness of this example when we are on a 64-bit % platform. E.g. only the final ``2'' is returned. -<>= -if (.Machine$sizeof.longlong >= 8) { - options("RProtoBuf.int64AsString" = FALSE) - test$repeated_int64 - length(unique(test$repeated_int64)) - options("RProtoBuf.int64AsString" = TRUE) - test$repeated_int64 - length(unique(test$repeated_int64)) -} -@ +% <>= +% options("RProtoBuf.int64AsString" = FALSE) +% test$repeated_int64 +% length(unique(test$repeated_int64)) +% options("RProtoBuf.int64AsString" = TRUE) +% test$repeated_int64 +% length(unique(test$repeated_int64)) +%@ +\begin{Schunk} +\begin{Sinput} +> options("RProtoBuf.int64AsString" = FALSE) +\end{Sinput} +\begin{Soutput} +$RProtoBuf.int64AsString +[1] FALSE +\end{Soutput} +\begin{Sinput} +> test$repeated_int64 +\end{Sinput} +\begin{Soutput} +[1] 9.007199e+15 9.007199e+15 +\end{Soutput} +\begin{Sinput} +> length(unique(test$repeated_int64)) +\end{Sinput} +\begin{Soutput} +[1] 1 +\end{Soutput} +\begin{Sinput} +> options("RProtoBuf.int64AsString" = TRUE) +\end{Sinput} +\begin{Soutput} +$RProtoBuf.int64AsString +[1] FALSE +\end{Soutput} +\begin{Sinput} +> test$repeated_int64 +\end{Sinput} +\begin{Soutput} +[1] "9007199254740992" "9007199254740993" +\end{Soutput} +\begin{Sinput} +> length(unique(test$repeated_int64)) +\end{Sinput} +\begin{Soutput} +[1] 2 +\end{Soutput} +\end{Schunk} <>= options("RProtoBuf.int64AsString" = FALSE) From noreply at r-forge.r-project.org Mon Jan 13 23:12:59 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 13 Jan 2014 23:12:59 +0100 (CET) Subject: [Rprotobuf-commits] r771 - papers/jss Message-ID: <20140113221300.012F2184632@r-forge.r-project.org> Author: murray Date: 2014-01-13 23:12:59 +0100 (Mon, 13 Jan 2014) New Revision: 771 Modified: papers/jss/article.Rnw Log: Remove descriptor lookup section and incorporate a few sentences and the citation from that section into an earlier section where we first talk about the descriptorpool environment, per Jeroen's comment. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-13 22:03:09 UTC (rev 770) +++ papers/jss/article.Rnw 2014-01-13 22:12:59 UTC (rev 771) @@ -424,15 +424,35 @@ After importing proto files, the corresponding message descriptors are available from the \texttt{RProtoBuf:DescriptorPool} environment in -the R search path. The underlying mechanism used here is -described in more detail in Section~\ref{sec-lookup}. +the R search path. This environment is implemented with the user +defined tables framework from the \pkg{RObjectTables} package +available from the OmegaHat project \citep{RObjectTables}. Instead of +being associated with a static hash table, this environment +dynamically queries the in-memory database of loaded descriptors +during normal variable lookup. %JO: can we just move the section 7 to here? It's only one paragraph% +%MS: I replaced section 7 with 2 sentences above. <<>>= ls("RProtoBuf:DescriptorPool") @ +% The old section 7 in entirety: +%The \texttt{RProtoBuf} package uses the user defined tables framework +%that is defined as part of the \texttt{RObjectTables} package available +%from the OmegaHat project \citep{RObjectTables}. +% +%The feature allows \texttt{RProtoBuf} to install the +%special environment \emph{RProtoBuf:DescriptorPool} in the R search path. +%The environment is special in that, instead of being associated with a +%static hash table, it is dynamically queried by R as part of R's usual +%variable lookup. In other words, it means that when the R interpreter +%looks for a binding to a symbol (foo) in its search path, +%it asks to our package if it knows the binding "foo", this is then +%implemented by the \texttt{RProtoBuf} package by calling an internal +%method of the \texttt{protobuf} C++ library. + %\subsection{Importing proto files} %In contrast to the other languages (Java, C++, Python) that are officially %supported by Google, the implementation used by the \texttt{RProtoBuf} @@ -1353,25 +1373,6 @@ reflection and object mapping. -\section{Descriptor lookup} -\label{sec-lookup} - -%JO: is this section really relevant? Maybe just a citation will do instead? - -The \texttt{RProtoBuf} package uses the user defined tables framework -that is defined as part of the \texttt{RObjectTables} package available -from the OmegaHat project \citep{RObjectTables}. - -The feature allows \texttt{RProtoBuf} to install the -special environment \emph{RProtoBuf:DescriptorPool} in the R search path. -The environment is special in that, instead of being associated with a -static hash table, it is dynamically queried by R as part of R's usual -variable lookup. In other words, it means that when the R interpreter -looks for a binding to a symbol (foo) in its search path, -it asks to our package if it knows the binding "foo", this is then -implemented by the \texttt{RProtoBuf} package by calling an internal -method of the \texttt{protobuf} C++ library. - %\section{Other approaches} % Phillip Yelland wrote another implementation, currently proprietary, From noreply at r-forge.r-project.org Tue Jan 14 01:41:36 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 01:41:36 +0100 (CET) Subject: [Rprotobuf-commits] r772 - papers/jss Message-ID: <20140114004136.6682B183AD8@r-forge.r-project.org> Author: murray Date: 2014-01-14 01:41:35 +0100 (Tue, 14 Jan 2014) New Revision: 772 Modified: papers/jss/article.Rnw Log: Edits to section 1 suggested by Karl. 1) Remove the Hadley Split-Apply-Combine reference for now as it is confusing and more narrowly R-only than the multi-platform-data-analysis pipeline pattern otherwise being discussed in the first paragraph. We might be able to add it back with suitable distinctions added. 2) Note that our goal is not just to safely transfer the data, but to safely _and efficiently_ do so. 3) Rewrite the second to last paragraph a bit, and add a note specifically at the end that this paper describes an R implementation of protocol buffers. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-13 22:12:59 UTC (rev 771) +++ papers/jss/article.Rnw 2014-01-14 00:41:35 UTC (rev 772) @@ -119,9 +119,10 @@ built using collections of components to better manage software complexity through reusability, modularity, and fault isolation \citep{Wegiel:2010:CTT:1932682.1869479}. -Data analysis patterns such as Split-Apply-Combine -\citep{wickham2011split} explicitly break up large problems into -manageable pieces. These patterns are frequently employed with +% This is really a different pattern not connected well here. +%Data analysis patterns such as Split-Apply-Combine +%\citep{wickham2011split} explicitly break up large problems into manageable pieces. +These pipelines are frequently built with different programming languages used for the different phases of data analysis -- collection, cleaning, modeling, analysis, post-processing, and presentation in order to take advantage of the unique combination of @@ -130,7 +131,7 @@ analysis pipeline may involve storing intermediate results in a file or sending them over the network. -Given these requirements, how do we safely share intermediate results +Given these requirements, how do we safely and efficiently share intermediate results between different applications, possibly written in different languages, and possibly running on different computer system, possibly spanning different operating systems? Programming @@ -173,17 +174,27 @@ \emph{interface description language}, or \emph{IDL}. IDLs like Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact well-documented schema for cross-language data structures and -efficient binary interchange formats. The schema can be used to -generate model classes for statically-typed programming languages such -as C++ and Java, or can be used with reflection for dynamically-typed -programming languages. Since the schema is provided separately from -the encoded data, the data can be efficiently encoded to minimize -storage costs of the stored data when compared with simple -``schema-less'' binary interchange formats. -Many sources compare data serialization formats and show Protocol -Buffers very favorably to the alternatives; see +efficient binary interchange formats. +Since the schema is provided separately from the encoded data, the data can be +efficiently encoded to minimize storage costs of the stored data when compared with simple +``schema-less'' binary interchange formats. Many sources compare data serialization formats +and show Protocol Buffers compare very favorably to the alternatives; see \citet{Sumaray:2012:CDS:2184751.2184810} for one such comparison. +The schema can be used to generate classes for statically-typed programming languages +such as C++ and Java, or can be used with reflection for dynamically-typed programming +languages +% The schema can be used to +%generate model classes for statically-typed programming languages such +%as C++ and Java, or can be used with reflection for dynamically-typed +%programming languages. Since the schema is provided separately from +%the encoded data, the data can be efficiently encoded to minimize +%storage costs of the stored data when compared with simple +%``schema-less'' binary interchange formats. +%Many sources compare data serialization formats and show Protocol +%Buffers very + + % TODO(mstokely): Take a more conversational tone here asking % questions and motivating protocol buffers? @@ -193,6 +204,7 @@ % in the middle (full class/method details) and interesting % applications at the end. +This paper describes an R interface to protocol buffers. The rest of the paper is organized as follows. Section~\ref{sec:protobuf} provides a general overview of Protocol Buffers. Section~\ref{sec:rprotobuf-basic} describes the interactive R interface From noreply at r-forge.r-project.org Tue Jan 14 02:06:48 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 02:06:48 +0100 (CET) Subject: [Rprotobuf-commits] r773 - papers/jss Message-ID: <20140114010648.A03EC186806@r-forge.r-project.org> Author: murray Date: 2014-01-14 02:06:46 +0100 (Tue, 14 Jan 2014) New Revision: 773 Modified: papers/jss/article.Rnw Log: Correct a typo: s/at time/at times/ Implement some suggestions from Karl: * Improve the sentence comparing to Rserve * Add more context about the structure of the protocol buffer schema. Section 2 is very high level / abstract, then section 3 dives into the R interface without ever describing field types, tag numbers, etc. Added three sentences or so to the beginning of section 3 before diving into the R examples. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-14 00:41:35 UTC (rev 772) +++ papers/jss/article.Rnw 2014-01-14 01:06:46 UTC (rev 773) @@ -241,7 +241,11 @@ applications as varied as %database-internal messaging (Drizzle), % DE: citation? Sony Playstations, Twitter, Google Search, Hadoop, and Open Street Map. % TODO(DE): This either needs a citation, or remove the name drop -While traditional IDLs have at time been criticized for code bloat and +% MS: These are mostly from blog posts, I can't find a good reference +% that has a long list, and the name and year citation style seems +% less conducive to long lists of marginal citations like blog posts +% compared to say concise CS/math style citations [3,4,5,6]. Thoughts? +While traditional IDLs have at times been criticized for code bloat and complexity, Protocol Buffers are based on a simple list and records model that is compartively flexible and simple to use. @@ -264,9 +268,10 @@ Common use cases include populating a request remote-procedure call (RPC) Protocol Buffer in R that is then serialized and sent over the network to a remote server. The server would then deserialize the message, act on the -request, and respond with a new Protocol Buffer over the network. The key -difference to, say, a request to an Rserve instance is that the remote server -may not even know the R language. +request, and respond with a new Protocol Buffer over the network. +The key difference to, say, a request to an Rserve instance is that +the remote server may be implemented in any language, with no +dependence on R. %Protocol buffers are a language-neutral, platform-neutral, extensible %way of serializing structured data for use in communications @@ -295,9 +300,10 @@ language-specific classes that can be used to create, read, write and manipulate Protocol Buffer messages. The R interface, in contrast, uses a reflection-based API that is particularly well-suited for -interactive data analysis. All messages in R have a single class +interactive data analysis. +All messages in R have a single class structure, but different accessor methods are created at runtime based -on the name fields of the specified message type. +on the named fields of the specified message type. % In other words, given the 'proto' %description file, code is automatically generated for the chosen @@ -346,7 +352,15 @@ Table~\ref{tab:proto} shows an example \texttt{.proto} file which defines the \texttt{tutorial.Person} type. The R code in the right column shows an example of creating a new message of this type and -populating its fields. +populating its fields. A \texttt{.proto} file may contain one or more +message types, and each message type has one or more fields. A field +is specified with a unique number, a name, a value type, and a field +rule specifying whether the field is optional, required, or repeated. +The supported value types are numbers, enumerations, booleans, +strings, raw bytes, or other nested message types. +The \texttt{.proto} file syntax for defining the structure of protocol +buffer data is described comprehensively on Google Code\footnote{See +\url{http://code.google.com/apis/protocolbuffers/docs/proto.html}.}. % Commented out because we said this earlier. %This separation @@ -430,9 +444,6 @@ \texttt{.proto} files are imported using the \code{readProtoFiles} function, which can either import a single file, all files in a directory, or every \texttt{.proto} file provided by a particular R package. -The \texttt{.proto} file syntax for defining the structure of protocol -buffer data is described comprehensively on Google Code\footnote{See -\url{http://code.google.com/apis/protocolbuffers/docs/proto.html}.}. After importing proto files, the corresponding message descriptors are available from the \texttt{RProtoBuf:DescriptorPool} environment in From noreply at r-forge.r-project.org Tue Jan 14 02:19:02 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 02:19:02 +0100 (CET) Subject: [Rprotobuf-commits] r774 - papers/jss Message-ID: <20140114011902.3323F18628E@r-forge.r-project.org> Author: murray Date: 2014-01-14 02:19:01 +0100 (Tue, 14 Jan 2014) New Revision: 774 Modified: papers/jss/article.Rnw Log: Section 5 (types): Add a sentence noting that three types in particular need special attention, to transition to the three subsections on booleans, uint32s, and int64s. Section 6 (Converting R data structures to Protocol Buffers): Rewrite first two sentences to make the distinction between the previous sections describing methods for working with messages and a specific pre-defined schema, and the function in this section describing a universal schema that can be used on arbitrary R data structures. This seems like the more accurate distinction than calling it low-level vs high-level as was done previously. Also, add a missing comma in section 6. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-14 01:06:46 UTC (rev 773) +++ papers/jss/article.Rnw 2014-01-14 01:19:01 UTC (rev 774) @@ -1032,7 +1032,8 @@ Table~\ref{table-get-types} details the correspondence between the field type and the type of data that is retrieved by \verb|$| and \verb|[[| -extractors. +extractors. Three types in particular need further attention due to +specific differences in the R language. \begin{table}[h] \centering @@ -1187,10 +1188,10 @@ \label{sec:evaluation} The previous sections discussed functionality in the \pkg{RProtoBuf} package -for creating, manipulating, parsing and serializing Protocol Buffer messages. -In addition to these low-level methods, the package also has some high level -functionality for automatically converting R data structures into protocol -buffers and vice versa. The \texttt{serialize\_pb} and \texttt{unserialize\_pb} +for creating, manipulating, parsing and serializing Protocol Buffer +messages of a specific pre-defined schema. The package also provides +methods for converting arbitrary R data structures into protocol +buffers and vice versa with a universal R object schema. The \texttt{serialize\_pb} and \texttt{unserialize\_pb} functions serialize arbitrary R objects into a universal Protocol Buffer message: @@ -1201,7 +1202,7 @@ In order to accomplish this, \pkg{RProtoBuf} uses the same catch-all \texttt{proto} schema used by \pkg{RHIPE} for exchanging R data with Hadoop \citep{rhipe}. This -schema, which we will refer to as \texttt{rexp.proto} is printed in appendix +schema, which we will refer to as \texttt{rexp.proto}, is printed in appendix \ref{rexp.proto}. The Protocol Buffer messages generated by \pkg{RProtoBuf} and \pkg{RHIPE} are naturally compatible between the two systems because they use the same schema. This shows the power of using a schema based cross-platform format such From noreply at r-forge.r-project.org Tue Jan 14 05:45:50 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 05:45:50 +0100 (CET) Subject: [Rprotobuf-commits] r775 - papers/jss Message-ID: <20140114044558.9BDF9186274@r-forge.r-project.org> Author: murray Date: 2014-01-14 05:45:43 +0100 (Tue, 14 Jan 2014) New Revision: 775 Added: papers/jss/hist.pb papers/jss/histogram.proto Log: Add an example serialized protocol buffer of a histogram created in python, and the histogram.proto file from HistogramTools. Added: papers/jss/hist.pb =================================================================== (Binary files differ) Property changes on: papers/jss/hist.pb ___________________________________________________________________ Added: svn:mime-type + application/octet-stream Added: papers/jss/histogram.proto =================================================================== --- papers/jss/histogram.proto (rev 0) +++ papers/jss/histogram.proto 2014-01-14 04:45:43 UTC (rev 775) @@ -0,0 +1,7 @@ +package HistogramTools; + +message HistogramState { + repeated double breaks = 1; + repeated int32 counts = 2; + optional string name = 3; +} From noreply at r-forge.r-project.org Tue Jan 14 05:49:03 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 05:49:03 +0100 (CET) Subject: [Rprotobuf-commits] r776 - papers/jss Message-ID: <20140114044903.B0F8B186509@r-forge.r-project.org> Author: murray Date: 2014-01-14 05:49:02 +0100 (Tue, 14 Jan 2014) New Revision: 776 Modified: papers/jss/article.Rnw papers/jss/article.bib Log: Move the MapReduce / Histogram example before the OpenCPU one and greatly improve it. Add a very simple python example of using the protoc and a few lines of python to store arrays of bin counts and breaks as a Histogram protocol buffer. Then, use HistogramTools to read in this histogram into R, convert it to a native R histogram object, and plot it. Add a reference to blocker's theoretical work on preprocessing, and a self-citation at the end of this example section to show a real application of the design pattern described here. TODO: All three steps - the python code, the R code, and the output histogram plot should be much more concisely typeset onto a single line. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-14 04:45:43 UTC (rev 775) +++ papers/jss/article.Rnw 2014-01-14 04:49:02 UTC (rev 776) @@ -1404,8 +1404,127 @@ % large number of protocol buffers, but is less user friendly for the % basic cases documented here. -%\section{Basic usage example - tutorial.Person} +\section{Application: Distributed Data Collection with MapReduce} +\label{sec:mapreduce} +Many large data sets in fields such as particle physics and information +processing are stored in binned or histogram form in order to reduce +the data storage requirements \citep{scott2009multivariate}. In the +last decade, the MapReduce programming model \citep{dean2008mapreduce} +has emerged as a popular design pattern that enables the processing of +very large data sets on large compute clusters. + +Many types of data analysis over large data sets may involve very rare +phenomenon or be dealing with highly skewed data sets or inflexible +raw data storage systems from which unbiased sampling is not feasible. +In such situations, MapReduce and binning may be combined as a +pre-processing step for a wide range of statistical and scientific +analyses \citep{blocker2013}. + +There are two common patterns for generating histograms of large data +sets in a single pass with MapReduce. In the first method, each +mapper task generates a histogram over a subset of the data that it +has been assigned, serializes this histogram and sends it to one or +more reducer tasks which merge the intermediate histograms from the +mappers. + +In the second method, illustrated in +Figure~\ref{fig:mr-histogram-pattern1}, each mapper rounds a data +point to a bucket width and outputs that bucket as a key and '1' as a +value. Reducers then sum up all of the values with the same key and +output to a data store. + +\begin{figure}[h!] +\begin{center} +\includegraphics[width=\textwidth]{histogram-mapreduce-diag1.pdf} +\end{center} +\caption{Diagram of MapReduce Histogram Generation Pattern} +\label{fig:mr-histogram-pattern1} +\end{figure} + +In both methods, the mapper tasks must choose identical bucket +boundaries in advance if we are to construct the histogram in a single +pass, even though they are analyzing disjoint parts of the input set +that may cover different ranges. All distributed tasks involved in +the pre-processing as well as any downstream data analysis tasks must +share a schema of the histogram representation to coordinate +effectively. + +The \pkg{HistogramTools} package \citep{histogramtools} enhances +\pkg{RProtoBuf} by providing a concise schema for R histogram objects: + +\begin{example} +package HistogramTools; + +message HistogramState { + repeated double breaks = 1; + repeated int32 counts = 2; + optional string name = 3; +} +\end{example} + +This HistogramState message type is designed to be helpful if some of +the Map or Reduce tasks are written in R, or if those components are +written in other languages and only the resulting output histograms +need to be manipulated in R. For example, to create HistogramState +messages in Python for later consumption by R, we first compile the +\texttt{histogram.proto} descriptor into a python module using the +\texttt{protoc} compiler: + +\begin{verbatim} + protoc histogram.proto --python_out=. +\end{verbatim} +This generates Python module called \texttt{histogram\_pb2.py}, containing both the +descriptor information as well as methods to read and manipulate the R object +message. + +\begin{verbatim} +# Import modules +from histogram_pb2 import HistogramState; + +# Create empty Histogram message +hist = HistogramState() + +# Add breakpoints and binned data set. +hist.counts.extend([2, 6, 2, 4, 6]) +hist.breaks.extend(range(6)) +hist.name="Example Histogram Created in Python" + +# Output the histogram +outfile = open("/tmp/hist.pb", "wb") +outfile.write(hist.SerializeToString()) +outfile.close() +\end{verbatim} + +We can then read in the histogram into R and plot it with : + +\begin{verbatim} +library(RProtoBuf) +library(HistogramTools) + +# Read the Histogram schema +readProtoFiles(package="HistogramTools") + +# Read the serialized histogram file. +hist <- HistogramTools.HistogramState$read("/tmp/hist.pb") +hist +[1] "message of type 'HistogramTools.HistogramState' with 3 fields set" + +# Convert to native R histogram object and plot +plot(as.histogram(hist)) +\end{verbatim} + +<>= +require(RProtoBuf) +require(HistogramTools) +readProtoFiles(package="HistogramTools") +hist <- HistogramTools.HistogramState$read("/tmp/hist.pb") +plot(as.histogram(hist)) +@ + +One of the authors has used this design pattern for several large +scale studies of distributed filesystems \citep{janus}. + \section{Application: Data Interchange in Web Services} \label{sec:opencpu} @@ -1618,52 +1737,7 @@ print(msg.realValue); \end{verbatim} -\section{Application: Distributed Data Collection with MapReduce} -\label{sec:mapreduce} -Over the past years, the MapReduce programming model \citep{dean2008mapreduce} -has emerged as a poweful design pattern for processing large data -sets in parallel on large compute clusters. Protocol Buffers -provide a convenient mechanism to send structured data between tasks -in a MapReduce cluster. In particular, the large data sets in fields -such as particle physics and information processing are frequently -stored in binned or histogram form in order to reduce the data storage -requirements for later data analysis \citep{scott2009multivariate}. - -In such environments, analysts may be interested in very rare -phenomenon or be dealing with highly skewed data sets or inflexible -raw data storage systems from which unbiased sampling is not feasible. -There are two common patterns for generating histograms of large data -sets in a single pass with MapReduce. In the first method, each -mapper task generates a histogram over a subset of the data that it -has been assigned, serializes this histogram and sends it to one or -more reducer tasks which merge the intermediate histograms from the -mappers. - -In the second method, illustrated in -Figure~\ref{fig:mr-histogram-pattern1}, each mapper rounds a data -point to a bucket width and outputs that bucket as a key and '1' as a -value. Reducers then sum up all of the values with the same key and -output to a data store. - -In both methods, the mapper tasks must choose identical bucket -boundaries in advance if we are to construct the histogram in a single -pass, even though they are analyzing disjoint parts of the input set -that may cover different ranges. The \pkg{HistogramTools} package -\citep{histogramtools} enhances \pkg{RProtoBuf} by providing a concise -schema for R histogram objects. The histogram message type is -designed to be helpful if some of the Map or Reduce tasks are written -in R, or if those components are written in other languages and only -the resulting output histograms need to be manipulated in R. - -\begin{figure}[h!] -\begin{center} -\includegraphics[width=\textwidth]{histogram-mapreduce-diag1.pdf} -\end{center} -\caption{Diagram of MapReduce Histogram Generation Pattern} -\label{fig:mr-histogram-pattern1} -\end{figure} - %\section{Application: Sending/receiving Interaction With Servers} % %Combined @@ -1759,4 +1833,3 @@ %% Note: If there is markup in \(sub)section, then it has to be escape as above. \end{document} - Modified: papers/jss/article.bib =================================================================== --- papers/jss/article.bib 2014-01-14 04:45:43 UTC (rev 775) +++ papers/jss/article.bib 2014-01-14 04:49:02 UTC (rev 776) @@ -14,6 +14,29 @@ note = {R package version 1.1}, url = {http://CRAN.R-project.org/package=msgpackR}, } + at inproceedings{janus, +title = {Janus: Optimal Flash Provisioning for Cloud Storage Workloads}, +author = {Christoph Albrecht and Arif Merchant and Murray Stokely and Muhammad Waliji and Francois Labelle and Nathan Coehlo and Xudong Shi and Eric Schrock}, +year = 2013, +URL = {https://www.usenix.org/system/files/conference/atc13/atc13-albrecht.pdf}, +booktitle = {Proceedings of the USENIX Annual Technical Conference}, +pages = {91--102}, +address = {2560 Ninth Street, Suite 215, Berkeley, CA 94710, USA} +} + at article{blocker2013, +ajournal = "Bernoulli", +author = "Blocker, Alexander W. and Meng, Xiao-Li", +doi = "10.3150/13-BEJSP16", +journal = "Bernoulli", +month = "09", +number = "4", +pages = "1176--1211", +publisher = "Bernoulli Society for Mathematical Statistics and Probability", +title = "The potential and perils of preprocessing: Building new foundations", +url = "http://dx.doi.org/10.3150/13-BEJSP16", +volume = "19", +year = "2013" +} @Manual{rmongodb, title={rmongodb: R-MongoDB driver}, author={Gerald Lindsly}, From noreply at r-forge.r-project.org Tue Jan 14 06:04:06 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 06:04:06 +0100 (CET) Subject: [Rprotobuf-commits] r777 - pkg/inst Message-ID: <20140114050406.CB947186BAC@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-14 06:04:06 +0100 (Tue, 14 Jan 2014) New Revision: 777 Modified: pkg/inst/NEWS.Rd Log: Added section in NEWS for upcoming release Modified: pkg/inst/NEWS.Rd =================================================================== --- pkg/inst/NEWS.Rd 2014-01-14 04:49:02 UTC (rev 776) +++ pkg/inst/NEWS.Rd 2014-01-14 05:04:06 UTC (rev 777) @@ -2,6 +2,17 @@ \title{News for Package \pkg{RProtoBuf}} \newcommand{\cpkg}{\href{http://CRAN.R-project.org/package=#1}{\pkg{#1}}} +\section{Changes in RProtoBuf version 0.4.0 (2014-01-14)}{ + \itemize{ + \item Added Jeroen Ooms as author + \item Changes to support CRAN builds for MS Windows. + \item Added functions \code{serialize_pb}, \code{unserialize_pb}, + and \code{can_serialize_pb} plus documentation. + \item Added unit tests for serializing and unserializing datasets + using \code{serialize_pb}. + } +} + \section{Changes in UNRELEASED RProtoBuf version 0.3.3 (2013-12-26)}{ \itemize{ \item Vignettes have been converted to the R 3.0.0 or later use of From noreply at r-forge.r-project.org Tue Jan 14 07:07:48 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 07:07:48 +0100 (CET) Subject: [Rprotobuf-commits] r778 - pkg/man Message-ID: <20140114060748.EE6BB186C52@r-forge.r-project.org> Author: murray Date: 2014-01-14 07:07:48 +0100 (Tue, 14 Jan 2014) New Revision: 778 Modified: pkg/man/ZeroCopyInputStream-class.Rd Log: Add references to new numeric arguments to ReadRaw and ReadString. Modified: pkg/man/ZeroCopyInputStream-class.Rd =================================================================== --- pkg/man/ZeroCopyInputStream-class.Rd 2014-01-14 05:04:06 UTC (rev 777) +++ pkg/man/ZeroCopyInputStream-class.Rd 2014-01-14 06:07:48 UTC (rev 778) @@ -12,10 +12,12 @@ \alias{ReadRaw} \alias{ReadRaw-methods} \alias{ReadRaw,ZeroCopyInputStream,integer-method} +\alias{ReadRaw,ZeroCopyInputStream,numeric-method} \alias{ReadString} \alias{ReadString-methods} \alias{ReadString,ZeroCopyInputStream,integer-method} +\alias{ReadString,ZeroCopyInputStream,numeric-method} \alias{ReadVarint32} \alias{ReadVarint32-methods} @@ -52,7 +54,9 @@ \item{BackUp}{\code{signature(object="ZeroCopyInputStream")}: Backs up a number of bytes, so that the next call to \code{Next} returns data again that was already returned by the last call to \code{Next}.} \item{ByteCount}{\code{signature(object="ZeroCopyInputStream")}: Returns the total number of bytes read since this object was created. } \item{ReadRaw}{\code{signature(object="ZeroCopyInputStream", size = "integer")}: read raw bytes from the stream} + \item{ReadRaw}{\code{signature(object="ZeroCopyInputStream", size = "numeric")}: read raw bytes from the stream} \item{ReadString}{\code{signature(object="ZeroCopyInputStream", size = "integer")}: same as \code{ReadRaw} but formats the result as a string} + \item{ReadString}{\code{signature(object="ZeroCopyInputStream", size = "numeric")}: same as \code{ReadRaw} but formats the result as a string} \item{ReadVarint32}{\code{signature(object="ZeroCopyInputStream")}: Read an unsigned integer with Varint encoding, truncating to 32 bits. } \item{ReadLittleEndian32}{\code{signature(object="ZeroCopyInputStream")}: Read a 32-bit little-endian integer. } \item{ReadLittleEndian64}{\code{signature(object="ZeroCopyInputStream")}: Read a 64-bit little-endian integer. In R the value is stored as a \code{double} which looses some precision (no other way) } @@ -69,4 +73,3 @@ TODO: add classes that extend } \keyword{classes} - From noreply at r-forge.r-project.org Tue Jan 14 07:15:32 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 07:15:32 +0100 (CET) Subject: [Rprotobuf-commits] r779 - pkg/inst Message-ID: <20140114061532.5C637186759@r-forge.r-project.org> Author: murray Date: 2014-01-14 07:15:32 +0100 (Tue, 14 Jan 2014) New Revision: 779 Modified: pkg/inst/NEWS.Rd Log: Collate all unreleased news items to a single entry for 0.4.0. Modified: pkg/inst/NEWS.Rd =================================================================== --- pkg/inst/NEWS.Rd 2014-01-14 06:07:48 UTC (rev 778) +++ pkg/inst/NEWS.Rd 2014-01-14 06:15:32 UTC (rev 779) @@ -4,17 +4,12 @@ \section{Changes in RProtoBuf version 0.4.0 (2014-01-14)}{ \itemize{ - \item Added Jeroen Ooms as author \item Changes to support CRAN builds for MS Windows. \item Added functions \code{serialize_pb}, \code{unserialize_pb}, - and \code{can_serialize_pb} plus documentation. - \item Added unit tests for serializing and unserializing datasets + and \code{can_serialize_pb} plus documentation from Jeroen Ooms + RProtoBufUtils package. using \code{serialize_pb}. - } -} - -\section{Changes in UNRELEASED RProtoBuf version 0.3.3 (2013-12-26)}{ - \itemize{ + \item Added Jeroen Ooms as author. \item Vignettes have been converted to the R 3.0.0 or later use of external vignette builders, no longer need a \code{Makefile} \item Added missing methods to dollar completion list for Message, From noreply at r-forge.r-project.org Tue Jan 14 07:16:40 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 07:16:40 +0100 (CET) Subject: [Rprotobuf-commits] r780 - pkg/inst Message-ID: <20140114061640.6E2341867B8@r-forge.r-project.org> Author: murray Date: 2014-01-14 07:16:40 +0100 (Tue, 14 Jan 2014) New Revision: 780 Modified: pkg/inst/NEWS.Rd Log: Ooops, remove fragment. Modified: pkg/inst/NEWS.Rd =================================================================== --- pkg/inst/NEWS.Rd 2014-01-14 06:15:32 UTC (rev 779) +++ pkg/inst/NEWS.Rd 2014-01-14 06:16:40 UTC (rev 780) @@ -8,7 +8,6 @@ \item Added functions \code{serialize_pb}, \code{unserialize_pb}, and \code{can_serialize_pb} plus documentation from Jeroen Ooms RProtoBufUtils package. - using \code{serialize_pb}. \item Added Jeroen Ooms as author. \item Vignettes have been converted to the R 3.0.0 or later use of external vignette builders, no longer need a \code{Makefile} From noreply at r-forge.r-project.org Tue Jan 14 07:19:25 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 07:19:25 +0100 (CET) Subject: [Rprotobuf-commits] r781 - in pkg: . inst/unitTests src Message-ID: <20140114061925.D5281186845@r-forge.r-project.org> Author: murray Date: 2014-01-14 07:19:25 +0100 (Tue, 14 Jan 2014) New Revision: 781 Modified: pkg/ChangeLog pkg/DESCRIPTION pkg/inst/unitTests/runit.bool.R pkg/src/wrapper_ArrayInputStream.cpp Log: Increment version to 0.4 -- Dirk to review and push to CRAN. Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-14 06:16:40 UTC (rev 780) +++ pkg/ChangeLog 2014-01-14 06:19:25 UTC (rev 781) @@ -6,6 +6,7 @@ issues}): Hide 64-bit field accesses with if conditional to avoid running code that won't work on platforms without 64-bit long longs. + * DESCRIPTION (Version): Increment to 0.4.0 2014-01-09 Murray Stokely Modified: pkg/DESCRIPTION =================================================================== --- pkg/DESCRIPTION 2014-01-14 06:16:40 UTC (rev 780) +++ pkg/DESCRIPTION 2014-01-14 06:19:25 UTC (rev 781) @@ -1,5 +1,5 @@ Package: RProtoBuf -Version: 0.3.2.5 +Version: 0.4 Date: $Date$ Author: Romain Francois, Dirk Eddelbuettel, Murray Stokely and Jeroen Ooms Maintainer: Dirk Eddelbuettel Modified: pkg/inst/unitTests/runit.bool.R =================================================================== --- pkg/inst/unitTests/runit.bool.R 2014-01-14 06:16:40 UTC (rev 780) +++ pkg/inst/unitTests/runit.bool.R 2014-01-14 06:19:25 UTC (rev 781) @@ -22,7 +22,7 @@ package="RProtoBuf") readProtoFiles(file=unittest.proto.file) } - + a <- new(protobuf_unittest.TestAllTypes) a$optional_bool <- TRUE a$optional_bool <- FALSE Modified: pkg/src/wrapper_ArrayInputStream.cpp =================================================================== --- pkg/src/wrapper_ArrayInputStream.cpp 2014-01-14 06:16:40 UTC (rev 780) +++ pkg/src/wrapper_ArrayInputStream.cpp 2014-01-14 06:19:25 UTC (rev 781) @@ -2,11 +2,12 @@ #include "rprotobuf.h" #include "RcppMacros.h" - +#include namespace rprotobuf { RPB_FUNCTION_2(S4_ArrayInputStream, ArrayInputStream__new, Rcpp::RawVector payload, int block_size) { + printf("In ArrayInputStream__new, returning S4_ArrayInputStream(payload, blocksize)\n"); return S4_ArrayInputStream(payload, block_size); } } From noreply at r-forge.r-project.org Tue Jan 14 07:20:49 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 07:20:49 +0100 (CET) Subject: [Rprotobuf-commits] r782 - pkg/src Message-ID: <20140114062049.9CB5A186864@r-forge.r-project.org> Author: murray Date: 2014-01-14 07:20:49 +0100 (Tue, 14 Jan 2014) New Revision: 782 Modified: pkg/src/wrapper_ArrayInputStream.cpp Log: Oops, inadvertant debugging change submitted. Revert. Modified: pkg/src/wrapper_ArrayInputStream.cpp =================================================================== --- pkg/src/wrapper_ArrayInputStream.cpp 2014-01-14 06:19:25 UTC (rev 781) +++ pkg/src/wrapper_ArrayInputStream.cpp 2014-01-14 06:20:49 UTC (rev 782) @@ -2,12 +2,11 @@ #include "rprotobuf.h" #include "RcppMacros.h" -#include + namespace rprotobuf { RPB_FUNCTION_2(S4_ArrayInputStream, ArrayInputStream__new, Rcpp::RawVector payload, int block_size) { - printf("In ArrayInputStream__new, returning S4_ArrayInputStream(payload, blocksize)\n"); return S4_ArrayInputStream(payload, block_size); } } From noreply at r-forge.r-project.org Tue Jan 14 09:02:00 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 09:02:00 +0100 (CET) Subject: [Rprotobuf-commits] r783 - pkg/inst Message-ID: <20140114080200.96AA4186823@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-14 09:01:59 +0100 (Tue, 14 Jan 2014) New Revision: 783 Modified: pkg/inst/NEWS.Rd Log: add news item Modified: pkg/inst/NEWS.Rd =================================================================== --- pkg/inst/NEWS.Rd 2014-01-14 06:20:49 UTC (rev 782) +++ pkg/inst/NEWS.Rd 2014-01-14 08:01:59 UTC (rev 783) @@ -8,6 +8,7 @@ \item Added functions \code{serialize_pb}, \code{unserialize_pb}, and \code{can_serialize_pb} plus documentation from Jeroen Ooms RProtoBufUtils package. + \item New dir \code{inst/python} with some Python examples. \item Added Jeroen Ooms as author. \item Vignettes have been converted to the R 3.0.0 or later use of external vignette builders, no longer need a \code{Makefile} From noreply at r-forge.r-project.org Tue Jan 14 14:38:01 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 14:38:01 +0100 (CET) Subject: [Rprotobuf-commits] r784 - papers/jss Message-ID: <20140114133801.F39431867CA@r-forge.r-project.org> Author: murray Date: 2014-01-14 14:38:01 +0100 (Tue, 14 Jan 2014) New Revision: 784 Modified: papers/jss/article.Rnw Log: Dirk likes my tongue in cheek subtitle for the intro so change the section title to "Introduction: Friends don't let friends use CSV". Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-14 08:01:59 UTC (rev 783) +++ papers/jss/article.Rnw 2014-01-14 13:38:01 UTC (rev 784) @@ -113,7 +113,7 @@ %TODO(de) 'protocol buffers' or 'Protocol Buffers' ? -\section{Introduction} +\section{Introduction: Friends don't let friends use CSV} Modern data collection and analysis pipelines are increasingly being built using collections of components to better manage software @@ -1518,7 +1518,7 @@ require(RProtoBuf) require(HistogramTools) readProtoFiles(package="HistogramTools") -hist <- HistogramTools.HistogramState$read("/tmp/hist.pb") +hist <- HistogramTools.HistogramState$read("hist.pb") plot(as.histogram(hist)) @ From noreply at r-forge.r-project.org Tue Jan 14 19:31:13 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 19:31:13 +0100 (CET) Subject: [Rprotobuf-commits] r785 - pkg/inst/proto Message-ID: <20140114183114.0AE2618612B@r-forge.r-project.org> Author: edd Date: 2014-01-14 19:31:13 +0100 (Tue, 14 Jan 2014) New Revision: 785 Modified: pkg/inst/proto/rexp.proto Log: adding copyright header to rexp.proto as per email discussion Modified: pkg/inst/proto/rexp.proto =================================================================== --- pkg/inst/proto/rexp.proto 2014-01-14 13:38:01 UTC (rev 784) +++ pkg/inst/proto/rexp.proto 2014-01-14 18:31:13 UTC (rev 785) @@ -1,3 +1,6 @@ +// Originally written by Saptarshi Guha for RHIPE (http://www.rhipe.org) +// Released under Apache License 2.0, and reused with permission here + package rexp; message REXP { From noreply at r-forge.r-project.org Tue Jan 14 21:28:35 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 21:28:35 +0100 (CET) Subject: [Rprotobuf-commits] r786 - papers/jss Message-ID: <20140114202835.CD3DD185DDB@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-14 21:28:35 +0100 (Tue, 14 Jan 2014) New Revision: 786 Modified: papers/jss/article.Rnw papers/jss/article.bib Log: intermediate commit cause i?\226?\128?\153m getting lunch Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-14 18:31:13 UTC (rev 785) +++ papers/jss/article.Rnw 2014-01-14 20:28:35 UTC (rev 786) @@ -115,60 +115,62 @@ \section{Introduction: Friends don't let friends use CSV} -Modern data collection and analysis pipelines are increasingly being -built using collections of components to better manage software -complexity through reusability, modularity, and fault -isolation \citep{Wegiel:2010:CTT:1932682.1869479}. +Modern data collection and analysis pipelines involve collections +of components to enhance conrol of complex systems through +reusability, modularity, and fault isolation \citep{Wegiel:2010:CTT:1932682.1869479}. % This is really a different pattern not connected well here. %Data analysis patterns such as Split-Apply-Combine %\citep{wickham2011split} explicitly break up large problems into manageable pieces. -These pipelines are frequently built with -different programming languages used for the different phases of data -analysis -- collection, cleaning, modeling, analysis, post-processing, and +These pipelines are frequently built using different programming +languages for various phases of data analysis -- collection, +cleaning, modeling, analysis, post-processing, and presentation in order to take advantage of the unique combination of performance, speed of development, and library support offered by -different environments and languages. Each stage of such a data -analysis pipeline may involve storing intermediate results in a -file or sending them over the network. +each environment or language. Every stage of such a data +analysis pipeline may produce intermediate results that need to be +stored or sent over the network for further processing. +% JO Perhaps also mention that serialization is needed for distributed +% systems to make systems scale up? -Given these requirements, how do we safely and efficiently share intermediate results -between different applications, possibly written in different -languages, and possibly running on different computer system, possibly -spanning different operating systems? Programming -languages such as R, Julia, Java, and Python include built-in -serialization support, but these formats are tied to the specific -% DE: need to define serialization? -programming language in use and thus lock the user into a single +Such systems require reliable and efficient exchange of intermediate +results between the individual components, using formats that are +independent of platform, language, operating system or architecture. +Most technical computing languages such as R, Julia, Java, and Python +include built-in support for serialization, but the default formats +are usually language specific and thereby lock the user into a single environment. -\emph{Comma-separated values} (CSV) files can be read and written by many -applications and so are often used for exporting tabular data. However, CSV -files have a number of disadvantages, such as a limitation of exporting only -tabular datasets, lack of type-safety, inefficient text representation and -parsing, possibly limited precision and ambiguities in the format involving -special characters. \emph{JavaScript Object Notation} (JSON) is another -widely-supported format used mostly on the web that removes many of these -disadvantages, but it too suffers from being too slow to parse and also does -not provide strong typing between integers and floating point. Because the -schema information is not kept separately, multiple JSON messages of the same -type needlessly duplicate the field names with each message. Lastly, -\emph{Extensible Markup Language} (XML) is a well-established and widely-supported -protocol with the ability to define just about any arbitrarily complex -schema. However, it pays for this complexity with comparatively large and -verbose messages, and added complexities at the parsing side (which are -somewhat mitigated by the availability of mature libraries and -parsers). +Traditionally, scientists and statisticians often use character seperated +text formats such as \texttt{CSV} \citep{shafranovich2005common} to +export and import data. However, anyone who has ever used this will have +noticed that this method has many limitations: it is restricted to tabular +datasets, lacks type-safety, and has limited precision for numeric values. +Moreover, ambiguities in the format itself frequently cause problems. +For example the default characters used as seperator and decimal point +are different in various parts of the world. +\emph{Extensible Markup Language} (\texttt{XML}) is another text-based +well-established and widely-supported format with the ability to define +just about any arbitrarily complex schema \citep{nolan2013xml}. However, +it pays for this complexity with comparatively large and verbose messages, +and added complexities at the parsing side (which are somewhat mitigated +by the availability of mature libraries and parsers). +A more modern, widely used format is \emph{JavaScript Object Notation} +(\texttt{JSON}), which is derived from the object literals of +\proglang{JavaScript}. This format is text-based as well and used mostly +on the web. Several R packages implement functions to parse and generate +\texttt{JSON} data from R objects. A number of \texttt{JSON} dialects has +been proposed, such as \texttt{BSON} and \texttt{MessagePack} which both +add binary support. However, these derivatives are not compatible with +existing JSON software, and have not been widely adopted. + +\subsection{Why Protocol Buffers} -A number of binary formats based on JSON have been proposed that -reduce the parsing cost and improve the efficiency. MessagePack -and BSON both have R interfaces, but % \citep{msgpackR,rmongodb}, but -% DE Why do we cite these packages, but not the numerous JSON packages? -these formats lack a separate schema for the serialized data and thus -still duplicate field names with each message sent over the network or -stored in a file. Such formats also lack support for versioning when -data storage needs evolve over time, or when application logic and -requirement changes dictate update to the message format. +- This paper introduces another format: protocol buffers +- unique combination of features that make it very suitable for numerical computing: +- binary, schema, versioned, mature, high quality cross language implementations +- we argue that (complex) statistical applications will benefit from using this format +%we should probably explain what a schema is% Once the data serialization needs of an application become complex enough, developers typically benefit from the use of an \emph{interface description language}, or \emph{IDL}. IDLs like Modified: papers/jss/article.bib =================================================================== --- papers/jss/article.bib 2014-01-14 18:31:13 UTC (rev 785) +++ papers/jss/article.bib 2014-01-14 20:28:35 UTC (rev 786) @@ -315,3 +315,15 @@ note = {R package version 1.2.2}, url = {http://www.opencpu.org}, } + at article{shafranovich2005common, + title={Common format and mime type for comma-separated values (csv) files}, + author={Shafranovich, Yakov}, + year={2005}, + url={http://tools.ietf.org/html/rfc4180} +} + at book{nolan2013xml, + title={XML and Web Technologies for Data Sciences with R}, + author={Nolan, Deborah and Temple Lang, Duncan}, + year={2013}, + publisher={Springer} +} \ No newline at end of file From noreply at r-forge.r-project.org Tue Jan 14 21:52:15 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 14 Jan 2014 21:52:15 +0100 (CET) Subject: [Rprotobuf-commits] r787 - pkg/inst Message-ID: <20140114205215.89FF9185EB2@r-forge.r-project.org> Author: murray Date: 2014-01-14 21:52:14 +0100 (Tue, 14 Jan 2014) New Revision: 787 Modified: pkg/inst/NEWS.Rd Log: Add another news entry for a change. Modified: pkg/inst/NEWS.Rd =================================================================== --- pkg/inst/NEWS.Rd 2014-01-14 20:28:35 UTC (rev 786) +++ pkg/inst/NEWS.Rd 2014-01-14 20:52:14 UTC (rev 787) @@ -50,6 +50,8 @@ \item Add better error messages when setting a repeated field of messages to inform the user which element index was of the wrong type and what the expected type was. + \item Add an optional 'partial' argument to readASCII allowing + uninitialized message fragments to be read in. \item (internal) Added const qualifiers in more places throughout the C++ code for type safety. \item (internal) Standardize coding conventions of the C++ files and run them From noreply at r-forge.r-project.org Wed Jan 15 03:30:55 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 15 Jan 2014 03:30:55 +0100 (CET) Subject: [Rprotobuf-commits] r788 - papers/jss Message-ID: <20140115023055.8189718627C@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-15 03:30:53 +0100 (Wed, 15 Jan 2014) New Revision: 788 Modified: papers/jss/article.Rnw Log: rewrite of section 1 Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-14 20:52:14 UTC (rev 787) +++ papers/jss/article.Rnw 2014-01-15 02:30:53 UTC (rev 788) @@ -116,8 +116,8 @@ \section{Introduction: Friends don't let friends use CSV} Modern data collection and analysis pipelines involve collections -of components to enhance conrol of complex systems through -reusability, modularity, and fault isolation \citep{Wegiel:2010:CTT:1932682.1869479}. +of decoupled components in order to manage and control complexity +through reusability, modularity, and fault isolation \citep{Wegiel:2010:CTT:1932682.1869479}. % This is really a different pattern not connected well here. %Data analysis patterns such as Split-Apply-Combine %\citep{wickham2011split} explicitly break up large problems into manageable pieces. @@ -142,49 +142,56 @@ Traditionally, scientists and statisticians often use character seperated text formats such as \texttt{CSV} \citep{shafranovich2005common} to -export and import data. However, anyone who has ever used this will have +export and import data. However, anyone who has ever used \texttt{CSV} will have noticed that this method has many limitations: it is restricted to tabular -datasets, lacks type-safety, and has limited precision for numeric values. +data, lacks type-safety, and has limited precision for numeric values. Moreover, ambiguities in the format itself frequently cause problems. -For example the default characters used as seperator and decimal point -are different in various parts of the world. -\emph{Extensible Markup Language} (\texttt{XML}) is another text-based +For example, conventions on which characters used as seperator and decimal +point vary by country. +\emph{Extensible Markup Language} (\texttt{XML}) is another well-established and widely-supported format with the ability to define just about any arbitrarily complex schema \citep{nolan2013xml}. However, it pays for this complexity with comparatively large and verbose messages, -and added complexities at the parsing side (which are somewhat mitigated -by the availability of mature libraries and parsers). +and added complexitiy at the parsing side (which are somewhat mitigated +by the availability of mature libraries and parsers). Because \texttt{XML} +is text based and has no native notion of numeric types or arrays, it +usually not a very practical format to store numeric datasets as they appear +in statistical applications. A more modern, widely used format is \emph{JavaScript Object Notation} (\texttt{JSON}), which is derived from the object literals of -\proglang{JavaScript}. This format is text-based as well and used mostly -on the web. Several R packages implement functions to parse and generate -\texttt{JSON} data from R objects. A number of \texttt{JSON} dialects has -been proposed, such as \texttt{BSON} and \texttt{MessagePack} which both -add binary support. However, these derivatives are not compatible with -existing JSON software, and have not been widely adopted. +\proglang{JavaScript}, and used mostly on the web. \texttt{JSON} natively +supports arrays and distinguishes 4 primitive types: numbers, strings, +booleans and null. However, because it is a text-based format, numbers are +stored as human-readable decimal notation which is somewhat inefficient and +leads to loss of type (double vs integer) and precision. Several R packages +implement functions to parse and generate \texttt{JSON} data from R objects. +A number of \texttt{JSON} dialects has been proposed, such as \texttt{BSON} and +\texttt{MessagePack} which both add binary support. However, these derivatives +are not compatible with existing JSON software, and have not been widely adopted. \subsection{Why Protocol Buffers} -- This paper introduces another format: protocol buffers -- unique combination of features that make it very suitable for numerical computing: -- binary, schema, versioned, mature, high quality cross language implementations -- we argue that (complex) statistical applications will benefit from using this format +In 2008, Google released an open source version of Protocol Buffers: the data +interchange format that was designed and used for their internal infrastructure. +Google officially provides high quality parsing libraries for \texttt{Java}, +\texttt{C++} and \texttt{Python}, and community developed open source implementations +are available for many other languages. +Protocol Buffers take a quite different approach from many other popular formats. +They offer a unique combination of features, performance, and maturity that seems +particulary well suited for data-driven applications and numerical computing. +Protocol Buffers are a binary format that natively supports all common primitive types +found in modern programming languages. The advantage of this is that numeric values +are serialized exactly the same way as they are stored in memory. Therefore there is +no loss of precision, no overhead, and parsing messages is very efficient: the system can +simply copy bytes to memory without any further processing. +But the most powerful feature of protocol buffers is that it decouples the content +from the structure using a schema, very similar to a database. This further increases +performance by eliminating redundancy, while at the same time providing foundations +for defining an \emph{Interface Description Language}, or \emph{IDL}. +Many sources compare data serialization formats and show Protocol Buffers compare +very favorably to the alternatives; see \citet{Sumaray:2012:CDS:2184751.2184810} +for one such comparison. -%we should probably explain what a schema is% -Once the data serialization needs of an application become complex -enough, developers typically benefit from the use of an -\emph{interface description language}, or \emph{IDL}. IDLs like -Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact -well-documented schema for cross-language data structures and -efficient binary interchange formats. -Since the schema is provided separately from the encoded data, the data can be -efficiently encoded to minimize storage costs of the stored data when compared with simple -``schema-less'' binary interchange formats. Many sources compare data serialization formats -and show Protocol Buffers compare very favorably to the alternatives; see -\citet{Sumaray:2012:CDS:2184751.2184810} for one such comparison. -The schema can be used to generate classes for statically-typed programming languages -such as C++ and Java, or can be used with reflection for dynamically-typed programming -languages % The schema can be used to %generate model classes for statically-typed programming languages such @@ -229,6 +236,23 @@ \section{Protocol Buffers} \label{sec:protobuf} + +% JO: I'm not sure where to put this paragraph. I think it is too technical +% for the introduction section. Maybe start this section with some explanation +% of what a schema is and then continue with showing how PB implement this? +Once the data serialization needs of an application become complex +enough, developers typically benefit from the use of an +\emph{interface description language}, or \emph{IDL}. IDLs like +Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact +well-documented schema for cross-language data structures and +efficient binary interchange formats. +Since the schema is provided separately from the encoded data, the data can be +efficiently encoded to minimize storage costs of the stored data when compared with simple +``schema-less'' binary interchange formats. +The schema can be used to generate classes for statically-typed programming languages +such as C++ and Java, or can be used with reflection for dynamically-typed programming +languages. + %FIXME Introductory section which may include references in parentheses %\citep{R}, or cite a reference such as \citet{R} in the text. @@ -247,6 +271,9 @@ % that has a long list, and the name and year citation style seems % less conducive to long lists of marginal citations like blog posts % compared to say concise CS/math style citations [3,4,5,6]. Thoughts? + + + While traditional IDLs have at times been criticized for code bloat and complexity, Protocol Buffers are based on a simple list and records model that is compartively flexible and simple to use. From noreply at r-forge.r-project.org Wed Jan 15 05:30:14 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 15 Jan 2014 05:30:14 +0100 (CET) Subject: [Rprotobuf-commits] r789 - papers/jss Message-ID: <20140115043014.1A2A1186561@r-forge.r-project.org> Author: edd Date: 2014-01-15 05:30:13 +0100 (Wed, 15 Jan 2014) New Revision: 789 Modified: papers/jss/article.Rnw Log: some tweaks to intro Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-15 02:30:53 UTC (rev 788) +++ papers/jss/article.Rnw 2014-01-15 04:30:13 UTC (rev 789) @@ -113,7 +113,7 @@ %TODO(de) 'protocol buffers' or 'Protocol Buffers' ? -\section{Introduction: Friends don't let friends use CSV} +\section{Introduction} % TODO(DE) More sober: Friends don't let friends use CSV} Modern data collection and analysis pipelines involve collections of decoupled components in order to manage and control complexity @@ -140,59 +140,59 @@ are usually language specific and thereby lock the user into a single environment. -Traditionally, scientists and statisticians often use character seperated -text formats such as \texttt{CSV} \citep{shafranovich2005common} to -export and import data. However, anyone who has ever used \texttt{CSV} will have -noticed that this method has many limitations: it is restricted to tabular -data, lacks type-safety, and has limited precision for numeric values. -Moreover, ambiguities in the format itself frequently cause problems. -For example, conventions on which characters used as seperator and decimal -point vary by country. -\emph{Extensible Markup Language} (\texttt{XML}) is another -well-established and widely-supported format with the ability to define -just about any arbitrarily complex schema \citep{nolan2013xml}. However, -it pays for this complexity with comparatively large and verbose messages, -and added complexitiy at the parsing side (which are somewhat mitigated -by the availability of mature libraries and parsers). Because \texttt{XML} -is text based and has no native notion of numeric types or arrays, it -usually not a very practical format to store numeric datasets as they appear -in statistical applications. -A more modern, widely used format is \emph{JavaScript Object Notation} -(\texttt{JSON}), which is derived from the object literals of -\proglang{JavaScript}, and used mostly on the web. \texttt{JSON} natively -supports arrays and distinguishes 4 primitive types: numbers, strings, -booleans and null. However, because it is a text-based format, numbers are +%\paragraph*{Friends don't let friends use CSV!} +Data analysts and researchers often use character seperated text formats such +as \texttt{CSV} \citep{shafranovich2005common} to export and import +data. However, anyone who has ever used \texttt{CSV} files will have noticed +that this method has many limitations: it is restricted to tabular data, +lacks type-safety, and has limited precision for numeric values. Moreover, +ambiguities in the format itself frequently cause problems. For example, +conventions on which characters is used as seperator or decimal point vary by +country. \emph{Extensible Markup Language} (\texttt{XML}) is another +well-established and widely-supported format with the ability to define just +about any arbitrarily complex schema \citep{nolan2013xml}. However, it pays +for this complexity with comparatively large and verbose messages, and added +complexitiy at the parsing side (which are somewhat mitigated by the +availability of mature libraries and parsers). Because \texttt{XML} is text +based and has no native notion of numeric types or arrays, it usually not a +very practical format to store numeric datasets as they appear in statistical +applications. A more modern, widely used format is \emph{JavaScript Object + Notation} (\texttt{JSON}), which is derived from the object literals of +\proglang{JavaScript}, and used increasingly on the world wide web. \texttt{JSON} natively +supports arrays and distinguishes 4 primitive types: numbers, strings, +booleans and null. However, as it too is a text-based format, numbers are stored as human-readable decimal notation which is somewhat inefficient and -leads to loss of type (double vs integer) and precision. Several R packages -implement functions to parse and generate \texttt{JSON} data from R objects. -A number of \texttt{JSON} dialects has been proposed, such as \texttt{BSON} and -\texttt{MessagePack} which both add binary support. However, these derivatives -are not compatible with existing JSON software, and have not been widely adopted. +leads to loss of type (double versus integer) and precision. Several R packages +implement functions to parse and generate \texttt{JSON} data from R objects. +A number of \texttt{JSON} variants has been proposed, such as \texttt{BSON} +and \texttt{MessagePack} which both add binary support. However, these +derivatives are not compatible with existing JSON software, and have not seen +wide adoption. -\subsection{Why Protocol Buffers} - -In 2008, Google released an open source version of Protocol Buffers: the data +%\paragraph*{Enter Protocol Buffers:} +In 2008, and following several years of internal use, Google released an open +source version of Protocol Buffers. It provides data interchange format that was designed and used for their internal infrastructure. -Google officially provides high quality parsing libraries for \texttt{Java}, -\texttt{C++} and \texttt{Python}, and community developed open source implementations +Google officially provides high-quality parsing libraries for \texttt{Java}, +\texttt{C++} and \texttt{Python}, and community-developed open source implementations are available for many other languages. Protocol Buffers take a quite different approach from many other popular formats. They offer a unique combination of features, performance, and maturity that seems particulary well suited for data-driven applications and numerical computing. Protocol Buffers are a binary format that natively supports all common primitive types -found in modern programming languages. The advantage of this is that numeric values -are serialized exactly the same way as they are stored in memory. Therefore there is +found in modern programming languages. A key advantage is that numeric values +are serialized exactly the same way as they are stored in memory. There is no loss of precision, no overhead, and parsing messages is very efficient: the system can simply copy bytes to memory without any further processing. -But the most powerful feature of protocol buffers is that it decouples the content +But the most powerful feature of Protocol Buffers is that it decouples the content from the structure using a schema, very similar to a database. This further increases performance by eliminating redundancy, while at the same time providing foundations for defining an \emph{Interface Description Language}, or \emph{IDL}. Many sources compare data serialization formats and show Protocol Buffers compare very favorably to the alternatives; see \citet{Sumaray:2012:CDS:2184751.2184810} for one such comparison. +% TODO(DE): Mention "future proof" forward compatibility of schemata - % The schema can be used to %generate model classes for statically-typed programming languages such %as C++ and Java, or can be used with reflection for dynamically-typed @@ -213,8 +213,8 @@ % in the middle (full class/method details) and interesting % applications at the end. -This paper describes an R interface to protocol buffers. -The rest of the paper is organized as follows. Section~\ref{sec:protobuf} +This paper describes an R interface to Protocol Buffer, +and is organized as follows. Section~\ref{sec:protobuf} provides a general overview of Protocol Buffers. Section~\ref{sec:rprotobuf-basic} describes the interactive R interface provided by \CRANpkg{RProtoBuf} and introduces the two main abstractions: From noreply at r-forge.r-project.org Wed Jan 15 06:03:21 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 15 Jan 2014 06:03:21 +0100 (CET) Subject: [Rprotobuf-commits] r790 - papers/jss Message-ID: <20140115050322.0C5A8185FA8@r-forge.r-project.org> Author: murray Date: 2014-01-15 06:03:21 +0100 (Wed, 15 Jan 2014) New Revision: 790 Modified: papers/jss/article.Rnw Log: Move the first sentence of the introduction a little back towards the direction of my original version. Specifically, "manage and control complexity" is redundant, so move back to my "better manage software complexity". Also add back the "increasingly", as I mentioned in email wanting to make this point specifically -- the utility of this multi-platform pattern becomes more evident with scale and larger code bases, which happens increasingly often now as computer systems get larger. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-15 04:30:13 UTC (rev 789) +++ papers/jss/article.Rnw 2014-01-15 05:03:21 UTC (rev 790) @@ -115,8 +115,8 @@ \section{Introduction} % TODO(DE) More sober: Friends don't let friends use CSV} -Modern data collection and analysis pipelines involve collections -of decoupled components in order to manage and control complexity +Modern data collection and analysis pipelines increasingly involve collections +of decoupled components in order to better manage software complexity through reusability, modularity, and fault isolation \citep{Wegiel:2010:CTT:1932682.1869479}. % This is really a different pattern not connected well here. %Data analysis patterns such as Split-Apply-Combine From noreply at r-forge.r-project.org Wed Jan 15 06:12:27 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 15 Jan 2014 06:12:27 +0100 (CET) Subject: [Rprotobuf-commits] r791 - papers/jss Message-ID: <20140115051227.BFD731812C1@r-forge.r-project.org> Author: murray Date: 2014-01-15 06:12:26 +0100 (Wed, 15 Jan 2014) New Revision: 791 Modified: papers/jss/article.Rnw Log: Add back the article 'the' in the second sentence of the introduction, and revert back to my use of 'different' rather than various. I believe this flows better but don't have a strong preference if someone can explain why they like the change. Adding the article 'the' makes it more concrete/definitive. Certainly others could come up with a different break down of the phases of data analysis, but that doesn't make our breakdown wrong, and thus I see no reason to avoid the article 'the' here, if that was indeed the reason for the change. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-15 05:03:21 UTC (rev 790) +++ papers/jss/article.Rnw 2014-01-15 05:12:26 UTC (rev 791) @@ -122,7 +122,7 @@ %Data analysis patterns such as Split-Apply-Combine %\citep{wickham2011split} explicitly break up large problems into manageable pieces. These pipelines are frequently built using different programming -languages for various phases of data analysis -- collection, +languages for the different phases of data analysis -- collection, cleaning, modeling, analysis, post-processing, and presentation in order to take advantage of the unique combination of performance, speed of development, and library support offered by From noreply at r-forge.r-project.org Wed Jan 15 06:18:26 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 15 Jan 2014 06:18:26 +0100 (CET) Subject: [Rprotobuf-commits] r792 - papers/jss Message-ID: <20140115051826.904791846FC@r-forge.r-project.org> Author: murray Date: 2014-01-15 06:18:25 +0100 (Wed, 15 Jan 2014) New Revision: 792 Modified: papers/jss/article.Rnw Log: Each is more correct than every as it is used in this sentence, so revert back to my previous usage of 'each' here, but keep some of the other nice improvements that were made to this sentence. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-15 05:12:26 UTC (rev 791) +++ papers/jss/article.Rnw 2014-01-15 05:18:25 UTC (rev 792) @@ -126,9 +126,9 @@ cleaning, modeling, analysis, post-processing, and presentation in order to take advantage of the unique combination of performance, speed of development, and library support offered by -each environment or language. Every stage of such a data +different environments and languages. Each stage of such a data analysis pipeline may produce intermediate results that need to be -stored or sent over the network for further processing. +stored in a file or sent over the network for further processing. % JO Perhaps also mention that serialization is needed for distributed % systems to make systems scale up? From noreply at r-forge.r-project.org Wed Jan 15 06:32:09 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 15 Jan 2014 06:32:09 +0100 (CET) Subject: [Rprotobuf-commits] r793 - papers/jss Message-ID: <20140115053209.28B9E1862A0@r-forge.r-project.org> Author: murray Date: 2014-01-15 06:32:08 +0100 (Wed, 15 Jan 2014) New Revision: 793 Modified: papers/jss/article.Rnw Log: Revert back to my motivating question to start the second paragraph of the introduction "Given these requirements, how do we X, Y, Z?" This to me is much more engaging than the text that replaced it, and motivating questions like this are not uncommon in JSS introductions. The next sentence, "Most technical computing languages such as X, Y, Z, and W" does not seem as correct as the simpler "Programming languages X, Y, Z, and W" that I used previously. X, Y, Z, and W do not necessarily equate with "most technical computing languages", and "technical computing languages" is more cumbersome than just saying "programming languages". What is technical computing compared to non-technical computing, anyway? Everything except VB macros? We could change this to "programming langauges used for data analysis" or similar if someone really doesn't like the simple straightforward use of "programming languages" here. Otherwise keep some of the grammar improvements made to these two sentences. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-15 05:18:25 UTC (rev 792) +++ papers/jss/article.Rnw 2014-01-15 05:32:08 UTC (rev 793) @@ -131,12 +131,20 @@ stored in a file or sent over the network for further processing. % JO Perhaps also mention that serialization is needed for distributed % systems to make systems scale up? +% MS: yes perhaps somewhere near here we could define serialization +% and describe this. -Such systems require reliable and efficient exchange of intermediate -results between the individual components, using formats that are -independent of platform, language, operating system or architecture. -Most technical computing languages such as R, Julia, Java, and Python -include built-in support for serialization, but the default formats +Given these requirements, how do we safely and efficiently share intermediate results +between different applications, possibly written in different +languages, and possibly running on different computer systems? +% Reverted to my original above, because the replacement below puts me +% to sleep: +%Such systems require reliable and efficient exchange of intermediate +%results between the individual components, using formats that are +%independent of platform, language, operating system or architecture. +Programming +languages such as R, Julia, Java, and Python include built-in +support for serialization, but the default formats are usually language specific and thereby lock the user into a single environment. From noreply at r-forge.r-project.org Sat Jan 18 05:17:19 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 18 Jan 2014 05:17:19 +0100 (CET) Subject: [Rprotobuf-commits] r794 - in pkg: . src Message-ID: <20140118041719.617DB186A5A@r-forge.r-project.org> Author: edd Date: 2014-01-18 05:17:12 +0100 (Sat, 18 Jan 2014) New Revision: 794 Removed: pkg/configure.win.readme Modified: pkg/DESCRIPTION pkg/src/Makevars.win Log: set version to 0.4.0 and move content of configure.win.readme into src/Makevars.win Modified: pkg/DESCRIPTION =================================================================== --- pkg/DESCRIPTION 2014-01-15 05:32:08 UTC (rev 793) +++ pkg/DESCRIPTION 2014-01-18 04:17:12 UTC (rev 794) @@ -1,5 +1,5 @@ Package: RProtoBuf -Version: 0.4 +Version: 0.4.0 Date: $Date$ Author: Romain Francois, Dirk Eddelbuettel, Murray Stokely and Jeroen Ooms Maintainer: Dirk Eddelbuettel Deleted: pkg/configure.win.readme =================================================================== --- pkg/configure.win.readme 2014-01-15 05:32:08 UTC (rev 793) +++ pkg/configure.win.readme 2014-01-18 04:17:12 UTC (rev 794) @@ -1,7 +0,0 @@ -# Rename this file to configure.win to build on machines where the -# headers/lib are not installed on the machine already. Make sure to -# use R CMD INSTALL --force-biarch -# -"${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e ' -download.file("http://r-forge.r-project.org/scm/viewvc.php/*checkout*/windows/protobuf-2.5.0-windows.zip?root=rprotobuf", "lib.zip"); -unzip("lib.zip");' Modified: pkg/src/Makevars.win =================================================================== --- pkg/src/Makevars.win 2014-01-15 05:32:08 UTC (rev 793) +++ pkg/src/Makevars.win 2014-01-18 04:17:12 UTC (rev 794) @@ -12,3 +12,8 @@ RCPP_LDFLAGS= $(shell "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e "Rcpp:::LdFlags()") PKG_CPPFLAGS=-I${LIB_PROTOBUF}/include -I../protobuf-2.5.0/include PKG_LIBS=$(RCPP_LDFLAGS) -L${LIB_PROTOBUF}/lib${R_ARCH} -L../protobuf-2.5.0/lib${R_ARCH} -lprotobuf + +## In case the headers and library are not installed, excute the following bit +## of R code, and make sure to # use R CMD INSTALL --force-biarch +## +## "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" -e 'download.file("http://r-forge.r-project.org/scm/viewvc.php/*checkout*/windows/protobuf-2.5.0-windows.zip?root=rprotobuf", "lib.zip"); unzip("lib.zip");' From noreply at r-forge.r-project.org Sat Jan 18 05:36:41 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 18 Jan 2014 05:36:41 +0100 (CET) Subject: [Rprotobuf-commits] r795 - pkg Message-ID: <20140118043641.CFC87186D6F@r-forge.r-project.org> Author: edd Date: 2014-01-18 05:36:39 +0100 (Sat, 18 Jan 2014) New Revision: 795 Modified: pkg/DESCRIPTION pkg/NAMESPACE Log: for the next Rcpp version, we need to import something from Rcpp to instantiate internal Rcpp properly Modified: pkg/DESCRIPTION =================================================================== --- pkg/DESCRIPTION 2014-01-18 04:17:12 UTC (rev 794) +++ pkg/DESCRIPTION 2014-01-18 04:36:39 UTC (rev 795) @@ -9,9 +9,9 @@ of its internal RPC protocols and file formats. Depends: R (>= 3.0.0), methods LinkingTo: Rcpp -Suggests: RUnit, highlight, Rcpp +Suggests: RUnit, highlight VignetteBuilder: highlight -Imports: utils, stats, tools, RCurl +Imports: utils, stats, tools, Rcpp, RCurl SystemRequirements: Protocol Buffer compiler (to create C++ header and source files from .proto descriptions) and library (version 2.2.0 or later) License: GPL-2 Modified: pkg/NAMESPACE =================================================================== --- pkg/NAMESPACE 2014-01-18 04:17:12 UTC (rev 794) +++ pkg/NAMESPACE 2014-01-18 04:36:39 UTC (rev 795) @@ -7,6 +7,7 @@ importFrom(stats, update) importFrom(tools, file_path_as_absolute) importFrom(RCurl, basicTextGatherer, curlPerform) +importFrom(Rcpp, evalCpp) exportClasses( From noreply at r-forge.r-project.org Sat Jan 18 05:38:43 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sat, 18 Jan 2014 05:38:43 +0100 (CET) Subject: [Rprotobuf-commits] r796 - pkg Message-ID: <20140118043843.5A4EB186D9D@r-forge.r-project.org> Author: edd Date: 2014-01-18 05:38:42 +0100 (Sat, 18 Jan 2014) New Revision: 796 Modified: pkg/DESCRIPTION Log: slight reordering Modified: pkg/DESCRIPTION =================================================================== --- pkg/DESCRIPTION 2014-01-18 04:36:39 UTC (rev 795) +++ pkg/DESCRIPTION 2014-01-18 04:38:42 UTC (rev 796) @@ -8,10 +8,10 @@ efficient yet extensible format. Google uses Protocol Buffers for almost all of its internal RPC protocols and file formats. Depends: R (>= 3.0.0), methods +Imports: utils, stats, tools, Rcpp, RCurl LinkingTo: Rcpp Suggests: RUnit, highlight VignetteBuilder: highlight -Imports: utils, stats, tools, Rcpp, RCurl SystemRequirements: Protocol Buffer compiler (to create C++ header and source files from .proto descriptions) and library (version 2.2.0 or later) License: GPL-2 From noreply at r-forge.r-project.org Sun Jan 19 23:14:27 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 19 Jan 2014 23:14:27 +0100 (CET) Subject: [Rprotobuf-commits] r797 - in pkg: . src Message-ID: <20140119221427.973D3185C3B@r-forge.r-project.org> Author: edd Date: 2014-01-19 23:14:27 +0100 (Sun, 19 Jan 2014) New Revision: 797 Modified: pkg/ChangeLog pkg/src/S4_classes.h Log: Adapt to S4 access function renaming in Rcpp 0.10.7 and later, with special thanks to Kevin Ushey Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-18 04:38:42 UTC (rev 796) +++ pkg/ChangeLog 2014-01-19 22:14:27 UTC (rev 797) @@ -1,3 +1,8 @@ +2014-01-19 Dirk Eddelbuettel + + * src/S4_classes.h: Adapt to S4 access function renaming in Rcpp + 0.10.7 and later, with special thanks to Kevin Ushey + 2014-01-13 Murray Stokely * inst/unitTests/runit.int64.R (test.int64): Skip this test with a Modified: pkg/src/S4_classes.h =================================================================== --- pkg/src/S4_classes.h 2014-01-18 04:38:42 UTC (rev 796) +++ pkg/src/S4_classes.h 2014-01-19 22:14:27 UTC (rev 797) @@ -1,6 +1,6 @@ // S4_classes.h: R/C++ interface class library // -// Copyright (C) 2010 - 2011 Dirk Eddelbuettel and Romain Francois +// Copyright (C) 2010 - 2014 Dirk Eddelbuettel and Romain Francois // // This file is part of RProtoBuf. // @@ -20,8 +20,23 @@ #ifndef RPROTOBUF_S4CLASSES_H #define RPROTOBUF_S4CLASSES_H +//1. Place using Rcpp::S4; somewhere in the top of S4_classes.h in the +//src directory (so S4 is looked up in the right place), +//2. Replace setSEXP with set__, and asSexp with get__. + namespace rprotobuf { +// deal with Rcpp API changes +#if defined(RCPP_VERSION) && RCPP_VERSION <= Rcpp_Version(0,10,6) + #define SetSexp setSEXP + #define AsSexp asSexp +#else + #define SetSexp set__ + #define AsSexp get__ +#endif + +using Rcpp::S4; + class ZeroCopyInputStreamWrapper; class ZeroCopyOutputStreamWrapper; @@ -35,13 +50,13 @@ slot("name") = d->name(); slot("full_name") = d->full_name(); } else { - setSEXP(R_NilValue); + SetSexp(R_NilValue); } } - S4_EnumValueDescriptor(const S4_EnumValueDescriptor& other) : S4() { setSEXP(other.asSexp()); } + S4_EnumValueDescriptor(const S4_EnumValueDescriptor& other) : S4() { SetSexp(other.AsSexp()); } S4_EnumValueDescriptor& operator=(const S4_EnumValueDescriptor& other) { - setSEXP(other.asSexp()); + SetSexp(other.AsSexp()); return *this; } }; @@ -57,9 +72,9 @@ } } - S4_Descriptor(const S4_Descriptor& other) : S4() { setSEXP(other.asSexp()); } + S4_Descriptor(const S4_Descriptor& other) : S4() { SetSexp(other.AsSexp()); } S4_Descriptor& operator=(const S4_Descriptor& other) { - setSEXP(other.asSexp()); + SetSexp(other.AsSexp()); return *this; } }; @@ -78,9 +93,9 @@ } } - S4_FileDescriptor(const S4_FileDescriptor& other) : S4() { setSEXP(other.asSexp()); } + S4_FileDescriptor(const S4_FileDescriptor& other) : S4() { SetSexp(other.AsSexp()); } S4_FileDescriptor& operator=(const S4_FileDescriptor& other) { - setSEXP(other.asSexp()); + SetSexp(other.AsSexp()); return *this; } }; @@ -95,9 +110,9 @@ slot("type") = d->containing_type()->full_name(); } - S4_FieldDescriptor(const S4_FieldDescriptor& other) : S4() { setSEXP(other.asSexp()); } + S4_FieldDescriptor(const S4_FieldDescriptor& other) : S4() { SetSexp(other.AsSexp()); } S4_FieldDescriptor& operator=(const S4_FieldDescriptor& other) { - setSEXP(other.asSexp()); + SetSexp(other.AsSexp()); return *this; } }; @@ -109,9 +124,9 @@ Rcpp::XPtr(const_cast(d), false); } - S4_ServiceDescriptor(const S4_ServiceDescriptor& other) : S4() { setSEXP(other.asSexp()); } + S4_ServiceDescriptor(const S4_ServiceDescriptor& other) : S4() { SetSexp(other.AsSexp()); } S4_ServiceDescriptor& operator=(const S4_ServiceDescriptor& other) { - setSEXP(other.asSexp()); + SetSexp(other.AsSexp()); return *this; } }; @@ -123,9 +138,9 @@ Rcpp::XPtr(const_cast(d), false); } - S4_MethodDescriptor(const S4_MethodDescriptor& other) : S4() { setSEXP(other.asSexp()); } + S4_MethodDescriptor(const S4_MethodDescriptor& other) : S4() { SetSexp(other.AsSexp()); } S4_MethodDescriptor& operator=(const S4_MethodDescriptor& other) { - setSEXP(other.asSexp()); + SetSexp(other.AsSexp()); return *this; } }; @@ -149,9 +164,9 @@ } } - S4_EnumDescriptor(const S4_EnumDescriptor& other) : S4() { setSEXP(other.asSexp()); } + S4_EnumDescriptor(const S4_EnumDescriptor& other) : S4() { SetSexp(other.AsSexp()); } S4_EnumDescriptor& operator=(const S4_EnumDescriptor& other) { - setSEXP(other.asSexp()); + SetSexp(other.AsSexp()); return *this; } }; @@ -162,9 +177,9 @@ slot("pointer") = Rcpp::XPtr(const_cast(d), true); slot("type") = d->GetDescriptor()->full_name(); } - S4_Message(const S4_Message& other) : S4() { setSEXP(other.asSexp()); } + S4_Message(const S4_Message& other) : S4() { SetSexp(other.AsSexp()); } S4_Message& operator=(const S4_Message& other) { - setSEXP(other.asSexp()); + SetSexp(other.AsSexp()); return *this; } }; @@ -183,9 +198,9 @@ slot("pointer") = wrapper; } - S4_ArrayOutputStream(const S4_ArrayOutputStream& other) { setSEXP(other.asSexp()); } + S4_ArrayOutputStream(const S4_ArrayOutputStream& other) { SetSexp(other.AsSexp()); } S4_ArrayOutputStream& operator=(const S4_ArrayOutputStream& other) { - setSEXP(other.asSexp()); + SetSexp(other.AsSexp()); return *this; } }; @@ -201,6 +216,9 @@ } }; + #undef SetSexp + #undef AsSexp + } // namespace rprotobuf #endif From noreply at r-forge.r-project.org Mon Jan 20 00:18:18 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 20 Jan 2014 00:18:18 +0100 (CET) Subject: [Rprotobuf-commits] r798 - in pkg: . inst Message-ID: <20140119231819.CB305186000@r-forge.r-project.org> Author: edd Date: 2014-01-20 00:18:14 +0100 (Mon, 20 Jan 2014) New Revision: 798 Modified: pkg/ChangeLog pkg/inst/NEWS.Rd Log: finalize as release 0.4.0 Modified: pkg/ChangeLog =================================================================== --- pkg/ChangeLog 2014-01-19 22:14:27 UTC (rev 797) +++ pkg/ChangeLog 2014-01-19 23:18:14 UTC (rev 798) @@ -1,5 +1,7 @@ 2014-01-19 Dirk Eddelbuettel + * DESCRIPTION: Finalize release 0.4.0 with initial Windows support + * src/S4_classes.h: Adapt to S4 access function renaming in Rcpp 0.10.7 and later, with special thanks to Kevin Ushey Modified: pkg/inst/NEWS.Rd =================================================================== --- pkg/inst/NEWS.Rd 2014-01-19 22:14:27 UTC (rev 797) +++ pkg/inst/NEWS.Rd 2014-01-19 23:18:14 UTC (rev 798) @@ -4,7 +4,7 @@ \section{Changes in RProtoBuf version 0.4.0 (2014-01-14)}{ \itemize{ - \item Changes to support CRAN builds for MS Windows. + \item Changes to support CRAN builds for MS Windows. \item Added functions \code{serialize_pb}, \code{unserialize_pb}, and \code{can_serialize_pb} plus documentation from Jeroen Ooms RProtoBufUtils package. @@ -54,10 +54,13 @@ uninitialized message fragments to be read in. \item (internal) Added const qualifiers in more places throughout the C++ code for type safety. - \item (internal) Standardize coding conventions of the C++ files and run them - through clang-format for consistency. A STYLE file has been submitted - to R-Forge with details about the coding standards and how they are - enforced with Emacs and clang-format. + \item (internal) Standardize coding conventions of the C++ files and + run them through clang-format for consistency. A STYLE file has + been submitted to R-Forge with details about the coding standards + and how they are enforced with Emacs and clang-format. + \item Applied changes suggested by Kevin Ushey to the \code{S4} + class handling to support both the currently released \cpkg{Rcpp} as + well as the currently pending next version. } } From noreply at r-forge.r-project.org Tue Jan 21 03:43:02 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 03:43:02 +0100 (CET) Subject: [Rprotobuf-commits] r799 - papers/jss Message-ID: <20140121024302.CA2A71856A7@r-forge.r-project.org> Author: murray Date: 2014-01-21 03:42:58 +0100 (Tue, 21 Jan 2014) New Revision: 799 Modified: papers/jss/article.Rnw Log: Switch order of mapreduce/web services sections in the last paragraph of the intro since we've moved the order of them in the text. For a figure in the mapreduce example section, make the histogram more vertically compact to not waste so much space. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-19 23:18:14 UTC (rev 798) +++ papers/jss/article.Rnw 2014-01-21 02:42:58 UTC (rev 799) @@ -231,9 +231,9 @@ package. Section~\ref{sec:types} describes the challenges of type coercion between R and other languages. Section~\ref{sec:evaluation} introduces a general R language schema for serializing arbitrary R objects and evaluates -it against R's built-in serialization. Sections~\ref{sec:opencpu} -and \ref{sec:mapreduce} provide real-world use cases of \CRANpkg{RProtoBuf} -in web service and MapReduce environments, respectively, before +it against R's built-in serialization. Sections~\ref{sec:mapreduce} +and \ref{sec:opencpu} provide real-world use cases of \CRANpkg{RProtoBuf} +in MapReduce and web service environments, respectively, before Section~\ref{sec:summary} concludes. %This article describes the basics of Google's Protocol Buffers through @@ -1551,7 +1551,7 @@ plot(as.histogram(hist)) \end{verbatim} -<>= +<>= require(RProtoBuf) require(HistogramTools) readProtoFiles(package="HistogramTools") From noreply at r-forge.r-project.org Tue Jan 21 04:09:26 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 04:09:26 +0100 (CET) Subject: [Rprotobuf-commits] r800 - papers/jss Message-ID: <20140121030926.C1DE91808F5@r-forge.r-project.org> Author: murray Date: 2014-01-21 04:09:22 +0100 (Tue, 21 Jan 2014) New Revision: 800 Modified: papers/jss/article.Rnw Log: Revert Jereoen's two sentences about MessagePack and BSON back to my original three sentences, and add back the citations to the relevant R packages. Jereon's version is very dismissive of these formats as not being widely used or compatible with the almighty JSON. My version notes that these address real deficiencies of JSON for the application domain we are talking about in this paper, and points out the shortcomings they still have compared to protocol buffers. This introduction is a natural flow of alternatives each slightly better than the last one discussed: starting with CSV, then XML, then JSON, then binary JSON, then protocol buffers. For this application domain, binary JSON is strictly better than JSON and so dismissive comments if any should be oriented the other way towards traditional text JSON. The XML section Jereon and Dirk added is great, thanks. We may still need one more sentence in the first paragraph making it crystal clear what application domain / context is used for this discussion of the alternatives. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-21 02:42:58 UTC (rev 799) +++ papers/jss/article.Rnw 2014-01-21 03:09:22 UTC (rev 800) @@ -112,9 +112,15 @@ \maketitle %TODO(de) 'protocol buffers' or 'Protocol Buffers' ? +% MS: Lets standardize on 'Protocol Buffers'? \section{Introduction} % TODO(DE) More sober: Friends don't let friends use CSV} - +% NOTE(MS): I really do think we can use add back: +% \section{Introduction: Friends Don't Let Friends Use CSV} +% I didn't use proper Title Caps the first time around but really I +% think it makes the paper more readable to have a tl;dr intro title +% that is fun and engaging since this paper is still on the dry/boring +% side. Modern data collection and analysis pipelines increasingly involve collections of decoupled components in order to better manage software complexity through reusability, modularity, and fault isolation \citep{Wegiel:2010:CTT:1932682.1869479}. @@ -164,18 +170,30 @@ availability of mature libraries and parsers). Because \texttt{XML} is text based and has no native notion of numeric types or arrays, it usually not a very practical format to store numeric datasets as they appear in statistical -applications. A more modern, widely used format is \emph{JavaScript Object +applications. +% +A more modern, widely used format is \emph{JavaScript Object Notation} (\texttt{JSON}), which is derived from the object literals of \proglang{JavaScript}, and used increasingly on the world wide web. \texttt{JSON} natively supports arrays and distinguishes 4 primitive types: numbers, strings, booleans and null. However, as it too is a text-based format, numbers are -stored as human-readable decimal notation which is somewhat inefficient and +stored as human-readable decimal notation which is inefficient and leads to loss of type (double versus integer) and precision. Several R packages implement functions to parse and generate \texttt{JSON} data from R objects. -A number of \texttt{JSON} variants has been proposed, such as \texttt{BSON} -and \texttt{MessagePack} which both add binary support. However, these -derivatives are not compatible with existing JSON software, and have not seen -wide adoption. + +A number of binary formats based on \texttt{JSON} have been proposed +that reduce the parsing cost and improve efficiency. \pkg{MessagePack} +\citep{msgpackR} and \pkg{BSON} \citep{rmongodb} both have R +interfaces, but these formats lack a separate schema for the seralized +data and thus still duplicate field names with each message sent over +the network or stored in a file. Such formats also lack support for +versioning when data storage needs evolve over time, or when +application logic and requirement changes dictate updates to the +message format. + +%and \texttt{MessagePack} which both add binary support. However, these +%derivatives are not compatible with existing JSON software, and have not seen +%wide adoption. %\paragraph*{Enter Protocol Buffers:} In 2008, and following several years of internal use, Google released an open From noreply at r-forge.r-project.org Tue Jan 21 04:18:25 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 04:18:25 +0100 (CET) Subject: [Rprotobuf-commits] r801 - papers/jss Message-ID: <20140121031825.2F0281860BA@r-forge.r-project.org> Author: murray Date: 2014-01-21 04:18:23 +0100 (Tue, 21 Jan 2014) New Revision: 801 Modified: papers/jss/article.Rnw papers/jss/article.bib Log: Add citations for rjson and RJSONIO in the JSON section to make this comparable to the other formats described in the introduction which include citations to R packages. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-21 03:09:22 UTC (rev 800) +++ papers/jss/article.Rnw 2014-01-21 03:18:23 UTC (rev 801) @@ -179,7 +179,8 @@ booleans and null. However, as it too is a text-based format, numbers are stored as human-readable decimal notation which is inefficient and leads to loss of type (double versus integer) and precision. Several R packages -implement functions to parse and generate \texttt{JSON} data from R objects. +implement functions to parse and generate \texttt{JSON} data from R +objects \citep{rjson,RJSONIO}. A number of binary formats based on \texttt{JSON} have been proposed that reduce the parsing cost and improve efficiency. \pkg{MessagePack} @@ -190,10 +191,6 @@ versioning when data storage needs evolve over time, or when application logic and requirement changes dictate updates to the message format. - -%and \texttt{MessagePack} which both add binary support. However, these -%derivatives are not compatible with existing JSON software, and have not seen -%wide adoption. %\paragraph*{Enter Protocol Buffers:} In 2008, and following several years of internal use, Google released an open Modified: papers/jss/article.bib =================================================================== --- papers/jss/article.bib 2014-01-21 03:09:22 UTC (rev 800) +++ papers/jss/article.bib 2014-01-21 03:18:23 UTC (rev 801) @@ -37,6 +37,20 @@ volume = "19", year = "2013" } + at Manual{RJSONIO, + title = {RJSONIO: Serialize R objects to JSON, JavaScript Object Notation}, + author = {Duncan Temple Lang}, + year = {2011}, + note = {R package version 0.96-0}, + url = {http://CRAN.R-project.org/package=RJSONIO}, +} + at Manual{rjson, + title = {rjson: JSON for R}, + author = {Alex Couture-Beil}, + year = {2012}, + note = {R package version 0.2.10}, + url = {http://CRAN.R-project.org/package=rjson}, +} @Manual{rmongodb, title={rmongodb: R-MongoDB driver}, author={Gerald Lindsly}, From noreply at r-forge.r-project.org Tue Jan 21 05:00:54 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 05:00:54 +0100 (CET) Subject: [Rprotobuf-commits] r802 - papers/jss Message-ID: <20140121040054.53BB41867BE@r-forge.r-project.org> Author: murray Date: 2014-01-21 05:00:53 +0100 (Tue, 21 Jan 2014) New Revision: 802 Modified: papers/jss/article.Rnw papers/jss/article.bib Log: Add a short sentence to define serialization early in the intro, addressing a todo that multiple people had mentioned. Reference the C++ FAQ for lack of a better reference for now. Revert the second to last paragraph of the introduction to my earlier version that was instead moved to section 2. Remove one of the more technical sentences to address Jeroen's observation that it was a bit too technical for the intro (e.g. "reflection" and dynamic typed languages was a bit much) Most sentences of the deleted paragraph were false, as discussed in email. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-21 03:18:23 UTC (rev 801) +++ papers/jss/article.Rnw 2014-01-21 04:00:53 UTC (rev 802) @@ -35,6 +35,10 @@ \CRANpkg{RProtoBuf} package provides a complete interface between this library and the R environment for statistical computing. %TODO(ms) keep it less than 150 words. +% Maybe add Jeroen's sentence: +% They offer a unique combination of features, performance, and maturity that seems +% particulary well suited for data-driven applications and numerical +% computing. } \Keywords{r, protocol buffers, serialization, cross-platform} \Plainkeywords{r, protocol buffers, serialization, cross-platform} %% without formatting @@ -137,12 +141,14 @@ stored in a file or sent over the network for further processing. % JO Perhaps also mention that serialization is needed for distributed % systems to make systems scale up? -% MS: yes perhaps somewhere near here we could define serialization -% and describe this. Given these requirements, how do we safely and efficiently share intermediate results between different applications, possibly written in different languages, and possibly running on different computer systems? +In computer programming, \emph{serialization} is the process of +translating data structures, variables, and session state into a +format that can be stored or transmitted and then reconstructed in the +original form later \citep{clinec++}. % Reverted to my original above, because the replacement below puts me % to sleep: %Such systems require reliable and efficient exchange of intermediate @@ -191,52 +197,55 @@ versioning when data storage needs evolve over time, or when application logic and requirement changes dictate updates to the message format. + +Once the data serialization needs of an application become complex +enough, developers typically benefit from the use of an +\emph{interface description language}, or \emph{IDL}. IDLs like +Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro +provide a compact well-documented schema for cross-language data +structures and efficient binary interchange formats. Since the schema +is provided separately from the encoded data, the data can be +efficiently encoded to minimize storage costs when +compared with simple ``schema-less'' binary interchange formats. +Many sources compare data serialization formats +and show Protocol Buffers compare very favorably to the alternatives; see +\citet{Sumaray:2012:CDS:2184751.2184810} for one such comparison. + +% Too technical, move to section 2. +% The schema can be used to generate model classes for statically-typed programming languages +%such as C++ and Java, or can be used with reflection for dynamically-typed programming +%languages. + +% TODO(mstokely): Will need to define reflection if we use it here. +% Maybe in the next section since its not as key as 'serialization' +% which we already defined. %\paragraph*{Enter Protocol Buffers:} -In 2008, and following several years of internal use, Google released an open -source version of Protocol Buffers. It provides data -interchange format that was designed and used for their internal infrastructure. -Google officially provides high-quality parsing libraries for \texttt{Java}, -\texttt{C++} and \texttt{Python}, and community-developed open source implementations -are available for many other languages. -Protocol Buffers take a quite different approach from many other popular formats. -They offer a unique combination of features, performance, and maturity that seems -particulary well suited for data-driven applications and numerical computing. -Protocol Buffers are a binary format that natively supports all common primitive types -found in modern programming languages. A key advantage is that numeric values -are serialized exactly the same way as they are stored in memory. There is -no loss of precision, no overhead, and parsing messages is very efficient: the system can -simply copy bytes to memory without any further processing. -But the most powerful feature of Protocol Buffers is that it decouples the content -from the structure using a schema, very similar to a database. This further increases -performance by eliminating redundancy, while at the same time providing foundations -for defining an \emph{Interface Description Language}, or \emph{IDL}. -Many sources compare data serialization formats and show Protocol Buffers compare -very favorably to the alternatives; see \citet{Sumaray:2012:CDS:2184751.2184810} -for one such comparison. + +% In 2008, and following several years of internal use, Google released an open +% source version of Protocol Buffers. It provides data +% interchange format that was designed and used for their internal infrastructure. +% Google officially provides high-quality parsing libraries for \texttt{Java}, +% \texttt{C++} and \texttt{Python}, and community-developed open source implementations +% are available for many other languages. +% Protocol Buffers take a quite different approach from many other popular formats. + +% TODO(mstokely): Good sentence from Jeroen, add it here or sec 2. +% They offer a unique combination of features, performance, and maturity that seems +% particulary well suited for data-driven applications and numerical +% computing. + % TODO(DE): Mention "future proof" forward compatibility of schemata -% The schema can be used to -%generate model classes for statically-typed programming languages such -%as C++ and Java, or can be used with reflection for dynamically-typed -%programming languages. Since the schema is provided separately from -%the encoded data, the data can be efficiently encoded to minimize -%storage costs of the stored data when compared with simple -%``schema-less'' binary interchange formats. -%Many sources compare data serialization formats and show Protocol -%Buffers very - % TODO(mstokely): Take a more conversational tone here asking % questions and motivating protocol buffers? -% TODO(mstokely): If we go to JSS, include a larger paragraph here -% referencing each numbered section. I don't like these generally, -% but its useful for this paper I think because we have a boring bit -% in the middle (full class/method details) and interesting -% applications at the end. +% NOTE(mstokely): I don't like these roadmap paragraphs in general, +% but it seems ueful here because we have a boring bit in the middle +% (full class/method details) and interesting applications at the end. -This paper describes an R interface to Protocol Buffer, +This paper describes an R interface to Protocol Buffers, and is organized as follows. Section~\ref{sec:protobuf} provides a general overview of Protocol Buffers. Section~\ref{sec:rprotobuf-basic} describes the interactive R interface Modified: papers/jss/article.bib =================================================================== --- papers/jss/article.bib 2014-01-21 03:18:23 UTC (rev 801) +++ papers/jss/article.bib 2014-01-21 04:00:53 UTC (rev 802) @@ -37,6 +37,12 @@ volume = "19", year = "2013" } + at article{clinec++, + title={C++ faq}, + author={Cline, Marshall}, + journal={Also available as http://www. parashift. com/c++-faq-lite/index. html}, + year = "2013" +} @Manual{RJSONIO, title = {RJSONIO: Serialize R objects to JSON, JavaScript Object Notation}, author = {Duncan Temple Lang}, From noreply at r-forge.r-project.org Tue Jan 21 05:19:04 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 05:19:04 +0100 (CET) Subject: [Rprotobuf-commits] r803 - papers/jss Message-ID: <20140121041904.AB1D31862F6@r-forge.r-project.org> Author: edd Date: 2014-01-21 05:19:04 +0100 (Tue, 21 Jan 2014) New Revision: 803 Modified: papers/jss/article.bib Log: minor fixes Modified: papers/jss/article.bib =================================================================== --- papers/jss/article.bib 2014-01-21 04:00:53 UTC (rev 802) +++ papers/jss/article.bib 2014-01-21 04:19:04 UTC (rev 803) @@ -25,7 +25,7 @@ } @article{blocker2013, ajournal = "Bernoulli", -author = "Blocker, Alexander W. and Meng, Xiao-Li", +author = "Alexander W. Blocker and Xiao-Li Meng", doi = "10.3150/13-BEJSP16", journal = "Bernoulli", month = "09", @@ -39,13 +39,13 @@ } @article{clinec++, title={C++ faq}, - author={Cline, Marshall}, + author={Marshall Cline}, journal={Also available as http://www. parashift. com/c++-faq-lite/index. html}, year = "2013" } @Manual{RJSONIO, title = {RJSONIO: Serialize R objects to JSON, JavaScript Object Notation}, - author = {Duncan Temple Lang}, + author = {Duncan {Temple Lang}}, year = {2011}, note = {R package version 0.96-0}, url = {http://CRAN.R-project.org/package=RJSONIO}, @@ -337,13 +337,13 @@ } @article{shafranovich2005common, title={Common format and mime type for comma-separated values (csv) files}, - author={Shafranovich, Yakov}, + author={Yakov Shafranovich}, year={2005}, url={http://tools.ietf.org/html/rfc4180} } @book{nolan2013xml, title={XML and Web Technologies for Data Sciences with R}, - author={Nolan, Deborah and Temple Lang, Duncan}, + author={Deborah Nolan and Duncan {Temple Lang}}, year={2013}, publisher={Springer} -} \ No newline at end of file +} From noreply at r-forge.r-project.org Tue Jan 21 05:21:13 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 05:21:13 +0100 (CET) Subject: [Rprotobuf-commits] r804 - papers/jss Message-ID: <20140121042113.9005F186553@r-forge.r-project.org> Author: edd Date: 2014-01-21 05:21:13 +0100 (Tue, 21 Jan 2014) New Revision: 804 Modified: papers/jss/article.bib Log: only whitespaced changed: I allowed Emacs to reformat (which is a menu entry if you use AucTeX) Modified: papers/jss/article.bib =================================================================== --- papers/jss/article.bib 2014-01-21 04:19:04 UTC (rev 803) +++ papers/jss/article.bib 2014-01-21 04:21:13 UTC (rev 804) @@ -1,349 +1,433 @@ @article{eddelbuettel2011rcpp, - title={Rcpp: Seamless R and C++ integration}, - author={Dirk Eddelbuettel and Romain Fran{\c{c}}ois}, - journal={Journal of Statistical Software}, - volume={40}, - number={8}, - pages={1--18}, - year={2011} + title = {Rcpp: Seamless R and C++ integration}, + author = {Dirk Eddelbuettel and Romain Fran{\c{c}}ois}, + journal = {Journal of Statistical Software}, + volume = 40, + number = 8, + pages = {1--18}, + year = 2011 } + @Manual{msgpackR, - title = {msgpackR: A library to serialize or unserialize data in MessagePack format}, - author = {Mikiya Tanizawa}, - year = {2013}, - note = {R package version 1.1}, - url = {http://CRAN.R-project.org/package=msgpackR}, + title = {msgpackR: A library to serialize or unserialize data + in MessagePack format}, + author = {Mikiya Tanizawa}, + year = 2013, + note = {R package version 1.1}, + url = {http://CRAN.R-project.org/package=msgpackR}, } + @inproceedings{janus, -title = {Janus: Optimal Flash Provisioning for Cloud Storage Workloads}, -author = {Christoph Albrecht and Arif Merchant and Murray Stokely and Muhammad Waliji and Francois Labelle and Nathan Coehlo and Xudong Shi and Eric Schrock}, -year = 2013, -URL = {https://www.usenix.org/system/files/conference/atc13/atc13-albrecht.pdf}, -booktitle = {Proceedings of the USENIX Annual Technical Conference}, -pages = {91--102}, -address = {2560 Ninth Street, Suite 215, Berkeley, CA 94710, USA} + title = {Janus: Optimal Flash Provisioning for Cloud Storage + Workloads}, + author = {Christoph Albrecht and Arif Merchant and Murray + Stokely and Muhammad Waliji and Francois Labelle and + Nathan Coehlo and Xudong Shi and Eric Schrock}, + year = 2013, + URL = + {https://www.usenix.org/system/files/conference/atc13/atc13-albrecht.pdf}, + booktitle = {Proceedings of the USENIX Annual Technical + Conference}, + pages = {91--102}, + address = {2560 Ninth Street, Suite 215, Berkeley, CA 94710, + USA} } + @article{blocker2013, -ajournal = "Bernoulli", -author = "Alexander W. Blocker and Xiao-Li Meng", -doi = "10.3150/13-BEJSP16", -journal = "Bernoulli", -month = "09", -number = "4", -pages = "1176--1211", -publisher = "Bernoulli Society for Mathematical Statistics and Probability", -title = "The potential and perils of preprocessing: Building new foundations", -url = "http://dx.doi.org/10.3150/13-BEJSP16", -volume = "19", -year = "2013" + ajournal = "Bernoulli", + author = "Alexander W. Blocker and Xiao-Li Meng", + doi = "10.3150/13-BEJSP16", + journal = "Bernoulli", + month = 09, + number = 4, + pages = "1176--1211", + publisher = "Bernoulli Society for Mathematical Statistics and + Probability", + title = "The potential and perils of preprocessing: Building + new foundations", + url = "http://dx.doi.org/10.3150/13-BEJSP16", + volume = 19, + year = 2013 } + @article{clinec++, - title={C++ faq}, - author={Marshall Cline}, - journal={Also available as http://www. parashift. com/c++-faq-lite/index. html}, - year = "2013" + title = {C++ faq}, + author = {Marshall Cline}, + journal = {Also available as + http://www. parashift. com/c++-faq-lite/index. html}, + year = 2013 } + @Manual{RJSONIO, - title = {RJSONIO: Serialize R objects to JSON, JavaScript Object Notation}, - author = {Duncan {Temple Lang}}, - year = {2011}, - note = {R package version 0.96-0}, - url = {http://CRAN.R-project.org/package=RJSONIO}, + title = {RJSONIO: Serialize R objects to JSON, JavaScript + Object Notation}, + author = {Duncan {Temple Lang}}, + year = 2011, + note = {R package version 0.96-0}, + url = {http://CRAN.R-project.org/package=RJSONIO}, } + @Manual{rjson, - title = {rjson: JSON for R}, - author = {Alex Couture-Beil}, - year = {2012}, - note = {R package version 0.2.10}, - url = {http://CRAN.R-project.org/package=rjson}, + title = {rjson: JSON for R}, + author = {Alex Couture-Beil}, + year = 2012, + note = {R package version 0.2.10}, + url = {http://CRAN.R-project.org/package=rjson}, } + @Manual{rmongodb, - title={rmongodb: R-MongoDB driver}, - author={Gerald Lindsly}, - year = {2013}, - note = {R package version 1.3.3}, - url = {http://CRAN.R-project.org/package=rmongodb}, + title = {rmongodb: R-MongoDB driver}, + author = {Gerald Lindsly}, + year = 2013, + note = {R package version 1.3.3}, + url = {http://CRAN.R-project.org/package=rmongodb}, } + @Manual{int64, - title = {int64: 64 bit integer types}, - author = {Romain Fran{\c{c}}ois}, - year = {2011}, - note = {R package version 1.1.2}, - url = {http://CRAN.R-project.org/package=int64}, + title = {int64: 64 bit integer types}, + author = {Romain Fran{\c{c}}ois}, + year = 2011, + note = {R package version 1.1.2}, + url = {http://CRAN.R-project.org/package=int64}, } + @Manual{bit64, - title = {bit64: A S3 class for vectors of 64bit integers}, - author = {Jens Oehlschl\"{a}gel}, - year = {2012}, - note = {R package version 0.9-3}, - url = {http://CRAN.R-project.org/package=bit64}, + title = {bit64: A S3 class for vectors of 64bit integers}, + author = {Jens Oehlschl\"{a}gel}, + year = 2012, + note = {R package version 0.9-3}, + url = {http://CRAN.R-project.org/package=bit64}, } + @book{eddelbuettel2013seamless, - title={Seamless R and C++ Integration with Rcpp}, - author={Dirk Eddelbuettel}, - year={2013}, - publisher={Springer} + title = {Seamless R and C++ Integration with Rcpp}, + author = {Dirk Eddelbuettel}, + year = 2013, + publisher = {Springer} } + @Manual{rhipe, - title = {RHIPE: A Distributed Environment for the Analysis of Large and Complex Datasets}, - author = {Saptarshi Guha}, - year = {2010}, - url = {http://www.stat.purdue.edu/~sguha/rhipe/}, + title = {RHIPE: A Distributed Environment for the Analysis of + Large and Complex Datasets}, + author = {Saptarshi Guha}, + year = 2010, + url = {http://www.stat.purdue.edu/~sguha/rhipe/}, } + @misc{serialization, -author= {Luke Tierney}, -title = {A New Serialization Mechanism for R}, -url = {http://www.cs.uiowa.edu/~luke/R/serialize/serialize.ps}, -year = {2003}, + author = {Luke Tierney}, + title = {A New Serialization Mechanism for R}, + url = + {http://www.cs.uiowa.edu/~luke/R/serialize/serialize.ps}, + year = 2003, } + @manual{eddelbuettel2013exposing, - title={Exposing C++ functions and classes with Rcpp modules}, - author={Dirk Eddelbuettel and Romain Fran{\c{c}}ois}, - year={2013}, - note={Vignette included in R package Rcpp}, - url = {http://CRAN.R-project.org/package=Rcpp}, + title = {Exposing C++ functions and classes with Rcpp + modules}, + author = {Dirk Eddelbuettel and Romain Fran{\c{c}}ois}, + year = 2013, + note = {Vignette included in R package Rcpp}, + url = {http://CRAN.R-project.org/package=Rcpp}, } + @inproceedings{cantrill2004dynamic, - title={Dynamic Instrumentation of Production Systems.}, - author={Bryan Cantrill and Michael W Shapiro and Adam H Leventhal and others}, - booktitle={USENIX Annual Technical Conference, General Track}, - pages={15--28}, - year={2004} + title = {Dynamic Instrumentation of Production Systems.}, + author = {Bryan Cantrill and Michael W Shapiro and Adam H + Leventhal and others}, + booktitle = {USENIX Annual Technical Conference, General Track}, + pages = {15--28}, + year = 2004 } + @article{swain1991color, - title={Color indexing}, - author={Michael J Swain and Dana H Ballard}, - journal={International journal of computer vision}, - volume={7}, - number={1}, - pages={11--32}, - year={1991}, - publisher={Springer} + title = {Color indexing}, + author = {Michael J Swain and Dana H Ballard}, + journal = {International journal of computer vision}, + volume = 7, + number = 1, + pages = {11--32}, + year = 1991, + publisher = {Springer} } + @article{rubner2000earth, - title={The earth mover's distance as a metric for image retrieval}, - author={Yossi Rubner and Carlo Tomasi and Leonidas J Guibas}, - journal={International Journal of Computer Vision}, - volume={40}, - number={2}, - pages={99--121}, - year={2000}, - publisher={Springer} + title = {The earth mover's distance as a metric for image + retrieval}, + author = {Yossi Rubner and Carlo Tomasi and Leonidas J Guibas}, + journal = {International Journal of Computer Vision}, + volume = 40, + number = 2, + pages = {99--121}, + year = 2000, + publisher = {Springer} } + @book{kullback1997information, - title={Information theory and statistics}, - author={Solomon Kullback}, - year={1997}, - publisher={Courier Dover Publications} + title = {Information theory and statistics}, + author = {Solomon Kullback}, + year = 1997, + publisher = {Courier Dover Publications} } + @inproceedings{puzicha1997non, - title={Non-parametric similarity measures for unsupervised texture segmentation and image retrieval}, - author={Jan Puzicha and Thomas Hofmann and Joachim M Buhmann}, - booktitle={Computer Vision and Pattern Recognition, 1997. Proceedings., 1997 IEEE Computer Society Conference on}, - pages={267--272}, - year={1997}, - organization={IEEE} + title = {Non-parametric similarity measures for unsupervised + texture segmentation and image retrieval}, + author = {Jan Puzicha and Thomas Hofmann and Joachim M + Buhmann}, + booktitle = {Computer Vision and Pattern Recognition, + 1997. Proceedings., 1997 IEEE Computer Society + Conference on}, + pages = {267--272}, + year = 1997, + organization = {IEEE} } + @inproceedings{fang1999computing, - title={Computing Iceberg Queries Efficiently.}, - author={Min Fang and Narayanan Shivakumar and Hector Garcia-Molina and Rajeev Motwani and Jeffrey D Ullman}, - booktitle={Internaational Conference on Very Large Databases (VLDB'98), New York, August 1998}, - year={1999}, - organization={Stanford InfoLab} + title = {Computing Iceberg Queries Efficiently.}, + author = {Min Fang and Narayanan Shivakumar and Hector + Garcia-Molina and Rajeev Motwani and Jeffrey D + Ullman}, + booktitle = {Internaational Conference on Very Large Databases + (VLDB'98), New York, August 1998}, + year = 1999, + organization = {Stanford InfoLab} } + @Manual{emdist, - title = {emdist: Earth Mover's Distance}, - author = {Simon Urbanek and Yossi Rubner}, - year = {2012}, - note = {R package version 0.3-1}, - url = {http://cran.r-project.org/package=emdist}, + title = {emdist: Earth Mover's Distance}, + author = {Simon Urbanek and Yossi Rubner}, + year = 2012, + note = {R package version 0.3-1}, + url = {http://cran.r-project.org/package=emdist}, } + @article{Wegiel:2010:CTT:1932682.1869479, - author = {Michal Wegiel and Chandra Krintz}, - title = {Cross-language, Type-safe, and Transparent Object Sharing for Co-located Managed Runtimes}, - journal = {SIGPLAN Not.}, - issue_date = {October 2010}, - volume = {45}, - number = {10}, - month = oct, - year = {2010}, - issn = {0362-1340}, - pages = {223--240}, - numpages = {18}, - url = {http://doi.acm.org/10.1145/1932682.1869479}, - doi = {10.1145/1932682.1869479}, - acmid = {1869479}, - publisher = {ACM}, - address = {New York, NY, USA}, - keywords = {collection, communication, cross-language, garbage, managed, memory, model, object, rpc, runtimes, shared, synchronization, transparent, type-safe}, + author = {Michal Wegiel and Chandra Krintz}, + title = {Cross-language, Type-safe, and Transparent Object + Sharing for Co-located Managed Runtimes}, + journal = {SIGPLAN Not.}, + issue_date = {October 2010}, + volume = 45, + number = 10, + month = oct, + year = 2010, + issn = {0362-1340}, + pages = {223--240}, + numpages = 18, + url = {http://doi.acm.org/10.1145/1932682.1869479}, + doi = {10.1145/1932682.1869479}, + acmid = 1869479, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {collection, communication, cross-language, garbage, + managed, memory, model, object, rpc, runtimes, + shared, synchronization, transparent, type-safe}, } + @article{wickham2011split, - title={The split-apply-combine strategy for data analysis}, - author={Hadley Wickham}, - journal={Journal of Statistical Software}, - volume={40}, - number={1}, - pages={1--29}, - year={2011}, - publisher={Citeseer} + title = {The split-apply-combine strategy for data analysis}, + author = {Hadley Wickham}, + journal = {Journal of Statistical Software}, + volume = 40, + number = 1, + pages = {1--29}, + year = 2011, + publisher = {Citeseer} } + @inproceedings{Sumaray:2012:CDS:2184751.2184810, - author = {Audie Sumaray and S. Kami Makki}, - title = {A Comparison of Data Serialization Formats for Optimal Efficiency on a Mobile Platform}, - booktitle = {Proceedings of the 6th International Conference on Ubiquitous Information Management and Communication}, - series = {ICUIMC '12}, - year = {2012}, - isbn = {978-1-4503-1172-4}, - location = {Kuala Lumpur, Malaysia}, - pages = {48:1--48:6}, - articleno = {48}, - numpages = {6}, - url = {http://doi.acm.org/10.1145/2184751.2184810}, - doi = {10.1145/2184751.2184810}, - acmid = {2184810}, - publisher = {ACM}, - address = {New York, NY, USA}, - keywords = {Android, Dalvik, JSON, ProtoBuf, XML, data serialization, thrift}, -} + author = {Audie Sumaray and S. Kami Makki}, + title = {A Comparison of Data Serialization Formats for + Optimal Efficiency on a Mobile Platform}, + booktitle = {Proceedings of the 6th International Conference on + Ubiquitous Information Management and Communication}, + series = {ICUIMC '12}, + year = 2012, + isbn = {978-1-4503-1172-4}, + location = {Kuala Lumpur, Malaysia}, + pages = {48:1--48:6}, + articleno = 48, + numpages = 6, + url = {http://doi.acm.org/10.1145/2184751.2184810}, + doi = {10.1145/2184751.2184810}, + acmid = 2184810, + publisher = {ACM}, + address = {New York, NY, USA}, + keywords = {Android, Dalvik, JSON, ProtoBuf, XML, data + serialization, thrift}, +} + @Manual{RObjectTables, - title = {User-Defined Tables in the R Search Path}, - author = {Duncan {Temple Lang}}, - year = {2012}, - url = {http://www.omegahat.org/RObjectTables/RObjectTables.pdf}, + title = {User-Defined Tables in the R Search Path}, + author = {Duncan {Temple Lang}}, + year = 2012, + url = + {http://www.omegahat.org/RObjectTables/RObjectTables.pdf}, } + @Manual{rprotobuf, - title = {RProtoBuf: R Interface to the Protocol Buffers API}, - author = {Romain Francois and Dirk Eddelbuettel and Murray Stokely}, - note = {R package version 0.3.2}, - year = {2013}, - url = {http://cran.r-project.org/web/packages/RProtoBuf/index.html}, + title = {RProtoBuf: R Interface to the Protocol Buffers API}, + author = {Romain Francois and Dirk Eddelbuettel and Murray + Stokely}, + note = {R package version 0.3.2}, + year = 2013, + url = + {http://cran.r-project.org/web/packages/RProtoBuf/index.html}, } + @Manual{r, - title = {R: A Language and Environment for Statistical Computing}, - author = {{R Core Team}}, - organization = {R Foundation for Statistical Computing}, - address = {Vienna, Austria}, - year = {2013}, - url = {http://www.R-project.org/}, - } + title = {R: A Language and Environment for Statistical + Computing}, + author = {{R Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = 2013, + url = {http://www.R-project.org/}, +} + @article{dean2008mapreduce, - title={MapReduce: simplified data processing on large clusters}, - author={Jeffrey Dean and Sanjay Ghemawat}, - journal={Communications of the ACM}, - volume={51}, - number={1}, - pages={107--113}, - year={2008}, - publisher={ACM} + title = {MapReduce: simplified data processing on large + clusters}, + author = {Jeffrey Dean and Sanjay Ghemawat}, + journal = {Communications of the ACM}, + volume = 51, + number = 1, + pages = {107--113}, + year = 2008, + publisher = {ACM} } + @article{bostock2011d3, - title={D$^3$ Data-Driven Documents}, - author={Michael Bostock and Vadim Ogievetsky and Jeffrey Heer}, - journal={Visualization and Computer Graphics, IEEE Transactions on}, - volume={17}, - number={12}, - pages={2301--2309}, - year={2011}, - publisher={IEEE} + title = {D$^3$ Data-Driven Documents}, + author = {Michael Bostock and Vadim Ogievetsky and Jeffrey + Heer}, + journal = {Visualization and Computer Graphics, IEEE + Transactions on}, + volume = 17, + number = 12, + pages = {2301--2309}, + year = 2011, + publisher = {IEEE} } % celebrated article in this field. Also see the parallel paragraph. + @article{Manku:1998:AMO:276305.276342, - author = {Gurmeet Singh Manku and Sridhar Rajagopalan and Bruce G. Lindsay}, - title = {Approximate medians and other quantiles in one pass and with limited memory}, - journal = {SIGMOD Rec.}, - issue_date = {June 1998}, - volume = {27}, - number = {2}, - month = jun, - year = {1998}, - issn = {0163-5808}, - pages = {426--435}, - numpages = {10}, - url = {http://doi.acm.org/10.1145/276305.276342}, - doi = {10.1145/276305.276342}, - acmid = {276342}, - publisher = {ACM}, - address = {New York, NY, USA}, + author = {Gurmeet Singh Manku and Sridhar Rajagopalan and + Bruce G. Lindsay}, + title = {Approximate medians and other quantiles in one pass + and with limited memory}, + journal = {SIGMOD Rec.}, + issue_date = {June 1998}, + volume = 27, + number = 2, + month = jun, + year = 1998, + issn = {0163-5808}, + pages = {426--435}, + numpages = 10, + url = {http://doi.acm.org/10.1145/276305.276342}, + doi = {10.1145/276305.276342}, + acmid = 276342, + publisher = {ACM}, + address = {New York, NY, USA}, } % Has a section on protocol buffers + @article{Pike:2005:IDP:1239655.1239658, - author = {Rob Pike and Sean Dorward and Robert Griesemer and Sean Quinlan}, - title = {Interpreting the data: Parallel analysis with Sawzall}, - journal = {Sci. Program.}, - issue_date = {October 2005}, - volume = {13}, - number = {4}, - month = oct, - year = {2005}, - issn = {1058-9244}, - pages = {277--298}, - numpages = {22}, - acmid = {1239658}, - publisher = {IOS Press}, - address = {Amsterdam, The Netherlands, The Netherlands}, -} + author = {Rob Pike and Sean Dorward and Robert Griesemer and + Sean Quinlan}, + title = {Interpreting the data: Parallel analysis with + Sawzall}, + journal = {Sci. Program.}, + issue_date = {October 2005}, + volume = 13, + number = 4, + month = oct, + year = 2005, + issn = {1058-9244}, + pages = {277--298}, + numpages = 22, + acmid = 1239658, + publisher = {IOS Press}, + address = {Amsterdam, The Netherlands, The Netherlands}, +} + @Manual{protobuf, - title = {Protocol Buffers: Developer Guide}, - author = {Google}, - year = {2012}, - url = {http://code.google.com/apis/protocolbuffers/docs/overview.html} + title = {Protocol Buffers: Developer Guide}, + author = {Google}, + year = 2012, + url = + {http://code.google.com/apis/protocolbuffers/docs/overview.html} } + @article{sturges1926choice, - title={The choice of a class interval}, - author={Herbert A Sturges}, - journal={Journal of the American Statistical Association}, - volume={21}, - number={153}, - pages={65--66}, - year={1926} + title = {The choice of a class interval}, + author = {Herbert A Sturges}, + journal = {Journal of the American Statistical Association}, + volume = 21, + number = 153, + pages = {65--66}, + year = 1926 } + @Manual{histogramtools, - title = {HistogramTools: Utility Functions for R Histograms}, - author = {Murray Stokely}, - year = {2013}, - note = {R package version 0.3}, - url = {https://r-forge.r-project.org/projects/histogramtools/}, + title = {HistogramTools: Utility Functions for R Histograms}, + author = {Murray Stokely}, + year = 2013, + note = {R package version 0.3}, + url = + {https://r-forge.r-project.org/projects/histogramtools/}, } + @article{scott1979optimal, - title={On optimal and data-based histograms}, - author={David W Scott}, - journal={Biometrika}, - volume={66}, - number={3}, - pages={605--610}, - year={1979}, - publisher={Biometrika Trust} + title = {On optimal and data-based histograms}, + author = {David W Scott}, + journal = {Biometrika}, + volume = 66, + number = 3, + pages = {605--610}, + year = 1979, + publisher = {Biometrika Trust} } + @book{scott2009multivariate, - title={Multivariate density estimation: theory, practice, and visualization}, - author={David W Scott}, - volume={383}, - year={2009}, - publisher={Wiley. com} + title = {Multivariate density estimation: theory, practice, + and visualization}, + author = {David W Scott}, + volume = 383, + year = 2009, + publisher = {Wiley. com} } + @Manual{httr, - title = {httr: Tools for working with URLs and HTTP}, - author = {Hadley Wickham}, - year = {2012}, - note = {R package version 0.2}, - url = {http://CRAN.R-project.org/package=httr}, + title = {httr: Tools for working with URLs and HTTP}, + author = {Hadley Wickham}, + year = 2012, + note = {R package version 0.2}, + url = {http://CRAN.R-project.org/package=httr}, } + @Manual{opencpu, - title = {OpenCPU system for embedded statistical computation and reproducible research}, - author = {Jeroen Ooms}, - year = {2013}, - note = {R package version 1.2.2}, - url = {http://www.opencpu.org}, + title = {OpenCPU system for embedded statistical computation + and reproducible research}, + author = {Jeroen Ooms}, + year = 2013, + note = {R package version 1.2.2}, + url = {http://www.opencpu.org}, } + @article{shafranovich2005common, - title={Common format and mime type for comma-separated values (csv) files}, - author={Yakov Shafranovich}, - year={2005}, - url={http://tools.ietf.org/html/rfc4180} + title = {Common format and mime type for comma-separated + values (csv) files}, + author = {Yakov Shafranovich}, + year = 2005, + url = {http://tools.ietf.org/html/rfc4180} } + @book{nolan2013xml, - title={XML and Web Technologies for Data Sciences with R}, - author={Deborah Nolan and Duncan {Temple Lang}}, - year={2013}, - publisher={Springer} + title = {XML and Web Technologies for Data Sciences with R}, + author = {Deborah Nolan and Duncan {Temple Lang}}, + year = 2013, + publisher = {Springer} } From noreply at r-forge.r-project.org Tue Jan 21 06:37:30 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 06:37:30 +0100 (CET) Subject: [Rprotobuf-commits] r805 - papers/jss Message-ID: <20140121053730.AB444186926@r-forge.r-project.org> Author: murray Date: 2014-01-21 06:37:30 +0100 (Tue, 21 Jan 2014) New Revision: 805 Modified: papers/jss/article.Rnw Log: Mostly work on section 2 to address some flaws identified by Jeroen, Karl, and others. Move up the basic description of the protocol buffer schema from section 3 to section 2, including the example of how protocol buffers are manipulated with this package. Revert a regression -- fix the reference to BSON and MessagePack by putting the citations next to the text about the R interfaces rather than the formats themselves (re-apply fix from Dirk). Add an explicit transition from section 2 to section 3 as the last sentence of 2. Define payload at the beginning of section 3 just once, so we don't repeat ourselves later in the section. Add a sentence to section 6 that provides more context about when you use the basic RProtoBuf functionality with specific schemas -- "This is useful when there are pre-existing systems with defined schemas or significant software components written in other languages that need to be accessed from within R." before transitioning to talk about the universal r language schema. (This point suggested by Karl). Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-21 04:21:13 UTC (rev 804) +++ papers/jss/article.Rnw 2014-01-21 05:37:30 UTC (rev 805) @@ -190,8 +190,8 @@ A number of binary formats based on \texttt{JSON} have been proposed that reduce the parsing cost and improve efficiency. \pkg{MessagePack} -\citep{msgpackR} and \pkg{BSON} \citep{rmongodb} both have R -interfaces, but these formats lack a separate schema for the seralized +and \pkg{BSON} both have R +interfaces \citep{msgpackR,rmongodb}, but these formats lack a separate schema for the serialized data and thus still duplicate field names with each message sent over the network or stored in a file. Such formats also lack support for versioning when data storage needs evolve over time, or when @@ -268,22 +268,10 @@ \section{Protocol Buffers} \label{sec:protobuf} - % JO: I'm not sure where to put this paragraph. I think it is too technical % for the introduction section. Maybe start this section with some explanation % of what a schema is and then continue with showing how PB implement this? -Once the data serialization needs of an application become complex -enough, developers typically benefit from the use of an -\emph{interface description language}, or \emph{IDL}. IDLs like -Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact -well-documented schema for cross-language data structures and -efficient binary interchange formats. -Since the schema is provided separately from the encoded data, the data can be -efficiently encoded to minimize storage costs of the stored data when compared with simple -``schema-less'' binary interchange formats. -The schema can be used to generate classes for statically-typed programming languages -such as C++ and Java, or can be used with reflection for dynamically-typed programming -languages. +% MS: Yes I agree, tried to address below. %FIXME Introductory section which may include references in parentheses %\citep{R}, or cite a reference such as \citet{R} in the text. @@ -293,26 +281,10 @@ %% TODO(de,ms) What follows is oooooold and was lifted from the webpage %% Rewrite? -Protocol Buffers can be described as a modern, language-neutral, platform-neutral, -extensible mechanism for sharing and storing structured data. Since their -introduction, Protocol Buffers have been widely adopted in industry with -applications as varied as %database-internal messaging (Drizzle), % DE: citation? -Sony Playstations, Twitter, Google Search, Hadoop, and Open Street Map. -% TODO(DE): This either needs a citation, or remove the name drop -% MS: These are mostly from blog posts, I can't find a good reference -% that has a long list, and the name and year citation style seems -% less conducive to long lists of marginal citations like blog posts -% compared to say concise CS/math style citations [3,4,5,6]. Thoughts? +Protocol Buffers are a modern, language-neutral, platform-neutral, +extensible mechanism for sharing and storing structured data. Some of +the key features provided by Protocol Buffers for data analysis include: - - -While traditional IDLs have at times been criticized for code bloat and -complexity, Protocol Buffers are based on a simple list and records -model that is compartively flexible and simple to use. - -Some of the key features provided by Protocol Buffers for data analysis -include: - \begin{itemize} \item \emph{Portable}: Enable users to send and receive data between applications as well as different computers or operating systems. @@ -324,6 +296,16 @@ decade. \end{itemize} +% Lets place this at the top of the page or the bottom, or on a float +% page, but not just here in the middle of the page. +\begin{figure}[tbp] +\begin{center} +\includegraphics[width=\textwidth]{protobuf-distributed-system-crop.pdf} +\end{center} +\caption{Example protobuf usage} +\label{fig:protobuf-distributed-usecase} +\end{figure} + Figure~\ref{fig:protobuf-distributed-usecase} illustrates an example communication workflow with Protocol Buffers and an interactive R session. Common use cases include populating a request remote-procedure call (RPC) @@ -334,6 +316,94 @@ the remote server may be implemented in any language, with no dependence on R. +While traditional IDLs have at times been criticized for code bloat and +complexity, Protocol Buffers are based on a simple list and records +model that is flexible and simple to use. The schema for structured +protocol buffer data is defined in \texttt{.proto} files which may +contain one or more message types. Each message type has one or more +fields. A field is specified with a unique number, a name, a value +type, and a field rule specifying whether the field is optional, +required, or repeated. The supported value types are numbers, +enumerations, booleans, strings, raw bytes, or other nested message +types. The \texttt{.proto} file syntax for defining the structure of protocol +buffer data is described comprehensively on Google Code\footnote{See +\url{http://code.google.com/apis/protocolbuffers/docs/proto.html}.}. +Table~\ref{tab:proto} shows an example \texttt{.proto} file which +defines the \texttt{tutorial.Person} type. The R code in the right +column shows an example of creating a new message of this type and +populating its fields. + +%% TODO(de) Can we make this not break the width of the page? +\noindent +\begin{table} +\begin{tabular}{p{.40\textwidth}p{0.55\textwidth}} +\toprule +Schema : \texttt{addressbook.proto} & Example R Session\\ +\cmidrule{1-2} +\begin{minipage}{.40\textwidth} +\vspace{2mm} +\begin{example} +package tutorial; +message Person { + required string name = 1; + required int32 id = 2; + optional string email = 3; + enum PhoneType { + MOBILE = 0; HOME = 1; + WORK = 2; + } + message PhoneNumber { + required string number = 1; + optional PhoneType type = 2; + } + repeated PhoneNumber phone = 4; +} +\end{example} +\vspace{2mm} +\end{minipage} & \begin{minipage}{.55\textwidth} +<>= +library(RProtoBuf) +p <- new(tutorial.Person,id=1,name="Dirk") +class(p) +p$name +p$name <- "Murray" +cat(as.character(p)) +serialize(p, NULL) +@ +\end{minipage} \\ +\bottomrule +\end{tabular} +\caption{The schema representation from a \texttt{.proto} file for the + \texttt{tutorial.Person} class (left) and simple R code for creating + an object of this class and accessing its fields (right).} +\label{tab:proto} +\end{table} + + +% The schema can be used to generate model classes for statically-typed programming languages +%such as C++ and Java, or can be used with reflection for dynamically-typed programming +%languages. + +% TODO(mstokely): Maybe find a place to add this? +% Since their +% introduction, Protocol Buffers have been widely adopted in industry with +% applications as varied as %database-internal messaging (Drizzle), % DE: citation? +% Sony Playstations, Twitter, Google Search, Hadoop, and Open Street +% Map. + +% TODO(DE): This either needs a citation, or remove the name drop +% MS: These are mostly from blog posts, I can't find a good reference +% that has a long list, and the name and year citation style seems +% less conducive to long lists of marginal citations like blog posts +% compared to say concise CS/math style citations [3,4,5,6]. Thoughts? + + +% The schema can be used to generate classes for statically-typed programming languages +% such as C++ and Java, or can be used with reflection for dynamically-typed programming +% languages. + + + %Protocol buffers are a language-neutral, platform-neutral, extensible %way of serializing structured data for use in communications %protocols, data storage, and more. @@ -345,16 +415,6 @@ %buffers are also forward compatible: updates to the \texttt{proto} %files do not break programs built against the previous specification. -%While benchmarks are not available, Google states on the project page that in -%comparison to XML, protocol buffers are at the same time \textsl{simpler}, -%between three to ten times \textsl{smaller}, between twenty and one hundred -%times \textsl{faster}, as well as less ambiguous and easier to program. - -%The flexibility of the reflection-based API is particularly well -%suited for interactive data analysis. - -% XXX Design tradeoffs: reflection vs proto compiler - For added speed and efficiency, the C++, Java, and Python bindings to Protocol Buffers are used with a compiler that translates a Protocol Buffer schema description file (ending in \texttt{.proto}) into @@ -364,7 +424,8 @@ interactive data analysis. All messages in R have a single class structure, but different accessor methods are created at runtime based -on the named fields of the specified message type. +on the named fields of the specified message type, as described in the +next section. % In other words, given the 'proto' %description file, code is automatically generated for the chosen @@ -388,40 +449,19 @@ %languages to support protocol buffers is compiled as part of the %project page: \url{http://code.google.com/p/protobuf/wiki/ThirdPartyAddOns} -\begin{figure}[t] -\begin{center} -\includegraphics[width=\textwidth]{protobuf-distributed-system-crop.pdf} -\end{center} -\caption{Example protobuf usage} -\label{fig:protobuf-distributed-usecase} -\end{figure} - \section{Basic Usage: Messages and Descriptors} \label{sec:rprotobuf-basic} This section describes how to use the R API to create and manipulate protocol buffer messages in R, and how to read and write the -binary \emph{payload} of the messages to files and arbitrary binary +binary representation of the message (often called the \emph{payload}) to files and arbitrary binary R connections. - The two fundamental building blocks of Protocol Buffers are \emph{Messages} and \emph{Descriptors}. Messages provide a common abstract encapsulation of structured data fields of the type specified in a Message Descriptor. Message Descriptors are defined in \texttt{.proto} files and define a schema for a particular named class of messages. -Table~\ref{tab:proto} shows an example \texttt{.proto} file which -defines the \texttt{tutorial.Person} type. The R code in the right -column shows an example of creating a new message of this type and -populating its fields. A \texttt{.proto} file may contain one or more -message types, and each message type has one or more fields. A field -is specified with a unique number, a name, a value type, and a field -rule specifying whether the field is optional, required, or repeated. -The supported value types are numbers, enumerations, booleans, -strings, raw bytes, or other nested message types. -The \texttt{.proto} file syntax for defining the structure of protocol -buffer data is described comprehensively on Google Code\footnote{See -\url{http://code.google.com/apis/protocolbuffers/docs/proto.html}.}. % Commented out because we said this earlier. %This separation @@ -438,51 +478,6 @@ %languages. The definition -%% TODO(de) Can we make this not break the width of the page? -\noindent -\begin{table} -\begin{tabular}{p{.40\textwidth}p{0.55\textwidth}} -\toprule -Schema : \texttt{addressbook.proto} & Example R Session\\ -\cmidrule{1-2} -\begin{minipage}{.40\textwidth} -\vspace{2mm} -\begin{example} -package tutorial; -message Person { - required string name = 1; - required int32 id = 2; - optional string email = 3; - enum PhoneType { - MOBILE = 0; HOME = 1; - WORK = 2; - } - message PhoneNumber { - required string number = 1; - optional PhoneType type = 2; - } - repeated PhoneNumber phone = 4; -} -\end{example} -\vspace{2mm} -\end{minipage} & \begin{minipage}{.55\textwidth} -<>= -library(RProtoBuf) -p <- new(tutorial.Person,id=1,name="Dirk") -class(p) -p$name -p$name <- "Murray" -cat(as.character(p)) -serialize(p, NULL) -@ -\end{minipage} \\ -\bottomrule -\end{tabular} -\caption{The schema representation from a \texttt{.proto} file for the - \texttt{tutorial.Person} class (left) and simple R code for creating - an object of this class and accessing its fields (right).} -\label{tab:proto} -\end{table} %This section may contain a figure such as Figure~\ref{figure:rlogo}. % @@ -663,7 +658,7 @@ the human-readable ASCII output that is created with \code{as.character}. -The binary representation of the message (often called the payload) +The binary representation of the message does not contain information that can be used to dynamically infer the message type, so we have to provide this information to the \texttt{read} function in the form of a descriptor : @@ -1250,8 +1245,12 @@ The previous sections discussed functionality in the \pkg{RProtoBuf} package for creating, manipulating, parsing and serializing Protocol Buffer -messages of a specific pre-defined schema. The package also provides -methods for converting arbitrary R data structures into protocol +messages of a pre-defined schema. This is useful when there are +pre-existing systems with defined schemas or significant software +components written in other languages that need to be accessed from +within R. + +The package also provides methods for converting arbitrary R data structures into protocol buffers and vice versa with a universal R object schema. The \texttt{serialize\_pb} and \texttt{unserialize\_pb} functions serialize arbitrary R objects into a universal Protocol Buffer message: From noreply at r-forge.r-project.org Tue Jan 21 06:42:32 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 06:42:32 +0100 (CET) Subject: [Rprotobuf-commits] r806 - papers/jss Message-ID: <20140121054232.1E7811854F2@r-forge.r-project.org> Author: murray Date: 2014-01-21 06:42:31 +0100 (Tue, 21 Jan 2014) New Revision: 806 Modified: papers/jss/article.Rnw Log: spellcheck. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-21 05:37:30 UTC (rev 805) +++ papers/jss/article.Rnw 2014-01-21 05:42:31 UTC (rev 806) @@ -161,18 +161,18 @@ environment. %\paragraph*{Friends don't let friends use CSV!} -Data analysts and researchers often use character seperated text formats such +Data analysts and researchers often use character separated text formats such as \texttt{CSV} \citep{shafranovich2005common} to export and import data. However, anyone who has ever used \texttt{CSV} files will have noticed that this method has many limitations: it is restricted to tabular data, lacks type-safety, and has limited precision for numeric values. Moreover, ambiguities in the format itself frequently cause problems. For example, -conventions on which characters is used as seperator or decimal point vary by +conventions on which characters is used as separator or decimal point vary by country. \emph{Extensible Markup Language} (\texttt{XML}) is another well-established and widely-supported format with the ability to define just about any arbitrarily complex schema \citep{nolan2013xml}. However, it pays for this complexity with comparatively large and verbose messages, and added -complexitiy at the parsing side (which are somewhat mitigated by the +complexity at the parsing side (which are somewhat mitigated by the availability of mature libraries and parsers). Because \texttt{XML} is text based and has no native notion of numeric types or arrays, it usually not a very practical format to store numeric datasets as they appear in statistical @@ -1266,7 +1266,7 @@ \ref{rexp.proto}. The Protocol Buffer messages generated by \pkg{RProtoBuf} and \pkg{RHIPE} are naturally compatible between the two systems because they use the same schema. This shows the power of using a schema based cross-platform format such -as Protocol Buffers: interoperability is archieved without effort or close coordination. +as Protocol Buffers: interoperability is achieved without effort or close coordination. The \texttt{rexp.proto} schema supports all main R storage types holding \emph{data}. These include \texttt{NULL}, \texttt{list} and vectors of type \texttt{logical}, From noreply at r-forge.r-project.org Tue Jan 21 07:05:20 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 07:05:20 +0100 (CET) Subject: [Rprotobuf-commits] r807 - papers/jss Message-ID: <20140121060520.83F9A186987@r-forge.r-project.org> Author: murray Date: 2014-01-21 07:05:20 +0100 (Tue, 21 Jan 2014) New Revision: 807 Added: papers/jss/serialization.csv Log: Check in a simple CSV of the big datasets table from section 5, to make it easier for one to create a summary row or add relative size change %s instead of absolute byte counts, etc. Added: papers/jss/serialization.csv =================================================================== --- papers/jss/serialization.csv (rev 0) +++ papers/jss/serialization.csv 2014-01-21 06:05:20 UTC (rev 807) @@ -0,0 +1,51 @@ +data.set,size,r.size,r.gz.size,protobuf.size,protobuf.gz.size +uspop,584,268,172,211,148 +Titanic,1960,633,257,481,249 +volcano,42656,42517,5226,42476,4232 +euro.cross,2728,1319,910,1207,891 +attenu,14568,8234,2165,7771,2336 +ToothGrowth,2568,1486,349,1239,391 +lynx,1344,1028,429,971,404 +nottem,2352,2036,627,1979,641 +sleep,2752,746,282,483,260 +co2,4176,3860,1473,3803,1453 +austres,1144,828,439,771,410 +ability.cov,1944,716,357,589,341 +EuStockMarkets,60664,59785,21232,59674,19882 +treering,64272,63956,17647,63900,17758 +freeny.x,1944,1445,1311,1372,1289 +Puromycin,2088,813,306,620,320 +warpbreaks,2768,1231,310,811,343 +BOD,1088,334,182,226,168 +sunspots,22992,22676,6482,22620,6742 +beaver2,4184,3423,751,3468,840 +anscombe,2424,991,375,884,352 +esoph,5624,3111,548,2240,665 +PlantGrowth,1680,646,303,459,314 +infert,15848,14328,1172,13197,1404 +BJsales,1632,1316,496,1259,465 +stackloss,1688,917,293,844,283 +crimtab,7936,4641,713,1655,576 +LifeCycleSavings,6048,3014,1420,2825,1407 +Harman74.cor,9144,6056,2045,5861,2070 +nhtemp,912,596,240,539,223 +faithful,5136,4543,1339,4936,1776 +freeny,5296,2465,1518,2271,1507 +discoveries,1232,916,199,859,180 +state.x77,7168,4251,1754,4068,1756 +pressure,1096,498,277,427,273 +fdeaths,1008,692,291,635,272 +euro,976,264,186,202,161 +LakeHuron,1216,900,420,843,404 +mtcars,6736,3798,1204,3633,1206 +precip,4992,1793,813,1615,815 +state.area,440,422,246,405,235 +attitude,3024,1990,544,1920,561 +randu,10496,9794,8859,10441,9558 +state.name,3088,844,408,724,415 +airquality,5496,4551,1241,2874,1294 +airmiles,624,308,170,251,148 +quakes,33112,32246,9898,29063,11595 +islands,3496,1232,563,1098,561 +OrchardSprays,3600,2164,445,1897,483 +WWWusage,1232,916,274,859,251 From noreply at r-forge.r-project.org Tue Jan 21 07:17:53 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 07:17:53 +0100 (CET) Subject: [Rprotobuf-commits] r808 - papers/jss Message-ID: <20140121061753.2143E18653F@r-forge.r-project.org> Author: murray Date: 2014-01-21 07:17:52 +0100 (Tue, 21 Jan 2014) New Revision: 808 Modified: papers/jss/article.Rnw Log: Add back a reference to Tierney's serialization doc. Move my incomplete TODO performance summary subsection to the final conclusion/summary section, where I will likely delete most of it but may use a bit of those ideas in wrapping everything up in a conclusion. Ditto for the commented out other approaches section. Section 6 previously ended on a pessimistic note and a sentence fragment (the table shows protobuf doesn't do much better in compression size than normal R serialization). End on a more complete note that mentions that RProtoBuf is most benefetial when multiple languages are involved, and when a more concise application-specific schema is in place, and transition to the example in the next section by noting that both of those conditions hold for the MapReduce / histogram example. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-21 06:05:20 UTC (rev 807) +++ papers/jss/article.Rnw 2014-01-21 06:17:52 UTC (rev 808) @@ -273,9 +273,6 @@ % of what a schema is and then continue with showing how PB implement this? % MS: Yes I agree, tried to address below. -%FIXME Introductory section which may include references in parentheses -%\citep{R}, or cite a reference such as \citet{R} in the text. - % This content is good. Maybe use and cite? % http://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html @@ -1245,7 +1242,7 @@ The previous sections discussed functionality in the \pkg{RProtoBuf} package for creating, manipulating, parsing and serializing Protocol Buffer -messages of a pre-defined schema. This is useful when there are +messages of a defined schema. This is useful when there are pre-existing systems with defined schemas or significant software components written in other languages that need to be accessed from within R. @@ -1330,7 +1327,7 @@ using four different methods: \begin{itemize} -\item normal R serialization, +\item normal R serialization \citep{serialization}, \item R serialization followed by gzip, \item normal Protocol Buffer serialization, and \item Protocol Buffer serialization followed by gzip. @@ -1364,12 +1361,16 @@ %sample of 50 datasets is included on the next page. Sizes are comparable but Protocol Buffers provide simple getters and setters in multiple languages instead of requiring other programs to parse the R -serialization format.% \citep{serialization}. -One takeaway from this table is that RProtoBuf does not in general provide +serialization format. % \citep{serialization}. +One takeaway from this table is that the universal R object schema +included in RProtoBuf does not in general provide any significant saving in file size compared to the normal serialization -mechanism in R which is seen as equally compact. The benefit from RProtoBuf -comes from its interoperability with other environments, as well as its safe -versioning, +mechanism in R. +% redundant: which is seen as equally compact. +The benefits of RProtoBuf accrue more naturally in applications where +multiple programming languages are involved, or when a more concise +application-specific schema has been defined. The example in the next +section provides both of these conditions. % N.B. see table.Rnw for how this table is created. % @@ -1444,26 +1445,7 @@ \end{center} \end{table} -\subsection{Performance considerations} -TODO RProtoBuf is quite flexible and easy to use for interactive -analysis, but it is not designed for certain classes of operations one -might like to do with Protocol Buffers. For example, taking a list of -10,000 Protocol Buffers, extracting a named field from each one, and -computing a aggregate statistics on those values would be extremely -slow with RProtoBuf, and while this is a useful class of operations, -it is outside of the scope of RProtoBuf. We should be very clear -about this to clarify the goals and strengths of RProtoBuf and its -reflection and object mapping. - - -%\section{Other approaches} - -% Phillip Yelland wrote another implementation, currently proprietary, -% that has significant speed advantages when querying fields from a -% large number of protocol buffers, but is less user friendly for the -% basic cases documented here. - \section{Application: Distributed Data Collection with MapReduce} \label{sec:mapreduce} @@ -1812,6 +1794,25 @@ \section{Summary} \label{sec:summary} + +TODO RProtoBuf is quite flexible and easy to use for interactive +analysis, but it is not designed for certain classes of operations one +might like to do with Protocol Buffers. For example, taking a list of +10,000 Protocol Buffers, extracting a named field from each one, and +computing a aggregate statistics on those values would be extremely +slow with RProtoBuf, and while this is a useful class of operations, +it is outside of the scope of RProtoBuf. We should be very clear +about this to clarify the goals and strengths of RProtoBuf and its +reflection and object mapping. + + +%\section{Other approaches} + +% Phillip Yelland wrote another implementation, currently proprietary, +% that has significant speed advantages when querying fields from a +% large number of protocol buffers, but is less user friendly for the +% basic cases documented here. + % RProtoBuf has been used. %Its pretty useful. Murray to see if he can get approval to talk a From noreply at r-forge.r-project.org Tue Jan 21 07:44:05 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 07:44:05 +0100 (CET) Subject: [Rprotobuf-commits] r809 - papers/jss Message-ID: <20140121064405.152DC186210@r-forge.r-project.org> Author: murray Date: 2014-01-21 07:44:04 +0100 (Tue, 21 Jan 2014) New Revision: 809 Modified: papers/jss/article.Rnw Log: Comment out the OpenCPU section about generating R objects from within Python. This code could be added to OpenCPU docs somewhere, but it is confusing in an article about RProtoBuf because it doesn't at all look language neutral as we describe many times here. It was a good addition before we had any python code in the Histogram application section, but now we have a more concise and clear example of python <-> R protocol buffer usage, so we don't need this any more. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-21 06:17:52 UTC (rev 808) +++ papers/jss/article.Rnw 2014-01-21 06:44:04 UTC (rev 809) @@ -1732,65 +1732,54 @@ outputmsg <- serialize_pb(val) @ -OpenCPU also provides a lot of meta-functionality such as handling -of sessions, exceptions, security, and more. OpenCPU also makes it possible to store -output of a function call on the server, instead of directly retrieving it. Thereby -objects can be shared with other users or used as arguments in a subsequent -function call. But in its essence, the HTTP API provides a simple way to perform remote -R function calls over HTTPS. The same request can be performed in Python as demonstrated -below. The code is a bit verbose because to show how the REXP message is created from -scratch. In practice would probably write a function or small module construct a Protocol -Buffer message representing an R list from a Python dictionary object. - -\begin{verbatim} -import urllib2; -from rexp_pb2 import *; - -#create the post payload, i.e. list(n=42, mean=100) -payload = REXP( - rclass = 5, - rexpValue = [ - REXP(rclass = 2, realValue = [42]), - REXP(rclass = 2, realValue = [100]) - ], - attrName = [ - "names" - ], - attrValue = [ - REXP(rclass = 0, stringValue = [STRING(strval="n"), STRING(strval="mean")]) - ] -); - -#HTTP POST -req = urllib2.Request( - "https://public.opencpu.org/ocpu/library/stats/R/rnorm/pb", - data = payload.SerializeToString(), - headers = { - 'Content-type': 'application/x-protobuf' - } -) -res = urllib2.urlopen(req); - -#parse output pb -msg = REXP(); -msg.ParseFromString(res.read()); - -#the return value is a double vector in this case -print(msg.realValue); -\end{verbatim} - - -%\section{Application: Sending/receiving Interaction With Servers} +% OpenCPU also provides a lot of meta-functionality such as handling +% of sessions, exceptions, security, and more. OpenCPU also makes it possible to store +% output of a function call on the server, instead of directly retrieving it. Thereby +% objects can be shared with other users or used as arguments in a subsequent +% function call. +% But in its essence, the HTTP API provides a simple way to perform remote +% R function calls over HTTPS. The same request can be performed in Python as demonstrated +% below. The code is a bit verbose because to show how the REXP message is created from +% scratch. In practice would probably write a function or small module construct a Protocol +% Buffer message representing an R list from a Python dictionary object. % -%Combined -%with an RPC system this means that one can interactively craft request -%messages, send the serialized message to a remote server, read back a -%response, and then parse the response protocol buffer interactively. +% \begin{verbatim} +% import urllib2; +% from rexp_pb2 import *; +% +% #create the post payload, i.e. list(n=42, mean=100) +% payload = REXP( +% rclass = 5, +% rexpValue = [ +% REXP(rclass = 2, realValue = [42]), +% REXP(rclass = 2, realValue = [100]) +% ], +% attrName = [ +% "names" +% ], +% attrValue = [ +% REXP(rclass = 0, stringValue = [STRING(strval="n"), STRING(strval="mean")]) +% ] +%); +% +%#HTTP POST +%req = urllib2.Request( +% "https://public.opencpu.org/ocpu/library/stats/R/rnorm/pb", +% data = payload.SerializeToString(), +% headers = { +% 'Content-type': 'application/x-protobuf' +% } +%) +%res = urllib2.urlopen(req); +% +%#parse output pb +%msg = REXP(); +%msg.ParseFromString(res.read()); +% +%#the return value is a double vector in this case +%print(msg.realValue); +%\end{verbatim} -%TODO(mstokely): Talk about Jeroen Ooms OpenCPU, or talk about Andy -%Chu's Poly. - - \section{Summary} \label{sec:summary} From noreply at r-forge.r-project.org Tue Jan 21 08:38:20 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 21 Jan 2014 08:38:20 +0100 (CET) Subject: [Rprotobuf-commits] r810 - papers/jss Message-ID: <20140121073820.C4BA1186AB3@r-forge.r-project.org> Author: murray Date: 2014-01-21 08:38:20 +0100 (Tue, 21 Jan 2014) New Revision: 810 Modified: papers/jss/article.Rnw papers/jss/article.bib Log: Add an initial conclusions and commentary section. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-21 06:44:04 UTC (rev 809) +++ papers/jss/article.Rnw 2014-01-21 07:38:20 UTC (rev 810) @@ -1780,35 +1780,38 @@ %print(msg.realValue); %\end{verbatim} -\section{Summary} +\section{Conclusion and Commentary} \label{sec:summary} +% TODO(mstokely): Get cibona approval for these two sentences before +% publishing. +Schema-less text formats such as CSV and JSON will continue to be +widely used in many contexts, but we hope that the availability of +\pkg{RProtoBuf} makes it easy for many mixed-language data analysis +pipelines to embrace schemas such as Protocol Buffers for type-safe +and performant data serialization between applications. +\pkg{RProtoBuf} has been heavily used inside Google for the past three +years by statisticians and software engineers. At the time of this +writing there are more than XXX 30-day active users of RProtoBuf using +it to read data from and otherwise interact with other distributed +systems written in C++, Java, Python, and other languages. -TODO RProtoBuf is quite flexible and easy to use for interactive -analysis, but it is not designed for certain classes of operations one -might like to do with Protocol Buffers. For example, taking a list of -10,000 Protocol Buffers, extracting a named field from each one, and -computing a aggregate statistics on those values would be extremely -slow with RProtoBuf, and while this is a useful class of operations, -it is outside of the scope of RProtoBuf. We should be very clear -about this to clarify the goals and strengths of RProtoBuf and its -reflection and object mapping. +\paragraph*{Other Approaches} +\pkg{RProtoBuf} is quite flexible and easy to use for interactive use, +but it is not designed for efficient high-speed manipulation of large +numbers of protocol buffers once they have been read into R. For +example, taking a list of 100,000 Protocol Buffers, extracting a named +field from each one, and computing an aggregate statistic on those +values would be relatively slow with RProtoBuf. Instead for such a +use case, the current design of RProtoBuf relies on other database +systems to provide query and aggregation semantics before the +resulting protocol buffers are read into R. Such queries could be +supported in a future version of \pkg{RProtoBuf} by supporting a +vector of messages type such that \emph{slicing} operations over a +given field across a large number of messages could be done +efficiently in C++. -%\section{Other approaches} - -% Phillip Yelland wrote another implementation, currently proprietary, -% that has significant speed advantages when querying fields from a -% large number of protocol buffers, but is less user friendly for the -% basic cases documented here. - -% RProtoBuf has been used. - -%Its pretty useful. Murray to see if he can get approval to talk a -%tiny bit about how much its used at Google. - -%This file is only a basic article template. For full details of \emph{The R Journal} style and information on how to prepare your article for submission, see the \href{http://journal.r-project.org/latex/RJauthorguide.pdf}{Instructions for Authors}. - \section{Acknowledgement} The first versions of \CRANpkg{RProtoBuf} were written during 2009-2010. Modified: papers/jss/article.bib =================================================================== --- papers/jss/article.bib 2014-01-21 06:44:04 UTC (rev 809) +++ papers/jss/article.bib 2014-01-21 07:38:20 UTC (rev 810) @@ -7,7 +7,16 @@ pages = {1--18}, year = 2011 } - + at inproceedings{dremel, +title = {Dremel: Interactive Analysis of Web-Scale Datasets}, +author = {Sergey Melnik and Andrey Gubarev and Jing Jing Long and + Geoffrey Romer and Shiva Shivakumar and Matt Tolton + and Theo Vassilakis}, +year = 2010, +URL = {http://www.vldb2010.org/accept.htm}, +booktitle = {Proc. of the 36th Int'l Conf on Very Large Data Bases}, +pages = {330-339} +} @Manual{msgpackR, title = {msgpackR: A library to serialize or unserialize data in MessagePack format}, From noreply at r-forge.r-project.org Wed Jan 22 02:48:23 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 02:48:23 +0100 (CET) Subject: [Rprotobuf-commits] r811 - papers/jss Message-ID: <20140122014823.D04981867F7@r-forge.r-project.org> Author: murray Date: 2014-01-22 02:48:20 +0100 (Wed, 22 Jan 2014) New Revision: 811 Modified: papers/jss/article.Rnw Log: Reword the beginning of section 4 a bit, and move the \verb|method(..)| lines into an enumerated list so that we don't overflow into the margin. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-21 07:38:20 UTC (rev 810) +++ papers/jss/article.Rnw 2014-01-22 01:48:20 UTC (rev 811) @@ -704,15 +704,27 @@ The \CRANpkg{RProtoBuf} package uses the S4 system to store information about descriptors and messages. Using the S4 system -allows the \texttt{RProtoBuf} package to dispatch methods that are not +allows the package to dispatch methods that are not generic in the S3 sense, such as \texttt{new} and \texttt{serialize}. +Table~\ref{class-summary-table} lists the six +primary Message and Descriptor classes in RProtoBuf. Each R object +contains an external pointer to an object managed by the +\texttt{protobuf} C++ library, and the R objects make calls into more +than 100 C++ functions that provide the +glue code between the R language classes and the underlying C++ +classes. -Each R object stores an external pointer to an object managed by -the \texttt{protobuf} C++ library which implements the core Protocol Buffer -functionality. The \CRANpkg{Rcpp} package +The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp,eddelbuettel2013seamless} is used to -facilitate the integration of the R and C++ code for these objects. +facilitate this integration of the R and C++ code for these objects. +Each method is wrapped individually which alllows us to add user +friendly custom error handling, type coercion, and performance +improvements at the cost of a more verbose implementation. +The RProtoBuf package in many ways motivated +the development of Rcpp Modules \citep{eddelbuettel2013exposing}, +which provide a more concise way of wrapping C++ functions and classes +in a single entity. % Message, Descriptor, FieldDescriptor, EnumDescriptor, % FileDescriptor, EnumValueDescriptor @@ -722,19 +734,15 @@ % grep RPB_ * | grep -v RPB_FUNCTION | grep METHOD|wc -l % 33 -There are over 100 C++ functions that provide the glue code between -the member functions of the 6 primary Message and Descriptor classes -in the protobuf library. Wrapping each method individually allows us -to add user friendly custom error handling, type coercion, and -performance improvements at the cost of a more verbose -implementation. The RProtoBuf implementation in many ways motivated -the development of Rcpp Modules \citep{eddelbuettel2013exposing}, -which provide a more concise way of wrapping C++ functions and classes -in a single entity. +The \texttt{RProtoBuf} package supports two forms for calling +functions with these S4 classes: +\begin{itemize} +\item The functional dispatch mechanism of the the form + \verb|method(object, arguments)| (common to R), and +\item The traditional object oriented notation + \verb|object$method(arguments)|. +\end{itemize} -The \texttt{RProtoBuf} package combines a functional dispatch mechanism -of the form \verb|method(object, arguments)| (common to R) and the more -traditional object oriented notation \verb|object$method(arguments)|. Additionally, \texttt{RProtoBuf} implements the \texttt{.DollarNames} S3 generic function (defined in the \texttt{utils} package) for all classes to enable tab completion. Completion possibilities include pseudo-method names for all @@ -758,7 +766,7 @@ EnumValueDescriptor & 3 & \phantom{1}6 & no\\ \bottomrule \end{tabular} -\caption{\label{Message-methods-table}Overview of Class, Slot, Method and +\caption{\label{class-summary-table}Overview of Class, Slot, Method and Dispatch Relationships} \end{table} From noreply at r-forge.r-project.org Wed Jan 22 03:11:28 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 03:11:28 +0100 (CET) Subject: [Rprotobuf-commits] r812 - papers/jss Message-ID: <20140122021128.2E429185CBE@r-forge.r-project.org> Author: murray Date: 2014-01-22 03:11:26 +0100 (Wed, 22 Jan 2014) New Revision: 812 Modified: papers/jss/article.Rnw Log: Add \shortcites{janus} so that the Google paper with ~8 authors just shows up as authorOne, et al, per the jss.pdf style guide. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 01:48:20 UTC (rev 811) +++ papers/jss/article.Rnw 2014-01-22 02:11:26 UTC (rev 812) @@ -13,7 +13,7 @@ \RequirePackage{fancyvrb} \RequirePackage{alltt} \DefineVerbatimEnvironment{example}{Verbatim}{} - +\shortcites{janus} %% almost as usual \author{Dirk Eddelbuettel\\Debian Project \And Murray Stokely\\Google, Inc \And From noreply at r-forge.r-project.org Wed Jan 22 03:28:48 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 03:28:48 +0100 (CET) Subject: [Rprotobuf-commits] r813 - papers/jss Message-ID: <20140122022848.D3AC3186968@r-forge.r-project.org> Author: murray Date: 2014-01-22 03:28:46 +0100 (Wed, 22 Jan 2014) New Revision: 813 Modified: papers/jss/article.Rnw Log: Label the appendix simply as "Appendix: The rexp.proto schema descriptor" and refer to it in line as "the appendix" instead of adding a section "Appendices" with only a single appendix "A" in it. Suggested by Dirk. Also, add missing \proglang{} in the intro. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 02:11:26 UTC (rev 812) +++ papers/jss/article.Rnw 2014-01-22 02:28:46 UTC (rev 813) @@ -155,7 +155,7 @@ %results between the individual components, using formats that are %independent of platform, language, operating system or architecture. Programming -languages such as R, Julia, Java, and Python include built-in +languages such as \proglang{R}, \proglang{Julia}, \proglang{Java}, and \proglang{Python} include built-in support for serialization, but the default formats are usually language specific and thereby lock the user into a single environment. @@ -1267,8 +1267,10 @@ In order to accomplish this, \pkg{RProtoBuf} uses the same catch-all \texttt{proto} schema used by \pkg{RHIPE} for exchanging R data with Hadoop \citep{rhipe}. This -schema, which we will refer to as \texttt{rexp.proto}, is printed in appendix -\ref{rexp.proto}. The Protocol Buffer messages generated by \pkg{RProtoBuf} and +schema, which we will refer to as \texttt{rexp.proto}, is printed in +%appendix \ref{rexp.proto}. +the appendix. +The Protocol Buffer messages generated by \pkg{RProtoBuf} and \pkg{RHIPE} are naturally compatible between the two systems because they use the same schema. This shows the power of using a schema based cross-platform format such as Protocol Buffers: interoperability is achieved without effort or close coordination. @@ -1820,7 +1822,7 @@ given field across a large number of messages could be done efficiently in C++. -\section{Acknowledgement} +\section*{Acknowledgments} The first versions of \CRANpkg{RProtoBuf} were written during 2009-2010. Very significant contributions, both in code and design, were made by @@ -1836,9 +1838,11 @@ %initial motivator. \newpage -\begin{appendices} +\appendix +\setcounter{secnumdepth}{0} +%\begin{appendices} -\section{The rexp.proto schema descriptor} +\section*{Appendix: The rexp.proto schema descriptor} \label{rexp.proto} Below a print of the \texttt{rexp.proto} schema (originally designed by \cite{rhipe}) @@ -1885,9 +1889,8 @@ required double imag = 2; } \end{verbatim} -\end{appendices} - - +% \end{appendices} +\newpage \bibliography{article} %\section[About Java]{About \proglang{Java}} From noreply at r-forge.r-project.org Wed Jan 22 03:40:34 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 03:40:34 +0100 (CET) Subject: [Rprotobuf-commits] r814 - papers/jss Message-ID: <20140122024034.D92E7183FD8@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-22 03:40:34 +0100 (Wed, 22 Jan 2014) New Revision: 814 Modified: papers/jss/article.Rnw Log: copy/paste typo Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 02:28:46 UTC (rev 813) +++ papers/jss/article.Rnw 2014-01-22 02:40:34 UTC (rev 814) @@ -1527,8 +1527,8 @@ protoc histogram.proto --python_out=. \end{verbatim} This generates Python module called \texttt{histogram\_pb2.py}, containing both the -descriptor information as well as methods to read and manipulate the R object -message. +descriptor information as well as methods to read and manipulate the histogram +message data. \begin{verbatim} # Import modules From noreply at r-forge.r-project.org Wed Jan 22 03:46:50 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 03:46:50 +0100 (CET) Subject: [Rprotobuf-commits] r815 - papers/jss Message-ID: <20140122024650.8414718677A@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-22 03:46:49 +0100 (Wed, 22 Jan 2014) New Revision: 815 Modified: papers/jss/article.Rnw papers/jss/article.bib Log: Add reference to jsonlite Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 02:40:34 UTC (rev 814) +++ papers/jss/article.Rnw 2014-01-22 02:46:49 UTC (rev 815) @@ -186,7 +186,7 @@ stored as human-readable decimal notation which is inefficient and leads to loss of type (double versus integer) and precision. Several R packages implement functions to parse and generate \texttt{JSON} data from R -objects \citep{rjson,RJSONIO}. +objects \citep{rjson,RJSONIO,jsonlite}. A number of binary formats based on \texttt{JSON} have been proposed that reduce the parsing cost and improve efficiency. \pkg{MessagePack} Modified: papers/jss/article.bib =================================================================== --- papers/jss/article.bib 2014-01-22 02:40:34 UTC (rev 814) +++ papers/jss/article.bib 2014-01-22 02:46:49 UTC (rev 815) @@ -84,6 +84,14 @@ url = {http://CRAN.R-project.org/package=rjson}, } + at Manual{jsonlite, + title = {jsonlite: A smarter JSON encoder/decoder for R}, + author = {Jeroen Ooms}, + year = 2014, + note = {R package version 0.9.4}, + url = {http://github.com/jeroenooms/jsonlite#readme}, +} + @Manual{rmongodb, title = {rmongodb: R-MongoDB driver}, author = {Gerald Lindsly}, From noreply at r-forge.r-project.org Wed Jan 22 04:02:14 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 04:02:14 +0100 (CET) Subject: [Rprotobuf-commits] r816 - papers/jss Message-ID: <20140122030214.D43371869E6@r-forge.r-project.org> Author: murray Date: 2014-01-22 04:02:13 +0100 (Wed, 22 Jan 2014) New Revision: 816 Modified: papers/jss/article.Rnw Log: Update the other approaches section of conclusion with some suggestions by Phillip Yelland. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 02:46:49 UTC (rev 815) +++ papers/jss/article.Rnw 2014-01-22 03:02:13 UTC (rev 816) @@ -13,7 +13,7 @@ \RequirePackage{fancyvrb} \RequirePackage{alltt} \DefineVerbatimEnvironment{example}{Verbatim}{} -\shortcites{janus} +\shortcites{janus,dremel} %% almost as usual \author{Dirk Eddelbuettel\\Debian Project \And Murray Stokely\\Google, Inc \And @@ -1790,7 +1790,7 @@ %print(msg.realValue); %\end{verbatim} -\section{Conclusion and Commentary} +\section{Concluding remarks} \label{sec:summary} % TODO(mstokely): Get cibona approval for these two sentences before % publishing. @@ -1805,23 +1805,30 @@ writing there are more than XXX 30-day active users of RProtoBuf using it to read data from and otherwise interact with other distributed systems written in C++, Java, Python, and other languages. +\\ -\paragraph*{Other Approaches} +\emph{Other Approaches} +\\ \pkg{RProtoBuf} is quite flexible and easy to use for interactive use, but it is not designed for efficient high-speed manipulation of large numbers of protocol buffers once they have been read into R. For example, taking a list of 100,000 Protocol Buffers, extracting a named field from each one, and computing an aggregate statistic on those -values would be relatively slow with RProtoBuf. Instead for such a -use case, the current design of RProtoBuf relies on other database -systems to provide query and aggregation semantics before the -resulting protocol buffers are read into R. Such queries could be -supported in a future version of \pkg{RProtoBuf} by supporting a -vector of messages type such that \emph{slicing} operations over a -given field across a large number of messages could be done -efficiently in C++. +values would be relatively slow with RProtoBuf. Mechanisms to address +such use cases are under investigation for possible incorporation into +future releases of RProtoBuf, but currently, the package relies on +other database systems to provide query and aggregation semantics +before the resulting protocol buffers are read into R. Inside Google, +the Dremel query system \citep{dremel} is often employed in this role +in conjunction with \pkg{RProtoBuf}. +% Such queries could be +%supported in a future version of \pkg{RProtoBuf} by supporting a +%vector of messages type such that \emph{slicing} operations over a +%given field across a large number of messages could be done +%efficiently in C++. + \section*{Acknowledgments} The first versions of \CRANpkg{RProtoBuf} were written during 2009-2010. From noreply at r-forge.r-project.org Wed Jan 22 06:28:40 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 06:28:40 +0100 (CET) Subject: [Rprotobuf-commits] r817 - papers/jss Message-ID: <20140122052841.0AC391867BB@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-22 06:28:39 +0100 (Wed, 22 Jan 2014) New Revision: 817 Modified: papers/jss/article.Rnw Log: update conclusion Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 03:02:13 UTC (rev 816) +++ papers/jss/article.Rnw 2014-01-22 05:28:39 UTC (rev 817) @@ -36,6 +36,7 @@ library and the R environment for statistical computing. %TODO(ms) keep it less than 150 words. % Maybe add Jeroen's sentence: +% JO: added this sentence to the conclustion, but could use it in abstract as well. % They offer a unique combination of features, performance, and maturity that seems % particulary well suited for data-driven applications and numerical % computing. @@ -1793,23 +1794,36 @@ \section{Concluding remarks} \label{sec:summary} % TODO(mstokely): Get cibona approval for these two sentences before -% publishing. -Schema-less text formats such as CSV and JSON will continue to be -widely used in many contexts, but we hope that the availability of -\pkg{RProtoBuf} makes it easy for many mixed-language data analysis -pipelines to embrace schemas such as Protocol Buffers for type-safe -and performant data serialization between applications. +% publishing +Over the past decade, many formats have become available for interoperable +data exchange, each with their unique features, strengths and weaknesses. +Text based formats such as CSV and JSON are easy to use and will likely +remain popular among statisticians for many years to come. However, in the +context of increasingly complex stacks and applications involving +distributed computing and mixed language analysis pipelines, choosing a more +sophisticated data interchange format will bring many benefits. +Protocol Buffers offer a unique combination of features, performance, +and maturity that seems particulary well suited for data-driven +applications and numerical computing. -\pkg{RProtoBuf} has been heavily used inside Google for the past three -years by statisticians and software engineers. At the time of this -writing there are more than XXX 30-day active users of RProtoBuf using -it to read data from and otherwise interact with other distributed -systems written in C++, Java, Python, and other languages. +The \pkg{RProtoBuf} package implements functionality to generate, +parse and manipulate Protocol Buffer messages in R. We hope that +this package will make Protocol Buffers more accessible to the R +community, and contributes towards better integration of R with +other software. \pkg{RProtoBuf} has been heavily used inside Google +for the past three years by statisticians and software engineers. +At the time of this writing there are more than XXX 30-day active +users of RProtoBuf using it to read data from and otherwise interact +with other distributed systems written in C++, Java, Python, and +other languages. \\ \emph{Other Approaches} -\\ +== JO: I don't really like this section here, it gives the entire paper a bit of a +sour aftertaste. Perhaps we can mention performance caveats in the technical +sections? I think it's nicer to leave it at the above paragraphs.== + \pkg{RProtoBuf} is quite flexible and easy to use for interactive use, but it is not designed for efficient high-speed manipulation of large numbers of protocol buffers once they have been read into R. For From noreply at r-forge.r-project.org Wed Jan 22 16:00:34 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 16:00:34 +0100 (CET) Subject: [Rprotobuf-commits] r818 - papers/jss Message-ID: <20140122150034.998C7186A0B@r-forge.r-project.org> Author: edd Date: 2014-01-22 16:00:34 +0100 (Wed, 22 Jan 2014) New Revision: 818 Modified: papers/jss/article.Rnw Log: quick round over Summary Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 05:28:39 UTC (rev 817) +++ papers/jss/article.Rnw 2014-01-22 15:00:34 UTC (rev 818) @@ -1791,51 +1791,58 @@ %print(msg.realValue); %\end{verbatim} -\section{Concluding remarks} +\section{Summary} % DE Simpler title \label{sec:summary} % TODO(mstokely): Get cibona approval for these two sentences before % publishing -Over the past decade, many formats have become available for interoperable -data exchange, each with their unique features, strengths and weaknesses. -Text based formats such as CSV and JSON are easy to use and will likely +Over the past decade, many formats for interoperable +data exchange have become available, each with their unique features, +strengths and weaknesses. +Text based formats such as CSV and JSON are easy to use, and will likely remain popular among statisticians for many years to come. However, in the -context of increasingly complex stacks and applications involving -distributed computing and mixed language analysis pipelines, choosing a more -sophisticated data interchange format will bring many benefits. -Protocol Buffers offer a unique combination of features, performance, -and maturity that seems particulary well suited for data-driven +context of increasingly complex analysis stacks and applications involving +distributed computing as well as mixed language analysis pipelines, choosing a more +sophisticated data interchange format may reap considerable benefits. +The Protocol Buffers protocol and librart offers a unique combination of features, performance, +maturity, and forward-compatibility that seems particulary well suited for data-driven applications and numerical computing. -The \pkg{RProtoBuf} package implements functionality to generate, -parse and manipulate Protocol Buffer messages in R. We hope that -this package will make Protocol Buffers more accessible to the R -community, and contributes towards better integration of R with -other software. \pkg{RProtoBuf} has been heavily used inside Google -for the past three years by statisticians and software engineers. +%% DE Re-ordering so that we end on RProtoBuf +The \pkg{RProtoBuf} package builds on the Protocol Buffers library, and +extends the R system with the ability to create, read and write Protocol +Buffer message. \pkg{RProtoBuf} has been used extensively inside Google +for the past three years by statisticians, analysts and software engineers. At the time of this writing there are more than XXX 30-day active -users of RProtoBuf using it to read data from and otherwise interact +users of \pkg{RProtoBuf} using it to read data from and otherwise interact with other distributed systems written in C++, Java, Python, and other languages. -\\ -\emph{Other Approaches} +As the \pkg{RProtoBuf} package provides users with the ability to generate, +parse and manipulate Protocol Buffer messages in R, it is our hope that this +package will make Protocol Buffers more accessible to the R community, and +thereby makes a small contribution towards better integration between R and +other software systems and applications. -== JO: I don't really like this section here, it gives the entire paper a bit of a -sour aftertaste. Perhaps we can mention performance caveats in the technical -sections? I think it's nicer to leave it at the above paragraphs.== +%\emph{Other Approaches} +% +%== JO: I don't really like this section here, it gives the entire paper a bit of a +%sour aftertaste. Perhaps we can mention performance caveats in the technical +%sections? I think it's nicer to leave it at the above paragraphs.== +% +% DE: Agreed -- commenting out -\pkg{RProtoBuf} is quite flexible and easy to use for interactive use, -but it is not designed for efficient high-speed manipulation of large -numbers of protocol buffers once they have been read into R. For -example, taking a list of 100,000 Protocol Buffers, extracting a named -field from each one, and computing an aggregate statistic on those -values would be relatively slow with RProtoBuf. Mechanisms to address -such use cases are under investigation for possible incorporation into -future releases of RProtoBuf, but currently, the package relies on -other database systems to provide query and aggregation semantics -before the resulting protocol buffers are read into R. Inside Google, -the Dremel query system \citep{dremel} is often employed in this role -in conjunction with \pkg{RProtoBuf}. +%% \pkg{RProtoBuf} is quite flexible and easy to use for interactive use, +%% but it is not designed for efficient high-speed manipulation of large +%% numbers of protocol buffers once they have been read into R. For +%% example, taking a list of 100,000 Protocol Buffers, extracting a named +%% field from each one, and computing an aggregate statistic on those +%% values would be relatively slow with RProtoBuf. Mechanisms to address +%% such use cases are under investigation for possible incorporation into +%% future releases of RProtoBuf, but currently, the package relies on +%% other database systems to provide query and aggregation semantics +%% before the resulting protocol buffers are read into R. Inside Google, +%% the Dremel query system \citep{dremel} is often employed in this role +%% in conjunction with \pkg{RProtoBuf}. % Such queries could be %supported in a future version of \pkg{RProtoBuf} by supporting a @@ -1843,13 +1850,15 @@ %given field across a large number of messages could be done %efficiently in C++. + + \section*{Acknowledgments} The first versions of \CRANpkg{RProtoBuf} were written during 2009-2010. Very significant contributions, both in code and design, were made by Romain Fran\c{c}ois whose continued influence on design and code is -greatly appreciated. Several features of the package are influenced -by the design of the \CRANpkg{rJava} package by Simon Urbanek +greatly appreciated. Several features of the package reflect +the design of the \CRANpkg{rJava} package by Simon Urbanek The user-defined table mechanism, implemented by Duncan Temple Lang for the purpose of the \pkg{RObjectTables} package, allows for the dynamic symbol lookup. Kenton Varda was generous with his time in reviewing code and explaining From noreply at r-forge.r-project.org Wed Jan 22 21:47:26 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 21:47:26 +0100 (CET) Subject: [Rprotobuf-commits] r819 - papers/jss Message-ID: <20140122204726.96DCF1864B6@r-forge.r-project.org> Author: murray Date: 2014-01-22 21:47:26 +0100 (Wed, 22 Jan 2014) New Revision: 819 Modified: papers/jss/article.Rnw Log: Fix a typo and then replace XXX with 300 for now. This summary looks pretty good to me, thanks Jeroen and Dirk for greatly improving it. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 15:00:34 UTC (rev 818) +++ papers/jss/article.Rnw 2014-01-22 20:47:26 UTC (rev 819) @@ -1803,7 +1803,7 @@ context of increasingly complex analysis stacks and applications involving distributed computing as well as mixed language analysis pipelines, choosing a more sophisticated data interchange format may reap considerable benefits. -The Protocol Buffers protocol and librart offers a unique combination of features, performance, +The Protocol Buffers protocol and library offers a unique combination of features, performance, maturity, and forward-compatibility that seems particulary well suited for data-driven applications and numerical computing. @@ -1812,7 +1812,7 @@ extends the R system with the ability to create, read and write Protocol Buffer message. \pkg{RProtoBuf} has been used extensively inside Google for the past three years by statisticians, analysts and software engineers. -At the time of this writing there are more than XXX 30-day active +At the time of this writing there are more than 300 30-day active users of \pkg{RProtoBuf} using it to read data from and otherwise interact with other distributed systems written in C++, Java, Python, and other languages. From noreply at r-forge.r-project.org Wed Jan 22 22:24:28 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 22:24:28 +0100 (CET) Subject: [Rprotobuf-commits] r820 - papers/jss Message-ID: <20140122212428.E697A1864C3@r-forge.r-project.org> Author: murray Date: 2014-01-22 22:24:28 +0100 (Wed, 22 Jan 2014) New Revision: 820 Modified: papers/jss/article.Rnw Log: Remove the wooden 30-day qualifier about active users. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 20:47:26 UTC (rev 819) +++ papers/jss/article.Rnw 2014-01-22 21:24:28 UTC (rev 820) @@ -1812,7 +1812,7 @@ extends the R system with the ability to create, read and write Protocol Buffer message. \pkg{RProtoBuf} has been used extensively inside Google for the past three years by statisticians, analysts and software engineers. -At the time of this writing there are more than 300 30-day active +At the time of this writing there are more than 300 active users of \pkg{RProtoBuf} using it to read data from and otherwise interact with other distributed systems written in C++, Java, Python, and other languages. From noreply at r-forge.r-project.org Wed Jan 22 23:23:58 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 23:23:58 +0100 (CET) Subject: [Rprotobuf-commits] r821 - papers/jss Message-ID: <20140122222358.4D468186321@r-forge.r-project.org> Author: murray Date: 2014-01-22 23:23:57 +0100 (Wed, 22 Jan 2014) New Revision: 821 Modified: papers/jss/article.Rnw Log: Add a missing article in the mapreduce/histogram application section and try using \begin{Code} instead of \begin{verbatim} in that section (seems to be typeset the same, I may try more changes to improve the typsetting of this example). Steve Scott noticed that one sentence in the conclusion had the word 'other' in it 3 times -- remove one superfluous word as he suggested. Split another run-on sentence in the conclusion into two sentences as suggested by Steve as well. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 21:24:28 UTC (rev 820) +++ papers/jss/article.Rnw 2014-01-22 22:23:57 UTC (rev 821) @@ -1527,11 +1527,11 @@ \begin{verbatim} protoc histogram.proto --python_out=. \end{verbatim} -This generates Python module called \texttt{histogram\_pb2.py}, containing both the +This generates a Python module called \texttt{histogram\_pb2.py}, containing both the descriptor information as well as methods to read and manipulate the histogram message data. -\begin{verbatim} +\begin{Code} # Import modules from histogram_pb2 import HistogramState; @@ -1547,7 +1547,7 @@ outfile = open("/tmp/hist.pb", "wb") outfile.write(hist.SerializeToString()) outfile.close() -\end{verbatim} +\end{Code} We can then read in the histogram into R and plot it with : @@ -1814,11 +1814,11 @@ for the past three years by statisticians, analysts and software engineers. At the time of this writing there are more than 300 active users of \pkg{RProtoBuf} using it to read data from and otherwise interact -with other distributed systems written in C++, Java, Python, and +with distributed systems written in C++, Java, Python, and other languages. -As the \pkg{RProtoBuf} package provides users with the ability to generate, -parse and manipulate Protocol Buffer messages in R, it is our hope that this +The \pkg{RProtoBuf} package provides users with the ability to generate, +parse and manipulate Protocol Buffer messages in R. It is our hope that this package will make Protocol Buffers more accessible to the R community, and thereby makes a small contribution towards better integration between R and other software systems and applications. From noreply at r-forge.r-project.org Wed Jan 22 23:30:33 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 22 Jan 2014 23:30:33 +0100 (CET) Subject: [Rprotobuf-commits] r822 - papers/jss Message-ID: <20140122223033.A40B31865B2@r-forge.r-project.org> Author: murray Date: 2014-01-22 23:30:33 +0100 (Wed, 22 Jan 2014) New Revision: 822 Modified: papers/jss/article.Rnw Log: John Wilkes feels we should end the session in table1 with class(p) instead of beginning with it. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 22:23:57 UTC (rev 821) +++ papers/jss/article.Rnw 2014-01-22 22:30:33 UTC (rev 822) @@ -362,11 +362,11 @@ <>= library(RProtoBuf) p <- new(tutorial.Person,id=1,name="Dirk") -class(p) p$name p$name <- "Murray" cat(as.character(p)) serialize(p, NULL) +class(p) @ \end{minipage} \\ \bottomrule From noreply at r-forge.r-project.org Thu Jan 23 00:39:33 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 00:39:33 +0100 (CET) Subject: [Rprotobuf-commits] r823 - papers/jss Message-ID: <20140122233933.649E918074F@r-forge.r-project.org> Author: murray Date: 2014-01-23 00:39:33 +0100 (Thu, 23 Jan 2014) New Revision: 823 Modified: papers/jss/article.Rnw Log: Use \proglang and \pkg in Keywords list for proper typesetting, add Rcpp keyword. Improve the table float placement significantly in section 4 by letting latex do its own thing with [tbp] instead of trying to place them [h] which never works. With the default [tbp] it gets most of the floats placed inside section 4 so section 5 is much more readable now without being interspersed with floats from the previous section. Improve the histogram / mr example slightly. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 22:30:33 UTC (rev 822) +++ papers/jss/article.Rnw 2014-01-22 23:39:33 UTC (rev 823) @@ -41,8 +41,8 @@ % particulary well suited for data-driven applications and numerical % computing. } -\Keywords{r, protocol buffers, serialization, cross-platform} -\Plainkeywords{r, protocol buffers, serialization, cross-platform} %% without formatting +\Keywords{\proglang{R}, \pkg{Rcpp}, protocol buffers, serialization, cross-platform} +\Plainkeywords{r, Rcpp, protocol buffers, serialization, cross-platform} %% without formatting %% at least one keyword must be supplied %% publication information @@ -716,6 +716,29 @@ glue code between the R language classes and the underlying C++ classes. +% MS: I think this looks better at the bottom of the page. +% so it appears after the new section starts where it is referenced. +\begin{table}[bp] +\centering +\begin{tabular}{lccl} +\toprule +\textbf{Class} & + \textbf{Slots} & + \textbf{Methods} & + \textbf{Dynamic Dispatch}\\ +\cmidrule{1-4} +Message & 2 & 20 & yes (field names)\\ +Descriptor & 2 & 16 & yes (field names, enum types, nested types)\\ +FieldDescriptor & 4 & 18 & no\\ +EnumDescriptor & 4 & 11 & yes (enum constant names)\\ +FileDescriptor & 3 & \phantom{1}6 & yes (message/field definitions)\\ +EnumValueDescriptor & 3 & \phantom{1}6 & no\\ +\bottomrule +\end{tabular} +\caption{\label{class-summary-table}Overview of Class, Slot, Method and + Dispatch Relationships} +\end{table} + The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp,eddelbuettel2013seamless} is used to facilitate this integration of the R and C++ code for these objects. @@ -744,33 +767,13 @@ \verb|object$method(arguments)|. \end{itemize} -Additionally, \texttt{RProtoBuf} implements the \texttt{.DollarNames} S3 generic function -(defined in the \texttt{utils} package) for all classes to enable tab -completion. Completion possibilities include pseudo-method names for all -classes, plus dynamic dispatch on names or types specific to a given object. +Additionally, \pkg{RProtoBuf} supports tab completion for all +classes. Completion possibilities include pseudo-method names for all +classes, plus dynamic dispatch on names or types specific to a given +object. This functionality is implemented with the +\texttt{.DollarNames} S3 generic function defined in the \pkg{utils} +package. -% TODO(ms): Add column check box for doing dynamic dispatch based on type. -\begin{table}[h] -\centering -\begin{tabular}{lccl} -\toprule -\textbf{Class} & - \textbf{Slots} & - \textbf{Methods} & - \textbf{Dynamic Dispatch}\\ -\cmidrule{1-4} -Message & 2 & 20 & yes (field names)\\ -Descriptor & 2 & 16 & yes (field names, enum types, nested types)\\ -FieldDescriptor & 4 & 18 & no\\ -EnumDescriptor & 4 & 11 & yes (enum constant names)\\ -FileDescriptor & 3 & \phantom{1}6 & yes (message/field definitions)\\ -EnumValueDescriptor & 3 & \phantom{1}6 & no\\ -\bottomrule -\end{tabular} -\caption{\label{class-summary-table}Overview of Class, Slot, Method and - Dispatch Relationships} -\end{table} - \subsection{Messages} The \texttt{Message} S4 class represents Protocol Buffer Messages and @@ -840,14 +843,12 @@ then available on the search path. <<>>= -# field descriptor -tutorial.Person$email +tutorial.Person$email # field descriptor -# enum descriptor -tutorial.Person$PhoneType +tutorial.Person$PhoneType # enum descriptor -# nested type descriptor -tutorial.Person$PhoneNumber + +tutorial.Person$PhoneNumber # nested type descriptor # same as tutorial.Person.PhoneNumber @ @@ -855,7 +856,7 @@ Table~\ref{Descriptor-methods-table} provides a complete list of the slots and available methods for Descriptors. -\begin{table}[h] +\begin{table}[tbp] \centering \begin{small} \begin{tabular}{lp{10cm}} @@ -901,12 +902,12 @@ \label{subsec-field-descriptor} The class \emph{FieldDescriptor} represents field -descriptor in R. This is a wrapper S4 class around the +descriptors in R. This is a wrapper S4 class around the \texttt{google::protobuf::FieldDescriptor} C++ class. Table~\ref{fielddescriptor-methods-table} describes the methods defined for the \texttt{FieldDescriptor} class. -\begin{table}[h] +\begin{table}[tbp] \centering \begin{small} \begin{tabular}{lp{10cm}} @@ -952,8 +953,9 @@ \subsection{Enum Descriptors} \label{subsec-enum-descriptor} -The class \emph{EnumDescriptor} is an R wrapper -class around the C++ class \texttt{google::protobuf::EnumDescriptor}. +The class \emph{EnumDescriptor} represents enum descriptors in R. +This is a wrapper S4 class around the +\texttt{google::protobuf::EnumDescriptor} C++ class. Table~\ref{enumdescriptor-methods-table} describes the methods defined for the \texttt{EnumDescriptor} class. @@ -966,7 +968,7 @@ tutorial.Person$PhoneType$WORK @ -\begin{table}[h] +\begin{table}[tbp] \centering \begin{small} \begin{tabular}{lp{10cm}} @@ -1002,8 +1004,9 @@ \subsection{File Descriptors} \label{subsec-file-descriptor} -The class \emph{FileDescriptor} is an R wrapper -class around the C++ class \texttt{google::protobuf::FileDescriptor}. +The class \emph{FileDescriptor} represents file descriptors in R. +This is a wrapper S4 class around the +\texttt{google::protobuf::FileDescriptor} C++ class. Table~\ref{filedescriptor-methods-table} describes the methods defined for the \texttt{FileDescriptor} class. @@ -1016,7 +1019,7 @@ f$Person @ -\begin{table}[h] +\begin{table}[tbp] \centering \begin{small} \begin{tabular}{lp{10cm}} @@ -1045,8 +1048,9 @@ \subsection{Enum Value Descriptors} \label{subsec-enumvalue-descriptor} -The class \emph{EnumValueDescriptor} is an R wrapper -class around the C++ class \texttt{google::protobuf::EnumValueDescriptor}. +The class \emph{EnumValueDescriptor} represents enumeration value +descriptors in R. This is a wrapper S4 class around the +\texttt{google::protobuf::EnumValueDescriptor} C++ class. Table~\ref{EnumValueDescriptor-methods-table} describes the methods defined for the \texttt{EnumValueDescriptor} class. @@ -1058,7 +1062,7 @@ tutorial.Person$PhoneType$value(number=1) @ -\begin{table}[h] +\begin{table}[tbp] \centering \begin{small} \begin{tabular}{lp{10cm}} @@ -1468,7 +1472,7 @@ very large data sets on large compute clusters. Many types of data analysis over large data sets may involve very rare -phenomenon or be dealing with highly skewed data sets or inflexible +phenomenon or deal with highly skewed data sets or inflexible raw data storage systems from which unbiased sampling is not feasible. In such situations, MapReduce and binning may be combined as a pre-processing step for a wide range of statistical and scientific @@ -1529,10 +1533,11 @@ \end{verbatim} This generates a Python module called \texttt{histogram\_pb2.py}, containing both the descriptor information as well as methods to read and manipulate the histogram -message data. +message data. The following simple Python script uses this generated +module to create a histogram and write out the protocol buffer +representation to a file: \begin{Code} -# Import modules from histogram_pb2 import HistogramState; # Create empty Histogram message @@ -1549,9 +1554,10 @@ outfile.close() \end{Code} -We can then read in the histogram into R and plot it with : +The protocol buffer can then be read into R and converted to a native +R histogram object for plotting: -\begin{verbatim} +\begin{Code} library(RProtoBuf) library(HistogramTools) @@ -1565,7 +1571,7 @@ # Convert to native R histogram object and plot plot(as.histogram(hist)) -\end{verbatim} +\end{Code} <>= require(RProtoBuf) From noreply at r-forge.r-project.org Thu Jan 23 01:08:03 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 01:08:03 +0100 (CET) Subject: [Rprotobuf-commits] r824 - papers/jss Message-ID: <20140123000803.896211869DB@r-forge.r-project.org> Author: murray Date: 2014-01-23 01:08:03 +0100 (Thu, 23 Jan 2014) New Revision: 824 Modified: papers/jss/article.Rnw Log: Improve the typesetting for the section 5 type coercion examples. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-22 23:39:33 UTC (rev 823) +++ papers/jss/article.Rnw 2014-01-23 00:08:03 UTC (rev 824) @@ -1157,15 +1157,18 @@ } @ -<<>>= -a <- new(protobuf_unittest.TestAllTypes) -a$optional_bool <- TRUE -a$optional_bool <- FALSE -<>= -a$optional_bool <- NA -<>= -try(a$optional_bool <- NA,silent=TRUE) -@ +% We want a cleaner error message here. +\begin{CodeChunk} +\begin{CodeInput} +R> a <- new(protobuf_unittest.TestAllTypes) +R> a$optional_bool <- TRUE +R> a$optional_bool <- FALSE +R> a$optional_bool <- NA +\end{CodeInput} +\begin{CodeOutput} +Error: NA boolean values can not be stored in bool protocol buffer fields +\end{CodeOutput} +\end{CodeChunk} \subsection{Unsigned Integers} @@ -1175,11 +1178,7 @@ <<>>= as.integer(2^31-1) -<>= as.integer(2^31 - 1) + as.integer(1) -<>= -try(as.integer(2^31 - 1) + as.integer(1)) - 2^31 class(2^31) @ @@ -1202,16 +1201,6 @@ RProtoBuf allows users to get and set 64-bit integer values by specifying them as character strings. -<>= -if (!exists("protobuf_unittest.TestAllTypes", - "RProtoBuf:DescriptorPool")) { - unittest.proto.file <- system.file("unitTests", "data", - "unittest.proto", - package="RProtoBuf") - readProtoFiles(file=unittest.proto.file) -} -@ - If we try to set an int64 field in R to double values, we lose precision: From noreply at r-forge.r-project.org Thu Jan 23 01:28:32 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 01:28:32 +0100 (CET) Subject: [Rprotobuf-commits] r825 - papers/jss Message-ID: <20140123002832.EF452186809@r-forge.r-project.org> Author: murray Date: 2014-01-23 01:28:32 +0100 (Thu, 23 Jan 2014) New Revision: 825 Modified: papers/jss/article.Rnw Log: Liberal sprinkling of \proglang{} anywhere we mention R, C++, Python, or Java. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 00:08:03 UTC (rev 824) +++ papers/jss/article.Rnw 2014-01-23 00:28:32 UTC (rev 825) @@ -33,7 +33,7 @@ method of serializing structured data between applications---while remaining independent of programming languages or operating system. The \CRANpkg{RProtoBuf} package provides a complete interface between this -library and the R environment for statistical computing. +library and the \proglang{R} environment for statistical computing. %TODO(ms) keep it less than 150 words. % Maybe add Jeroen's sentence: % JO: added this sentence to the conclustion, but could use it in abstract as well. @@ -42,7 +42,7 @@ % computing. } \Keywords{\proglang{R}, \pkg{Rcpp}, protocol buffers, serialization, cross-platform} -\Plainkeywords{r, Rcpp, protocol buffers, serialization, cross-platform} %% without formatting +\Plainkeywords{R, Rcpp, protocol buffers, serialization, cross-platform} %% without formatting %% at least one keyword must be supplied %% publication information @@ -185,8 +185,8 @@ supports arrays and distinguishes 4 primitive types: numbers, strings, booleans and null. However, as it too is a text-based format, numbers are stored as human-readable decimal notation which is inefficient and -leads to loss of type (double versus integer) and precision. Several R packages -implement functions to parse and generate \texttt{JSON} data from R +leads to loss of type (double versus integer) and precision. Several \proglang{R} packages +implement functions to parse and generate \texttt{JSON} data from \proglang{R} objects \citep{rjson,RJSONIO,jsonlite}. A number of binary formats based on \texttt{JSON} have been proposed @@ -246,23 +246,23 @@ % but it seems ueful here because we have a boring bit in the middle % (full class/method details) and interesting applications at the end. -This paper describes an R interface to Protocol Buffers, +This paper describes an \proglang{R} interface to Protocol Buffers, and is organized as follows. Section~\ref{sec:protobuf} provides a general overview of Protocol Buffers. -Section~\ref{sec:rprotobuf-basic} describes the interactive R interface +Section~\ref{sec:rprotobuf-basic} describes the interactive \proglang{R} interface provided by \CRANpkg{RProtoBuf} and introduces the two main abstractions: \emph{Messages} and \emph{Descriptors}. Section~\ref{sec:rprotobuf-classes} describes the implementation details of the main S4 classes making up this package. Section~\ref{sec:types} describes the challenges of type coercion -between R and other languages. Section~\ref{sec:evaluation} introduces a -general R language schema for serializing arbitrary R objects and evaluates +between \proglang{R} and other languages. Section~\ref{sec:evaluation} introduces a +general \proglang{R} language schema for serializing arbitrary \proglang{R} objects and evaluates it against R's built-in serialization. Sections~\ref{sec:mapreduce} and \ref{sec:opencpu} provide real-world use cases of \CRANpkg{RProtoBuf} in MapReduce and web service environments, respectively, before Section~\ref{sec:summary} concludes. %This article describes the basics of Google's Protocol Buffers through -%an easy to use R package, \CRANpkg{RProtoBuf}. After describing the +%an easy to use \proglang{R} package, \CRANpkg{RProtoBuf}. After describing the %basics of protocol buffers and \CRANpkg{RProtoBuf}, we illustrate %several common use cases for protocol buffers in data analysis. @@ -305,9 +305,9 @@ \end{figure} Figure~\ref{fig:protobuf-distributed-usecase} illustrates an example -communication workflow with Protocol Buffers and an interactive R session. +communication workflow with Protocol Buffers and an interactive \proglang{R} session. Common use cases include populating a request remote-procedure call (RPC) -Protocol Buffer in R that is then serialized and sent over the network to a +Protocol Buffer in \proglang{R} that is then serialized and sent over the network to a remote server. The server would then deserialize the message, act on the request, and respond with a new Protocol Buffer over the network. The key difference to, say, a request to an Rserve instance is that @@ -327,7 +327,7 @@ buffer data is described comprehensively on Google Code\footnote{See \url{http://code.google.com/apis/protocolbuffers/docs/proto.html}.}. Table~\ref{tab:proto} shows an example \texttt{.proto} file which -defines the \texttt{tutorial.Person} type. The R code in the right +defines the \texttt{tutorial.Person} type. The \proglang{R} code in the right column shows an example of creating a new message of this type and populating its fields. @@ -336,7 +336,7 @@ \begin{table} \begin{tabular}{p{.40\textwidth}p{0.55\textwidth}} \toprule -Schema : \texttt{addressbook.proto} & Example R Session\\ +Schema : \texttt{addressbook.proto} & Example \proglang{R} Session\\ \cmidrule{1-2} \begin{minipage}{.40\textwidth} \vspace{2mm} @@ -372,7 +372,7 @@ \bottomrule \end{tabular} \caption{The schema representation from a \texttt{.proto} file for the - \texttt{tutorial.Person} class (left) and simple R code for creating + \texttt{tutorial.Person} class (left) and simple \proglang{R} code for creating an object of this class and accessing its fields (right).} \label{tab:proto} \end{table} @@ -413,14 +413,15 @@ %buffers are also forward compatible: updates to the \texttt{proto} %files do not break programs built against the previous specification. -For added speed and efficiency, the C++, Java, and Python bindings to +For added speed and efficiency, the \proglang{C++}, \proglang{Java}, +and \proglang{Python} bindings to Protocol Buffers are used with a compiler that translates a Protocol Buffer schema description file (ending in \texttt{.proto}) into language-specific classes that can be used to create, read, write and -manipulate Protocol Buffer messages. The R interface, in contrast, +manipulate Protocol Buffer messages. The \proglang{R} interface, in contrast, uses a reflection-based API that is particularly well-suited for interactive data analysis. -All messages in R have a single class +All messages in \proglang{R} have a single class structure, but different accessor methods are created at runtime based on the named fields of the specified message type, as described in the next section. @@ -450,8 +451,8 @@ \section{Basic Usage: Messages and Descriptors} \label{sec:rprotobuf-basic} -This section describes how to use the R API to create and manipulate -protocol buffer messages in R, and how to read and write the +This section describes how to use the \proglang{R} API to create and manipulate +protocol buffer messages in \proglang{R}, and how to read and write the binary representation of the message (often called the \emph{payload}) to files and arbitrary binary R connections. The two fundamental building blocks of Protocol Buffers are \emph{Messages} @@ -486,7 +487,7 @@ % \label{figure:rlogo} %\end{figure} -\subsection{Importing Message Descriptors from .proto files} +\subsection[Importing Message Descriptors from .proto files]{Importing Message Descriptors from \texttt{.proto} files} %The three basic abstractions of \CRANpkg{RProtoBuf} are Messages, %which encapsulate a data structure, Descriptors, which define the @@ -497,11 +498,11 @@ the message type specification from a \texttt{.proto} file. The \texttt{.proto} files are imported using the \code{readProtoFiles} function, which can either import a single file, all files in a directory, -or every \texttt{.proto} file provided by a particular R package. +or every \texttt{.proto} file provided by a particular \proglang{R} package. After importing proto files, the corresponding message descriptors are available from the \texttt{RProtoBuf:DescriptorPool} environment in -the R search path. This environment is implemented with the user +the \proglang{R} search path. This environment is implemented with the user defined tables framework from the \pkg{RObjectTables} package available from the OmegaHat project \citep{RObjectTables}. Instead of being associated with a static hash table, this environment @@ -521,10 +522,10 @@ %from the OmegaHat project \citep{RObjectTables}. % %The feature allows \texttt{RProtoBuf} to install the -%special environment \emph{RProtoBuf:DescriptorPool} in the R search path. +%special environment \emph{RProtoBuf:DescriptorPool} in the \proglang{R} search path. %The environment is special in that, instead of being associated with a -%static hash table, it is dynamically queried by R as part of R's usual -%variable lookup. In other words, it means that when the R interpreter +%static hash table, it is dynamically queried by \proglang{R} as part of R's usual +%variable lookup. In other words, it means that when the \proglang{R} interpreter %looks for a binding to a symbol (foo) in its search path, %it asks to our package if it knows the binding "foo", this is then %implemented by the \texttt{RProtoBuf} package by calling an internal @@ -536,7 +537,7 @@ %package does not rely on the \texttt{protoc} compiler (with the exception of %the two functions discussed in the previous section). This means that no %initial step of statically compiling the proto file into C++ code that is -%then accessed by R code is necessary. Instead, \texttt{proto} files are +%then accessed by \proglang{R} code is necessary. Instead, \texttt{proto} files are %parsed and processed \textsl{at runtime} by the protobuf C++ library---which %is much more appropriate for a dynamic language. @@ -568,7 +569,7 @@ p$email <- "murray at stokely.org" @ -However, as opposed to R lists, no partial matching is performed +However, as opposed to \proglang{R} lists, no partial matching is performed and the name must be given entirely. The \verb|[[| operator can also be used to query and set fields of a messages, supplying either their name or their tag number : @@ -579,7 +580,7 @@ p[[ "email" ]] @ -Protocol Buffers include a 64-bit integer type, but R lacks native +Protocol Buffers include a 64-bit integer type, but \proglang{R} lacks native 64-bit integer support. A workaround is available and described in Section~\ref{sec:int64} for working with large integer values. @@ -610,7 +611,7 @@ of bytes. The \texttt{serialize} method is implemented for Protocol Buffer messages to serialize a message into a sequence of bytes that represents the message. -%(raw vector in R speech) that represents the message. +%(raw vector in \proglang{R} speech) that represents the message. <<>>= serialize(p, NULL) @@ -667,7 +668,7 @@ @ The \texttt{input} argument of \texttt{read} can also be a binary -readable R connection, such as a binary file connection: +readable \proglang{R} connection, such as a binary file connection: <<>>= con <- file(tf2, open = "rb") @@ -709,11 +710,11 @@ generic in the S3 sense, such as \texttt{new} and \texttt{serialize}. Table~\ref{class-summary-table} lists the six -primary Message and Descriptor classes in RProtoBuf. Each R object +primary Message and Descriptor classes in RProtoBuf. Each \proglang{R} object contains an external pointer to an object managed by the -\texttt{protobuf} C++ library, and the R objects make calls into more -than 100 C++ functions that provide the -glue code between the R language classes and the underlying C++ +\texttt{protobuf} \proglang{C++} library, and the \proglang{R} objects make calls into more +than 100 \proglang{C++} functions that provide the +glue code between the \proglang{R} language classes and the underlying \proglang{C++} classes. % MS: I think this looks better at the bottom of the page. @@ -741,13 +742,13 @@ The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp,eddelbuettel2013seamless} is used to -facilitate this integration of the R and C++ code for these objects. +facilitate this integration of the \proglang{R} and \proglang{C++} code for these objects. Each method is wrapped individually which alllows us to add user friendly custom error handling, type coercion, and performance improvements at the cost of a more verbose implementation. -The RProtoBuf package in many ways motivated -the development of Rcpp Modules \citep{eddelbuettel2013exposing}, -which provide a more concise way of wrapping C++ functions and classes +The \pkg{RProtoBuf} package in many ways motivated +the development of \pkg{Rcpp} Modules \citep{eddelbuettel2013exposing}, +which provide a more concise way of wrapping \proglang{C++} functions and classes in a single entity. % Message, Descriptor, FieldDescriptor, EnumDescriptor, @@ -791,7 +792,7 @@ \toprule \textbf{Slot} & \textbf{Description} \\ \cmidrule(r){2-2} -\texttt{pointer} & External pointer to the \texttt{Message} object of the C++ protobuf library. Documentation for the +\texttt{pointer} & External pointer to the \texttt{Message} object of the \proglang{C++} protobuf library. Documentation for the \texttt{Message} class is available from the Protocol Buffer project page. \\ %(\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.message.html#Message}) \\ \texttt{type} & Fully qualified name of the message. For example a \texttt{Person} message @@ -813,10 +814,10 @@ \texttt{getExtension} & get the value of an extension of a message\\ \texttt{add} & add elements to a repeated field \\[3mm] % -\texttt{str} & the R structure of the message\\ +\texttt{str} & the \proglang{R} structure of the message\\ \texttt{as.character} & character representation of a message\\ \texttt{toString} & character representation of a message (same as \texttt{as.character}) \\ -\texttt{as.list} & converts message to a named R list\\ +\texttt{as.list} & converts message to a named \proglang{R} list\\ \texttt{update} & updates several fields of a message at once\\ \texttt{descriptor} & get the descriptor of the message type of this message\\ \texttt{fileDescriptor} & get the file descriptor of this message's descriptor\\ @@ -830,7 +831,7 @@ Descriptors describe the type of a Message. This includes what fields a message contains and what the types of those fields are. Message -descriptors are represented in R with the \emph{Descriptor} S4 +descriptors are represented in \proglang{R} with the \emph{Descriptor} S4 class. The class contains the slots \texttt{pointer} and \texttt{type}. Similarly to messages, the \verb|$| operator can be used to retrieve descriptors that are contained in the descriptor, or @@ -863,7 +864,7 @@ \toprule \textbf{Slot} & \textbf{Description} \\ \cmidrule(r){2-2} -\texttt{pointer} & External pointer to the \texttt{Descriptor} object of the C++ proto library. Documentation for the +\texttt{pointer} & External pointer to the \texttt{Descriptor} object of the \proglang{C++} proto library. Documentation for the \texttt{Descriptor} class is available from the Protocol Buffer project page.\\ %\url{http://code.google.com/apis/protocolbuffers/docs/reference/cpp/google.protobuf.descriptor.html#Descriptor} \\ \texttt{type} & Fully qualified path of the message type. \\[.3cm] @@ -903,7 +904,7 @@ The class \emph{FieldDescriptor} represents field descriptors in R. This is a wrapper S4 class around the -\texttt{google::protobuf::FieldDescriptor} C++ class. +\texttt{google::protobuf::FieldDescriptor} \proglang{C++} class. Table~\ref{fielddescriptor-methods-table} describes the methods defined for the \texttt{FieldDescriptor} class. @@ -914,7 +915,7 @@ \toprule \textbf{Slot} & \textbf{Description} \\ \cmidrule(r){2-2} -\texttt{pointer} & External pointer to the \texttt{FieldDescriptor} C++ variable \\ +\texttt{pointer} & External pointer to the \texttt{FieldDescriptor} \proglang{C++} variable \\ \texttt{name} & Simple name of the field \\ \texttt{full\_name} & Fully qualified name of the field \\ \texttt{type} & Name of the message type where the field is declared \\[.3cm] @@ -930,7 +931,7 @@ \texttt{is\_extension} & Return TRUE if this field is an extension.\\ \texttt{number} & Gets the declared tag number of the field.\\ \texttt{type} & Gets the type of the field.\\ -\texttt{cpp\_type} & Gets the C++ type of the field.\\ +\texttt{cpp\_type} & Gets the \proglang{C++} type of the field.\\ \texttt{label} & Gets the label of a field (optional, required, or repeated).\\ \texttt{is\_repeated} & Return TRUE if this field is repeated.\\ \texttt{is\_required} & Return TRUE if this field is required.\\ @@ -955,7 +956,7 @@ The class \emph{EnumDescriptor} represents enum descriptors in R. This is a wrapper S4 class around the -\texttt{google::protobuf::EnumDescriptor} C++ class. +\texttt{google::protobuf::EnumDescriptor} \proglang{C++} class. Table~\ref{enumdescriptor-methods-table} describes the methods defined for the \texttt{EnumDescriptor} class. @@ -975,7 +976,7 @@ \toprule \textbf{Slot} & \textbf{Description} \\ \cmidrule(r){2-2} -\texttt{pointer} & External pointer to the \texttt{EnumDescriptor} C++ variable \\ +\texttt{pointer} & External pointer to the \texttt{EnumDescriptor} \proglang{C++} variable \\ \texttt{name} & Simple name of the enum \\ \texttt{full\_name} & Fully qualified name of the enum \\ \texttt{type} & Name of the message type where the enum is declared \\[.3cm] @@ -1006,7 +1007,7 @@ The class \emph{FileDescriptor} represents file descriptors in R. This is a wrapper S4 class around the -\texttt{google::protobuf::FileDescriptor} C++ class. +\texttt{google::protobuf::FileDescriptor} \proglang{C++} class. Table~\ref{filedescriptor-methods-table} describes the methods defined for the \texttt{FileDescriptor} class. @@ -1026,7 +1027,7 @@ \toprule \textbf{Slot} & \textbf{Description} \\ \cmidrule(r){2-2} -\texttt{pointer} & external pointer to the \texttt{FileDescriptor} object of the C++ proto library. Documentation for the +\texttt{pointer} & external pointer to the \texttt{FileDescriptor} object of the \proglang{C++} proto library. Documentation for the \texttt{FileDescriptor} class is available from the protocol buffer project page: \url{http://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.descriptor.html#FileDescriptor} \\ \texttt{filename} & fully qualified pathname of the \texttt{.proto} file.\\ @@ -1050,7 +1051,7 @@ The class \emph{EnumValueDescriptor} represents enumeration value descriptors in R. This is a wrapper S4 class around the -\texttt{google::protobuf::EnumValueDescriptor} C++ class. +\texttt{google::protobuf::EnumValueDescriptor} \proglang{C++} class. Table~\ref{EnumValueDescriptor-methods-table} describes the methods defined for the \texttt{EnumValueDescriptor} class. @@ -1069,7 +1070,7 @@ \toprule \textbf{Slot} & \textbf{Description} \\ \cmidrule(r){2-2} -\texttt{pointer} & External pointer to the \texttt{EnumValueDescriptor} C++ variable \\ +\texttt{pointer} & External pointer to the \texttt{EnumValueDescriptor} \proglang{C++} variable \\ \texttt{name} & simple name of the enum value \\ \texttt{full\_name} & fully qualified name of the enum value \\[.3cm] % @@ -1099,14 +1100,14 @@ Table~\ref{table-get-types} details the correspondence between the field type and the type of data that is retrieved by \verb|$| and \verb|[[| extractors. Three types in particular need further attention due to -specific differences in the R language. +specific differences in the \proglang{R} language. \begin{table}[h] \centering \begin{small} \begin{tabular}{lp{5cm}p{5cm}} \toprule -Field type & R type (non repeated) & R type (repeated) \\ +Field type & \proglang{R} type (non repeated) & \proglang{R} type (repeated) \\ \cmidrule(r){2-3} double & \texttt{double} vector & \texttt{double} vector \\ float & \texttt{double} vector & \texttt{double} vector \\[3mm] @@ -1130,7 +1131,7 @@ \end{tabular} \end{small} \caption{\label{table-get-types}Correspondence between field type and - R type retrieved by the extractors. Note that R lacks native + \proglang{R} type retrieved by the extractors. Note that \proglang{R} lacks native 64-bit integers, so the \texttt{RProtoBuf.int64AsString} option is available to return large integers as characters to avoid losing precision. This option is described in Section~\ref{sec:int64}.} @@ -1141,7 +1142,7 @@ R booleans can accept three values: \texttt{TRUE}, \texttt{FALSE}, and \texttt{NA}. However, most other languages, including the Protocol Buffer schema, only accept \texttt{TRUE} or \texttt{FALSE}. This means -that we simply can not store R logical vectors that include all three +that we simply can not store \proglang{R} logical vectors that include all three possible values as booleans. The library will refuse to store \texttt{NA}s in protocol buffer boolean fields, and users must instead choose another type (such as enum or integer) capable of storing three @@ -1188,7 +1189,7 @@ R also does not support the native 64-bit integer type. Numeric vectors with values $\geq 2^{31}$ can only be stored as doubles, which have -limited precision. Thereby R loses the ability to distinguish some +limited precision. Thereby \proglang{R} loses the ability to distinguish some distinct integers: <<>>= @@ -1201,7 +1202,7 @@ RProtoBuf allows users to get and set 64-bit integer values by specifying them as character strings. -If we try to set an int64 field in R to double values, we lose +If we try to set an int64 field in \proglang{R} to double values, we lose precision: <<>>= @@ -1239,7 +1240,7 @@ options("RProtoBuf.int64AsString" = FALSE) @ -\section{Converting R Data Structures into Protocol Buffers} +\section[Converting R Data Structures into Protocol Buffers]{Converting \proglang{R} Data Structures into Protocol Buffers} \label{sec:evaluation} The previous sections discussed functionality in the \pkg{RProtoBuf} package @@ -1249,9 +1250,9 @@ components written in other languages that need to be accessed from within R. -The package also provides methods for converting arbitrary R data structures into protocol -buffers and vice versa with a universal R object schema. The \texttt{serialize\_pb} and \texttt{unserialize\_pb} -functions serialize arbitrary R objects into a universal Protocol Buffer +The package also provides methods for converting arbitrary \proglang{R} data structures into protocol +buffers and vice versa with a universal \proglang{R} object schema. The \texttt{serialize\_pb} and \texttt{unserialize\_pb} +functions serialize arbitrary \proglang{R} objects into a universal Protocol Buffer message: <<>>= @@ -1260,7 +1261,7 @@ @ In order to accomplish this, \pkg{RProtoBuf} uses the same catch-all \texttt{proto} -schema used by \pkg{RHIPE} for exchanging R data with Hadoop \citep{rhipe}. This +schema used by \pkg{RHIPE} for exchanging \proglang{R} data with Hadoop \citep{rhipe}. This schema, which we will refer to as \texttt{rexp.proto}, is printed in %appendix \ref{rexp.proto}. the appendix. @@ -1269,23 +1270,23 @@ same schema. This shows the power of using a schema based cross-platform format such as Protocol Buffers: interoperability is achieved without effort or close coordination. -The \texttt{rexp.proto} schema supports all main R storage types holding \emph{data}. +The \texttt{rexp.proto} schema supports all main \proglang{R} storage types holding \emph{data}. These include \texttt{NULL}, \texttt{list} and vectors of type \texttt{logical}, \texttt{character}, \texttt{double}, \texttt{integer} and \texttt{complex}. In addition, every type can contain a named set of attributes, as is the case in R. The \texttt{rexp.proto} -schema does not support some of the special R specific storage types, such as \texttt{function}, +schema does not support some of the special \proglang{R} specific storage types, such as \texttt{function}, \texttt{language} or \texttt{environment}. Such objects have no native equivalent type in Protocol Buffers, and have little meaning outside the context of R. -When serializing R objects using \texttt{serialize\_pb}, values or attributes of +When serializing \proglang{R} objects using \texttt{serialize\_pb}, values or attributes of unsupported types are skipped with a warning. If the user really wishes to serialize these objects, they need to be converted into a supported type. For example, the can use \texttt{deparse} to convert functions or language objects into strings, or \texttt{as.list} for environments. -\subsection{Evaluation: Converting R Data Sets} +\subsection[Evaluation: Converting R Data Sets]{Evaluation: Converting \proglang{R} Data Sets} To illustrate how this method works, we attempt to convert all of the built-in -datasets from R into this serialized Protocol Buffer representation. +datasets from \proglang{R} into this serialized Protocol Buffer representation. <>= datasets <- as.data.frame(data(package="datasets")$results) @@ -1309,7 +1310,7 @@ inspection, all other datasets are objects of class \texttt{nfnGroupedData}. This class represents a special type of data frame that has some additional attributes used by the \pkg{nlme} package, among which a \emph{formula} object. -Because formulas are R \emph{language} objects, they have little meaning to +Because formulas are \proglang{R} \emph{language} objects, they have little meaning to other systems, and are not supported by the \texttt{rexp.proto} descriptor. When \texttt{serialize\_pb} is used on objects of this class, it will serialize the data frame and all attributes, except for the formula. @@ -1331,8 +1332,8 @@ using four different methods: \begin{itemize} -\item normal R serialization \citep{serialization}, -\item R serialization followed by gzip, +\item normal \proglang{R} serialization \citep{serialization}, +\item \proglang{R} serialization followed by gzip, \item normal Protocol Buffer serialization, and \item Protocol Buffer serialization followed by gzip. \end{itemize} @@ -1359,14 +1360,14 @@ check.names=FALSE) @ -Table~\ref{tab:compression} shows the sizes of 50 sample R datasets as +Table~\ref{tab:compression} shows the sizes of 50 sample \proglang{R} datasets as returned by object.size() compared to the serialized sizes. %The summary compression sizes are listed below, and a full table for a %sample of 50 datasets is included on the next page. Sizes are comparable but Protocol Buffers provide simple getters and setters in multiple languages instead of requiring other programs to parse the R serialization format. % \citep{serialization}. -One takeaway from this table is that the universal R object schema +One takeaway from this table is that the universal \proglang{R} object schema included in RProtoBuf does not in general provide any significant saving in file size compared to the normal serialization mechanism in R. @@ -1378,7 +1379,7 @@ % N.B. see table.Rnw for how this table is created. % -% latex table generated in R 3.0.2 by xtable 1.7-0 package +% latex table generated in \proglang{R} 3.0.2 by xtable 1.7-0 package % Fri Dec 27 17:00:03 2013 \begin{table}[h!] \begin{center} @@ -1443,8 +1444,8 @@ \bottomrule \end{tabular} } -\caption{Serialization sizes for default serialization in R and - RProtoBuf for 50 R datasets.} +\caption{Serialization sizes for default serialization in \proglang{R} and + RProtoBuf for 50 \proglang{R} datasets.} \label{tab:compression} \end{center} \end{table} @@ -1497,7 +1498,7 @@ effectively. The \pkg{HistogramTools} package \citep{histogramtools} enhances -\pkg{RProtoBuf} by providing a concise schema for R histogram objects: +\pkg{RProtoBuf} by providing a concise schema for \proglang{R} histogram objects: \begin{example} package HistogramTools; @@ -1543,7 +1544,7 @@ outfile.close() \end{Code} -The protocol buffer can then be read into R and converted to a native +The protocol buffer can then be read into \proglang{R} and converted to a native R histogram object for plotting: \begin{Code} @@ -1558,7 +1559,7 @@ hist [1] "message of type 'HistogramTools.HistogramState' with 3 fields set" -# Convert to native R histogram object and plot +# Convert to native \proglang{R} histogram object and plot plot(as.histogram(hist)) \end{Code} @@ -1598,22 +1599,22 @@ generate protobuf messages are available for many programming languages, making it relatively straightforward to implement clients and servers. -\subsection{Interacting with R through HTTPS and Protocol Buffers} +\subsection[Interacting with R through HTTPS and Protocol Buffers]{Interacting with \proglang{R} through HTTPS and Protocol Buffers} One example of a system that supports Protocol Buffers to interact -with R is OpenCPU \citep{opencpu}. OpenCPU is a framework for embedded statistical -computation and reproducible research based on R and \LaTeX. It exposes a -HTTP(S) API to access and manipulate R objects and allows for performing -remote R function calls. Clients do not need to understand -or generate any R code: HTTP requests are automatically mapped to +with \proglang{R} is OpenCPU \citep{opencpu}. OpenCPU is a framework for embedded statistical +computation and reproducible research based on \proglang{R} and \LaTeX. It exposes a +HTTP(S) API to access and manipulate \proglang{R} objects and allows for performing +remote \proglang{R} function calls. Clients do not need to understand +or generate any \proglang{R} code: HTTP requests are automatically mapped to function calls, and arguments/return values can be posted/retrieved using several data interchange formats, such as protocol buffers. OpenCPU uses the \texttt{serialize\_pb} and \texttt{unserialize\_pb} functions -from the \texttt{RProtoBuf} package to convert between R objects and protobuf +from the \texttt{RProtoBuf} package to convert between \proglang{R} objects and protobuf messages. Therefore, clients need the \texttt{rexp.proto} descriptor mentioned earlier to parse and generate protobuf messages when interacting with OpenCPU. -\subsection{HTTP GET: Retrieving an R object} +\subsection[HTTP GET: Retrieving an R object]{HTTP GET: Retrieving an \proglang{R} object} The \texttt{HTTP GET} method is used to read a resource from OpenCPU. For example, to access the dataset \texttt{Animals} from the package \texttt{MASS}, a @@ -1632,10 +1633,10 @@ Because both HTTP and Protocol Buffers have libraries available for many languages, clients can be implemented in just a few lines of code. Below -is example code for both R and Python that retrieves a dataset from R with +is example code for both \proglang{R} and Python that retrieves a dataset from \proglang{R} with OpenCPU using a protobuf message. In R, we use the HTTP client from the \texttt{httr} package \citep{httr}. In this example we -download a dataset which is part of the base R distribution, so we can +download a dataset which is part of the base \proglang{R} distribution, so we can verify that the object was transferred without loss of information. <>= @@ -1651,9 +1652,9 @@ identical(output, MASS::Animals) @ -This code suggests a method for exchanging objects between R servers, however this might as +This code suggests a method for exchanging objects between \proglang{R} servers, however this might as well be done without Protocol Buffers. The main advantage of using an inter-operable format -is that we can actually access R objects from within another +is that we can actually access \proglang{R} objects from within another programming language. For example, in a very similar fashion we can retrieve the same dataset in a Python client. To parse messages in Python, we first compile the \texttt{rexp.proto} descriptor into a python module using the \texttt{protoc} compiler: @@ -1662,7 +1663,7 @@ protoc rexp.proto --python_out=. \end{verbatim} This generates Python module called \texttt{rexp\_pb2.py}, containing both the -descriptor information as well as methods to read and manipulate the R object +descriptor information as well as methods to read and manipulate the \proglang{R} object message. In the example below we use the HTTP client from the \texttt{urllib2} module. @@ -1684,26 +1685,26 @@ can easily extract the desired fields for further use in Python. -\subsection{HTTP POST: Calling an R function} +\subsection[HTTP POST: Calling an R function]{HTTP POST: Calling an \proglang{R} function} The example above shows how the \texttt{HTTP GET} method retrieves a -resource from OpenCPU, for example an R object. The \texttt{HTTP POST} +resource from OpenCPU, for example an \proglang{R} object. The \texttt{HTTP POST} method on the other hand is used for calling functions and running scripts, which is the primary purpose of the framework. As before, the \texttt{/pb} postfix requests to retrieve the output as a protobuf message, in this case the function return value. However, OpenCPU allows us to supply the arguments of the function call in the form of protobuf messages as well. This is a bit more work, because clients needs to both generate messages [TRUNCATED] To get the complete diff run: svnlook diff /svnroot/rprotobuf -r 825 From noreply at r-forge.r-project.org Thu Jan 23 01:33:34 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 01:33:34 +0100 (CET) Subject: [Rprotobuf-commits] r826 - papers/jss Message-ID: <20140123003334.E320E1869EB@r-forge.r-project.org> Author: edd Date: 2014-01-23 01:33:34 +0100 (Thu, 23 Jan 2014) New Revision: 826 Modified: papers/jss/article.Rnw Log: mix of minor fixes from the train ride Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 00:28:32 UTC (rev 825) +++ papers/jss/article.Rnw 2014-01-23 00:33:34 UTC (rev 826) @@ -66,14 +66,14 @@ Murray Stokely\\ Google, Inc.\\ 1600 Amphitheatre Parkway\\ - Mountain View, CA 94040\\ - USA\\ + Mountain View, CA, USA\\ E-mail: \email{mstokely at google.com}\\ URL: \url{http://www.stokely.org/}\\ \\ Jeroen Ooms\\ UCLA Department of Statistics\\ University of California\\ + Los Angeles, CA, USA\\ E-mail: \email{jeroen.ooms at stat.ucla.edu}\\ URL: \url{http://jeroenooms.github.io} } @@ -248,15 +248,17 @@ This paper describes an \proglang{R} interface to Protocol Buffers, and is organized as follows. Section~\ref{sec:protobuf} -provides a general overview of Protocol Buffers. +provides a general high-level overview of Protocol Buffers as well as a basic +motivation for their use. Section~\ref{sec:rprotobuf-basic} describes the interactive \proglang{R} interface -provided by \CRANpkg{RProtoBuf} and introduces the two main abstractions: +provided by the \CRANpkg{RProtoBuf} package, and introduces the two main abstractions: \emph{Messages} and \emph{Descriptors}. Section~\ref{sec:rprotobuf-classes} -describes the implementation details of the main S4 classes making up this +details the implementation details of the main S4 classes and methods +contained in this package. Section~\ref{sec:types} describes the challenges of type coercion between \proglang{R} and other languages. Section~\ref{sec:evaluation} introduces a general \proglang{R} language schema for serializing arbitrary \proglang{R} objects and evaluates -it against R's built-in serialization. Sections~\ref{sec:mapreduce} +it against the serialization capbilities built directly into R. Sections~\ref{sec:mapreduce} and \ref{sec:opencpu} provide real-world use cases of \CRANpkg{RProtoBuf} in MapReduce and web service environments, respectively, before Section~\ref{sec:summary} concludes. @@ -1309,7 +1311,7 @@ (\Sexpr{format(100*m/n,digits=1)}\%) without loss of information. Upon closer inspection, all other datasets are objects of class \texttt{nfnGroupedData}. This class represents a special type of data frame that has some additional -attributes used by the \pkg{nlme} package, among which a \emph{formula} object. +attributes (such as a \emph{formula} object) used by the \pkg{nlme} package. Because formulas are \proglang{R} \emph{language} objects, they have little meaning to other systems, and are not supported by the \texttt{rexp.proto} descriptor. When \texttt{serialize\_pb} is used on objects of this class, it will serialize @@ -1368,14 +1370,14 @@ in multiple languages instead of requiring other programs to parse the R serialization format. % \citep{serialization}. One takeaway from this table is that the universal \proglang{R} object schema -included in RProtoBuf does not in general provide +included in \pkg{RProtoBuf} does not in general provide any significant saving in file size compared to the normal serialization mechanism in R. % redundant: which is seen as equally compact. -The benefits of RProtoBuf accrue more naturally in applications where +The benefits of \pkg{RProtoBuf} accrue more naturally in applications where multiple programming languages are involved, or when a more concise application-specific schema has been defined. The example in the next -section provides both of these conditions. +section satisfies both of these conditions. % N.B. see table.Rnw for how this table is created. % @@ -1563,6 +1565,7 @@ plot(as.histogram(hist)) \end{Code} +\begin{center} <>= require(RProtoBuf) require(HistogramTools) @@ -1570,6 +1573,7 @@ hist <- HistogramTools.HistogramState$read("hist.pb") plot(as.histogram(hist)) @ +\end{center} One of the authors has used this design pattern for several large scale studies of distributed filesystems \citep{janus}. @@ -1584,7 +1588,7 @@ As described earlier, the primary application of Protocol Buffers is data interchange in the context of inter-system communications. Network protocols -such as HTTP provide mechanisms for client-server communication, i.e. how to +such as HTTP provide mechanisms for client-server communication, i.e.~how to initiate requests, authenticate, send messages, etc. However, network protocols generally do not regulate the \emph{content} of messages: they allow transfer of any media type, such as web pages, static files or @@ -1919,7 +1923,5 @@ \newpage \bibliography{article} -%\section[About Java]{About \proglang{Java}} -%% Note: If there is markup in \(sub)section, then it has to be escape as above. \end{document} From noreply at r-forge.r-project.org Thu Jan 23 01:39:21 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 01:39:21 +0100 (CET) Subject: [Rprotobuf-commits] r827 - papers/jss Message-ID: <20140123003921.70E971810B1@r-forge.r-project.org> Author: murray Date: 2014-01-23 01:39:21 +0100 (Thu, 23 Jan 2014) New Revision: 827 Modified: papers/jss/article.Rnw Log: Remove duplicate \title line, add a few missing \proglangs, and emphasize dynamic dispatch on first use since its a rather technical term which we only partially define. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 00:33:34 UTC (rev 826) +++ papers/jss/article.Rnw 2014-01-23 00:39:21 UTC (rev 827) @@ -13,17 +13,17 @@ \RequirePackage{fancyvrb} \RequirePackage{alltt} \DefineVerbatimEnvironment{example}{Verbatim}{} +% Articles with many authors we should shorten to FirstAuthor, et al. \shortcites{janus,dremel} -%% almost as usual \author{Dirk Eddelbuettel\\Debian Project \And Murray Stokely\\Google, Inc \And Jeroen Ooms\\UCLA} -\title{\pkg{RProtoBuf}: Efficient Cross-Language Data Serialization in R} +\title{\pkg{RProtoBuf}: Efficient Cross-Language Data Serialization in \proglang{R}} %% for pretty printing and a nice hypersummary also set: \Plainauthor{Dirk Eddelbuettel, Murray Stokely, Jeroen Ooms} %% comma-separated \Plaintitle{RProtoBuf: Efficient Cross-Language Data Serialization in R} -\Shorttitle{\pkg{RProtoBuf}: Protocol Buffers in R} %% a short title (if necessary) +\Shorttitle{\pkg{RProtoBuf}: Protocol Buffers in \proglang{R}} %% a short title (if necessary) %% an abstract and keywords \Abstract{ @@ -105,8 +105,8 @@ \fvset{listparameters={\setlength{\topsep}{0pt}}} \renewenvironment{Schunk}{\vspace{\topsep}}{\vspace{\topsep}} -\title{RProtoBuf: Efficient Cross-Language Data Serialization in R} -\author{by Dirk Eddelbuettel, Murray Stokely and Jeroen Ooms} +%\title{RProtoBuf: Efficient Cross-Language Data Serialization in R} +%\author{by Dirk Eddelbuettel, Murray Stokely and Jeroen Ooms} %% DE: I tend to have wider option(width=...) so this %% guarantees better line breaks @@ -772,7 +772,7 @@ Additionally, \pkg{RProtoBuf} supports tab completion for all classes. Completion possibilities include pseudo-method names for all -classes, plus dynamic dispatch on names or types specific to a given +classes, plus \emph{dynamic dispatch} on names or types specific to a given object. This functionality is implemented with the \texttt{.DollarNames} S3 generic function defined in the \pkg{utils} package. From noreply at r-forge.r-project.org Thu Jan 23 01:46:41 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 01:46:41 +0100 (CET) Subject: [Rprotobuf-commits] r828 - papers/jss Message-ID: <20140123004641.ECB8A186923@r-forge.r-project.org> Author: murray Date: 2014-01-23 01:46:41 +0100 (Thu, 23 Jan 2014) New Revision: 828 Modified: papers/jss/article.Rnw Log: Add more \proglangs, we now have \proglang{R} at least 119 times in this document, which might be a bit much. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 00:39:21 UTC (rev 827) +++ papers/jss/article.Rnw 2014-01-23 00:46:41 UTC (rev 828) @@ -191,7 +191,7 @@ A number of binary formats based on \texttt{JSON} have been proposed that reduce the parsing cost and improve efficiency. \pkg{MessagePack} -and \pkg{BSON} both have R +and \pkg{BSON} both have \proglang{R} interfaces \citep{msgpackR,rmongodb}, but these formats lack a separate schema for the serialized data and thus still duplicate field names with each message sent over the network or stored in a file. Such formats also lack support for @@ -258,7 +258,7 @@ package. Section~\ref{sec:types} describes the challenges of type coercion between \proglang{R} and other languages. Section~\ref{sec:evaluation} introduces a general \proglang{R} language schema for serializing arbitrary \proglang{R} objects and evaluates -it against the serialization capbilities built directly into R. Sections~\ref{sec:mapreduce} +it against the serialization capbilities built directly into \proglang{R}. Sections~\ref{sec:mapreduce} and \ref{sec:opencpu} provide real-world use cases of \CRANpkg{RProtoBuf} in MapReduce and web service environments, respectively, before Section~\ref{sec:summary} concludes. @@ -312,9 +312,9 @@ Protocol Buffer in \proglang{R} that is then serialized and sent over the network to a remote server. The server would then deserialize the message, act on the request, and respond with a new Protocol Buffer over the network. -The key difference to, say, a request to an Rserve instance is that +The key difference to, say, a request to an \pkg{Rserve} instance is that the remote server may be implemented in any language, with no -dependence on R. +dependence on \proglang{R}. While traditional IDLs have at times been criticized for code bloat and complexity, Protocol Buffers are based on a simple list and records @@ -456,7 +456,7 @@ This section describes how to use the \proglang{R} API to create and manipulate protocol buffer messages in \proglang{R}, and how to read and write the binary representation of the message (often called the \emph{payload}) to files and arbitrary binary -R connections. +\proglang{R} connections. The two fundamental building blocks of Protocol Buffers are \emph{Messages} and \emph{Descriptors}. Messages provide a common abstract encapsulation of structured data fields of the type specified in a Message Descriptor. @@ -479,16 +479,6 @@ %languages. The definition - -%This section may contain a figure such as Figure~\ref{figure:rlogo}. -% -%\begin{figure}[htbp] -% \centering -% \includegraphics{Rlogo} -% \caption{The logo of R.} -% \label{figure:rlogo} -%\end{figure} - \subsection[Importing Message Descriptors from .proto files]{Importing Message Descriptors from \texttt{.proto} files} %The three basic abstractions of \CRANpkg{RProtoBuf} are Messages, @@ -562,7 +552,7 @@ \subsection{Access and modify fields of a message} Once the message is created, its fields can be queried -and modified using the dollar operator of R, making protocol +and modified using the dollar operator of \proglang{R}, making protocol buffer messages seem like lists. <<>>= @@ -712,7 +702,7 @@ generic in the S3 sense, such as \texttt{new} and \texttt{serialize}. Table~\ref{class-summary-table} lists the six -primary Message and Descriptor classes in RProtoBuf. Each \proglang{R} object +primary Message and Descriptor classes in \CRANpkg{RProtoBuf}. Each \proglang{R} object contains an external pointer to an object managed by the \texttt{protobuf} \proglang{C++} library, and the \proglang{R} objects make calls into more than 100 \proglang{C++} functions that provide the @@ -765,7 +755,7 @@ functions with these S4 classes: \begin{itemize} \item The functional dispatch mechanism of the the form - \verb|method(object, arguments)| (common to R), and + \verb|method(object, arguments)| (common to \proglang{R}), and \item The traditional object oriented notation \verb|object$method(arguments)|. \end{itemize} @@ -905,7 +895,7 @@ \label{subsec-field-descriptor} The class \emph{FieldDescriptor} represents field -descriptors in R. This is a wrapper S4 class around the +descriptors in \proglang{R}. This is a wrapper S4 class around the \texttt{google::protobuf::FieldDescriptor} \proglang{C++} class. Table~\ref{fielddescriptor-methods-table} describes the methods defined for the \texttt{FieldDescriptor} class. @@ -956,7 +946,7 @@ \subsection{Enum Descriptors} \label{subsec-enum-descriptor} -The class \emph{EnumDescriptor} represents enum descriptors in R. +The class \emph{EnumDescriptor} represents enum descriptors in \proglang{R}. This is a wrapper S4 class around the \texttt{google::protobuf::EnumDescriptor} \proglang{C++} class. Table~\ref{enumdescriptor-methods-table} describes the methods @@ -1007,7 +997,7 @@ \subsection{File Descriptors} \label{subsec-file-descriptor} -The class \emph{FileDescriptor} represents file descriptors in R. +The class \emph{FileDescriptor} represents file descriptors in \proglang{R}. This is a wrapper S4 class around the \texttt{google::protobuf::FileDescriptor} \proglang{C++} class. Table~\ref{filedescriptor-methods-table} describes the methods @@ -1052,7 +1042,7 @@ \label{subsec-enumvalue-descriptor} The class \emph{EnumValueDescriptor} represents enumeration value -descriptors in R. This is a wrapper S4 class around the +descriptors in \proglang{R}. This is a wrapper S4 class around the \texttt{google::protobuf::EnumValueDescriptor} \proglang{C++} class. Table~\ref{EnumValueDescriptor-methods-table} describes the methods defined for the \texttt{EnumValueDescriptor} class. @@ -1141,7 +1131,7 @@ \subsection{Booleans} -R booleans can accept three values: \texttt{TRUE}, \texttt{FALSE}, and +\proglang{R} booleans can accept three values: \texttt{TRUE}, \texttt{FALSE}, and \texttt{NA}. However, most other languages, including the Protocol Buffer schema, only accept \texttt{TRUE} or \texttt{FALSE}. This means that we simply can not store \proglang{R} logical vectors that include all three @@ -1175,9 +1165,9 @@ \subsection{Unsigned Integers} -R lacks a native unsigned integer type. Values between $2^{31}$ and +\proglang{R} lacks a native unsigned integer type. Values between $2^{31}$ and $2^{32} - 1$ read from unsigned into Protocol Buffer fields must be -stored as doubles in R. +stored as doubles in \proglang{R}. <<>>= as.integer(2^31-1) @@ -1189,7 +1179,7 @@ \subsection{64-bit integers} \label{sec:int64} -R also does not support the native 64-bit integer type. Numeric vectors +\proglang{R} also does not support the native 64-bit integer type. Numeric vectors with values $\geq 2^{31}$ can only be stored as doubles, which have limited precision. Thereby \proglang{R} loses the ability to distinguish some distinct integers: @@ -1199,9 +1189,9 @@ @ However, most modern languages do have support for 64-bit integers, -which becomes problematic when \pkg{RProtoBuf} is used to exchange data +which becomes problematic when \CRANpkg{RProtoBuf} is used to exchange data with a system that requires this integer type. To work around this, -RProtoBuf allows users to get and set 64-bit integer values by specifying +\CRANpkg{RProtoBuf} allows users to get and set 64-bit integer values by specifying them as character strings. If we try to set an int64 field in \proglang{R} to double values, we lose @@ -1213,7 +1203,7 @@ length(unique(test$repeated_int64)) @ -But when the values are specified as character strings, RProtoBuf +But when the values are specified as character strings, \CRANpkg{RProtoBuf} will automatically coerce them into a true 64-bit integer types before storing them in the Protocol Buffer message: @@ -1221,13 +1211,13 @@ test$repeated_int64 <- c("9007199254740992", "9007199254740993") @ -When reading the value back into R, numeric types are returned by +When reading the value back into \proglang{R}, numeric types are returned by default, but when the full precision is required a character value will be returned if the \texttt{RProtoBuf.int64AsString} option is set to \texttt{TRUE}. The character values are useful because they can -accurately be used as unique identifiers and can easily be passed to R +accurately be used as unique identifiers and can easily be passed to \proglang{R} packages such as \CRANpkg{int64} \citep{int64} or \CRANpkg{bit64} -\citep{bit64} which represent 64-bit integers in R. +\citep{bit64} which represent 64-bit integers in \proglang{R}. <<>>= options("RProtoBuf.int64AsString" = FALSE) @@ -1250,7 +1240,7 @@ messages of a defined schema. This is useful when there are pre-existing systems with defined schemas or significant software components written in other languages that need to be accessed from -within R. +within \proglang{R}. The package also provides methods for converting arbitrary \proglang{R} data structures into protocol buffers and vice versa with a universal \proglang{R} object schema. The \texttt{serialize\_pb} and \texttt{unserialize\_pb} @@ -1275,10 +1265,10 @@ The \texttt{rexp.proto} schema supports all main \proglang{R} storage types holding \emph{data}. These include \texttt{NULL}, \texttt{list} and vectors of type \texttt{logical}, \texttt{character}, \texttt{double}, \texttt{integer} and \texttt{complex}. In addition, -every type can contain a named set of attributes, as is the case in R. The \texttt{rexp.proto} +every type can contain a named set of attributes, as is the case in \proglang{R}. The \texttt{rexp.proto} schema does not support some of the special \proglang{R} specific storage types, such as \texttt{function}, \texttt{language} or \texttt{environment}. Such objects have no native equivalent -type in Protocol Buffers, and have little meaning outside the context of R. +type in Protocol Buffers, and have little meaning outside the context of \proglang{R}. When serializing \proglang{R} objects using \texttt{serialize\_pb}, values or attributes of unsupported types are skipped with a warning. If the user really wishes to serialize these objects, they need to be converted into a supported type. For example, the can use @@ -1367,12 +1357,12 @@ %The summary compression sizes are listed below, and a full table for a %sample of 50 datasets is included on the next page. Sizes are comparable but Protocol Buffers provide simple getters and setters -in multiple languages instead of requiring other programs to parse the R +in multiple languages instead of requiring other programs to parse the \proglang{R} serialization format. % \citep{serialization}. One takeaway from this table is that the universal \proglang{R} object schema included in \pkg{RProtoBuf} does not in general provide any significant saving in file size compared to the normal serialization -mechanism in R. +mechanism in \proglang{R}. % redundant: which is seen as equally compact. The benefits of \pkg{RProtoBuf} accrue more naturally in applications where multiple programming languages are involved, or when a more concise @@ -1389,7 +1379,7 @@ \scalebox{0.9}{ \begin{tabular}{lrrrrr} \toprule - Data Set & object.size & \multicolumn{2}{c}{R Serialization} & + Data Set & object.size & \multicolumn{2}{c}{\proglang{R} Serialization} & \multicolumn{2}{c}{RProtoBuf Serial.} \\ & & default & gzipped & default & gzipped \\ \cmidrule(r){2-6} @@ -1513,10 +1503,10 @@ \end{example} This HistogramState message type is designed to be helpful if some of -the Map or Reduce tasks are written in R, or if those components are +the Map or Reduce tasks are written in \proglang{R}, or if those components are written in other languages and only the resulting output histograms -need to be manipulated in R. For example, to create HistogramState -messages in Python for later consumption by R, we first compile the +need to be manipulated in \proglang{R}. For example, to create HistogramState +messages in Python for later consumption by \proglang{R}, we first compile the \texttt{histogram.proto} descriptor into a python module using the \texttt{protoc} compiler: @@ -1547,7 +1537,7 @@ \end{Code} The protocol buffer can then be read into \proglang{R} and converted to a native -R histogram object for plotting: +\proglang{R} histogram object for plotting: \begin{Code} library(RProtoBuf) @@ -1638,7 +1628,7 @@ Because both HTTP and Protocol Buffers have libraries available for many languages, clients can be implemented in just a few lines of code. Below is example code for both \proglang{R} and Python that retrieves a dataset from \proglang{R} with -OpenCPU using a protobuf message. In R, we use the HTTP client from +OpenCPU using a protobuf message. In \proglang{R}, we use the HTTP client from the \texttt{httr} package \citep{httr}. In this example we download a dataset which is part of the base \proglang{R} distribution, so we can verify that the object was transferred without loss of information. @@ -1712,7 +1702,7 @@ \texttt{stats::rnorm(n=42, mean=100)}. The function arguments (in this case \texttt{n} and \texttt{mean}) as well as the return value (a vector with 42 random numbers) are transferred using a protobuf message. RPC in -OpenCPU works like the \texttt{do.call} function in R, hence all arguments +OpenCPU works like the \texttt{do.call} function in \proglang{R}, hence all arguments are contained within a list. <>= @@ -1818,7 +1808,7 @@ other languages. The \pkg{RProtoBuf} package provides users with the ability to generate, -parse and manipulate Protocol Buffer messages in R. It is our hope that this +parse and manipulate Protocol Buffer messages in \proglang{R}. It is our hope that this package will make Protocol Buffers more accessible to the \proglang{R} community, and thereby makes a small contribution towards better integration between \proglang{R} and other software systems and applications. From noreply at r-forge.r-project.org Thu Jan 23 02:03:57 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 02:03:57 +0100 (CET) Subject: [Rprotobuf-commits] r829 - papers/jss Message-ID: <20140123010357.94FF5186B01@r-forge.r-project.org> Author: murray Date: 2014-01-23 02:03:57 +0100 (Thu, 23 Jan 2014) New Revision: 829 Modified: papers/jss/article.Rnw papers/jss/article.bib Log: Add second reference where the mapreduce histogram pattern is used since we note we are aware of multiple applications. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 00:46:41 UTC (rev 828) +++ papers/jss/article.Rnw 2014-01-23 01:03:57 UTC (rev 829) @@ -14,7 +14,7 @@ \RequirePackage{alltt} \DefineVerbatimEnvironment{example}{Verbatim}{} % Articles with many authors we should shorten to FirstAuthor, et al. -\shortcites{janus,dremel} +\shortcites{sciencecloud,janus,dremel} \author{Dirk Eddelbuettel\\Debian Project \And Murray Stokely\\Google, Inc \And Jeroen Ooms\\UCLA} @@ -1566,7 +1566,7 @@ \end{center} One of the authors has used this design pattern for several large -scale studies of distributed filesystems \citep{janus}. +scale studies of distributed storage systems \citep{sciencecloud,janus}. \section{Application: Data Interchange in Web Services} \label{sec:opencpu} Modified: papers/jss/article.bib =================================================================== --- papers/jss/article.bib 2014-01-23 00:46:41 UTC (rev 828) +++ papers/jss/article.bib 2014-01-23 01:03:57 UTC (rev 829) @@ -25,7 +25,13 @@ note = {R package version 1.1}, url = {http://CRAN.R-project.org/package=msgpackR}, } - + at inproceedings{sciencecloud, +title = {Projecting Disk Usage Based on Historical Trends in a Cloud Environment}, +author = {Murray Stokely and Amaan Mehrabian and Christoph Albrecht and Francois Labelle and Arif Merchant}, +year = 2012, +booktitle = {ScienceCloud 2012 Proceedings of the 3rd International Workshop on Scientific Cloud Computing}, +pages = {63--70} +} @inproceedings{janus, title = {Janus: Optimal Flash Provisioning for Cloud Storage Workloads}, From noreply at r-forge.r-project.org Thu Jan 23 02:14:47 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 02:14:47 +0100 (CET) Subject: [Rprotobuf-commits] r830 - papers/jss Message-ID: <20140123011447.3E50D1864FC@r-forge.r-project.org> Author: murray Date: 2014-01-23 02:14:47 +0100 (Thu, 23 Jan 2014) New Revision: 830 Modified: papers/jss/article.Rnw Log: Standardize on using the oxford/serial comma with lists of 3 or more. This seemed the far more prevalent convention in the file but there were 5 instances where it was not used. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 01:03:57 UTC (rev 829) +++ papers/jss/article.Rnw 2014-01-23 01:14:47 UTC (rev 830) @@ -183,7 +183,7 @@ Notation} (\texttt{JSON}), which is derived from the object literals of \proglang{JavaScript}, and used increasingly on the world wide web. \texttt{JSON} natively supports arrays and distinguishes 4 primitive types: numbers, strings, -booleans and null. However, as it too is a text-based format, numbers are +booleans, and null. However, as it too is a text-based format, numbers are stored as human-readable decimal notation which is inefficient and leads to loss of type (double versus integer) and precision. Several \proglang{R} packages implement functions to parse and generate \texttt{JSON} data from \proglang{R} @@ -419,7 +419,7 @@ and \proglang{Python} bindings to Protocol Buffers are used with a compiler that translates a Protocol Buffer schema description file (ending in \texttt{.proto}) into -language-specific classes that can be used to create, read, write and +language-specific classes that can be used to create, read, write, and manipulate Protocol Buffer messages. The \proglang{R} interface, in contrast, uses a reflection-based API that is particularly well-suited for interactive data analysis. @@ -1264,7 +1264,7 @@ The \texttt{rexp.proto} schema supports all main \proglang{R} storage types holding \emph{data}. These include \texttt{NULL}, \texttt{list} and vectors of type \texttt{logical}, -\texttt{character}, \texttt{double}, \texttt{integer} and \texttt{complex}. In addition, +\texttt{character}, \texttt{double}, \texttt{integer}, and \texttt{complex}. In addition, every type can contain a named set of attributes, as is the case in \proglang{R}. The \texttt{rexp.proto} schema does not support some of the special \proglang{R} specific storage types, such as \texttt{function}, \texttt{language} or \texttt{environment}. Such objects have no native equivalent @@ -1799,16 +1799,16 @@ %% DE Re-ordering so that we end on RProtoBuf The \pkg{RProtoBuf} package builds on the Protocol Buffers library, and -extends the \proglang{R} system with the ability to create, read and write Protocol -Buffer message. \pkg{RProtoBuf} has been used extensively inside Google -for the past three years by statisticians, analysts and software engineers. +extends the \proglang{R} system with the ability to create, read, and write Protocol +Buffer messages. \pkg{RProtoBuf} has been used extensively inside Google +for the past three years by statisticians, analysts, and software engineers. At the time of this writing there are more than 300 active users of \pkg{RProtoBuf} using it to read data from and otherwise interact with distributed systems written in \proglang{C++}, \proglang{Java}, \proglang{Python}, and other languages. The \pkg{RProtoBuf} package provides users with the ability to generate, -parse and manipulate Protocol Buffer messages in \proglang{R}. It is our hope that this +parse, and manipulate Protocol Buffer messages in \proglang{R}. It is our hope that this package will make Protocol Buffers more accessible to the \proglang{R} community, and thereby makes a small contribution towards better integration between \proglang{R} and other software systems and applications. From noreply at r-forge.r-project.org Thu Jan 23 02:33:37 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 02:33:37 +0100 (CET) Subject: [Rprotobuf-commits] r831 - papers/jss Message-ID: <20140123013337.3FDEB18683C@r-forge.r-project.org> Author: murray Date: 2014-01-23 02:33:36 +0100 (Thu, 23 Jan 2014) New Revision: 831 Modified: papers/jss/article.Rnw Log: Minor grammar improvements suggested by my office mate Arif Merchant. Mostly involving that/which. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 01:14:47 UTC (rev 830) +++ papers/jss/article.Rnw 2014-01-23 01:33:36 UTC (rev 831) @@ -318,8 +318,8 @@ While traditional IDLs have at times been criticized for code bloat and complexity, Protocol Buffers are based on a simple list and records -model that is flexible and simple to use. The schema for structured -protocol buffer data is defined in \texttt{.proto} files which may +model that is flexible and easy to use. The schema for structured +protocol buffer data is defined in \texttt{.proto} files, which may contain one or more message types. Each message type has one or more fields. A field is specified with a unique number, a name, a value type, and a field rule specifying whether the field is optional, @@ -328,7 +328,7 @@ types. The \texttt{.proto} file syntax for defining the structure of protocol buffer data is described comprehensively on Google Code\footnote{See \url{http://code.google.com/apis/protocolbuffers/docs/proto.html}.}. -Table~\ref{tab:proto} shows an example \texttt{.proto} file which +Table~\ref{tab:proto} shows an example \texttt{.proto} file that defines the \texttt{tutorial.Person} type. The \proglang{R} code in the right column shows an example of creating a new message of this type and populating its fields. @@ -561,7 +561,7 @@ p$email <- "murray at stokely.org" @ -However, as opposed to \proglang{R} lists, no partial matching is performed +As opposed to \proglang{R} lists, no partial matching is performed and the name must be given entirely. The \verb|[[| operator can also be used to query and set fields of a messages, supplying either their name or their tag number : From noreply at r-forge.r-project.org Thu Jan 23 02:57:51 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 02:57:51 +0100 (CET) Subject: [Rprotobuf-commits] r832 - papers/jss Message-ID: <20140123015752.00BCB186A0B@r-forge.r-project.org> Author: murray Date: 2014-01-23 02:57:51 +0100 (Thu, 23 Jan 2014) New Revision: 832 Modified: papers/jss/article.Rnw Log: Add two minor text improvements suggested by Tim Hesterberg to sections 5 and 6. The first sentence of each of the last two paragraphs of the summary was saying the same thing -- merge the second duplicate sentence into the first, which then leaves the third paragraph of the summary as just a single sentence. Add it to the end of the second sentence and we now have I think a slightly crisper summary in just two paragraphs. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 01:33:36 UTC (rev 831) +++ papers/jss/article.Rnw 2014-01-23 01:57:51 UTC (rev 832) @@ -1092,7 +1092,8 @@ Table~\ref{table-get-types} details the correspondence between the field type and the type of data that is retrieved by \verb|$| and \verb|[[| extractors. Three types in particular need further attention due to -specific differences in the \proglang{R} language. +specific differences in the \proglang{R} language: booleans, unsigned +integers, and 64-bit integers. \begin{table}[h] \centering @@ -1298,7 +1299,7 @@ @ \Sexpr{m} data sets can be converted to Protocol Buffers -(\Sexpr{format(100*m/n,digits=1)}\%) without loss of information. Upon closer +without loss of information (\Sexpr{format(100*m/n,digits=1)}\%). Upon closer inspection, all other datasets are objects of class \texttt{nfnGroupedData}. This class represents a special type of data frame that has some additional attributes (such as a \emph{formula} object) used by the \pkg{nlme} package. @@ -1783,8 +1784,6 @@ \section{Summary} % DE Simpler title \label{sec:summary} -% TODO(mstokely): Get cibona approval for these two sentences before -% publishing Over the past decade, many formats for interoperable data exchange have become available, each with their unique features, strengths and weaknesses. @@ -1799,16 +1798,14 @@ %% DE Re-ordering so that we end on RProtoBuf The \pkg{RProtoBuf} package builds on the Protocol Buffers library, and -extends the \proglang{R} system with the ability to create, read, and write Protocol +extends the \proglang{R} system with the ability to create, read, +write, parse, and manipulate Protocol Buffer messages. \pkg{RProtoBuf} has been used extensively inside Google for the past three years by statisticians, analysts, and software engineers. At the time of this writing there are more than 300 active users of \pkg{RProtoBuf} using it to read data from and otherwise interact with distributed systems written in \proglang{C++}, \proglang{Java}, \proglang{Python}, and -other languages. - -The \pkg{RProtoBuf} package provides users with the ability to generate, -parse, and manipulate Protocol Buffer messages in \proglang{R}. It is our hope that this +other languages. It is our hope that this package will make Protocol Buffers more accessible to the \proglang{R} community, and thereby makes a small contribution towards better integration between \proglang{R} and other software systems and applications. From noreply at r-forge.r-project.org Thu Jan 23 03:06:41 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 03:06:41 +0100 (CET) Subject: [Rprotobuf-commits] r833 - papers/jss Message-ID: <20140123020641.2E616186A84@r-forge.r-project.org> Author: murray Date: 2014-01-23 03:06:39 +0100 (Thu, 23 Jan 2014) New Revision: 833 Modified: papers/jss/article.Rnw Log: large scale -> large-scale protocol buffers -> Protocol Buffers And remove some old commented out sections now. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 01:57:51 UTC (rev 832) +++ papers/jss/article.Rnw 2014-01-23 02:06:39 UTC (rev 833) @@ -2,6 +2,10 @@ \usepackage{booktabs} \usepackage[toc,page]{appendix} +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 +% Spelling Standardization: +% Protocol Buffers, not protocol buffers + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -105,9 +109,6 @@ \fvset{listparameters={\setlength{\topsep}{0pt}}} \renewenvironment{Schunk}{\vspace{\topsep}}{\vspace{\topsep}} -%\title{RProtoBuf: Efficient Cross-Language Data Serialization in R} -%\author{by Dirk Eddelbuettel, Murray Stokely and Jeroen Ooms} - %% DE: I tend to have wider option(width=...) so this %% guarantees better line breaks <>= @@ -116,9 +117,6 @@ \maketitle -%TODO(de) 'protocol buffers' or 'Protocol Buffers' ? -% MS: Lets standardize on 'Protocol Buffers'? - \section{Introduction} % TODO(DE) More sober: Friends don't let friends use CSV} % NOTE(MS): I really do think we can use add back: % \section{Introduction: Friends Don't Let Friends Use CSV} @@ -463,7 +461,6 @@ Message Descriptors are defined in \texttt{.proto} files and define a schema for a particular named class of messages. - % Commented out because we said this earlier. %This separation %between schema and the message objects is in contrast to @@ -478,7 +475,6 @@ %from a variety of data streams using a variety of different %languages. The definition - \subsection[Importing Message Descriptors from .proto files]{Importing Message Descriptors from \texttt{.proto} files} %The three basic abstractions of \CRANpkg{RProtoBuf} are Messages, @@ -501,28 +497,10 @@ dynamically queries the in-memory database of loaded descriptors during normal variable lookup. -%JO: can we just move the section 7 to here? It's only one paragraph% -%MS: I replaced section 7 with 2 sentences above. - <<>>= ls("RProtoBuf:DescriptorPool") @ -% The old section 7 in entirety: -%The \texttt{RProtoBuf} package uses the user defined tables framework -%that is defined as part of the \texttt{RObjectTables} package available -%from the OmegaHat project \citep{RObjectTables}. -% -%The feature allows \texttt{RProtoBuf} to install the -%special environment \emph{RProtoBuf:DescriptorPool} in the \proglang{R} search path. -%The environment is special in that, instead of being associated with a -%static hash table, it is dynamically queried by \proglang{R} as part of R's usual -%variable lookup. In other words, it means that when the \proglang{R} interpreter -%looks for a binding to a symbol (foo) in its search path, -%it asks to our package if it knows the binding "foo", this is then -%implemented by the \texttt{RProtoBuf} package by calling an internal -%method of the \texttt{protobuf} C++ library. - %\subsection{Importing proto files} %In contrast to the other languages (Java, C++, Python) that are officially %supported by Google, the implementation used by the \texttt{RProtoBuf} @@ -1566,8 +1544,9 @@ @ \end{center} -One of the authors has used this design pattern for several large -scale studies of distributed storage systems \citep{sciencecloud,janus}. +One of the authors has used this design pattern for several +large-scale studies of distributed storage systems +\citep{sciencecloud,janus}. \section{Application: Data Interchange in Web Services} \label{sec:opencpu} @@ -1603,7 +1582,7 @@ remote \proglang{R} function calls. Clients do not need to understand or generate any \proglang{R} code: HTTP requests are automatically mapped to function calls, and arguments/return values can be posted/retrieved -using several data interchange formats, such as protocol buffers. +using several data interchange formats, such as Protocol Buffers. OpenCPU uses the \texttt{serialize\_pb} and \texttt{unserialize\_pb} functions from the \texttt{RProtoBuf} package to convert between \proglang{R} objects and protobuf messages. Therefore, clients need the \texttt{rexp.proto} descriptor mentioned @@ -1694,7 +1673,7 @@ protobuf messages returned by the server. Using Protocol Buffers to post function arguments is not required, and for simple (scalar) arguments the standard \texttt{application/x-www-form-urlencoded} format might be sufficient. -However, with protocol buffers the client can perform function calls with +However, with Protocol Buffers the client can perform function calls with more complex arguments such as \proglang{R} vectors or lists. The result is a complete RPC system to do arbitrary \proglang{R} function calls from within any programming language. @@ -1734,54 +1713,6 @@ outputmsg <- serialize_pb(val) @ -% OpenCPU also provides a lot of meta-functionality such as handling -% of sessions, exceptions, security, and more. OpenCPU also makes it possible to store -% output of a function call on the server, instead of directly retrieving it. Thereby -% objects can be shared with other users or used as arguments in a subsequent -% function call. -% But in its essence, the HTTP API provides a simple way to perform remote -% \proglang{R} function calls over HTTPS. The same request can be performed in Python as demonstrated -% below. The code is a bit verbose because to show how the REXP message is created from -% scratch. In practice would probably write a function or small module construct a Protocol -% Buffer message representing an \proglang{R} list from a Python dictionary object. -% -% \begin{verbatim} -% import urllib2; -% from rexp_pb2 import *; -% -% #create the post payload, i.e. list(n=42, mean=100) -% payload = REXP( -% rclass = 5, -% rexpValue = [ -% REXP(rclass = 2, realValue = [42]), -% REXP(rclass = 2, realValue = [100]) -% ], -% attrName = [ -% "names" -% ], -% attrValue = [ -% REXP(rclass = 0, stringValue = [STRING(strval="n"), STRING(strval="mean")]) -% ] -%); -% -%#HTTP POST -%req = urllib2.Request( -% "https://public.opencpu.org/ocpu/library/stats/R/rnorm/pb", -% data = payload.SerializeToString(), -% headers = { -% 'Content-type': 'application/x-protobuf' -% } -%) -%res = urllib2.urlopen(req); -% -%#parse output pb -%msg = REXP(); -%msg.ParseFromString(res.read()); -% -%#the return value is a double vector in this case -%print(msg.realValue); -%\end{verbatim} - \section{Summary} % DE Simpler title \label{sec:summary} Over the past decade, many formats for interoperable From noreply at r-forge.r-project.org Thu Jan 23 03:45:44 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 03:45:44 +0100 (CET) Subject: [Rprotobuf-commits] r834 - papers/jss Message-ID: <20140123024544.2976E186935@r-forge.r-project.org> Author: murray Date: 2014-01-23 03:45:35 +0100 (Thu, 23 Jan 2014) New Revision: 834 Modified: papers/jss/article.Rnw Log: Go back to dissing MessagePack and BSON since Jeroen pointed out some limitations in the messagepack R package. Remove citations to both, and call them "not widely supported". I think this wording is a fair compromise in that it still makes the point I wanted to make that they address a shortcoming of text based JSON for certain classes of applications by providing more efficient binary encoding, but they make Jeroen's point that they still suck due to lack of adoption and weak implementations for R packages so shouldn't really be held up on the same level as JSON. Add a paragraph break before we start talking about JSON, and merge the binary json sentences into this new paragraph instead of being a paragraph on their own. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 02:06:39 UTC (rev 833) +++ papers/jss/article.Rnw 2014-01-23 02:45:35 UTC (rev 834) @@ -5,6 +5,7 @@ %%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 % Spelling Standardization: % Protocol Buffers, not protocol buffers +% large-scale, not large scale %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -156,7 +157,7 @@ Programming languages such as \proglang{R}, \proglang{Julia}, \proglang{Java}, and \proglang{Python} include built-in support for serialization, but the default formats -are usually language specific and thereby lock the user into a single +are usually language-specific and thereby lock the user into a single environment. %\paragraph*{Friends don't let friends use CSV!} @@ -177,6 +178,7 @@ very practical format to store numeric datasets as they appear in statistical applications. % + A more modern, widely used format is \emph{JavaScript Object Notation} (\texttt{JSON}), which is derived from the object literals of \proglang{JavaScript}, and used increasingly on the world wide web. \texttt{JSON} natively @@ -186,16 +188,21 @@ leads to loss of type (double versus integer) and precision. Several \proglang{R} packages implement functions to parse and generate \texttt{JSON} data from \proglang{R} objects \citep{rjson,RJSONIO,jsonlite}. - A number of binary formats based on \texttt{JSON} have been proposed -that reduce the parsing cost and improve efficiency. \pkg{MessagePack} -and \pkg{BSON} both have \proglang{R} -interfaces \citep{msgpackR,rmongodb}, but these formats lack a separate schema for the serialized -data and thus still duplicate field names with each message sent over -the network or stored in a file. Such formats also lack support for -versioning when data storage needs evolve over time, or when -application logic and requirement changes dictate updates to the -message format. +that reduce the parsing cost and improve efficiency, but these formats +are not widely supported. Furthermore, such formats lack a separate +schema for the serialized data and thus still duplicate field names +with each message sent over the network or stored in a file. +% and still must send duplicate field names +% with each message since there is no separate schema. +% \pkg{MessagePack} +% and \pkg{BSON} both have \proglang{R} +% interfaces \citep{msgpackR,rmongodb}, but these formats lack a separate schema for the serialized +% data and thus still duplicate field names with each message sent over +% the network or stored in a file. Such formats also lack support for +% versioning when data storage needs evolve over time, or when +% application logic and requirement changes dictate updates to the +%message format. Once the data serialization needs of an application become complex enough, developers typically benefit from the use of an From noreply at r-forge.r-project.org Thu Jan 23 04:02:00 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 04:02:00 +0100 (CET) Subject: [Rprotobuf-commits] r835 - papers/jss Message-ID: <20140123030200.D3A40186A16@r-forge.r-project.org> Author: murray Date: 2014-01-23 04:01:59 +0100 (Thu, 23 Jan 2014) New Revision: 835 Modified: papers/jss/article.Rnw Log: Note at the end of section 2 that the reflection based API is slightly slower for some operations, but that its much more convenient for interactive data analysis. I want to tone down the implication that protocol buffers are the best/fastest thing out there, and I always thought the 'particularly well-suited' was a tad vague here. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 02:45:35 UTC (rev 834) +++ papers/jss/article.Rnw 2014-01-23 03:01:59 UTC (rev 835) @@ -426,8 +426,10 @@ Buffer schema description file (ending in \texttt{.proto}) into language-specific classes that can be used to create, read, write, and manipulate Protocol Buffer messages. The \proglang{R} interface, in contrast, -uses a reflection-based API that is particularly well-suited for -interactive data analysis. +uses a reflection-based API that makes some operations slightly +slower but which is much more convenient for interactive data analysis. +%particularly well-suited for +%interactive data analysis. All messages in \proglang{R} have a single class structure, but different accessor methods are created at runtime based on the named fields of the specified message type, as described in the @@ -554,7 +556,7 @@ <<>>= p[["name"]] <- "Murray Stokely" p[[ 2 ]] <- 3 -p[[ "email" ]] +p[["email"]] @ Protocol Buffers include a 64-bit integer type, but \proglang{R} lacks native From noreply at r-forge.r-project.org Thu Jan 23 04:09:11 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 04:09:11 +0100 (CET) Subject: [Rprotobuf-commits] r836 - papers/jss Message-ID: <20140123030911.9FB8418010D@r-forge.r-project.org> Author: murray Date: 2014-01-23 04:09:10 +0100 (Thu, 23 Jan 2014) New Revision: 836 Modified: papers/jss/article.Rnw Log: Improve the text about serializing messages. Don't start the section with "However". Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 03:01:59 UTC (rev 835) +++ papers/jss/article.Rnw 2014-01-23 03:09:10 UTC (rev 836) @@ -585,12 +585,13 @@ \subsection{Serializing messages} -However, the main focus of Protocol Buffer messages is -efficiency. Therefore, messages are transported as a sequence -of bytes. The \texttt{serialize} method is implemented for +One of the primary benefits of Protocol Buffers is the efficient +binary wire-format representation. +The \texttt{serialize} method is implemented for Protocol Buffer messages to serialize a message into a sequence of -bytes that represents the message. -%(raw vector in \proglang{R} speech) that represents the message. +bytes (raw vector) that represents the message. +The raw bytes can then be parsed back into the original message safely +as long as the message type is known and its descriptor is available. <<>>= serialize(p, NULL) From noreply at r-forge.r-project.org Thu Jan 23 04:17:07 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 04:17:07 +0100 (CET) Subject: [Rprotobuf-commits] r837 - papers/jss Message-ID: <20140123031707.B1ABF186209@r-forge.r-project.org> Author: murray Date: 2014-01-23 04:17:06 +0100 (Thu, 23 Jan 2014) New Revision: 837 Modified: papers/jss/article.Rnw Log: Standardize on no space before ':' in the text. Use tbp instead of h for the Message table float placement, and correct a typo in the type coercion section. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 03:09:10 UTC (rev 836) +++ papers/jss/article.Rnw 2014-01-23 03:17:06 UTC (rev 837) @@ -551,7 +551,7 @@ As opposed to \proglang{R} lists, no partial matching is performed and the name must be given entirely. The \verb|[[| operator can also be used to query and set fields -of a messages, supplying either their name or their tag number : +of a messages, supplying either their name or their tag number: <<>>= p[["name"]] <- "Murray Stokely" @@ -569,7 +569,7 @@ \subsection{Display messages} Protocol Buffer messages and descriptors implement \texttt{show} -methods that provide basic information about the message : +methods that provide basic information about the message: <<>>= p @@ -597,7 +597,7 @@ serialize(p, NULL) @ -The same method can be used to serialize messages to files : +The same method can be used to serialize messages to files: <<>>= tf1 <- tempfile() @@ -640,7 +640,7 @@ The binary representation of the message does not contain information that can be used to dynamically infer the message type, so we have to provide this information -to the \texttt{read} function in the form of a descriptor : +to the \texttt{read} function in the form of a descriptor: <<>>= msg <- read(tutorial.Person, tf1) @@ -657,7 +657,7 @@ writeLines(as.character(message)) @ -Finally, the payload of the message can be used : +Finally, the payload of the message can be used: <<>>= # reading the raw vector payload of the message @@ -667,7 +667,7 @@ \texttt{read} can also be used as a pseudo-method of the descriptor -object : +object: <<>>= # reading from a file @@ -723,7 +723,7 @@ The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp,eddelbuettel2013seamless} is used to facilitate this integration of the \proglang{R} and \proglang{C++} code for these objects. -Each method is wrapped individually which alllows us to add user +Each method is wrapped individually which allows us to add user friendly custom error handling, type coercion, and performance improvements at the cost of a more verbose implementation. The \pkg{RProtoBuf} package in many ways motivated @@ -765,7 +765,7 @@ complete list of the slots and methods for \texttt{Messages} is available in Table~\ref{Message-methods-table}. -\begin{table}[h] +\begin{table}[tbp] \centering \begin{small} \begin{tabular}{lp{10cm}} @@ -1155,7 +1155,7 @@ \subsection{Unsigned Integers} \proglang{R} lacks a native unsigned integer type. Values between $2^{31}$ and -$2^{32} - 1$ read from unsigned into Protocol Buffer fields must be +$2^{32} - 1$ read from unsigned integer Protocol Buffer fields must be stored as doubles in \proglang{R}. <<>>= From noreply at r-forge.r-project.org Thu Jan 23 04:21:11 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 04:21:11 +0100 (CET) Subject: [Rprotobuf-commits] r838 - papers/jss Message-ID: <20140123032112.045D2186550@r-forge.r-project.org> Author: murray Date: 2014-01-23 04:21:10 +0100 (Thu, 23 Jan 2014) New Revision: 838 Modified: papers/jss/article.Rnw Log: Section 6: Add the oxford comma somewhere I missed it earlier. Use \code{} instead of \texttt{} for a function name. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 03:17:06 UTC (rev 837) +++ papers/jss/article.Rnw 2014-01-23 03:21:10 UTC (rev 838) @@ -1225,14 +1225,13 @@ \label{sec:evaluation} The previous sections discussed functionality in the \pkg{RProtoBuf} package -for creating, manipulating, parsing and serializing Protocol Buffer +for creating, manipulating, parsing, and serializing Protocol Buffer messages of a defined schema. This is useful when there are pre-existing systems with defined schemas or significant software components written in other languages that need to be accessed from within \proglang{R}. - The package also provides methods for converting arbitrary \proglang{R} data structures into protocol -buffers and vice versa with a universal \proglang{R} object schema. The \texttt{serialize\_pb} and \texttt{unserialize\_pb} +buffers and vice versa with a universal \proglang{R} object schema. The \code{serialize\_pb} and \code{unserialize\_pb} functions serialize arbitrary \proglang{R} objects into a universal Protocol Buffer message: From noreply at r-forge.r-project.org Thu Jan 23 04:42:26 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 04:42:26 +0100 (CET) Subject: [Rprotobuf-commits] r839 - papers/jss Message-ID: <20140123034226.DA50C184180@r-forge.r-project.org> Author: murray Date: 2014-01-23 04:42:24 +0100 (Thu, 23 Jan 2014) New Revision: 839 Modified: papers/jss/article.Rnw Log: Add new, better ack for saptarshi. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 03:21:10 UTC (rev 838) +++ papers/jss/article.Rnw 2014-01-23 03:42:24 UTC (rev 839) @@ -1790,9 +1790,11 @@ purpose of the \pkg{RObjectTables} package, allows for the dynamic symbol lookup. Kenton Varda was generous with his time in reviewing code and explaining obscure protocol buffer semantics. Karl Millar was very -helpful in reviewing code and offering suggestions. -%The contemporaneous work by Saptarshi Guha on \pkg{RHIPE} was a strong -%initial motivator. +helpful in reviewing code and offering suggestions. Saptarshi Guha's +work on RHIPE and implementation of a universal message type for \proglang{R} +language objects allowed us to add the \code{serialize_pb} and \code{unserialize\_pb} +methods for turning arbitrary R objects into protocol buffers without +a specialized pre-defined schema. \newpage \appendix From noreply at r-forge.r-project.org Thu Jan 23 05:09:38 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 05:09:38 +0100 (CET) Subject: [Rprotobuf-commits] r840 - papers/jss Message-ID: <20140123040938.E6B86183CF5@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-23 05:09:38 +0100 (Thu, 23 Jan 2014) New Revision: 840 Modified: papers/jss/article.Rnw Log: Switch around a sentence to get a better flow Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 03:42:24 UTC (rev 839) +++ papers/jss/article.Rnw 2014-01-23 04:09:38 UTC (rev 840) @@ -179,15 +179,15 @@ applications. % -A more modern, widely used format is \emph{JavaScript Object - Notation} (\texttt{JSON}), which is derived from the object literals of -\proglang{JavaScript}, and used increasingly on the world wide web. \texttt{JSON} natively -supports arrays and distinguishes 4 primitive types: numbers, strings, +A more modern, widely used format is \emph{JavaScript ObjectNotation} +(\texttt{JSON}), which is derived from the object literals of +\proglang{JavaScript}, and used increasingly on the world wide web. +Several \proglang{R} packages implement functions to parse and generate +\texttt{JSON} data from \proglang{R} objects \citep{rjson,RJSONIO,jsonlite}. +\texttt{JSON} natively supports arrays and 4 primitive types: numbers, strings, booleans, and null. However, as it too is a text-based format, numbers are stored as human-readable decimal notation which is inefficient and -leads to loss of type (double versus integer) and precision. Several \proglang{R} packages -implement functions to parse and generate \texttt{JSON} data from \proglang{R} -objects \citep{rjson,RJSONIO,jsonlite}. +leads to loss of type (double versus integer) and precision. A number of binary formats based on \texttt{JSON} have been proposed that reduce the parsing cost and improve efficiency, but these formats are not widely supported. Furthermore, such formats lack a separate From noreply at r-forge.r-project.org Thu Jan 23 05:50:13 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 05:50:13 +0100 (CET) Subject: [Rprotobuf-commits] r841 - papers/jss Message-ID: <20140123045013.9665F18699F@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-23 05:50:12 +0100 (Thu, 23 Jan 2014) New Revision: 841 Modified: papers/jss/article.Rnw Log: another pass at the summary Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 04:09:38 UTC (rev 840) +++ papers/jss/article.Rnw 2014-01-23 04:50:12 UTC (rev 841) @@ -1727,28 +1727,30 @@ Over the past decade, many formats for interoperable data exchange have become available, each with their unique features, strengths and weaknesses. -Text based formats such as CSV and JSON are easy to use, and will likely +Text based formats such as \texttt{CSV} and \texttt{JSON} are easy to use, and will likely remain popular among statisticians for many years to come. However, in the context of increasingly complex analysis stacks and applications involving distributed computing as well as mixed language analysis pipelines, choosing a more sophisticated data interchange format may reap considerable benefits. -The Protocol Buffers protocol and library offers a unique combination of features, performance, -maturity, and forward-compatibility that seems particulary well suited for data-driven +%Protocol Buffers is itself not a protocol. +%Forward-compatibility is one of the features. No need to re-iterate those +The Protocol Buffers standard and library offer a unique combination of features, +performance, and maturity, that seems particulary well suited for data-driven applications and numerical computing. %% DE Re-ordering so that we end on RProtoBuf -The \pkg{RProtoBuf} package builds on the Protocol Buffers library, and -extends the \proglang{R} system with the ability to create, read, +The \pkg{RProtoBuf} package builds on the Protocol Buffers \proglang{C++} library, +and extends the \proglang{R} system with the ability to create, read, write, parse, and manipulate Protocol Buffer messages. \pkg{RProtoBuf} has been used extensively inside Google for the past three years by statisticians, analysts, and software engineers. -At the time of this writing there are more than 300 active +At the time of this writing there are over 300 active users of \pkg{RProtoBuf} using it to read data from and otherwise interact with distributed systems written in \proglang{C++}, \proglang{Java}, \proglang{Python}, and -other languages. It is our hope that this -package will make Protocol Buffers more accessible to the \proglang{R} community, and -thereby makes a small contribution towards better integration between \proglang{R} and -other software systems and applications. +other languages. We hope that making Protocol Buffers available to the +\proglang{R} community will contribute towards better software integration +and allow for building even more advanced applications and analysis pipelines +with \proglang{R}. %\emph{Other Approaches} % From noreply at r-forge.r-project.org Thu Jan 23 05:54:40 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 05:54:40 +0100 (CET) Subject: [Rprotobuf-commits] r842 - papers/jss Message-ID: <20140123045440.E3AD31869DE@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-23 05:54:40 +0100 (Thu, 23 Jan 2014) New Revision: 842 Modified: papers/jss/article.Rnw Log: a missing dot Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 04:50:12 UTC (rev 841) +++ papers/jss/article.Rnw 2014-01-23 04:54:40 UTC (rev 842) @@ -1787,7 +1787,7 @@ Very significant contributions, both in code and design, were made by Romain Fran\c{c}ois whose continued influence on design and code is greatly appreciated. Several features of the package reflect -the design of the \CRANpkg{rJava} package by Simon Urbanek +the design of the \CRANpkg{rJava} package by Simon Urbanek. The user-defined table mechanism, implemented by Duncan Temple Lang for the purpose of the \pkg{RObjectTables} package, allows for the dynamic symbol lookup. Kenton Varda was generous with his time in reviewing code and explaining From noreply at r-forge.r-project.org Thu Jan 23 06:30:34 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 06:30:34 +0100 (CET) Subject: [Rprotobuf-commits] r843 - papers/jss Message-ID: <20140123053034.2F2FD1867AE@r-forge.r-project.org> Author: murray Date: 2014-01-23 06:30:33 +0100 (Thu, 23 Jan 2014) New Revision: 843 Modified: papers/jss/article.Rnw Log: Add a few improvements to the abstract suggested by Arif Merchant to make it more self contained. After the first sentence explains that pipelines of applications need to communicate with each other, briefly describe the problem with common solutions such as CSV files in a new sentence, before transitioning to the third sentence to talk about protocol buffers. Then after this, uncomment Jeroen's sentence from elsewhere in the paper which I had moved here which describes how they offer a unique combination of ... After this change, section 2 starts at the top of page 3 instead of the last line of page 2, which is a nice change in the layout. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 04:54:40 UTC (rev 842) +++ papers/jss/article.Rnw 2014-01-23 05:30:33 UTC (rev 843) @@ -2,11 +2,13 @@ \usepackage{booktabs} \usepackage[toc,page]{appendix} -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%5 +%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Spelling Standardization: % Protocol Buffers, not protocol buffers % large-scale, not large scale +% Oxford comma: foo, bar, and baz. + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %% declarations for jss.cls %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -34,11 +36,20 @@ \Abstract{ Modern data collection and analysis pipelines often involve a sophisticated mix of applications written in general purpose and -specialized programming languages. Protocol Buffers are a popular +specialized programming languages. +There are many formats used to import and export data between +different programs or systems, such as CSV, but they are usually +verbose, inefficient, not type-safe, or tied to a specific programming language. +Protocol Buffers are a popular method of serializing structured data between applications---while remaining -independent of programming languages or operating system. The -\CRANpkg{RProtoBuf} package provides a complete interface between this -library and the \proglang{R} environment for statistical computing. +independent of programming languages or operating system. +They offer a unique combination of features, performance, and maturity that seems +particulary well suited for data-driven applications and numerical +computing. +The +\CRANpkg{RProtoBuf} package provides a complete interface to Protocol +Buffers from the +\proglang{R} environment for statistical computing. %TODO(ms) keep it less than 150 words. % Maybe add Jeroen's sentence: % JO: added this sentence to the conclustion, but could use it in abstract as well. From noreply at r-forge.r-project.org Thu Jan 23 06:55:47 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 06:55:47 +0100 (CET) Subject: [Rprotobuf-commits] r844 - papers/jss Message-ID: <20140123055548.0756B186A1D@r-forge.r-project.org> Author: murray Date: 2014-01-23 06:55:47 +0100 (Thu, 23 Jan 2014) New Revision: 844 Modified: papers/jss/article.Rnw Log: Add one more sentence to end the abstract by describing what the paper is about: "This paper outlines the general class of data serialization requirements for statistical computing, describes the implementation of the \CRANpkg{RProtoBuf} package, and illustrates its use with examples applications in large-scale data collection pipelines and web services." We still have the nice page break between pages 2 and 3 where section 2 begins at the very top of page 3. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 05:30:33 UTC (rev 843) +++ papers/jss/article.Rnw 2014-01-23 05:55:47 UTC (rev 844) @@ -50,12 +50,13 @@ \CRANpkg{RProtoBuf} package provides a complete interface to Protocol Buffers from the \proglang{R} environment for statistical computing. -%TODO(ms) keep it less than 150 words. -% Maybe add Jeroen's sentence: -% JO: added this sentence to the conclustion, but could use it in abstract as well. -% They offer a unique combination of features, performance, and maturity that seems -% particulary well suited for data-driven applications and numerical -% computing. +This paper outlines the general class of data serialization +requirements for statistical computing, describes the implementation +of the \CRANpkg{RProtoBuf} package, and illustrates its use with +examples applications in large-scale data collection pipelines and web +services. +%TODO(ms) keep it less than 150 words. -- I think this may be 154, +%depending how emacs is counting. } \Keywords{\proglang{R}, \pkg{Rcpp}, protocol buffers, serialization, cross-platform} \Plainkeywords{R, Rcpp, protocol buffers, serialization, cross-platform} %% without formatting From noreply at r-forge.r-project.org Thu Jan 23 15:27:20 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 15:27:20 +0100 (CET) Subject: [Rprotobuf-commits] r845 - papers/jss Message-ID: <20140123142720.476031869DD@r-forge.r-project.org> Author: edd Date: 2014-01-23 15:27:19 +0100 (Thu, 23 Jan 2014) New Revision: 845 Modified: papers/jss/article.Rnw Log: a few micro-edits in Section 1 plural s in two places, hyphenating, minor rewording Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 05:55:47 UTC (rev 844) +++ papers/jss/article.Rnw 2014-01-23 14:27:19 UTC (rev 845) @@ -42,7 +42,7 @@ verbose, inefficient, not type-safe, or tied to a specific programming language. Protocol Buffers are a popular method of serializing structured data between applications---while remaining -independent of programming languages or operating system. +independent of programming languages or operating systems. They offer a unique combination of features, performance, and maturity that seems particulary well suited for data-driven applications and numerical computing. @@ -53,7 +53,7 @@ This paper outlines the general class of data serialization requirements for statistical computing, describes the implementation of the \CRANpkg{RProtoBuf} package, and illustrates its use with -examples applications in large-scale data collection pipelines and web +example applications in large-scale data collection pipelines and web services. %TODO(ms) keep it less than 150 words. -- I think this may be 154, %depending how emacs is counting. @@ -146,11 +146,11 @@ These pipelines are frequently built using different programming languages for the different phases of data analysis -- collection, cleaning, modeling, analysis, post-processing, and -presentation in order to take advantage of the unique combination of +presentation -- in order to take advantage of the unique combination of performance, speed of development, and library support offered by different environments and languages. Each stage of such a data analysis pipeline may produce intermediate results that need to be -stored in a file or sent over the network for further processing. +stored in a file, or sent over the network for further processing. % JO Perhaps also mention that serialization is needed for distributed % systems to make systems scale up? @@ -173,7 +173,7 @@ environment. %\paragraph*{Friends don't let friends use CSV!} -Data analysts and researchers often use character separated text formats such +Data analysts and researchers often use character-separated text formats such as \texttt{CSV} \citep{shafranovich2005common} to export and import data. However, anyone who has ever used \texttt{CSV} files will have noticed that this method has many limitations: it is restricted to tabular data, @@ -185,18 +185,18 @@ about any arbitrarily complex schema \citep{nolan2013xml}. However, it pays for this complexity with comparatively large and verbose messages, and added complexity at the parsing side (which are somewhat mitigated by the -availability of mature libraries and parsers). Because \texttt{XML} is text -based and has no native notion of numeric types or arrays, it usually not a +availability of mature libraries and parsers). Because \texttt{XML} is +text-based and has no native notion of numeric types or arrays, it usually not a very practical format to store numeric datasets as they appear in statistical applications. % -A more modern, widely used format is \emph{JavaScript ObjectNotation} +A more modern format is \emph{JavaScript ObjectNotation} (\texttt{JSON}), which is derived from the object literals of -\proglang{JavaScript}, and used increasingly on the world wide web. +\proglang{JavaScript}, and already widely-used on the world wide web. Several \proglang{R} packages implement functions to parse and generate \texttt{JSON} data from \proglang{R} objects \citep{rjson,RJSONIO,jsonlite}. -\texttt{JSON} natively supports arrays and 4 primitive types: numbers, strings, +\texttt{JSON} natively supports arrays and four primitive types: numbers, strings, booleans, and null. However, as it too is a text-based format, numbers are stored as human-readable decimal notation which is inefficient and leads to loss of type (double versus integer) and precision. @@ -222,7 +222,7 @@ Protocol Buffers \citep{protobuf}, Apache Thrift, and Apache Avro provide a compact well-documented schema for cross-language data structures and efficient binary interchange formats. Since the schema -is provided separately from the encoded data, the data can be +is provided separately from the data, the data can be efficiently encoded to minimize storage costs when compared with simple ``schema-less'' binary interchange formats. Many sources compare data serialization formats @@ -270,9 +270,8 @@ Section~\ref{sec:rprotobuf-basic} describes the interactive \proglang{R} interface provided by the \CRANpkg{RProtoBuf} package, and introduces the two main abstractions: \emph{Messages} and \emph{Descriptors}. Section~\ref{sec:rprotobuf-classes} -details the implementation details of the main S4 classes and methods -contained in this -package. Section~\ref{sec:types} describes the challenges of type coercion +details the implementation details of the main S4 classes and methods. +Section~\ref{sec:types} describes the challenges of type coercion between \proglang{R} and other languages. Section~\ref{sec:evaluation} introduces a general \proglang{R} language schema for serializing arbitrary \proglang{R} objects and evaluates it against the serialization capbilities built directly into \proglang{R}. Sections~\ref{sec:mapreduce} From noreply at r-forge.r-project.org Thu Jan 23 18:47:32 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 18:47:32 +0100 (CET) Subject: [Rprotobuf-commits] r846 - papers/jss Message-ID: <20140123174732.ADAD5186248@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-23 18:47:32 +0100 (Thu, 23 Jan 2014) New Revision: 846 Modified: papers/jss/article.Rnw Log: concisification Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 14:27:19 UTC (rev 845) +++ papers/jss/article.Rnw 2014-01-23 17:47:32 UTC (rev 846) @@ -37,8 +37,8 @@ Modern data collection and analysis pipelines often involve a sophisticated mix of applications written in general purpose and specialized programming languages. -There are many formats used to import and export data between -different programs or systems, such as CSV, but they are usually +Many formats commonly used to import and export data between +different programs or systems, such as \texttt{CSV} or \texttt{JSON}, are verbose, inefficient, not type-safe, or tied to a specific programming language. Protocol Buffers are a popular method of serializing structured data between applications---while remaining From noreply at r-forge.r-project.org Thu Jan 23 19:59:09 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 19:59:09 +0100 (CET) Subject: [Rprotobuf-commits] r847 - papers/jss Message-ID: <20140123185909.395011865B3@r-forge.r-project.org> Author: murray Date: 2014-01-23 19:59:08 +0100 (Thu, 23 Jan 2014) New Revision: 847 Modified: papers/jss/article.Rnw Log: Consistently use \CRANpkg for RProtoBuf and also use \code more. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 17:47:32 UTC (rev 846) +++ papers/jss/article.Rnw 2014-01-23 18:59:08 UTC (rev 847) @@ -30,7 +30,7 @@ %% for pretty printing and a nice hypersummary also set: \Plainauthor{Dirk Eddelbuettel, Murray Stokely, Jeroen Ooms} %% comma-separated \Plaintitle{RProtoBuf: Efficient Cross-Language Data Serialization in R} -\Shorttitle{\pkg{RProtoBuf}: Protocol Buffers in \proglang{R}} %% a short title (if necessary) +\Shorttitle{\CRANpkg{RProtoBuf}: Protocol Buffers in \proglang{R}} %% a short title (if necessary) %% an abstract and keywords \Abstract{ @@ -509,7 +509,7 @@ or every \texttt{.proto} file provided by a particular \proglang{R} package. After importing proto files, the corresponding message descriptors are -available from the \texttt{RProtoBuf:DescriptorPool} environment in +available from the \code{RProtoBuf:DescriptorPool} environment in the \proglang{R} search path. This environment is implemented with the user defined tables framework from the \pkg{RObjectTables} package available from the OmegaHat project \citep{RObjectTables}. Instead of @@ -641,17 +641,17 @@ \subsection{Parsing messages} -The \texttt{RProtoBuf} package defines the \texttt{read} and -\texttt{readASCII} functions to read messages from files, raw vectors, -or arbitrary connections. \texttt{read} expects to read the message -payload from binary files or connections and \texttt{readASCII} parses +The \CRANpkg{RProtoBuf} package defines the \code{read} and +\code{readASCII} functions to read messages from files, raw vectors, +or arbitrary connections. \code{read} expects to read the message +payload from binary files or connections and \code{readASCII} parses the human-readable ASCII output that is created with \code{as.character}. The binary representation of the message does not contain information that can be used to dynamically infer the message type, so we have to provide this information -to the \texttt{read} function in the form of a descriptor: +to the \code{read} function in the form of a descriptor: <<>>= msg <- read(tutorial.Person, tf1) @@ -737,8 +737,8 @@ Each method is wrapped individually which allows us to add user friendly custom error handling, type coercion, and performance improvements at the cost of a more verbose implementation. -The \pkg{RProtoBuf} package in many ways motivated -the development of \pkg{Rcpp} Modules \citep{eddelbuettel2013exposing}, +The \CRANpkg{RProtoBuf} package in many ways motivated +the development of \CRANpkg{Rcpp} Modules \citep{eddelbuettel2013exposing}, which provide a more concise way of wrapping \proglang{C++} functions and classes in a single entity. @@ -750,7 +750,7 @@ % grep RPB_ * | grep -v RPB_FUNCTION | grep METHOD|wc -l % 33 -The \texttt{RProtoBuf} package supports two forms for calling +The \CRANpkg{RProtoBuf} package supports two forms for calling functions with these S4 classes: \begin{itemize} \item The functional dispatch mechanism of the the form @@ -759,7 +759,7 @@ \verb|object$method(arguments)|. \end{itemize} -Additionally, \pkg{RProtoBuf} supports tab completion for all +Additionally, \CRANpkg{RProtoBuf} supports tab completion for all classes. Completion possibilities include pseudo-method names for all classes, plus \emph{dynamic dispatch} on names or types specific to a given object. This functionality is implemented with the @@ -1124,7 +1124,7 @@ \end{small} \caption{\label{table-get-types}Correspondence between field type and \proglang{R} type retrieved by the extractors. Note that \proglang{R} lacks native - 64-bit integers, so the \texttt{RProtoBuf.int64AsString} option is + 64-bit integers, so the \code{RProtoBuf.int64AsString} option is available to return large integers as characters to avoid losing precision. This option is described in Section~\ref{sec:int64}.} \end{table} @@ -1213,7 +1213,7 @@ When reading the value back into \proglang{R}, numeric types are returned by default, but when the full precision is required a character value -will be returned if the \texttt{RProtoBuf.int64AsString} option is set +will be returned if the \code{RProtoBuf.int64AsString} option is set to \texttt{TRUE}. The character values are useful because they can accurately be used as unique identifiers and can easily be passed to \proglang{R} packages such as \CRANpkg{int64} \citep{int64} or \CRANpkg{bit64} @@ -1235,7 +1235,7 @@ \section[Converting R Data Structures into Protocol Buffers]{Converting \proglang{R} Data Structures into Protocol Buffers} \label{sec:evaluation} -The previous sections discussed functionality in the \pkg{RProtoBuf} package +The previous sections discussed functionality in the \CRANpkg{RProtoBuf} package for creating, manipulating, parsing, and serializing Protocol Buffer messages of a defined schema. This is useful when there are pre-existing systems with defined schemas or significant software @@ -1251,12 +1251,12 @@ identical(iris, unserialize_pb(msg)) @ -In order to accomplish this, \pkg{RProtoBuf} uses the same catch-all \texttt{proto} +In order to accomplish this, \CRANpkg{RProtoBuf} uses the same catch-all \texttt{proto} schema used by \pkg{RHIPE} for exchanging \proglang{R} data with Hadoop \citep{rhipe}. This schema, which we will refer to as \texttt{rexp.proto}, is printed in %appendix \ref{rexp.proto}. the appendix. -The Protocol Buffer messages generated by \pkg{RProtoBuf} and +The Protocol Buffer messages generated by \CRANpkg{RProtoBuf} and \pkg{RHIPE} are naturally compatible between the two systems because they use the same schema. This shows the power of using a schema based cross-platform format such as Protocol Buffers: interoperability is achieved without effort or close coordination. @@ -1359,11 +1359,11 @@ in multiple languages instead of requiring other programs to parse the \proglang{R} serialization format. % \citep{serialization}. One takeaway from this table is that the universal \proglang{R} object schema -included in \pkg{RProtoBuf} does not in general provide +included in \CRANpkg{RProtoBuf} does not in general provide any significant saving in file size compared to the normal serialization mechanism in \proglang{R}. % redundant: which is seen as equally compact. -The benefits of \pkg{RProtoBuf} accrue more naturally in applications where +The benefits of \CRANpkg{RProtoBuf} accrue more naturally in applications where multiple programming languages are involved, or when a more concise application-specific schema has been defined. The example in the next section satisfies both of these conditions. @@ -1436,7 +1436,7 @@ \end{tabular} } \caption{Serialization sizes for default serialization in \proglang{R} and - RProtoBuf for 50 \proglang{R} datasets.} + \CRANpkg{RProtoBuf} for 50 \proglang{R} datasets.} \label{tab:compression} \end{center} \end{table} @@ -1488,8 +1488,8 @@ share a schema of the histogram representation to coordinate effectively. -The \pkg{HistogramTools} package \citep{histogramtools} enhances -\pkg{RProtoBuf} by providing a concise schema for \proglang{R} histogram objects: +The \CRANpkg{HistogramTools} package \citep{histogramtools} enhances +\CRANpkg{RProtoBuf} by providing a concise schema for \proglang{R} histogram objects: \begin{example} package HistogramTools; @@ -1550,7 +1550,7 @@ hist [1] "message of type 'HistogramTools.HistogramState' with 3 fields set" -# Convert to native \proglang{R} histogram object and plot +# Convert to native R histogram object and plot plot(as.histogram(hist)) \end{Code} @@ -1604,7 +1604,7 @@ function calls, and arguments/return values can be posted/retrieved using several data interchange formats, such as Protocol Buffers. OpenCPU uses the \texttt{serialize\_pb} and \texttt{unserialize\_pb} functions -from the \texttt{RProtoBuf} package to convert between \proglang{R} objects and protobuf +from the \CRANpkg{RProtoBuf} package to convert between \proglang{R} objects and protobuf messages. Therefore, clients need the \texttt{rexp.proto} descriptor mentioned earlier to parse and generate protobuf messages when interacting with OpenCPU. @@ -1750,13 +1750,13 @@ applications and numerical computing. %% DE Re-ordering so that we end on RProtoBuf -The \pkg{RProtoBuf} package builds on the Protocol Buffers \proglang{C++} library, +The \CRANpkg{RProtoBuf} package builds on the Protocol Buffers \proglang{C++} library, and extends the \proglang{R} system with the ability to create, read, write, parse, and manipulate Protocol -Buffer messages. \pkg{RProtoBuf} has been used extensively inside Google +Buffer messages. \CRANpkg{RProtoBuf} has been used extensively inside Google for the past three years by statisticians, analysts, and software engineers. At the time of this writing there are over 300 active -users of \pkg{RProtoBuf} using it to read data from and otherwise interact +users of \CRANpkg{RProtoBuf} using it to read data from and otherwise interact with distributed systems written in \proglang{C++}, \proglang{Java}, \proglang{Python}, and other languages. We hope that making Protocol Buffers available to the \proglang{R} community will contribute towards better software integration @@ -1805,7 +1805,7 @@ obscure protocol buffer semantics. Karl Millar was very helpful in reviewing code and offering suggestions. Saptarshi Guha's work on RHIPE and implementation of a universal message type for \proglang{R} -language objects allowed us to add the \code{serialize_pb} and \code{unserialize\_pb} +language objects allowed us to add the \code{serialize_pb} and \code{unserialize_pb} methods for turning arbitrary R objects into protocol buffers without a specialized pre-defined schema. @@ -1818,7 +1818,7 @@ \label{rexp.proto} Below a print of the \texttt{rexp.proto} schema (originally designed by \cite{rhipe}) -that is included with the \pkg{RProtoBuf} package and used by \texttt{serialize\_pb} and +that is included with the \CRANpkg{RProtoBuf} package and used by \texttt{serialize\_pb} and \texttt{unserialize\_pb}. \begin{verbatim} From noreply at r-forge.r-project.org Thu Jan 23 20:13:05 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Thu, 23 Jan 2014 20:13:05 +0100 (CET) Subject: [Rprotobuf-commits] r848 - papers/jss Message-ID: <20140123191305.EF955183ECF@r-forge.r-project.org> Author: murray Date: 2014-01-23 20:13:04 +0100 (Thu, 23 Jan 2014) New Revision: 848 Modified: papers/jss/article.Rnw Log: Add a total summary row giving the relative size reduction for different serialization formats, and reference the finding more specifically in the text. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 18:59:08 UTC (rev 847) +++ papers/jss/article.Rnw 2014-01-23 19:13:04 UTC (rev 848) @@ -1355,9 +1355,12 @@ returned by object.size() compared to the serialized sizes. %The summary compression sizes are listed below, and a full table for a %sample of 50 datasets is included on the next page. -Sizes are comparable but Protocol Buffers provide simple getters and setters -in multiple languages instead of requiring other programs to parse the \proglang{R} -serialization format. % \citep{serialization}. +Note that protocol buffer serialization results in slightly +smaller byte streams compared to native \proglang{R} serialization in most cases, +but this difference disappears if the results are compressed with gzip. +%Sizes are comparable but Protocol Buffers provide simple getters and setters +%in multiple languages instead of requiring other programs to parse the \proglang{R} +%serialization format. % \citep{serialization}. One takeaway from this table is that the universal \proglang{R} object schema included in \CRANpkg{RProtoBuf} does not in general provide any significant saving in file size compared to the normal serialization @@ -1433,6 +1436,9 @@ OrchardSprays & 3600 & 2164 & 445 & 1897 & 483 \\ WWWusage & 1232 & 916 & 274 & 859 & 251 \\ \bottomrule +% Total & 391176 & 327537 & 99161 & 313456 & 100308 \\ + Relative Size & 100\% & 83.7\% & 25.3\% & 80.1\% & 25.6\%\\ + \bottomrule \end{tabular} } \caption{Serialization sizes for default serialization in \proglang{R} and From noreply at r-forge.r-project.org Fri Jan 24 02:02:21 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Fri, 24 Jan 2014 02:02:21 +0100 (CET) Subject: [Rprotobuf-commits] r849 - papers/jss Message-ID: <20140124010221.622841869D0@r-forge.r-project.org> Author: edd Date: 2014-01-24 02:02:18 +0100 (Fri, 24 Jan 2014) New Revision: 849 Modified: papers/jss/article.Rnw Log: more hyphen for adverbial use, more Prot. Buf capitalization Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-23 19:13:04 UTC (rev 848) +++ papers/jss/article.Rnw 2014-01-24 01:02:18 UTC (rev 849) @@ -58,8 +58,8 @@ %TODO(ms) keep it less than 150 words. -- I think this may be 154, %depending how emacs is counting. } -\Keywords{\proglang{R}, \pkg{Rcpp}, protocol buffers, serialization, cross-platform} -\Plainkeywords{R, Rcpp, protocol buffers, serialization, cross-platform} %% without formatting +\Keywords{\proglang{R}, \pkg{Rcpp}, Protocol Buffers, serialization, cross-platform} +\Plainkeywords{R, Rcpp, Protocol Buffers, serialization, cross-platform} %% without formatting %% at least one keyword must be supplied %% publication information @@ -335,14 +335,14 @@ While traditional IDLs have at times been criticized for code bloat and complexity, Protocol Buffers are based on a simple list and records model that is flexible and easy to use. The schema for structured -protocol buffer data is defined in \texttt{.proto} files, which may +Protocol Buffer data is defined in \texttt{.proto} files, which may contain one or more message types. Each message type has one or more fields. A field is specified with a unique number, a name, a value type, and a field rule specifying whether the field is optional, required, or repeated. The supported value types are numbers, enumerations, booleans, strings, raw bytes, or other nested message -types. The \texttt{.proto} file syntax for defining the structure of protocol -buffer data is described comprehensively on Google Code\footnote{See +types. The \texttt{.proto} file syntax for defining the structure of Protocol +Buffer data is described comprehensively on Google Code\footnote{See \url{http://code.google.com/apis/protocolbuffers/docs/proto.html}.}. Table~\ref{tab:proto} shows an example \texttt{.proto} file that defines the \texttt{tutorial.Person} type. The \proglang{R} code in the right @@ -472,7 +472,7 @@ \label{sec:rprotobuf-basic} This section describes how to use the \proglang{R} API to create and manipulate -protocol buffer messages in \proglang{R}, and how to read and write the +Protocol Buffer messages in \proglang{R}, and how to read and write the binary representation of the message (often called the \emph{payload}) to files and arbitrary binary \proglang{R} connections. The two fundamental building blocks of Protocol Buffers are \emph{Messages} @@ -510,8 +510,8 @@ After importing proto files, the corresponding message descriptors are available from the \code{RProtoBuf:DescriptorPool} environment in -the \proglang{R} search path. This environment is implemented with the user -defined tables framework from the \pkg{RObjectTables} package +the \proglang{R} search path. This environment is implemented with the +user-defined tables framework from the \pkg{RObjectTables} package available from the OmegaHat project \citep{RObjectTables}. Instead of being associated with a static hash table, this environment dynamically queries the in-memory database of loaded descriptors @@ -550,8 +550,8 @@ \subsection{Access and modify fields of a message} Once the message is created, its fields can be queried -and modified using the dollar operator of \proglang{R}, making protocol -buffer messages seem like lists. +and modified using the dollar operator of \proglang{R}, making Protocol +Buffer messages seem like lists. <<>>= p$name @@ -734,8 +734,8 @@ The \CRANpkg{Rcpp} package \citep{eddelbuettel2011rcpp,eddelbuettel2013seamless} is used to facilitate this integration of the \proglang{R} and \proglang{C++} code for these objects. -Each method is wrapped individually which allows us to add user -friendly custom error handling, type coercion, and performance +Each method is wrapped individually which allows us to add +user-friendly custom error handling, type coercion, and performance improvements at the cost of a more verbose implementation. The \CRANpkg{RProtoBuf} package in many ways motivated the development of \CRANpkg{Rcpp} Modules \citep{eddelbuettel2013exposing}, @@ -1019,7 +1019,7 @@ \textbf{Slot} & \textbf{Description} \\ \cmidrule(r){2-2} \texttt{pointer} & external pointer to the \texttt{FileDescriptor} object of the \proglang{C++} proto library. Documentation for the -\texttt{FileDescriptor} class is available from the protocol buffer project page: +\texttt{FileDescriptor} class is available from the Protocol Buffer project page: \url{http://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.descriptor.html#FileDescriptor} \\ \texttt{filename} & fully qualified pathname of the \texttt{.proto} file.\\ \texttt{package} & package name defined in this \texttt{.proto} file.\\[.3cm] @@ -1136,7 +1136,7 @@ Buffer schema, only accept \texttt{TRUE} or \texttt{FALSE}. This means that we simply can not store \proglang{R} logical vectors that include all three possible values as booleans. The library will refuse to store -\texttt{NA}s in protocol buffer boolean fields, and users must instead +\texttt{NA}s in Protocol Buffer boolean fields, and users must instead choose another type (such as enum or integer) capable of storing three distinct values. @@ -1159,7 +1159,7 @@ R> a$optional_bool <- NA \end{CodeInput} \begin{CodeOutput} -Error: NA boolean values can not be stored in bool protocol buffer fields +Error: NA boolean values can not be stored in bool Protocol Buffer fields \end{CodeOutput} \end{CodeChunk} @@ -1241,8 +1241,8 @@ pre-existing systems with defined schemas or significant software components written in other languages that need to be accessed from within \proglang{R}. -The package also provides methods for converting arbitrary \proglang{R} data structures into protocol -buffers and vice versa with a universal \proglang{R} object schema. The \code{serialize\_pb} and \code{unserialize\_pb} +The package also provides methods for converting arbitrary \proglang{R} data structures into Protocol +Buffers and vice versa with a universal \proglang{R} object schema. The \code{serialize\_pb} and \code{unserialize\_pb} functions serialize arbitrary \proglang{R} objects into a universal Protocol Buffer message: @@ -1355,7 +1355,7 @@ returned by object.size() compared to the serialized sizes. %The summary compression sizes are listed below, and a full table for a %sample of 50 datasets is included on the next page. -Note that protocol buffer serialization results in slightly +Note that Protocol Buffer serialization results in slightly smaller byte streams compared to native \proglang{R} serialization in most cases, but this difference disappears if the results are compressed with gzip. %Sizes are comparable but Protocol Buffers provide simple getters and setters @@ -1521,7 +1521,7 @@ This generates a Python module called \texttt{histogram\_pb2.py}, containing both the descriptor information as well as methods to read and manipulate the histogram message data. The following simple Python script uses this generated -module to create a histogram and write out the protocol buffer +module to create a histogram and write out the Protocol Buffer representation to a file: \begin{Code} @@ -1541,7 +1541,7 @@ outfile.close() \end{Code} -The protocol buffer can then be read into \proglang{R} and converted to a native +The Protocol Buffer can then be read into \proglang{R} and converted to a native \proglang{R} histogram object for plotting: \begin{Code} @@ -1808,11 +1808,11 @@ The user-defined table mechanism, implemented by Duncan Temple Lang for the purpose of the \pkg{RObjectTables} package, allows for the dynamic symbol lookup. Kenton Varda was generous with his time in reviewing code and explaining -obscure protocol buffer semantics. Karl Millar was very +obscure Protocol Buffer semantics. Karl Millar was very helpful in reviewing code and offering suggestions. Saptarshi Guha's work on RHIPE and implementation of a universal message type for \proglang{R} language objects allowed us to add the \code{serialize_pb} and \code{unserialize_pb} -methods for turning arbitrary R objects into protocol buffers without +methods for turning arbitrary R objects into Protocol Buffers without a specialized pre-defined schema. \newpage From noreply at r-forge.r-project.org Sun Jan 26 22:03:10 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 26 Jan 2014 22:03:10 +0100 (CET) Subject: [Rprotobuf-commits] r850 - in papers/jss: . data figures Message-ID: <20140126210310.F066E18694D@r-forge.r-project.org> Author: edd Date: 2014-01-26 22:03:10 +0100 (Sun, 26 Jan 2014) New Revision: 850 Added: papers/jss/data/ papers/jss/data/serialization.csv papers/jss/figures/ papers/jss/figures/histogram-mapreduce-diag1.pdf papers/jss/figures/protobuf-distributed-system-crop.pdf Removed: papers/jss/histogram-mapreduce-diag1.pdf papers/jss/protobuf-distributed-system-crop.pdf papers/jss/serialization.csv Modified: papers/jss/article.Rnw Log: make the directory structure a little more organized Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-24 01:02:18 UTC (rev 849) +++ papers/jss/article.Rnw 2014-01-26 21:03:10 UTC (rev 850) @@ -106,7 +106,7 @@ \begin{document} -\SweaveOpts{concordance=FALSE} +\SweaveOpts{concordance=FALSE,prefix.string=figures/fig} %% include your article here, just as usual @@ -316,7 +316,7 @@ % page, but not just here in the middle of the page. \begin{figure}[tbp] \begin{center} -\includegraphics[width=\textwidth]{protobuf-distributed-system-crop.pdf} +\includegraphics[width=\textwidth]{figures/protobuf-distributed-system-crop.pdf} \end{center} \caption{Example protobuf usage} \label{fig:protobuf-distributed-usecase} @@ -1480,7 +1480,7 @@ \begin{figure}[h!] \begin{center} -\includegraphics[width=\textwidth]{histogram-mapreduce-diag1.pdf} +\includegraphics[width=\textwidth]{figures/histogram-mapreduce-diag1.pdf} \end{center} \caption{Diagram of MapReduce Histogram Generation Pattern} \label{fig:mr-histogram-pattern1} Copied: papers/jss/data/serialization.csv (from rev 849, papers/jss/serialization.csv) =================================================================== --- papers/jss/data/serialization.csv (rev 0) +++ papers/jss/data/serialization.csv 2014-01-26 21:03:10 UTC (rev 850) @@ -0,0 +1,51 @@ +data.set,size,r.size,r.gz.size,protobuf.size,protobuf.gz.size +uspop,584,268,172,211,148 +Titanic,1960,633,257,481,249 +volcano,42656,42517,5226,42476,4232 +euro.cross,2728,1319,910,1207,891 +attenu,14568,8234,2165,7771,2336 +ToothGrowth,2568,1486,349,1239,391 +lynx,1344,1028,429,971,404 +nottem,2352,2036,627,1979,641 +sleep,2752,746,282,483,260 +co2,4176,3860,1473,3803,1453 +austres,1144,828,439,771,410 +ability.cov,1944,716,357,589,341 +EuStockMarkets,60664,59785,21232,59674,19882 +treering,64272,63956,17647,63900,17758 +freeny.x,1944,1445,1311,1372,1289 +Puromycin,2088,813,306,620,320 +warpbreaks,2768,1231,310,811,343 +BOD,1088,334,182,226,168 +sunspots,22992,22676,6482,22620,6742 +beaver2,4184,3423,751,3468,840 +anscombe,2424,991,375,884,352 +esoph,5624,3111,548,2240,665 +PlantGrowth,1680,646,303,459,314 +infert,15848,14328,1172,13197,1404 +BJsales,1632,1316,496,1259,465 +stackloss,1688,917,293,844,283 +crimtab,7936,4641,713,1655,576 +LifeCycleSavings,6048,3014,1420,2825,1407 +Harman74.cor,9144,6056,2045,5861,2070 +nhtemp,912,596,240,539,223 +faithful,5136,4543,1339,4936,1776 +freeny,5296,2465,1518,2271,1507 +discoveries,1232,916,199,859,180 +state.x77,7168,4251,1754,4068,1756 +pressure,1096,498,277,427,273 +fdeaths,1008,692,291,635,272 +euro,976,264,186,202,161 +LakeHuron,1216,900,420,843,404 +mtcars,6736,3798,1204,3633,1206 +precip,4992,1793,813,1615,815 +state.area,440,422,246,405,235 +attitude,3024,1990,544,1920,561 +randu,10496,9794,8859,10441,9558 +state.name,3088,844,408,724,415 +airquality,5496,4551,1241,2874,1294 +airmiles,624,308,170,251,148 +quakes,33112,32246,9898,29063,11595 +islands,3496,1232,563,1098,561 +OrchardSprays,3600,2164,445,1897,483 +WWWusage,1232,916,274,859,251 Copied: papers/jss/figures/histogram-mapreduce-diag1.pdf (from rev 849, papers/jss/histogram-mapreduce-diag1.pdf) =================================================================== (Binary files differ) Copied: papers/jss/figures/protobuf-distributed-system-crop.pdf (from rev 849, papers/jss/protobuf-distributed-system-crop.pdf) =================================================================== (Binary files differ) Deleted: papers/jss/histogram-mapreduce-diag1.pdf =================================================================== (Binary files differ) Deleted: papers/jss/protobuf-distributed-system-crop.pdf =================================================================== (Binary files differ) Deleted: papers/jss/serialization.csv =================================================================== --- papers/jss/serialization.csv 2014-01-24 01:02:18 UTC (rev 849) +++ papers/jss/serialization.csv 2014-01-26 21:03:10 UTC (rev 850) @@ -1,51 +0,0 @@ -data.set,size,r.size,r.gz.size,protobuf.size,protobuf.gz.size -uspop,584,268,172,211,148 -Titanic,1960,633,257,481,249 -volcano,42656,42517,5226,42476,4232 -euro.cross,2728,1319,910,1207,891 -attenu,14568,8234,2165,7771,2336 -ToothGrowth,2568,1486,349,1239,391 -lynx,1344,1028,429,971,404 -nottem,2352,2036,627,1979,641 -sleep,2752,746,282,483,260 -co2,4176,3860,1473,3803,1453 -austres,1144,828,439,771,410 -ability.cov,1944,716,357,589,341 -EuStockMarkets,60664,59785,21232,59674,19882 -treering,64272,63956,17647,63900,17758 -freeny.x,1944,1445,1311,1372,1289 -Puromycin,2088,813,306,620,320 -warpbreaks,2768,1231,310,811,343 -BOD,1088,334,182,226,168 -sunspots,22992,22676,6482,22620,6742 -beaver2,4184,3423,751,3468,840 -anscombe,2424,991,375,884,352 -esoph,5624,3111,548,2240,665 -PlantGrowth,1680,646,303,459,314 -infert,15848,14328,1172,13197,1404 -BJsales,1632,1316,496,1259,465 -stackloss,1688,917,293,844,283 -crimtab,7936,4641,713,1655,576 -LifeCycleSavings,6048,3014,1420,2825,1407 -Harman74.cor,9144,6056,2045,5861,2070 -nhtemp,912,596,240,539,223 -faithful,5136,4543,1339,4936,1776 -freeny,5296,2465,1518,2271,1507 -discoveries,1232,916,199,859,180 -state.x77,7168,4251,1754,4068,1756 -pressure,1096,498,277,427,273 -fdeaths,1008,692,291,635,272 -euro,976,264,186,202,161 -LakeHuron,1216,900,420,843,404 -mtcars,6736,3798,1204,3633,1206 -precip,4992,1793,813,1615,815 -state.area,440,422,246,405,235 -attitude,3024,1990,544,1920,561 -randu,10496,9794,8859,10441,9558 -state.name,3088,844,408,724,415 -airquality,5496,4551,1241,2874,1294 -airmiles,624,308,170,251,148 -quakes,33112,32246,9898,29063,11595 -islands,3496,1232,563,1098,561 -OrchardSprays,3600,2164,445,1897,483 -WWWusage,1232,916,274,859,251 From mstokely at google.com Sun Jan 26 22:09:11 2014 From: mstokely at google.com (Murray Stokely) Date: Sun, 26 Jan 2014 13:09:11 -0800 Subject: [Rprotobuf-commits] r850 - in papers/jss: . data figures In-Reply-To: <20140126210310.F066E18694D@r-forge.r-project.org> References: <20140126210310.F066E18694D@r-forge.r-project.org> Message-ID: Thanks. I'm on my way back to California. Do you have a remaining todo list? Do you think we can send this out on Monday? Murray On Jan 26, 2014 4:03 PM, wrote: > Author: edd > Date: 2014-01-26 22:03:10 +0100 (Sun, 26 Jan 2014) > New Revision: 850 > > Added: > papers/jss/data/ > papers/jss/data/serialization.csv > papers/jss/figures/ > papers/jss/figures/histogram-mapreduce-diag1.pdf > papers/jss/figures/protobuf-distributed-system-crop.pdf > Removed: > papers/jss/histogram-mapreduce-diag1.pdf > papers/jss/protobuf-distributed-system-crop.pdf > papers/jss/serialization.csv > Modified: > papers/jss/article.Rnw > Log: > make the directory structure a little more organized > > > Modified: papers/jss/article.Rnw > =================================================================== > --- papers/jss/article.Rnw 2014-01-24 01:02:18 UTC (rev 849) > +++ papers/jss/article.Rnw 2014-01-26 21:03:10 UTC (rev 850) > @@ -106,7 +106,7 @@ > > > \begin{document} > -\SweaveOpts{concordance=FALSE} > +\SweaveOpts{concordance=FALSE,prefix.string=figures/fig} > > > %% include your article here, just as usual > @@ -316,7 +316,7 @@ > % page, but not just here in the middle of the page. > \begin{figure}[tbp] > \begin{center} > -\includegraphics[width=\textwidth]{protobuf-distributed-system-crop.pdf} > > +\includegraphics[width=\textwidth]{figures/protobuf-distributed-system-crop.pdf} > \end{center} > \caption{Example protobuf usage} > \label{fig:protobuf-distributed-usecase} > @@ -1480,7 +1480,7 @@ > > \begin{figure}[h!] > \begin{center} > -\includegraphics[width=\textwidth]{histogram-mapreduce-diag1.pdf} > +\includegraphics[width=\textwidth]{figures/histogram-mapreduce-diag1.pdf} > \end{center} > \caption{Diagram of MapReduce Histogram Generation Pattern} > \label{fig:mr-histogram-pattern1} > > Copied: papers/jss/data/serialization.csv (from rev 849, > papers/jss/serialization.csv) > =================================================================== > --- papers/jss/data/serialization.csv (rev 0) > +++ papers/jss/data/serialization.csv 2014-01-26 21:03:10 UTC (rev 850) > @@ -0,0 +1,51 @@ > +data.set,size,r.size,r.gz.size,protobuf.size,protobuf.gz.size > +uspop,584,268,172,211,148 > +Titanic,1960,633,257,481,249 > +volcano,42656,42517,5226,42476,4232 > +euro.cross,2728,1319,910,1207,891 > +attenu,14568,8234,2165,7771,2336 > +ToothGrowth,2568,1486,349,1239,391 > +lynx,1344,1028,429,971,404 > +nottem,2352,2036,627,1979,641 > +sleep,2752,746,282,483,260 > +co2,4176,3860,1473,3803,1453 > +austres,1144,828,439,771,410 > +ability.cov,1944,716,357,589,341 > +EuStockMarkets,60664,59785,21232,59674,19882 > +treering,64272,63956,17647,63900,17758 > +freeny.x,1944,1445,1311,1372,1289 > +Puromycin,2088,813,306,620,320 > +warpbreaks,2768,1231,310,811,343 > +BOD,1088,334,182,226,168 > +sunspots,22992,22676,6482,22620,6742 > +beaver2,4184,3423,751,3468,840 > +anscombe,2424,991,375,884,352 > +esoph,5624,3111,548,2240,665 > +PlantGrowth,1680,646,303,459,314 > +infert,15848,14328,1172,13197,1404 > +BJsales,1632,1316,496,1259,465 > +stackloss,1688,917,293,844,283 > +crimtab,7936,4641,713,1655,576 > +LifeCycleSavings,6048,3014,1420,2825,1407 > +Harman74.cor,9144,6056,2045,5861,2070 > +nhtemp,912,596,240,539,223 > +faithful,5136,4543,1339,4936,1776 > +freeny,5296,2465,1518,2271,1507 > +discoveries,1232,916,199,859,180 > +state.x77,7168,4251,1754,4068,1756 > +pressure,1096,498,277,427,273 > +fdeaths,1008,692,291,635,272 > +euro,976,264,186,202,161 > +LakeHuron,1216,900,420,843,404 > +mtcars,6736,3798,1204,3633,1206 > +precip,4992,1793,813,1615,815 > +state.area,440,422,246,405,235 > +attitude,3024,1990,544,1920,561 > +randu,10496,9794,8859,10441,9558 > +state.name,3088,844,408,724,415 > +airquality,5496,4551,1241,2874,1294 > +airmiles,624,308,170,251,148 > +quakes,33112,32246,9898,29063,11595 > +islands,3496,1232,563,1098,561 > +OrchardSprays,3600,2164,445,1897,483 > +WWWusage,1232,916,274,859,251 > > Copied: papers/jss/figures/histogram-mapreduce-diag1.pdf (from rev 849, > papers/jss/histogram-mapreduce-diag1.pdf) > =================================================================== > (Binary files differ) > > Copied: papers/jss/figures/protobuf-distributed-system-crop.pdf (from rev > 849, papers/jss/protobuf-distributed-system-crop.pdf) > =================================================================== > (Binary files differ) > > Deleted: papers/jss/histogram-mapreduce-diag1.pdf > =================================================================== > (Binary files differ) > > Deleted: papers/jss/protobuf-distributed-system-crop.pdf > =================================================================== > (Binary files differ) > > Deleted: papers/jss/serialization.csv > =================================================================== > --- papers/jss/serialization.csv 2014-01-24 01:02:18 UTC (rev 849) > +++ papers/jss/serialization.csv 2014-01-26 21:03:10 UTC (rev 850) > @@ -1,51 +0,0 @@ > -data.set,size,r.size,r.gz.size,protobuf.size,protobuf.gz.size > -uspop,584,268,172,211,148 > -Titanic,1960,633,257,481,249 > -volcano,42656,42517,5226,42476,4232 > -euro.cross,2728,1319,910,1207,891 > -attenu,14568,8234,2165,7771,2336 > -ToothGrowth,2568,1486,349,1239,391 > -lynx,1344,1028,429,971,404 > -nottem,2352,2036,627,1979,641 > -sleep,2752,746,282,483,260 > -co2,4176,3860,1473,3803,1453 > -austres,1144,828,439,771,410 > -ability.cov,1944,716,357,589,341 > -EuStockMarkets,60664,59785,21232,59674,19882 > -treering,64272,63956,17647,63900,17758 > -freeny.x,1944,1445,1311,1372,1289 > -Puromycin,2088,813,306,620,320 > -warpbreaks,2768,1231,310,811,343 > -BOD,1088,334,182,226,168 > -sunspots,22992,22676,6482,22620,6742 > -beaver2,4184,3423,751,3468,840 > -anscombe,2424,991,375,884,352 > -esoph,5624,3111,548,2240,665 > -PlantGrowth,1680,646,303,459,314 > -infert,15848,14328,1172,13197,1404 > -BJsales,1632,1316,496,1259,465 > -stackloss,1688,917,293,844,283 > -crimtab,7936,4641,713,1655,576 > -LifeCycleSavings,6048,3014,1420,2825,1407 > -Harman74.cor,9144,6056,2045,5861,2070 > -nhtemp,912,596,240,539,223 > -faithful,5136,4543,1339,4936,1776 > -freeny,5296,2465,1518,2271,1507 > -discoveries,1232,916,199,859,180 > -state.x77,7168,4251,1754,4068,1756 > -pressure,1096,498,277,427,273 > -fdeaths,1008,692,291,635,272 > -euro,976,264,186,202,161 > -LakeHuron,1216,900,420,843,404 > -mtcars,6736,3798,1204,3633,1206 > -precip,4992,1793,813,1615,815 > -state.area,440,422,246,405,235 > -attitude,3024,1990,544,1920,561 > -randu,10496,9794,8859,10441,9558 > -state.name,3088,844,408,724,415 > -airquality,5496,4551,1241,2874,1294 > -airmiles,624,308,170,251,148 > -quakes,33112,32246,9898,29063,11595 > -islands,3496,1232,563,1098,561 > -OrchardSprays,3600,2164,445,1897,483 > -WWWusage,1232,916,274,859,251 > > _______________________________________________ > Rprotobuf-commits mailing list > Rprotobuf-commits at lists.r-forge.r-project.org > > https://lists.r-forge.r-project.org/cgi-bin/mailman/listinfo/rprotobuf-commits > -------------- next part -------------- An HTML attachment was scrubbed... URL: From noreply at r-forge.r-project.org Sun Jan 26 22:17:54 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 26 Jan 2014 22:17:54 +0100 (CET) Subject: [Rprotobuf-commits] r851 - papers/jss Message-ID: <20140126211754.9BA75185B79@r-forge.r-project.org> Author: edd Date: 2014-01-26 22:17:54 +0100 (Sun, 26 Jan 2014) New Revision: 851 Modified: papers/jss/article.Rnw Log: section and subsection in sentence style Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-26 21:03:10 UTC (rev 850) +++ papers/jss/article.Rnw 2014-01-26 21:17:54 UTC (rev 851) @@ -468,7 +468,7 @@ %languages to support protocol buffers is compiled as part of the %project page: \url{http://code.google.com/p/protobuf/wiki/ThirdPartyAddOns} -\section{Basic Usage: Messages and Descriptors} +\section{Basic Usage: Messages and descriptors} \label{sec:rprotobuf-basic} This section describes how to use the \proglang{R} API to create and manipulate @@ -495,7 +495,7 @@ %from a variety of data streams using a variety of different %languages. The definition -\subsection[Importing Message Descriptors from .proto files]{Importing Message Descriptors from \texttt{.proto} files} +\subsection[Importing message descriptors from .proto files]{Importing message descriptors from \texttt{.proto} files} %The three basic abstractions of \CRANpkg{RProtoBuf} are Messages, %which encapsulate a data structure, Descriptors, which define the @@ -692,7 +692,7 @@ @ -\section{Under the hood: S4 Classes, Methods, and Pseudo Methods} +\section{Under the hood: S4 classes, methods, and pseudo methods} \label{sec:rprotobuf-classes} The \CRANpkg{RProtoBuf} package uses the S4 system to store @@ -890,7 +890,7 @@ \caption{\label{Descriptor-methods-table}Description of slots and methods for the \texttt{Descriptor} S4 class} \end{table} -\subsection{Field Descriptors} +\subsection{Field descriptors} \label{subsec-field-descriptor} The class \emph{FieldDescriptor} represents field @@ -942,7 +942,7 @@ % separate '$' dispatch like Messages, Descriptors, and % EnumDescriptors do. Should it? -\subsection{Enum Descriptors} +\subsection{Enum descriptors} \label{subsec-enum-descriptor} The class \emph{EnumDescriptor} represents enum descriptors in \proglang{R}. @@ -993,7 +993,7 @@ \caption{\label{enumdescriptor-methods-table}Description of slots and methods for the \texttt{EnumDescriptor} S4 class} \end{table} -\subsection{File Descriptors} +\subsection{File descriptors} \label{subsec-file-descriptor} The class \emph{FileDescriptor} represents file descriptors in \proglang{R}. @@ -1037,7 +1037,7 @@ \caption{\label{filedescriptor-methods-table}Description of slots and methods for the \texttt{FileDescriptor} S4 class} \end{table} -\subsection{Enum Value Descriptors} +\subsection{Enum value descriptors} \label{subsec-enumvalue-descriptor} The class \emph{EnumValueDescriptor} represents enumeration value @@ -1080,7 +1080,7 @@ and methods for the \texttt{EnumValueDescriptor} S4 class} \end{table} -\section{Type Coercion} +\section{Type coercion} \label{sec:types} One of the benefits of using an Interface Definition Language (IDL) @@ -1163,7 +1163,7 @@ \end{CodeOutput} \end{CodeChunk} -\subsection{Unsigned Integers} +\subsection{Unsigned integers} \proglang{R} lacks a native unsigned integer type. Values between $2^{31}$ and $2^{32} - 1$ read from unsigned integer Protocol Buffer fields must be @@ -1232,7 +1232,7 @@ options("RProtoBuf.int64AsString" = FALSE) @ -\section[Converting R Data Structures into Protocol Buffers]{Converting \proglang{R} Data Structures into Protocol Buffers} +\section[Converting R data structures into Protocol Buffers]{Converting \proglang{R} data structures into Protocol Buffers} \label{sec:evaluation} The previous sections discussed functionality in the \CRANpkg{RProtoBuf} package @@ -1274,7 +1274,7 @@ \texttt{deparse} to convert functions or language objects into strings, or \texttt{as.list} for environments. -\subsection[Evaluation: Converting R Data Sets]{Evaluation: Converting \proglang{R} Data Sets} +\subsection[Evaluation: Converting R data sets]{Evaluation: Converting \proglang{R} data sets} To illustrate how this method works, we attempt to convert all of the built-in datasets from \proglang{R} into this serialized Protocol Buffer representation. @@ -1316,7 +1316,7 @@ attr(object, "formula") @ -\subsection{Compression Performance} +\subsection{Compression performance} \label{sec:compression} This section compares how many bytes are used to store data sets @@ -1448,7 +1448,7 @@ \end{table} -\section{Application: Distributed Data Collection with MapReduce} +\section{Application: Distributed data collection with MapReduce} \label{sec:mapreduce} Many large data sets in fields such as particle physics and information @@ -1574,7 +1574,7 @@ large-scale studies of distributed storage systems \citep{sciencecloud,janus}. -\section{Application: Data Interchange in Web Services} +\section{Application: Data Interchange in web Services} \label{sec:opencpu} % TODO(jeroen): I think maybe some of this should go earlier in the From noreply at r-forge.r-project.org Sun Jan 26 22:20:19 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 26 Jan 2014 22:20:19 +0100 (CET) Subject: [Rprotobuf-commits] r852 - papers/jss Message-ID: <20140126212019.1344A185E63@r-forge.r-project.org> Author: edd Date: 2014-01-26 22:20:18 +0100 (Sun, 26 Jan 2014) New Revision: 852 Modified: papers/jss/article.Rnw Log: sentence style in captions Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-26 21:17:54 UTC (rev 851) +++ papers/jss/article.Rnw 2014-01-26 21:20:18 UTC (rev 852) @@ -727,8 +727,8 @@ EnumValueDescriptor & 3 & \phantom{1}6 & no\\ \bottomrule \end{tabular} -\caption{\label{class-summary-table}Overview of Class, Slot, Method and - Dispatch Relationships} +\caption{\label{class-summary-table}Overview of class, slot, method and + dispatch relationships} \end{table} The \CRANpkg{Rcpp} package @@ -1482,7 +1482,7 @@ \begin{center} \includegraphics[width=\textwidth]{figures/histogram-mapreduce-diag1.pdf} \end{center} -\caption{Diagram of MapReduce Histogram Generation Pattern} +\caption{Diagram of MapReduce histogram generation pattern} \label{fig:mr-histogram-pattern1} \end{figure} From noreply at r-forge.r-project.org Sun Jan 26 22:28:13 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 26 Jan 2014 22:28:13 +0100 (CET) Subject: [Rprotobuf-commits] r853 - papers/jss Message-ID: <20140126212814.09B4018614D@r-forge.r-project.org> Author: edd Date: 2014-01-26 22:28:13 +0100 (Sun, 26 Jan 2014) New Revision: 853 Modified: papers/jss/article.Rnw Log: i.e. and e.g. should have comma (see Style Guide, Misc) Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-26 21:20:18 UTC (rev 852) +++ papers/jss/article.Rnw 2014-01-26 21:28:13 UTC (rev 853) @@ -1584,7 +1584,7 @@ As described earlier, the primary application of Protocol Buffers is data interchange in the context of inter-system communications. Network protocols -such as HTTP provide mechanisms for client-server communication, i.e.~how to +such as HTTP provide mechanisms for client-server communication, i.e., how to initiate requests, authenticate, send messages, etc. However, network protocols generally do not regulate the \emph{content} of messages: they allow transfer of any media type, such as web pages, static files or From noreply at r-forge.r-project.org Sun Jan 26 22:47:30 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 26 Jan 2014 22:47:30 +0100 (CET) Subject: [Rprotobuf-commits] r854 - papers/jss Message-ID: <20140126214730.B6A0E18675F@r-forge.r-project.org> Author: edd Date: 2014-01-26 22:47:30 +0100 (Sun, 26 Jan 2014) New Revision: 854 Modified: papers/jss/article.Rnw Log: MASSIVE killing of comments which were becoming ballast Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-26 21:28:13 UTC (rev 853) +++ papers/jss/article.Rnw 2014-01-26 21:47:30 UTC (rev 854) @@ -130,19 +130,11 @@ \maketitle -\section{Introduction} % TODO(DE) More sober: Friends don't let friends use CSV} -% NOTE(MS): I really do think we can use add back: -% \section{Introduction: Friends Don't Let Friends Use CSV} -% I didn't use proper Title Caps the first time around but really I -% think it makes the paper more readable to have a tl;dr intro title -% that is fun and engaging since this paper is still on the dry/boring -% side. +\section{Introduction} + Modern data collection and analysis pipelines increasingly involve collections of decoupled components in order to better manage software complexity through reusability, modularity, and fault isolation \citep{Wegiel:2010:CTT:1932682.1869479}. -% This is really a different pattern not connected well here. -%Data analysis patterns such as Split-Apply-Combine -%\citep{wickham2011split} explicitly break up large problems into manageable pieces. These pipelines are frequently built using different programming languages for the different phases of data analysis -- collection, cleaning, modeling, analysis, post-processing, and @@ -151,8 +143,6 @@ different environments and languages. Each stage of such a data analysis pipeline may produce intermediate results that need to be stored in a file, or sent over the network for further processing. -% JO Perhaps also mention that serialization is needed for distributed -% systems to make systems scale up? Given these requirements, how do we safely and efficiently share intermediate results between different applications, possibly written in different @@ -161,18 +151,12 @@ translating data structures, variables, and session state into a format that can be stored or transmitted and then reconstructed in the original form later \citep{clinec++}. -% Reverted to my original above, because the replacement below puts me -% to sleep: -%Such systems require reliable and efficient exchange of intermediate -%results between the individual components, using formats that are -%independent of platform, language, operating system or architecture. Programming languages such as \proglang{R}, \proglang{Julia}, \proglang{Java}, and \proglang{Python} include built-in support for serialization, but the default formats are usually language-specific and thereby lock the user into a single environment. -%\paragraph*{Friends don't let friends use CSV!} Data analysts and researchers often use character-separated text formats such as \texttt{CSV} \citep{shafranovich2005common} to export and import data. However, anyone who has ever used \texttt{CSV} files will have noticed @@ -189,8 +173,8 @@ text-based and has no native notion of numeric types or arrays, it usually not a very practical format to store numeric datasets as they appear in statistical applications. -% + A more modern format is \emph{JavaScript ObjectNotation} (\texttt{JSON}), which is derived from the object literals of \proglang{JavaScript}, and already widely-used on the world wide web. @@ -205,16 +189,6 @@ are not widely supported. Furthermore, such formats lack a separate schema for the serialized data and thus still duplicate field names with each message sent over the network or stored in a file. -% and still must send duplicate field names -% with each message since there is no separate schema. -% \pkg{MessagePack} -% and \pkg{BSON} both have \proglang{R} -% interfaces \citep{msgpackR,rmongodb}, but these formats lack a separate schema for the serialized -% data and thus still duplicate field names with each message sent over -% the network or stored in a file. Such formats also lack support for -% versioning when data storage needs evolve over time, or when -% application logic and requirement changes dictate updates to the -%message format. Once the data serialization needs of an application become complex enough, developers typically benefit from the use of an @@ -229,40 +203,6 @@ and show Protocol Buffers compare very favorably to the alternatives; see \citet{Sumaray:2012:CDS:2184751.2184810} for one such comparison. -% Too technical, move to section 2. -% The schema can be used to generate model classes for statically-typed programming languages -%such as C++ and Java, or can be used with reflection for dynamically-typed programming -%languages. - -% TODO(mstokely): Will need to define reflection if we use it here. -% Maybe in the next section since its not as key as 'serialization' -% which we already defined. - -%\paragraph*{Enter Protocol Buffers:} - -% In 2008, and following several years of internal use, Google released an open -% source version of Protocol Buffers. It provides data -% interchange format that was designed and used for their internal infrastructure. -% Google officially provides high-quality parsing libraries for \texttt{Java}, -% \texttt{C++} and \texttt{Python}, and community-developed open source implementations -% are available for many other languages. -% Protocol Buffers take a quite different approach from many other popular formats. - -% TODO(mstokely): Good sentence from Jeroen, add it here or sec 2. -% They offer a unique combination of features, performance, and maturity that seems -% particulary well suited for data-driven applications and numerical -% computing. - -% TODO(DE): Mention "future proof" forward compatibility of schemata - - -% TODO(mstokely): Take a more conversational tone here asking -% questions and motivating protocol buffers? - -% NOTE(mstokely): I don't like these roadmap paragraphs in general, -% but it seems ueful here because we have a boring bit in the middle -% (full class/method details) and interesting applications at the end. - This paper describes an \proglang{R} interface to Protocol Buffers, and is organized as follows. Section~\ref{sec:protobuf} provides a general high-level overview of Protocol Buffers as well as a basic @@ -279,24 +219,9 @@ in MapReduce and web service environments, respectively, before Section~\ref{sec:summary} concludes. -%This article describes the basics of Google's Protocol Buffers through -%an easy to use \proglang{R} package, \CRANpkg{RProtoBuf}. After describing the -%basics of protocol buffers and \CRANpkg{RProtoBuf}, we illustrate -%several common use cases for protocol buffers in data analysis. - \section{Protocol Buffers} \label{sec:protobuf} -% JO: I'm not sure where to put this paragraph. I think it is too technical -% for the introduction section. Maybe start this section with some explanation -% of what a schema is and then continue with showing how PB implement this? -% MS: Yes I agree, tried to address below. - -% This content is good. Maybe use and cite? -% http://martin.kleppmann.com/2012/12/05/schema-evolution-in-avro-protocol-buffers-thrift.html - -%% TODO(de,ms) What follows is oooooold and was lifted from the webpage -%% Rewrite? Protocol Buffers are a modern, language-neutral, platform-neutral, extensible mechanism for sharing and storing structured data. Some of the key features provided by Protocol Buffers for data analysis include: @@ -312,8 +237,6 @@ decade. \end{itemize} -% Lets place this at the top of the page or the bottom, or on a float -% page, but not just here in the middle of the page. \begin{figure}[tbp] \begin{center} \includegraphics[width=\textwidth]{figures/protobuf-distributed-system-crop.pdf} @@ -349,7 +272,6 @@ column shows an example of creating a new message of this type and populating its fields. -%% TODO(de) Can we make this not break the width of the page? \noindent \begin{table} \begin{tabular}{p{.40\textwidth}p{0.55\textwidth}} @@ -396,41 +318,6 @@ \end{table} -% The schema can be used to generate model classes for statically-typed programming languages -%such as C++ and Java, or can be used with reflection for dynamically-typed programming -%languages. - -% TODO(mstokely): Maybe find a place to add this? -% Since their -% introduction, Protocol Buffers have been widely adopted in industry with -% applications as varied as %database-internal messaging (Drizzle), % DE: citation? -% Sony Playstations, Twitter, Google Search, Hadoop, and Open Street -% Map. - -% TODO(DE): This either needs a citation, or remove the name drop -% MS: These are mostly from blog posts, I can't find a good reference -% that has a long list, and the name and year citation style seems -% less conducive to long lists of marginal citations like blog posts -% compared to say concise CS/math style citations [3,4,5,6]. Thoughts? - - -% The schema can be used to generate classes for statically-typed programming languages -% such as C++ and Java, or can be used with reflection for dynamically-typed programming -% languages. - - - -%Protocol buffers are a language-neutral, platform-neutral, extensible -%way of serializing structured data for use in communications -%protocols, data storage, and more. - -%Protocol Buffers offer key features such as an efficient data interchange -%format that is both language- and operating system-agnostic yet uses a -%lightweight and highly performant encoding, object serialization and -%de-serialization as well data and configuration management. Protocol -%buffers are also forward compatible: updates to the \texttt{proto} -%files do not break programs built against the previous specification. - For added speed and efficiency, the \proglang{C++}, \proglang{Java}, and \proglang{Python} bindings to Protocol Buffers are used with a compiler that translates a Protocol @@ -439,35 +326,11 @@ manipulate Protocol Buffer messages. The \proglang{R} interface, in contrast, uses a reflection-based API that makes some operations slightly slower but which is much more convenient for interactive data analysis. -%particularly well-suited for -%interactive data analysis. All messages in \proglang{R} have a single class structure, but different accessor methods are created at runtime based on the named fields of the specified message type, as described in the next section. -% In other words, given the 'proto' -%description file, code is automatically generated for the chosen -%target language(s). The project page contains a tutorial for each of -%these officially supported languages: -%\url{http://code.google.com/apis/protocolbuffers/docs/tutorials.html} - -%The protocol buffers code is released under an open-source (BSD) license. The -%protocol buffer project (\url{http://code.google.com/p/protobuf/}) -%contains a C++ library and a set of runtime libraries and compilers for -%C++, Java and Python. - -%With these languages, the workflow follows standard practice of so-called -%Interface Description Languages (IDL) -%(c.f. \href{http://en.wikipedia.org/wiki/Interface_description_language}{Wikipedia -% on IDL}). This consists of compiling a protocol buffer description file -%(ending in \texttt{.proto}) into language specific classes that can be used - -%Besides the officially supported C++, Java and Python implementations, several projects have been -%created to support protocol buffers for many languages. The list of known -%languages to support protocol buffers is compiled as part of the -%project page: \url{http://code.google.com/p/protobuf/wiki/ThirdPartyAddOns} - \section{Basic Usage: Messages and descriptors} \label{sec:rprotobuf-basic} @@ -481,27 +344,8 @@ Message Descriptors are defined in \texttt{.proto} files and define a schema for a particular named class of messages. -% Commented out because we said this earlier. -%This separation -%between schema and the message objects is in contrast to -%more verbose formats like JSON, and when combined with the efficient -%binary representation of any Message object explains a large part of -%the performance and storage-space advantage offered by Protocol -%Buffers. TODO(ms): we already said some of this above. clean up. - -% lifted from protobuf page: -%With Protocol Buffers you define how you want your data to be -%structured once, and then you can read or write structured data to and -%from a variety of data streams using a variety of different -%languages. The definition - \subsection[Importing message descriptors from .proto files]{Importing message descriptors from \texttt{.proto} files} -%The three basic abstractions of \CRANpkg{RProtoBuf} are Messages, -%which encapsulate a data structure, Descriptors, which define the -%schema used by one or more messages, and DescriptorPools, which -%provide access to descriptors. - To create or parse a Protocol Buffer Message, one must first read in the message type specification from a \texttt{.proto} file. The \texttt{.proto} files are imported using the \code{readProtoFiles} @@ -521,16 +365,6 @@ ls("RProtoBuf:DescriptorPool") @ -%\subsection{Importing proto files} -%In contrast to the other languages (Java, C++, Python) that are officially -%supported by Google, the implementation used by the \texttt{RProtoBuf} -%package does not rely on the \texttt{protoc} compiler (with the exception of -%the two functions discussed in the previous section). This means that no -%initial step of statically compiling the proto file into C++ code that is -%then accessed by \proglang{R} code is necessary. Instead, \texttt{proto} files are -%parsed and processed \textsl{at runtime} by the protobuf C++ library---which -%is much more appropriate for a dynamic language. - \subsection{Creating a message} New messages are created with the \texttt{new} function which accepts @@ -574,8 +408,6 @@ 64-bit integer support. A workaround is available and described in Section~\ref{sec:int64} for working with large integer values. -% TODO(mstokely): Document extensions here. -% There are none in addressbook.proto though. \subsection{Display messages} @@ -708,8 +540,6 @@ glue code between the \proglang{R} language classes and the underlying \proglang{C++} classes. -% MS: I think this looks better at the bottom of the page. -% so it appears after the new section starts where it is referenced. \begin{table}[bp] \centering \begin{tabular}{lccl} @@ -742,13 +572,6 @@ which provide a more concise way of wrapping \proglang{C++} functions and classes in a single entity. -% Message, Descriptor, FieldDescriptor, EnumDescriptor, -% FileDescriptor, EnumValueDescriptor -% -% grep RPB_FUNC * | grep -v define|wc -l -% 84 -% grep RPB_ * | grep -v RPB_FUNCTION | grep METHOD|wc -l -% 33 The \CRANpkg{RProtoBuf} package supports two forms for calling functions with these S4 classes: @@ -938,9 +761,6 @@ methods for the \texttt{FieldDescriptor} S4 class} \end{table} -% TODO(ms): Useful distinction to make -- FieldDescriptor does not do -% separate '$' dispatch like Messages, Descriptors, and -% EnumDescriptors do. Should it? \subsection{Enum descriptors} \label{subsec-enum-descriptor} @@ -1371,8 +1191,6 @@ application-specific schema has been defined. The example in the next section satisfies both of these conditions. -% N.B. see table.Rnw for how this table is created. -% % latex table generated in \proglang{R} 3.0.2 by xtable 1.7-0 package % Fri Dec 27 17:00:03 2013 \begin{table}[h!] @@ -1577,11 +1395,6 @@ \section{Application: Data Interchange in web Services} \label{sec:opencpu} -% TODO(jeroen): I think maybe some of this should go earlier in the -% paper, so this part can focus only on introducing the application, -% Can you integrate some of this text earlier, maybe into the the -% introduction? - As described earlier, the primary application of Protocol Buffers is data interchange in the context of inter-system communications. Network protocols such as HTTP provide mechanisms for client-server communication, i.e., how to @@ -1739,7 +1552,7 @@ outputmsg <- serialize_pb(val) @ -\section{Summary} % DE Simpler title +\section{Summary} \label{sec:summary} Over the past decade, many formats for interoperable data exchange have become available, each with their unique features, @@ -1755,7 +1568,6 @@ performance, and maturity, that seems particulary well suited for data-driven applications and numerical computing. -%% DE Re-ordering so that we end on RProtoBuf The \CRANpkg{RProtoBuf} package builds on the Protocol Buffers \proglang{C++} library, and extends the \proglang{R} system with the ability to create, read, write, parse, and manipulate Protocol @@ -1769,35 +1581,8 @@ and allow for building even more advanced applications and analysis pipelines with \proglang{R}. -%\emph{Other Approaches} -% -%== JO: I don't really like this section here, it gives the entire paper a bit of a -%sour aftertaste. Perhaps we can mention performance caveats in the technical -%sections? I think it's nicer to leave it at the above paragraphs.== -% -% DE: Agreed -- commenting out -%% \pkg{RProtoBuf} is quite flexible and easy to use for interactive use, -%% but it is not designed for efficient high-speed manipulation of large -%% numbers of protocol buffers once they have been read into R. For -%% example, taking a list of 100,000 Protocol Buffers, extracting a named -%% field from each one, and computing an aggregate statistic on those -%% values would be relatively slow with RProtoBuf. Mechanisms to address -%% such use cases are under investigation for possible incorporation into -%% future releases of RProtoBuf, but currently, the package relies on -%% other database systems to provide query and aggregation semantics -%% before the resulting protocol buffers are read into R. Inside Google, -%% the Dremel query system \citep{dremel} is often employed in this role -%% in conjunction with \pkg{RProtoBuf}. -% Such queries could be -%supported in a future version of \pkg{RProtoBuf} by supporting a -%vector of messages type such that \emph{slicing} operations over a -%given field across a large number of messages could be done -%efficiently in \proglang{C++}. - - - \section*{Acknowledgments} The first versions of \CRANpkg{RProtoBuf} were written during 2009-2010. From noreply at r-forge.r-project.org Sun Jan 26 23:29:04 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Sun, 26 Jan 2014 23:29:04 +0100 (CET) Subject: [Rprotobuf-commits] r855 - papers/jss Message-ID: <20140126222904.0D7B6185F47@r-forge.r-project.org> Author: edd Date: 2014-01-26 23:29:03 +0100 (Sun, 26 Jan 2014) New Revision: 855 Modified: papers/jss/article.Rnw Log: one spell checks, lots of 'data set' instead of dataset, one 'work flow' Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-26 21:47:30 UTC (rev 854) +++ papers/jss/article.Rnw 2014-01-26 22:29:03 UTC (rev 855) @@ -136,9 +136,9 @@ of decoupled components in order to better manage software complexity through reusability, modularity, and fault isolation \citep{Wegiel:2010:CTT:1932682.1869479}. These pipelines are frequently built using different programming -languages for the different phases of data analysis -- collection, +languages for the different phases of data analysis --- collection, cleaning, modeling, analysis, post-processing, and -presentation -- in order to take advantage of the unique combination of +presentation --- in order to take advantage of the unique combination of performance, speed of development, and library support offered by different environments and languages. Each stage of such a data analysis pipeline may produce intermediate results that need to be @@ -171,7 +171,7 @@ complexity at the parsing side (which are somewhat mitigated by the availability of mature libraries and parsers). Because \texttt{XML} is text-based and has no native notion of numeric types or arrays, it usually not a -very practical format to store numeric datasets as they appear in statistical +very practical format to store numeric data sets as they appear in statistical applications. @@ -214,7 +214,7 @@ Section~\ref{sec:types} describes the challenges of type coercion between \proglang{R} and other languages. Section~\ref{sec:evaluation} introduces a general \proglang{R} language schema for serializing arbitrary \proglang{R} objects and evaluates -it against the serialization capbilities built directly into \proglang{R}. Sections~\ref{sec:mapreduce} +it against the serialization capabilities built directly into \proglang{R}. Sections~\ref{sec:mapreduce} and \ref{sec:opencpu} provide real-world use cases of \CRANpkg{RProtoBuf} in MapReduce and web service environments, respectively, before Section~\ref{sec:summary} concludes. @@ -231,7 +231,7 @@ applications as well as different computers or operating systems. \item \emph{Efficient}: Data is serialized into a compact binary representation for transmission or storage. -\item \emph{Extensible}: New fields can be added to Protocol Buffer Schemas +\item \emph{Extensible}: New fields can be added to Protocol Buffer schemas in a forward-compatible way that does not break older applications. \item \emph{Stable}: Protocol Buffers have been in wide use for over a decade. @@ -246,7 +246,7 @@ \end{figure} Figure~\ref{fig:protobuf-distributed-usecase} illustrates an example -communication workflow with Protocol Buffers and an interactive \proglang{R} session. +communication work flow with Protocol Buffers and an interactive \proglang{R} session. Common use cases include populating a request remote-procedure call (RPC) Protocol Buffer in \proglang{R} that is then serialized and sent over the network to a remote server. The server would then deserialize the message, act on the @@ -1097,7 +1097,7 @@ \subsection[Evaluation: Converting R data sets]{Evaluation: Converting \proglang{R} data sets} To illustrate how this method works, we attempt to convert all of the built-in -datasets from \proglang{R} into this serialized Protocol Buffer representation. +data sets from \proglang{R} into this serialized Protocol Buffer representation. <>= datasets <- as.data.frame(data(package="datasets")$results) @@ -1105,8 +1105,8 @@ n <- nrow(datasets) @ -There are \Sexpr{n} standard data sets included in the base-r \pkg{datasets} -package. These datasets include data frames, matrices, time series, tables lists, +There are \Sexpr{n} standard data sets included in the \pkg{datasets} +package included with \proglang{R}. These data sets include data frames, matrices, time series, tables lists, and some more exotic data classes. The \texttt{can\_serialize\_pb} method is used to determine which of those can fully be converted to the \texttt{rexp.proto} Protocol Buffer representation. This method simply checks if any of the values or @@ -1118,7 +1118,7 @@ \Sexpr{m} data sets can be converted to Protocol Buffers without loss of information (\Sexpr{format(100*m/n,digits=1)}\%). Upon closer -inspection, all other datasets are objects of class \texttt{nfnGroupedData}. +inspection, all other data sets are objects of class \texttt{nfnGroupedData}. This class represents a special type of data frame that has some additional attributes (such as a \emph{formula} object) used by the \pkg{nlme} package. Because formulas are \proglang{R} \emph{language} objects, they have little meaning to @@ -1171,10 +1171,10 @@ check.names=FALSE) @ -Table~\ref{tab:compression} shows the sizes of 50 sample \proglang{R} datasets as +Table~\ref{tab:compression} shows the sizes of 50 sample \proglang{R} data sets as returned by object.size() compared to the serialized sizes. %The summary compression sizes are listed below, and a full table for a -%sample of 50 datasets is included on the next page. +%sample of 50 data sets is included on the next page. Note that Protocol Buffer serialization results in slightly smaller byte streams compared to native \proglang{R} serialization in most cases, but this difference disappears if the results are compressed with gzip. @@ -1260,7 +1260,7 @@ \end{tabular} } \caption{Serialization sizes for default serialization in \proglang{R} and - \CRANpkg{RProtoBuf} for 50 \proglang{R} datasets.} + \CRANpkg{RProtoBuf} for 50 \proglang{R} data sets.} \label{tab:compression} \end{center} \end{table} @@ -1430,7 +1430,7 @@ \subsection[HTTP GET: Retrieving an R object]{HTTP GET: Retrieving an \proglang{R} object} The \texttt{HTTP GET} method is used to read a resource from OpenCPU. For example, -to access the dataset \texttt{Animals} from the package \texttt{MASS}, a +to access the data set \texttt{Animals} from the package \texttt{MASS}, a client performs the following HTTP request: \begin{verbatim} @@ -1446,10 +1446,10 @@ Because both HTTP and Protocol Buffers have libraries available for many languages, clients can be implemented in just a few lines of code. Below -is example code for both \proglang{R} and Python that retrieves a dataset from \proglang{R} with +is example code for both \proglang{R} and Python that retrieves a data set from \proglang{R} with OpenCPU using a protobuf message. In \proglang{R}, we use the HTTP client from the \texttt{httr} package \citep{httr}. In this example we -download a dataset which is part of the base \proglang{R} distribution, so we can +download a data set which is part of the base \proglang{R} distribution, so we can verify that the object was transferred without loss of information. <>= @@ -1469,7 +1469,7 @@ well be done without Protocol Buffers. The main advantage of using an inter-operable format is that we can actually access \proglang{R} objects from within another programming language. For example, in a very similar fashion we can retrieve the same -dataset in a Python client. To parse messages in Python, we first compile the +data set in a Python client. To parse messages in Python, we first compile the \texttt{rexp.proto} descriptor into a python module using the \texttt{protoc} compiler: \begin{verbatim} @@ -1494,7 +1494,7 @@ msg.ParseFromString(res.read()) print(msg) \end{verbatim} -The \texttt{msg} object contains all data from the Animals dataset. From here we +The \texttt{msg} object contains all data from the Animals data set. From here we can easily extract the desired fields for further use in Python. @@ -1565,7 +1565,7 @@ %Protocol Buffers is itself not a protocol. %Forward-compatibility is one of the features. No need to re-iterate those The Protocol Buffers standard and library offer a unique combination of features, -performance, and maturity, that seems particulary well suited for data-driven +performance, and maturity, that seems particularly well suited for data-driven applications and numerical computing. The \CRANpkg{RProtoBuf} package builds on the Protocol Buffers \proglang{C++} library, From noreply at r-forge.r-project.org Mon Jan 27 01:01:40 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 27 Jan 2014 01:01:40 +0100 (CET) Subject: [Rprotobuf-commits] r856 - papers/jss Message-ID: <20140127000140.AE5351867E4@r-forge.r-project.org> Author: edd Date: 2014-01-27 01:01:40 +0100 (Mon, 27 Jan 2014) New Revision: 856 Modified: papers/jss/article.Rnw papers/jss/article.bib Log: add nlme citation as we have to cite software anyway Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-26 22:29:03 UTC (rev 855) +++ papers/jss/article.Rnw 2014-01-27 00:01:40 UTC (rev 856) @@ -21,7 +21,7 @@ \RequirePackage{alltt} \DefineVerbatimEnvironment{example}{Verbatim}{} % Articles with many authors we should shorten to FirstAuthor, et al. -\shortcites{sciencecloud,janus,dremel} +\shortcites{sciencecloud,janus,dremel,nlme} \author{Dirk Eddelbuettel\\Debian Project \And Murray Stokely\\Google, Inc \And Jeroen Ooms\\UCLA} @@ -1120,7 +1120,7 @@ without loss of information (\Sexpr{format(100*m/n,digits=1)}\%). Upon closer inspection, all other data sets are objects of class \texttt{nfnGroupedData}. This class represents a special type of data frame that has some additional -attributes (such as a \emph{formula} object) used by the \pkg{nlme} package. +attributes (such as a \emph{formula} object) used by the \pkg{nlme} package \citep{nlme}. Because formulas are \proglang{R} \emph{language} objects, they have little meaning to other systems, and are not supported by the \texttt{rexp.proto} descriptor. When \texttt{serialize\_pb} is used on objects of this class, it will serialize Modified: papers/jss/article.bib =================================================================== --- papers/jss/article.bib 2014-01-26 22:29:03 UTC (rev 855) +++ papers/jss/article.bib 2014-01-27 00:01:40 UTC (rev 856) @@ -454,3 +454,11 @@ year = 2013, publisher = {Springer} } + + at Manual{nlme, + title = {nlme: Linear and Nonlinear Mixed Effects Models}, + author = {Jos\'{e} Pinheiro and Douglas Bates and Saikat DebRoy and Deepayan Sarkar and {EISPACK authors} and {R Core}}, + year = 2013, + note = {R package version 3.1-113}, + url = {http://CRAN.R-project.org/package=nlme}, +} From noreply at r-forge.r-project.org Mon Jan 27 02:18:06 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 27 Jan 2014 02:18:06 +0100 (CET) Subject: [Rprotobuf-commits] r857 - papers/jss Message-ID: <20140127011807.1E49F185ECD@r-forge.r-project.org> Author: edd Date: 2014-01-27 02:18:06 +0100 (Mon, 27 Jan 2014) New Revision: 857 Modified: papers/jss/article.Rnw Log: another hyphen between nouns Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-27 00:01:40 UTC (rev 856) +++ papers/jss/article.Rnw 2014-01-27 01:18:06 UTC (rev 857) @@ -1078,7 +1078,7 @@ the appendix. The Protocol Buffer messages generated by \CRANpkg{RProtoBuf} and \pkg{RHIPE} are naturally compatible between the two systems because they use the -same schema. This shows the power of using a schema based cross-platform format such +same schema. This shows the power of using a schema-based cross-platform format such as Protocol Buffers: interoperability is achieved without effort or close coordination. The \texttt{rexp.proto} schema supports all main \proglang{R} storage types holding \emph{data}. From noreply at r-forge.r-project.org Mon Jan 27 03:04:33 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 27 Jan 2014 03:04:33 +0100 (CET) Subject: [Rprotobuf-commits] r858 - papers/jss Message-ID: <20140127020433.F28C9185E46@r-forge.r-project.org> Author: edd Date: 2014-01-27 03:04:28 +0100 (Mon, 27 Jan 2014) New Revision: 858 Modified: papers/jss/article.Rnw Log: preferred options() specified at jstatsoft.org Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-27 01:18:06 UTC (rev 857) +++ papers/jss/article.Rnw 2014-01-27 02:04:28 UTC (rev 858) @@ -125,7 +125,8 @@ %% DE: I tend to have wider option(width=...) so this %% guarantees better line breaks <>= -options(width=65, prompt="R> ", digits=4) +## cf http://www.jstatsoft.org/style#q12 +options(prompt = "R> ", continue = "+ ", width = 70, useFancyQuotes = FALSE, digits=4) @ \maketitle From noreply at r-forge.r-project.org Mon Jan 27 22:37:34 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 27 Jan 2014 22:37:34 +0100 (CET) Subject: [Rprotobuf-commits] r859 - papers/jss Message-ID: <20140127213734.E3BB0186CA1@r-forge.r-project.org> Author: murray Date: 2014-01-27 22:37:34 +0100 (Mon, 27 Jan 2014) New Revision: 859 Modified: papers/jss/article.Rnw Log: Replace the second use of 'compare' in a sentence with another more specific and less duplicative word ('perform') Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-27 02:04:28 UTC (rev 858) +++ papers/jss/article.Rnw 2014-01-27 21:37:34 UTC (rev 859) @@ -201,7 +201,7 @@ efficiently encoded to minimize storage costs when compared with simple ``schema-less'' binary interchange formats. Many sources compare data serialization formats -and show Protocol Buffers compare very favorably to the alternatives; see +and show Protocol Buffers perform favorably to the alternatives; see \citet{Sumaray:2012:CDS:2184751.2184810} for one such comparison. This paper describes an \proglang{R} interface to Protocol Buffers, From noreply at r-forge.r-project.org Mon Jan 27 22:42:27 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 27 Jan 2014 22:42:27 +0100 (CET) Subject: [Rprotobuf-commits] r860 - papers/jss Message-ID: <20140127214227.1D91018419D@r-forge.r-project.org> Author: murray Date: 2014-01-27 22:42:26 +0100 (Mon, 27 Jan 2014) New Revision: 860 Modified: papers/jss/article.Rnw Log: Move figure1 float to [bp] from [tbp] so it is placed at the bottom of the page. This looks better so that section 2 with its title header start at the top of the page, instead of the previous layout of starting with a not-yet-referenced figure and then a section title header beneath it. I think this looks a bit better, but feel free to revert this change if you disagree. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-27 21:37:34 UTC (rev 859) +++ papers/jss/article.Rnw 2014-01-27 21:42:26 UTC (rev 860) @@ -238,7 +238,7 @@ decade. \end{itemize} -\begin{figure}[tbp] +\begin{figure}[bp] \begin{center} \includegraphics[width=\textwidth]{figures/protobuf-distributed-system-crop.pdf} \end{center} From noreply at r-forge.r-project.org Mon Jan 27 23:02:10 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 27 Jan 2014 23:02:10 +0100 (CET) Subject: [Rprotobuf-commits] r861 - papers/jss Message-ID: <20140127220210.33B2C1868A6@r-forge.r-project.org> Author: murray Date: 2014-01-27 23:02:09 +0100 (Mon, 27 Jan 2014) New Revision: 861 Modified: papers/jss/article.Rnw Log: Add a short example to the messages section again illustrating how these objects are created, and modify a table float later on to ensure that all of the many tables in section 4 are placed in section 4 before we start section 5. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-27 21:42:26 UTC (rev 860) +++ papers/jss/article.Rnw 2014-01-27 22:02:09 UTC (rev 861) @@ -600,6 +600,10 @@ complete list of the slots and methods for \texttt{Messages} is available in Table~\ref{Message-methods-table}. +<<>>= +new(tutorial.Person) +@ + \begin{table}[tbp] \centering \begin{small} @@ -672,7 +676,7 @@ Table~\ref{Descriptor-methods-table} provides a complete list of the slots and available methods for Descriptors. -\begin{table}[tbp] +\begin{table}[p] \centering \begin{small} \begin{tabular}{lp{10cm}} From noreply at r-forge.r-project.org Mon Jan 27 23:20:27 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 27 Jan 2014 23:20:27 +0100 (CET) Subject: [Rprotobuf-commits] r862 - papers/jss Message-ID: <20140127222027.AD449186891@r-forge.r-project.org> Author: murray Date: 2014-01-27 23:20:27 +0100 (Mon, 27 Jan 2014) New Revision: 862 Modified: papers/jss/article.Rnw Log: Address a few comments made by Phillip Yelland. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-27 22:02:09 UTC (rev 861) +++ papers/jss/article.Rnw 2014-01-27 22:20:27 UTC (rev 862) @@ -261,7 +261,7 @@ model that is flexible and easy to use. The schema for structured Protocol Buffer data is defined in \texttt{.proto} files, which may contain one or more message types. Each message type has one or more -fields. A field is specified with a unique number, a name, a value +fields. A field is specified with a unique number (called a \emph{tag number}), a name, a value type, and a field rule specifying whether the field is optional, required, or repeated. The supported value types are numbers, enumerations, booleans, strings, raw bytes, or other nested message @@ -269,7 +269,10 @@ Buffer data is described comprehensively on Google Code\footnote{See \url{http://code.google.com/apis/protocolbuffers/docs/proto.html}.}. Table~\ref{tab:proto} shows an example \texttt{.proto} file that -defines the \texttt{tutorial.Person} type. The \proglang{R} code in the right +defines the \texttt{tutorial.Person} type\footnote{The compound name + \texttt{tutorial.Person} in R is derived from the name of the + message and the name of the package defined at the top of the + \texttt{.proto} file in which it is defined.}. The \proglang{R} code in the right column shows an example of creating a new message of this type and populating its fields. @@ -650,7 +653,7 @@ Descriptors describe the type of a Message. This includes what fields a message contains and what the types of those fields are. Message -descriptors are represented in \proglang{R} with the \emph{Descriptor} S4 +descriptors are represented in \proglang{R} by the \emph{Descriptor} S4 class. The class contains the slots \texttt{pointer} and \texttt{type}. Similarly to messages, the \verb|$| operator can be used to retrieve descriptors that are contained in the descriptor, or @@ -667,7 +670,6 @@ tutorial.Person$PhoneType # enum descriptor - tutorial.Person$PhoneNumber # nested type descriptor # same as tutorial.Person.PhoneNumber @@ -780,6 +782,10 @@ constants contained in the EnumDescriptor, or to invoke pseudo-methods. +The \texttt{EnumDescriptor} contains information about what values this type +defines, while the \texttt{EnumValueDescriptor} describes an +individual enum constant of a particular type. + <<>>= tutorial.Person$PhoneType tutorial.Person$PhoneType$WORK From noreply at r-forge.r-project.org Mon Jan 27 23:34:24 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 27 Jan 2014 23:34:24 +0100 (CET) Subject: [Rprotobuf-commits] r863 - papers/jss Message-ID: <20140127223424.C3D0C186BDA@r-forge.r-project.org> Author: murray Date: 2014-01-27 23:34:24 +0100 (Mon, 27 Jan 2014) New Revision: 863 Modified: papers/jss/article.Rnw Log: Switch the order of the FileDescriptor and EnumValueDescriptor subsections so that EnumDescriptors and EnumValueDescriptors are described in adjacent sections. Also manually move table definition earlier and adjust float placement options so that all the tables appear in section 4. Section 5 now begins at top of a page, and includes no floats from section 4, which is desirable. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-27 22:20:27 UTC (rev 862) +++ papers/jss/article.Rnw 2014-01-27 22:34:24 UTC (rev 863) @@ -271,8 +271,8 @@ Table~\ref{tab:proto} shows an example \texttt{.proto} file that defines the \texttt{tutorial.Person} type\footnote{The compound name \texttt{tutorial.Person} in R is derived from the name of the - message and the name of the package defined at the top of the - \texttt{.proto} file in which it is defined.}. The \proglang{R} code in the right + message (\emph{Person}) and the name of the package defined at the top of the + \texttt{.proto} file in which it is defined (\emph{tutorial}).}. The \proglang{R} code in the right column shows an example of creating a new message of this type and populating its fields. @@ -557,8 +557,8 @@ Descriptor & 2 & 16 & yes (field names, enum types, nested types)\\ FieldDescriptor & 4 & 18 & no\\ EnumDescriptor & 4 & 11 & yes (enum constant names)\\ +EnumValueDescriptor & 3 & \phantom{1}6 & no\\ FileDescriptor & 3 & \phantom{1}6 & yes (message/field definitions)\\ -EnumValueDescriptor & 3 & \phantom{1}6 & no\\ \bottomrule \end{tabular} \caption{\label{class-summary-table}Overview of class, slot, method and @@ -678,7 +678,7 @@ Table~\ref{Descriptor-methods-table} provides a complete list of the slots and available methods for Descriptors. -\begin{table}[p] +\begin{table}[tbp] \centering \begin{small} \begin{tabular}{lp{10cm}} @@ -824,50 +824,6 @@ \caption{\label{enumdescriptor-methods-table}Description of slots and methods for the \texttt{EnumDescriptor} S4 class} \end{table} -\subsection{File descriptors} -\label{subsec-file-descriptor} - -The class \emph{FileDescriptor} represents file descriptors in \proglang{R}. -This is a wrapper S4 class around the -\texttt{google::protobuf::FileDescriptor} \proglang{C++} class. -Table~\ref{filedescriptor-methods-table} describes the methods -defined for the \texttt{FileDescriptor} class. - -The \verb|$| operator can be used to retrieve named fields defined in -the FileDescriptor, or to invoke pseudo-methods. - -<<>>= -f <- tutorial.Person$fileDescriptor() -f -f$Person -@ - -\begin{table}[tbp] -\centering -\begin{small} -\begin{tabular}{lp{10cm}} -\toprule -\textbf{Slot} & \textbf{Description} \\ -\cmidrule(r){2-2} -\texttt{pointer} & external pointer to the \texttt{FileDescriptor} object of the \proglang{C++} proto library. Documentation for the -\texttt{FileDescriptor} class is available from the Protocol Buffer project page: -\url{http://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.descriptor.html#FileDescriptor} \\ -\texttt{filename} & fully qualified pathname of the \texttt{.proto} file.\\ -\texttt{package} & package name defined in this \texttt{.proto} file.\\[.3cm] -\textbf{Method} & \textbf{Description} \\ -\cmidrule(r){2-2} -\texttt{name} & Return the filename for this FileDescriptorProto.\\ -\texttt{package} & Return the file-level package name specified in this FileDescriptorProto.\\ -\texttt{as.character} & character representation of a descriptor. \\ -\texttt{toString} & character representation of a descriptor (same as \texttt{as.character}). \\ -\texttt{asMessage} & return FileDescriptorProto message. \\ -\texttt{as.list} & return named list of descriptors defined in this file descriptor.\\ -\bottomrule -\end{tabular} -\end{small} -\caption{\label{filedescriptor-methods-table}Description of slots and methods for the \texttt{FileDescriptor} S4 class} -\end{table} - \subsection{Enum value descriptors} \label{subsec-enumvalue-descriptor} @@ -911,6 +867,50 @@ and methods for the \texttt{EnumValueDescriptor} S4 class} \end{table} +\subsection{File descriptors} +\label{subsec-file-descriptor} + +\begin{table}[tbp] +\centering +\begin{small} +\begin{tabular}{lp{10cm}} +\toprule +\textbf{Slot} & \textbf{Description} \\ +\cmidrule(r){2-2} +\texttt{pointer} & external pointer to the \texttt{FileDescriptor} object of the \proglang{C++} proto library. Documentation for the +\texttt{FileDescriptor} class is available from the Protocol Buffer project page: +\url{http://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.descriptor.html#FileDescriptor} \\ +\texttt{filename} & fully qualified pathname of the \texttt{.proto} file.\\ +\texttt{package} & package name defined in this \texttt{.proto} file.\\[.3cm] +\textbf{Method} & \textbf{Description} \\ +\cmidrule(r){2-2} +\texttt{name} & Return the filename for this FileDescriptorProto.\\ +\texttt{package} & Return the file-level package name specified in this FileDescriptorProto.\\ +\texttt{as.character} & character representation of a descriptor. \\ +\texttt{toString} & character representation of a descriptor (same as \texttt{as.character}). \\ +\texttt{asMessage} & return FileDescriptorProto message. \\ +\texttt{as.list} & return named list of descriptors defined in this file descriptor.\\ +\bottomrule +\end{tabular} +\end{small} +\caption{\label{filedescriptor-methods-table}Description of slots and methods for the \texttt{FileDescriptor} S4 class} +\end{table} + +The class \emph{FileDescriptor} represents file descriptors in \proglang{R}. +This is a wrapper S4 class around the +\texttt{google::protobuf::FileDescriptor} \proglang{C++} class. +Table~\ref{filedescriptor-methods-table} describes the methods +defined for the \texttt{FileDescriptor} class. + +The \verb|$| operator can be used to retrieve named fields defined in +the FileDescriptor, or to invoke pseudo-methods. + +<<>>= +f <- tutorial.Person$fileDescriptor() +f +f$Person +@ + \section{Type coercion} \label{sec:types} From noreply at r-forge.r-project.org Mon Jan 27 23:45:11 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 27 Jan 2014 23:45:11 +0100 (CET) Subject: [Rprotobuf-commits] r864 - papers/jss Message-ID: <20140127224511.40B66184706@r-forge.r-project.org> Author: murray Date: 2014-01-27 23:45:10 +0100 (Mon, 27 Jan 2014) New Revision: 864 Modified: papers/jss/article.Rnw Log: Add two words suggested by Phillip for clarity. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-27 22:34:24 UTC (rev 863) +++ papers/jss/article.Rnw 2014-01-27 22:45:10 UTC (rev 864) @@ -357,7 +357,7 @@ or every \texttt{.proto} file provided by a particular \proglang{R} package. After importing proto files, the corresponding message descriptors are -available from the \code{RProtoBuf:DescriptorPool} environment in +available by name from the \code{RProtoBuf:DescriptorPool} environment in the \proglang{R} search path. This environment is implemented with the user-defined tables framework from the \pkg{RObjectTables} package available from the OmegaHat project \citep{RObjectTables}. Instead of From noreply at r-forge.r-project.org Mon Jan 27 23:54:10 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Mon, 27 Jan 2014 23:54:10 +0100 (CET) Subject: [Rprotobuf-commits] r865 - papers/jss Message-ID: <20140127225410.3E684186094@r-forge.r-project.org> Author: murray Date: 2014-01-27 23:54:09 +0100 (Mon, 27 Jan 2014) New Revision: 865 Modified: papers/jss/article.Rnw Log: One more comment made by Phillip that I had missed. Be more specific about how readProtoFiles is automatically called to read in addressbook.proto on package load, and add a footnote pointing out that this is why we didn't have to explicitly call readprotofiles in the example in table1. Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-27 22:45:10 UTC (rev 864) +++ papers/jss/article.Rnw 2014-01-27 22:54:09 UTC (rev 865) @@ -660,10 +660,12 @@ invoke pseudo-methods. When \CRANpkg{RProtoBuf} is first loaded it calls -\texttt{readProtoFiles} to read in an example \texttt{.proto} file +\texttt{readProtoFiles} to read in the example \texttt{addressbook.proto} file included with the package. The \texttt{tutorial.Person} descriptor -and any other descriptors defined in loaded \texttt{.proto} files are -then available on the search path. +and all other descriptors defined in the loaded \texttt{.proto} files are +then available on the search path\footnote{This explains why the example in +Table~\ref{tab:proto} lacked an explicit call to +\texttt{readProtoFiles}.}. <<>>= tutorial.Person$email # field descriptor From noreply at r-forge.r-project.org Tue Jan 28 04:02:30 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 28 Jan 2014 04:02:30 +0100 (CET) Subject: [Rprotobuf-commits] r866 - papers/jss Message-ID: <20140128030231.254161864A4@r-forge.r-project.org> Author: edd Date: 2014-01-28 04:02:26 +0100 (Tue, 28 Jan 2014) New Revision: 866 Modified: papers/jss/Makefile Log: small tweaks to Makefile remove ephemeral pdf figures add zip entry to prepare upload to jss Modified: papers/jss/Makefile =================================================================== --- papers/jss/Makefile 2014-01-27 22:54:09 UTC (rev 865) +++ papers/jss/Makefile 2014-01-28 03:02:26 UTC (rev 866) @@ -2,7 +2,7 @@ clean: rm -fr article.pdf article.out article.aux article.log article.bbl \ - article.blg article.brf + article.blg article.brf figures/fig-0??.pdf article.pdf: article.Rnw R CMD Sweave article.Rnw @@ -10,3 +10,6 @@ bibtex article pdflatex article.tex pdflatex article.tex + +jssarchive: + (cd .. && zip -r jssarchive.zip jss/) From noreply at r-forge.r-project.org Tue Jan 28 04:03:53 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Tue, 28 Jan 2014 04:03:53 +0100 (CET) Subject: [Rprotobuf-commits] r867 - papers/jss Message-ID: <20140128030353.A571F186893@r-forge.r-project.org> Author: edd Date: 2014-01-28 04:03:52 +0100 (Tue, 28 Jan 2014) New Revision: 867 Modified: papers/jss/Makefile Log: do not remove article when running clean Modified: papers/jss/Makefile =================================================================== --- papers/jss/Makefile 2014-01-28 03:02:26 UTC (rev 866) +++ papers/jss/Makefile 2014-01-28 03:03:52 UTC (rev 867) @@ -1,7 +1,7 @@ all: clean article.pdf clean: - rm -fr article.pdf article.out article.aux article.log article.bbl \ + rm -fr article.out article.aux article.log article.bbl \ article.blg article.brf figures/fig-0??.pdf article.pdf: article.Rnw From noreply at r-forge.r-project.org Wed Jan 29 02:18:04 2014 From: noreply at r-forge.r-project.org (noreply at r-forge.r-project.org) Date: Wed, 29 Jan 2014 02:18:04 +0100 (CET) Subject: [Rprotobuf-commits] r868 - papers/jss Message-ID: <20140129011804.E1604186061@r-forge.r-project.org> Author: jeroenooms Date: 2014-01-29 02:18:04 +0100 (Wed, 29 Jan 2014) New Revision: 868 Modified: papers/jss/article.Rnw Log: typo Modified: papers/jss/article.Rnw =================================================================== --- papers/jss/article.Rnw 2014-01-28 03:03:52 UTC (rev 867) +++ papers/jss/article.Rnw 2014-01-29 01:18:04 UTC (rev 868) @@ -44,7 +44,7 @@ method of serializing structured data between applications---while remaining independent of programming languages or operating systems. They offer a unique combination of features, performance, and maturity that seems -particulary well suited for data-driven applications and numerical +particularly well suited for data-driven applications and numerical computing. The \CRANpkg{RProtoBuf} package provides a complete interface to Protocol