[Rprotobuf-yada] Writing R data.frames out to serialized lists of protocol buffers
Murray Stokely
murray at stokely.org
Wed Jun 1 22:32:22 CEST 2011
I have an application where I need to save R data.frames in serialized
protocol buffers. I think there might be some existing code that does
this, but I couldn't find it so I wrote a few functions to :
1) create an ephemeral protocol buffer that defines a schema for the
provided data.frame
and
2) converts the column-oriented R data.frame into a row-oriented list
of protocol buffers using this ephemeral schema.
Step (2) is done in R code right now but should really be done in C++
to get much better performance. Is there any existing code that does
this? The approach I am using is basically :
# Map from R types to protocol buffer types
kTypeConversions <- list(numeric="double",
integer="int32",
character="string",
factor="string",
logical="int32")
kTypeConversionFunctions <- list(numeric=as.numeric,
integer=as.integer,
character=as.character,
factor=as.character,
logical=as.integer)
.RToProtoBufIdentifier <- function(identifier) {
# Convert an R identifier name to a name suitable for use in a Protocol Buffer
#
# Args:
# identifier: A character vector of names used in R.
# Returns:
# A character vector of identifiers suitable for using in a protocol buffer.
# First make sure the column names are at least valid R identifiers.
ret.val <- make.names(identifier, unique=TRUE)
# R names may start with '.', but protobuf identifiers can't start with '_'.
ret.val <- gsub("^\\.", "hidden_", ret.val, fixed=FALSE)
# Now convert any remaining '.'s to '_' since '.' is not valid in protobufs.
return(gsub(".", "_", ret.val, fixed=TRUE))
}
.WriteProtobufDescriptionFile <- function(x, file,
msg.name="Rdataframe_proto") {
# Write out a Protocol Buffer Description for a Dataframe
#
# Args:
# x: An R data.frame
# msg.name: The name to use for this protocol message.
# file: A connection or a character filename for writing proto def.
field.names <- .RToProtoBufIdentifier(names(x))
stopifnot(anyDuplicated(field.names) == 0)
if (is.character(file)) {
file <- file(file, "wt")
on.exit(close(file))
}
if (!isOpen(file)) {
open(file, "wt")
on.exit(close(file))
}
cat('syntax = "proto2";\n', file=file)
cat("message", msg.name, "{\n", file=file)
num <- 1
# TODO(mstokely): Vectorize this if performance becomes an issue.
for (column in x) {
field.type <- kTypeConversions[[class(column)]]
stopifnot(!is.null(field.type))
cat(" optional", field.type, field.names[num], "=", num, ";\n",
file=file)
num <- num + 1
}
cat("}\n", file=file)
return(invisible())
}
.DataFrameToProtoBufs <- function(x, proto.file, proto.class) {
# Returns a list of RProtoBuf protocol messages corresponding to rows of x.
#
# Then names of the columns in the provided data frames must match the
# fields of the provided protocol buffer exactly.
#
# Args:
# x: An R data.frame
# proto.file: The .proto filename containing the proto buffer definition.
# proto.class: An RProtoBuf protocol descriptor describing each row of x.
# Returns:
# A list of RProtoBuf protocol messages.
names(x) <- .RToProtoBufIdentifier(names(x))
proto.descriptor <- P(proto.class, proto.file)
# Convert column types as needed.
for (col in 1:ncol(x)) {
type.converter <- kTypeConversionFunctions[[class(x[[col]])]]
x[[col]] <- type.converter(x[[col]])
}
# TODO(mstokely): Consider rewriting in C++ for performance.
# TODO(mstokely): Or, combine these two functions to avoid overhead of
# building up long intermediate list of rows.
ExtractRow <- function(row.num, x) {
# Extracts a row from a data frame (a column-oriented structure).
# For each column of x, return row.num element.
return(lapply(x, '[', row.num))
# We use this to provide a named list structure as opposed to a data.frame
# subset with return(x[row.num,]) which would require changes below.
}
CreateProtoBufFromRow <- function(row, proto.descriptor) {
# Calls new(proto.descriptor, column1=value1, ...) with
# non-NA named arguments from the row used to set the fields of the
# protocol buffer (NA arguments omitted, as all are optional.)
return (do.call(new, c(list(quote(proto.descriptor)),
row[!is.na(row)])))
}
# Convert data frame (column-oriented) to a list of rows, where each
# row is a named list of columns.
data.by.rows <- lapply(seq_len(nrow(x)), ExtractRow, x)
# For each row, create a protocol buffer.
return(lapply(data.by.rows, CreateProtoBufFromRow, proto.descriptor))
}
More information about the Rprotobuf-yada
mailing list