[H5r-commits] r12 - R inst inst/h5_files src

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Thu Apr 22 03:28:39 CEST 2010


Author: extemporaneousb
Date: 2010-04-22 03:28:39 +0200 (Thu, 22 Apr 2010)
New Revision: 12

Added:
   inst/testSlab.R
Modified:
   R/h5R.R
   inst/.Rhistory
   inst/h5_files/ex_1.h5
   inst/h5_files/makeH5.py
   inst/tests.R
   src/h5_wrap.c
Log:
Large commit which gets support for hyperslab selection in. This selection is limited to contiguous regions for now (and maybe forever). 




Modified: R/h5R.R
===================================================================
--- R/h5R.R	2010-04-21 01:36:54 UTC (rev 11)
+++ R/h5R.R	2010-04-22 01:28:39 UTC (rev 12)
@@ -11,9 +11,11 @@
 setClass("H5File", contains = "H5Obj", representation(fileName = "character"))
 setClass("H5Group", contains = "H5Obj", representation(name = "character"))
 
+
+setClassUnion("envOrNULL", c("environment", "NULL"))
 setClass("H5DataContainer", contains = "H5Obj",
          representation(name = "character", dims = "integer",
-                        h5Type = "integer", .data = "environment"))
+                        h5Type = "integer", .data = "envOrNULL"))
 setClass("H5Dataset", contains = "H5DataContainer")
 setClass("H5Attribute", contains = "H5DataContainer")
 
@@ -48,6 +50,10 @@
   get(".data", h5DataContainer at .data)
 }
 
+.inMemory <- function(h5Dataset) {
+  return(! is.null(h5Dataset at .data))
+}
+
 setGeneric("getH5Group", function(h5Obj, groupName, ...) {
   standardGeneric("getH5Group")
 })
@@ -86,13 +92,13 @@
   ## call this on *class* instanteation time. 
   if (missing(fileName))
     return(.Object)
-
+  
   .Object at ePtr <- .Call("h5R_open", fileName, package = "h5R")
   .Object at fileName <- fileName
   return(.Object)
 })
 
-.initH5DataContainer <- function(o, name) {
+.initH5DataContainer <- function(o, name, inMemory = TRUE) {
   o at name   <- name
   o at h5Type <- getH5Type(o)
 
@@ -108,19 +114,22 @@
   else
     o at dims <- d ## I store the length of the vector here, note that in
                 ## dim I take this into account.
-  
+
   ## This caches the data. At some point, we'll want to
   ## move away from this and just grab things from disk
   ## and provide a mechanism to cache.
-  o at .data <- new.env(parent = emptyenv(), hash = TRUE)
-
+  if (! inMemory) {
+    o at .data <- NULL
+  } else {
+    o at .data <- new.env(parent = emptyenv(), hash = TRUE)
+  }
   return(o)
 }
 
-setMethod("getH5Dataset", c("H5Obj", "character"), function(h5Obj, datasetName) {
+setMethod("getH5Dataset", c("H5Obj", "character"), function(h5Obj, datasetName, inMemory = FALSE) {
   o <- new("H5Dataset")
   o at ePtr <- .Call("h5R_get_dataset", .ePtr(h5Obj), datasetName, PACKAGE = 'h5r')
-  return(.initH5DataContainer(o, datasetName))
+  return(.initH5DataContainer(o, datasetName, inMemory))
 })
 
 setMethod("getH5Attribute", c("H5Obj", "character"), function(h5Obj, attrName) {
@@ -129,7 +138,8 @@
   return(.initH5DataContainer(o, attrName))
 })
 
-setMethod("[", "H5DataContainer", function(x, i, j, ..., drop = FALSE) {
+
+.internalSlice <- function(x, i, j, ..., drop = TRUE) {
   if (!.hasData(x)) {
     .putData(x, .loadDataset(x))
   }
@@ -141,10 +151,81 @@
     d[i]
   }
   else {
-    drop(d[i, j, ..., drop = drop])
+    d[i, j, ..., drop = drop]
   }
+}
+
+setMethod("[", "H5DataContainer", .internalSlice)
+setMethod("[", "H5Dataset", function(x, i, j, ..., drop = TRUE) {
+  if (.inMemory(x)) {
+    callNextMethod()
+  }
+
+  ##
+  ## Currently, This supports only a limited range of slicing.
+  ## contiguous chunks
+  ##  
+  else {
+    if (is.null(dim(x))) {
+      if (! missing(j))
+        stop("incorrect number of dimensions")
+      if (! missing(i))
+        dta <- readSlab(x, min(i), max(i) - min(i) + 1)
+      else
+        dta <- readSlab(x, 1, length(x))
+    }
+    else {
+      ## need to specify the dim(x) offset, dim.
+      sel <- matrix(NA, nrow = length(dim(x)), ncol = 2)
+      if (! missing(i))
+        sel[1, ] <- range(i)
+      else
+        sel[1, ] <- c(1, dim(x)[1])
+
+      if (! missing(j))
+        sel[2, ] <- range(j)
+      else
+        sel[2, ] <- c(1, dim(x)[2])
+
+      ##
+      ## Not quite sure why this results in a strange state
+      ## of both missing and !missing(.)
+      ##
+      l <- tryCatch(list(...), simpleError = function(e) {
+        return(list())
+      })
+      if (nrow(sel) > 2) {
+        for (k in 3:nrow(sel)) {
+          if (length(l) >= k) 
+            sel[k, ] <- range(l[[k]])
+          else
+            sel[k, ] <- c(1, dim(x)[k])
+        }
+      }
+
+      ext <- sel[,2] - sel[,1] + 1
+
+      if (nrow(sel) == 2) {
+        dta <- readSlab(x, sel[,1], ext)
+        dta <- matrix(dta, nrow = ext[1], ncol = ext[2], byrow = TRUE)
+        dta <- if (drop) drop(dta) else dta
+      } else {
+        dta <- readSlab(x, rev(sel[,1]), rev(ext))
+        dim(dta) <- rev(ext)
+      }
+    }
+    return(dta)
+  }
 })
 
+
+readSlab <- function(h5Dataset, offsets, dims) {
+###   stopifnot(length(offsets) == length(dims))
+###   stopifnot(all((offsets + dims - 1) <=
+###                 (if (is.null(dim(h5Dataset))) length(h5Dataset) else dim(h5Dataset))))
+  .Call("h5R_read_slab", .ePtr(h5Dataset), as.integer(offsets - 1), as.integer(dims))
+}
+
 setGeneric("readDataAsVector", function(h5Obj, ...) {
   standardGeneric("readDataAsVector")
 })

Modified: inst/.Rhistory
===================================================================
--- inst/.Rhistory	2010-04-21 01:36:54 UTC (rev 11)
+++ inst/.Rhistory	2010-04-22 01:28:39 UTC (rev 12)
@@ -1,150 +1,150 @@
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-.Call("__h5R_get_dataset_str", d)
-replicate(100, .Call("__h5R_get_dataset_str", d))
+diLong
+readSlab(diLong, 1, 100)
+diLong[1:100]
+seq.int(10000, 10)
 gc()
-x = replicate(10000, .Call("__h5R_get_dataset_str", d))
 gc()
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("_h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("_h5R_get_dataset_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("_h5R_get_dataset_str", d)
-1
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("_h5R_get_dataset_str", d)
-.Call("_h5R_read_dataset_str", d)
-.Call("_h5R_read_vlen_str", d)
-.Call("_h5R_read_vlen_str", d)
-.Call("_h5R_read_vlen_str", d)
-.Call("_h5R_read_vlen_str", d)
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("h5R_get_dims", d)
-1
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("h5R_get_dims", d)
-.Call("h5R_get_dims", d)
+time()
+date()
+?date
+Sys.time()
+?system.time
+proc.time()
+d3
+d2
+q()
+n
+d2
+nrow(d)
+debug("readSlab")
+n
+offsets + dims
+offsets + dims - 1
+Q
+d2
+gc()
+emptyenv()
+length(emptyenv())
+?callNextMethod
+?emptyenv
+setUnion
+setClassUnion
+?setClassUnion
+?setClassUnion
+readSlab
+ls()
 d
-1
-d = .Call("h5R_get_dataset", g at ePtr, "ds_1")
-.Call("h5R_get_dims", d)
-.Call("h5R_get_dims", d)
+d2
+readSlab(d2, c(1,1), c(1000, 10))
+length(readSlab(d2, c(1,1), c(1000, 10)))
+length(readSlab(d2, c(5,5), c(1000, 10)))
+length(readSlab(d2, c(1,5), c(1000, 5)))
 d
-d = .Call("h5R_get_dataset", g at ePtr, "ds_1")
-.Call("h5R_get_dims", d)
-.Call("h5R_get_dims", d)
-.Call("h5R_get_dims", d)
+d[1:10]
+d[]
 d
+d[1:10]
+d2[1:10]
+d2[,] = d[,]
+c(1,2,3,4)[]
 d
-d = .Call("h5R_get_dataset", g at ePtr, "ds_2")
-.Call("h5R_get_dims", d)
-.Call("h5R_get_dims", d)
-.Call("h5R_get_dims", d)
-.Call("h5R_read_dataset", d)
-.Call("h5R_read_dataset", d)
-d
-d
-d[,]
-dim(d[,])
-head(d[,])
-head(d[,])
-head(d[,])
-head(d[,])
-head(d[,])
-d2
-d2[]
-dim(d2)
-d
-d[,]
-length(d[,])
-length(d)
-length(d2)
-d2
-d2[]
-getGenerics("[")
-getGeneric("[")
-showMethods("[")
-d2
-d2[1]
-d2[1:3]
-d2[1,2]
-length(d2)
-dim(d2)
-dim(d2)
-dim(d)
-a
-q
-dim
-a
+d at .data
+d2 at .data
+d2 at .data$.data
 b
-matrix(c(1,2,3,4), 2, 2)
-matrix(c(1,2,3,4), 2, 2)[1,2]
-dim(matrix(c(1,2,3,4), 2, 2)[1,2])
-dim(d)
-c(1,23,4)[1,1]
+dM
+dMM
+dM
+dM[] == dMM
+dM[] == dMM[]
+dM[] == t(dMM[])
+dim(dM[])
+dim(dMM[])
+dM
+dM[1:5, 2]
+dMM[1:5, 2]
+dM[1,2]
+dM[1:4,2]
+dMM[1:10, 2]
+dM[1:10, 2]
+dM[1:10, 2] == dMM[1:10, 2]
+dM[1:5, 2] == dMM[1:999, 2]
+dMM[1:999, 2]
+dM[1:999, 2]
+dim(d3)
+dim(d3M)
 d3
+d3[1,,]
+debug(.internalSlice)
+debug(h5r:::.internalSlice)
+debug(h5r:::.internalSlice)
+h5R:::.inMemory(d3)
 d3
-dim(d3)
-d3[,,]
-dim(d3[,,])
-drop
-?drop
-d[1,1]
-d3[1,1,]
+d3 at .data
+d3M at .data
+d3M at .data$.data
+d3
+d3M
+traceback()
+d3M at .data$data[,,]
+d3M at .data$data
+debug(h5r:::.internalSlice)
+traceback()
+c
+n
+dim(x)
+n
+i
 d
-.Call("h5R_read_slab", d at ePtr, as.integer(0,0), as.integer(5,5))
-4
-.Call("h5R_read_slab", d at ePtr, as.integer(0,0), as.integer(5,5))
-1
-.Call("h5R_read_slab", d at ePtr, as.integer(0,0), as.integer(5,5))
-1
-.Call("h5R_read_slab", d at ePtr, as.integer(0,0), as.integer(5,5))
-1
-d3
-d3[1,,]
-dint
-.Call("h5R_read_slab", dint at ePtr, as.integer(0,0), as.integer(5,5))
-1
-dint[,]
-1
+c
+n
+x at .data$.data
+n
+class(d)
+ drop
+d[i, j, ..., drop = FALSE]
+dim(d)
+Q
+?callNextMethod
+traceback()
+a = function(a, ...) { list(...) }
+a(1)
+c
+x
+c
+n
+n
+n
+n
+...
+missing(...)
+list(...)
+class(...)
+mode(...)
+length(...)
+?missing
+substitute(...)
+length(substitute(...))
+class(substitute(...))
+Q
+?promise
+?substitue
+?substitute
+n
+n
+ext
+n
+n
+sel[,1]
+sel
+dim(x)
+l
+sel
+Q
+n
+readSlab(x, rev(sel[, 1]), rev(ext))
+Q
+n
+n
+n
+readSlab(x, rev(sel[, 1]), rev(ext))

Modified: inst/h5_files/ex_1.h5
===================================================================
(Binary files differ)

Modified: inst/h5_files/makeH5.py
===================================================================
--- inst/h5_files/makeH5.py	2010-04-21 01:36:54 UTC (rev 11)
+++ inst/h5_files/makeH5.py	2010-04-22 01:28:39 UTC (rev 12)
@@ -25,8 +25,28 @@
 
 g.create_dataset("ds_5", data = a.reshape(21, 9))
 
+
+g.create_dataset("ds_6", data = random.randint(0, int(1e4), int(1e5)), dtype = "uint32")
+
+
 f.close()
 
+##
+## some reading and testing.
+##
 f = h5py.File("ex_1.h5")
-f['group_1']['ds_2'].attrs['z']
+g = f["group_1"]
+d = g["ds_6"]
 
+import time
+
+s = time.time()
+k = 100000
+n = 100
+
+for i in xrange(0, n):
+    b    = random.randint(0, d.shape[0] - k)
+    e    = b + k
+    print len(d[b:e])
+
+print "total time %d" % int(time.time() - s)

Added: inst/testSlab.R
===================================================================
--- inst/testSlab.R	                        (rev 0)
+++ inst/testSlab.R	2010-04-22 01:28:39 UTC (rev 12)
@@ -0,0 +1,56 @@
+##
+## Used to test hyperslab reading.
+##
+require(h5r)
+
+files <- list.files("h5_files", full.names = TRUE)
+
+## ex_1
+f <- H5File(files[1])
+g <- getH5Group(f, "group_1")
+d <- getH5Dataset(g, "ds_6", inMemory = FALSE)
+d2 <- getH5Dataset(g, "ds_6", inMemory = TRUE)
+
+all(d[,] == d2[,])
+all(d[] == d2[])
+all(d[2:length(d)] == d2[2:length(d2)])
+
+timeMe <- function(d) {
+  k <- 1000
+  n <- 1000
+  system.time({
+    for (i in seq.int(1, n)) {
+      b <- runif(1, 1, nrow(d) - k)
+      length(readSlab(d, b, k))
+    }
+  })
+}
+timeMe(d)
+timeMe(d2)
+
+dM <- getH5Dataset(g, "ds_1", inMemory = FALSE)
+dMM <- getH5Dataset(g, "ds_1", inMemory = TRUE)
+
+all(dM[] == dMM[])
+all(dM[,] == dMM[,])
+all(dM[1:5, 1:5] == dMM[1:5, 1:5])
+all(dM[1:5, 2] == dMM[1:5, 2])
+
+length(dM[1:5, 2]) == length(dMM[1:5, 2])
+
+all(dim(dM[1:5, 2:3]) == dim(dMM[1:5, 2:3]))
+
+
+##
+## Three dimensional 
+##
+d3M <- getH5Dataset(g, "ds_3", inMemory = TRUE)
+d3 <- getH5Dataset(g, "ds_3", inMemory = FALSE)
+
+d3M[,,]
+d3[,,]
+
+d3[1,,]
+d3[1,1,1]
+d3[1,,]
+d3[,,1]

Modified: inst/tests.R
===================================================================
--- inst/tests.R	2010-04-21 01:36:54 UTC (rev 11)
+++ inst/tests.R	2010-04-22 01:28:39 UTC (rev 12)
@@ -9,9 +9,6 @@
 f <- H5File(files[1])
 g <- getH5Group(f, "group_1")
 
-dint <- getH5Dataset(g, "ds_5")
-.Call("h5R_read_slab", dint at ePtr, as.integer(c(0,0)), as.integer(c(5,5)), as.integer(c(5,5)))
-
 d <- getH5Dataset(g, "ds_1")
 d[1:10, 1:10]
 d[1:10,]

Modified: src/h5_wrap.c
===================================================================
--- src/h5_wrap.c	2010-04-21 01:36:54 UTC (rev 11)
+++ src/h5_wrap.c	2010-04-22 01:28:39 UTC (rev 12)
@@ -199,46 +199,55 @@
     return(dta);
 }
 
-SEXP h5R_read_slab(SEXP h5_dataset, SEXP _offsets, SEXP _counts, SEXP _orank) {
+SEXP h5R_read_slab(SEXP h5_dataset, SEXP _offsets, SEXP _counts) {
     SEXP dta = R_NilValue;
-    hid_t space, memspace;
-    int ndims, i; 
+    hid_t space, memspace, memtype;
+    void* buf; 
+    int i; 
 
-    int* offsets = INTEGER(_offsets);
-    int* counts  = INTEGER(_counts);
-    int* orank   = INTEGER(_orank);  
-   
-    hsize_t sel[] = {0, 0};
-   
+    int* offsets  = INTEGER(_offsets);
+    int* counts   = INTEGER(_counts);
+
+    /** I'm surprised I have to do this, but it seems to be necessary. **/
+    hsize_t* _h_offsets = (hsize_t*) Calloc(length(_counts), hsize_t);
+    hsize_t* _h_counts  = (hsize_t*) Calloc(length(_counts), hsize_t);
+    
     int v = 1;
-    for (i = 0; i < length(_orank); i++) {
-	v *= orank[i];
+    for (i = 0; i < length(_counts); i++) {
+    	v *= counts[i];
+	_h_offsets[i] = offsets[i];
+	_h_counts[i]  = counts[i];
     }
 
     switch (INTEGER(h5R_get_type(h5_dataset))[0]) {
     case H5T_INTEGER: 
-	Rprintf("In H5T_INTEGER with v = %d\n", v);
-
-	space = _h5R_get_space(h5_dataset);
-	H5Sselect_hyperslab(space, H5S_SELECT_SET, (hsize_t*) offsets, NULL, NULL, (hsize_t*) counts);
-
-	/* memspace = H5Screate_simple(length(_orank), (hsize_t*) orank, NULL); */
-	/* H5Sselect_hyperslab(memspace, H5S_SELECT_SET, sel, NULL,  (hsize_t*) counts, NULL); */
-
 	PROTECT(dta = allocVector(INTSXP, v));
-	Rprintf("About to read dataset, v = %d\n", v);
-	Rprintf("offset: %d, %d\n", offsets[0], offsets[1]);
-	Rprintf("counts: %d, %d\n", counts[0], counts[1]);
-
-	H5Dread(HID(h5_dataset), H5T_NATIVE_INT, H5S_ALL, space, H5P_DEFAULT, (void *) INTEGER(dta));
+	memtype = H5T_NATIVE_INT;
+	buf = INTEGER(dta);
 	break;
+    case H5T_FLOAT:
+	PROTECT(dta = allocVector(REALSXP, v));
+	memtype = H5T_NATIVE_DOUBLE;
+	buf = REAL(dta);
+	break;
+    case H5T_STRING:
+	error("String slab selection not yet supported.\n");
     default:
 	error("Unsupported class in %s\n", __func__);
-	
     }
+    
+    space = _h5R_get_space(h5_dataset);
+    H5Sselect_hyperslab(space, H5S_SELECT_SET, _h_offsets, NULL, _h_counts, NULL);
+    memspace = H5Screate_simple(length(_counts), _h_counts, NULL);
+    H5Dread(HID(h5_dataset), memtype, memspace, space, 
+	    H5P_DEFAULT, buf);
 
+    /** clean up. **/
+    Free(_h_offsets);
+    Free(_h_counts);
+    H5Sclose(memspace);
     UNPROTECT(1);
-
+    
     return dta;
 }
 



More information about the H5r-commits mailing list