[Blotter-commits] r923 - pkg/FinancialInstrument/inst/parser

Sun Feb 12 20:13:04 CET 2012

Author: gsee
Date: 2012-02-12 20:13:03 +0100 (Sun, 12 Feb 2012)
New Revision: 923

Modified:
   pkg/FinancialInstrument/inst/parser/TRTH_BackFill.R
Log:
 - temp dir creattion takes place in split_csv instead of in configureTRTH
 - unlink temp dir on.exit from split_csv
 - make some convenience functions available outside of configureTRTH
 - revert to not using no-clobber when copying headers
 - If *.csv.gz is in csv_dir, do not split and copy csv to csv_dir; 
   *.csv.gz will be unzipped and used instead, unless overwrite==TRUE


Modified: pkg/FinancialInstrument/inst/parser/TRTH_BackFill.R
===================================================================

--- pkg/FinancialInstrument/inst/parser/TRTH_BackFill.R	2012-02-12 00:15:36 UTC (rev 922)
+++ pkg/FinancialInstrument/inst/parser/TRTH_BackFill.R	2012-02-12 19:13:03 UTC (rev 923)
@@ -89,7 +89,16 @@
     unlink(to.remove, force=TRUE)
 }
 
+## Some convenience functions
+addslash <- function(x) {
+    if (substr(x, nchar(x), nchar(x)) != '/') paste(x, "/", sep="")
+    else x   
+}
+makeDir <- function(x) { #if directory does not exist, create it
+     dir.create(x, showWarnings=FALSE, recursive=TRUE, mode="0775") #why not mode="0664" ???
+}
 
+
 configureTRTH <- function(config.file, path.output='~/TRTH/', ...) {
     ## Create environment to hold variables that more than one function needs to access    
     if (!exists('.TRTH', .GlobalEnv)) {
@@ -103,15 +112,6 @@
     require(doMC)
     #require(sendmailR) # for email on failure
 
-    ## Some convenience functions
-    addslash <- function(x) {
-        if (substr(x, nchar(x), nchar(x)) != '/') paste(x, "/", sep="")
-        else x   
-    }
-    makeDir <- function(x) { #if directory does not exist, create it
-        dir.create(x, showWarnings=FALSE, recursive=TRUE, mode="0775") #why not mode="0664" ???
-    }
-
     ## Source the config_file -- this will be overwritten by any arguments in dots
     if (!missing(config.file)) source(config.file)
 
@@ -139,16 +139,6 @@
     makeDir(.TRTH$tick_dir)
     makeDir(.TRTH$sec_dir)
 
-    # make a temp dir to use for splitting so that (fingers crossed)
-    # more than one instance can be run at a time in separate R sessions.
-    tmp <- list()
-    tmp$path.output <- addslash(tempdir())
-    dir.create(tmp$archive_dir <- paste(tmp$path.output, "archive/", sep=""), showWarnings=FALSE, mode='0775')
-    dir.create(tmp$csv_dir <- paste(tmp$path.output, "csv/", sep=""), showWarnings=FALSE, mode='0775')
-    dir.create(tmp$tick_dir <- paste(tmp$path.output, "archive/", sep=""), showWarnings=FALSE, mode='0775')
-    dir.create(tmp$sec_dir <- paste(tmp$path.output, "archive/", sep=""), showWarnings=FALSE, mode='0775')
-    .TRTH$tmp <- tmp
-    
     pickArg <- function(x, default=NULL) {
         # if argument "x" was passed through dots, use that
         # otherwise, if it was in config_file, use that
@@ -309,6 +299,11 @@
     #FIXME: respect overwrite argument
     if (missing(.TRTH) && !exists(".TRTH")) stop("Run configureTRTH function first")
     
+    # make a temp dir to use for splitting so that (fingers crossed)
+    # more than one instance can be run at a time in separate R sessions.
+    dir.create(.TRTH$tmp_archive_dir <- addslash(tempdir()), showWarnings=FALSE, mode='0775')
+    on.exit(unlink(.TRTH$tmp_archive_dir, recursive=TRUE))
+        
     if (substr(.TRTH$path.output, nchar(.TRTH$path.output), nchar(.TRTH$path.output)) != "/") {
         .TRTH$path.output <- paste(.TRTH$path.output, "/", sep="")
     }
@@ -338,14 +333,14 @@
 	    #unzip the file
 	    print(paste("unzipping ",filename.gz, sep=""))
         #system(paste("gzip -d -f ",archive_dir,filename.gz,sep=""))
-        system(paste("gunzip -f < ", .TRTH$archive_dir, filename.gz, " > ", .TRTH$tmp$archive_dir, filename.csv, sep=""))
+        system(paste("gunzip -f < ", .TRTH$archive_dir, filename.gz, " > ", .TRTH$tmp_archive_dir, filename.csv, sep=""))
     }
     ignored.csvs <- NULL #this will hold the names of CSVs that already have a header    
-    setwd(.TRTH$tmp$archive) #this directory contains the big CSVs that were unzipped
-    .TRTH$files.csv <- list.files(.TRTH$tmp$archive)
-    for (i in 1:length(.TRTH$files.csv)) 
+    setwd(.TRTH$tmp_archive_dir) #this directory contains the big CSVs that were unzipped
+    .TRTH$big.csv <- list.files(.TRTH$tmp_archive_dir)
+    for (i in 1:length(.TRTH$big.csv)) 
     {
-        filename.csv <- .TRTH$files.csv[i]
+        filename.csv <- .TRTH$big.csv[i]
         # Use awk to split the big CSV into daily CSVs.  Each CSV will have a single
         # row which we will then overwrite with the column headers.  Then we'll
         # use awk again to put the data into the split files
@@ -355,7 +350,7 @@
         print(paste('Making headers from', filename.csv))
         system(paste('awk -v f2="" -F "," '," '",'{f1 = $1"."$2".csv";if(f1 != f2) { print >> f1; close(f2); f2=f1; } }',"' ",filename.csv, sep=""))
         
-        tmpfiles <- list.files(.TRTH$tmp$archive_dir)
+        tmpfiles <- list.files(.TRTH$tmp_archive_dir)
         files.header <- tmpfiles[grep("RIC",tmpfiles)]
 
         big.files <- tmpfiles[grep("@", tmpfiles)] #Big zipped CSVs from Reuters have e-mail address in name
@@ -378,9 +373,7 @@
         # mv header.csv "RIC.Date[G].csv"
 
         for (fl in tmp.files.csv) { # make files with header that awk will later populate
-            # -n means don't overwrite files; useful if e.g. part001.csv.gz ends halfway through the day
-            # and part002.csv.gz has a different number of columns (I'm not sure if that ever happens, though)
-            system(paste('cp -n "', files.header, '" ', paste(.TRTH$tmp$archive_dir, fl, sep=""), sep=""))
+            system(paste('cp "', files.header, '" ', paste(.TRTH$tmp_archive_dir, fl, sep=""), sep=""))
             #cp "#RIC.Date[G].csv" /home/garrett/TRTH/archive/GEM1-U1.01-APR-2008.csv
         }
         # after we've put a header in a file, we need to ignore that file the
@@ -389,14 +382,16 @@
 
         # If all of the files that awk just created already exist in csv_dir and overwrite==FALSE, then
         # there is no need to split this csv because we're not going to move any to csv_dir anyway.
-        # a file in csv_dir might be "2012.02.10.AAPL.O.csv", but we have "AAPL.O.10-FEB-2012.csv"
+        # a file in csv_dir might be "2012.02.10.AAPL.O.csv", (or *.csv.gz) but we have "AAPL.O.10-FEB-2012.csv"
         tmp <- gsub("\\.csv", "", tmp.files.csv)
         new.names <- do.call(c, lapply(strsplit(tmp, "\\."), function(x) {
             day <- gsub("-", ".", as.Date(x[length(x)], format='%d-%b-%Y'))
             fl <- make.names(paste(x[-length(x)], collapse="."))
             paste(day, "/", day, ".", fl, sep="")
         }))
-        if (!all(file.exists(paste(paste(.TRTH$csv_dir, new.names, sep=""), ".csv", sep=""))) || isTRUE(.TRTH$overwrite)) {
+        if (!all(file.exists(paste(paste(.TRTH$csv_dir, new.names, sep=""), ".csv", sep=""))) || 
+            !all(file.exists(paste(paste(.TRTH$csv_dir, new.names, sep=""), ".csv.gz", sep=""))) || 
+            isTRUE(.TRTH$overwrite)) {
             ## Split the Files 
             print(paste("Splitting ",filename.csv,sep=""))
             # The following awk will put data in our CSV files which currently only have column headers;
@@ -408,9 +403,9 @@
             print(paste('Done splitting ', filename.csv, sep=""))
         } else print('All CSVs created by awk already exist. Not re-splitting')
         # remove header file
-        invisible(file.remove(paste(.TRTH$tmp$archive_dir, files.header, sep="")))
+        invisible(file.remove(paste(.TRTH$tmp_archive_dir, files.header, sep="")))
         # remove unzipped csv
-        invisible(file.remove(paste(.TRTH$tmp$archive_dir, filename.csv, sep="")))
+        invisible(file.remove(paste(.TRTH$tmp_archive_dir, filename.csv, sep="")))
 	    ## Zip the File
         # print(paste("zipping ",filename.csv,sep=""))
         # system(paste("gzip -f ",archive_dir,filename.csv,sep=""))
@@ -441,12 +436,11 @@
         dir.create(paste(.TRTH$csv_dir, date.format, "/", sep=""), showWarnings=FALSE, recursive=TRUE, mode='0775') #mode='0664'
 
         ## Move files to appropriate place
-        #system(paste("mv -vf ", path.output,"Archives/",name.csv, " ", path.output,date.format,"/",date.format,".",name.new,".csv", sep=""))
         if (isTRUE(.TRTH$overwrite)) {
             system(paste("mv -fv ", name.csv, " ", .TRTH$csv_dir, date.format, "/", date.format, ".", name.new, ".csv", sep=""))
-        } else {
+        } else if (!file.exists(paste(.TRTH$csv_dir, date.format, "/", date.format, ".", name.new, ".csv.gz", sep=""))) {
             system(paste("mv -nv ", name.csv, " ", .TRTH$csv_dir, date.format, "/", date.format, ".", name.new, ".csv", sep=""))
-        }
+        } else print(paste(date.format, ".", name.new, ".csv.gz not overwritten.", sep=""))
         #print(paste(date.format, name.new, "moved", sep=" "))
         files.xts <- rbind(files.xts,as.data.frame(cbind(name.new,date.format),stringsAsFactors=FALSE))
     }
@@ -454,6 +448,7 @@
 
     .TRTH$files.xts <- files.xts
     assign('.TRTH', .TRTH, pos=.GlobalEnv)
+    setwd(.TRTH$archive_dir)
  
     if (isTRUE(.TRTH$use.instrument)) {
         missing_i <- NULL