[Blotter-commits] r923 - pkg/FinancialInstrument/inst/parser
noreply at r-forge.r-project.org
noreply at r-forge.r-project.org
Sun Feb 12 20:13:04 CET 2012
Author: gsee
Date: 2012-02-12 20:13:03 +0100 (Sun, 12 Feb 2012)
New Revision: 923
Modified:
pkg/FinancialInstrument/inst/parser/TRTH_BackFill.R
Log:
- temp dir creattion takes place in split_csv instead of in configureTRTH
- unlink temp dir on.exit from split_csv
- make some convenience functions available outside of configureTRTH
- revert to not using no-clobber when copying headers
- If *.csv.gz is in csv_dir, do not split and copy csv to csv_dir;
*.csv.gz will be unzipped and used instead, unless overwrite==TRUE
Modified: pkg/FinancialInstrument/inst/parser/TRTH_BackFill.R
===================================================================
--- pkg/FinancialInstrument/inst/parser/TRTH_BackFill.R 2012-02-12 00:15:36 UTC (rev 922)
+++ pkg/FinancialInstrument/inst/parser/TRTH_BackFill.R 2012-02-12 19:13:03 UTC (rev 923)
@@ -89,7 +89,16 @@
unlink(to.remove, force=TRUE)
}
+## Some convenience functions
+addslash <- function(x) {
+ if (substr(x, nchar(x), nchar(x)) != '/') paste(x, "/", sep="")
+ else x
+}
+makeDir <- function(x) { #if directory does not exist, create it
+ dir.create(x, showWarnings=FALSE, recursive=TRUE, mode="0775") #why not mode="0664" ???
+}
+
configureTRTH <- function(config.file, path.output='~/TRTH/', ...) {
## Create environment to hold variables that more than one function needs to access
if (!exists('.TRTH', .GlobalEnv)) {
@@ -103,15 +112,6 @@
require(doMC)
#require(sendmailR) # for email on failure
- ## Some convenience functions
- addslash <- function(x) {
- if (substr(x, nchar(x), nchar(x)) != '/') paste(x, "/", sep="")
- else x
- }
- makeDir <- function(x) { #if directory does not exist, create it
- dir.create(x, showWarnings=FALSE, recursive=TRUE, mode="0775") #why not mode="0664" ???
- }
-
## Source the config_file -- this will be overwritten by any arguments in dots
if (!missing(config.file)) source(config.file)
@@ -139,16 +139,6 @@
makeDir(.TRTH$tick_dir)
makeDir(.TRTH$sec_dir)
- # make a temp dir to use for splitting so that (fingers crossed)
- # more than one instance can be run at a time in separate R sessions.
- tmp <- list()
- tmp$path.output <- addslash(tempdir())
- dir.create(tmp$archive_dir <- paste(tmp$path.output, "archive/", sep=""), showWarnings=FALSE, mode='0775')
- dir.create(tmp$csv_dir <- paste(tmp$path.output, "csv/", sep=""), showWarnings=FALSE, mode='0775')
- dir.create(tmp$tick_dir <- paste(tmp$path.output, "archive/", sep=""), showWarnings=FALSE, mode='0775')
- dir.create(tmp$sec_dir <- paste(tmp$path.output, "archive/", sep=""), showWarnings=FALSE, mode='0775')
- .TRTH$tmp <- tmp
-
pickArg <- function(x, default=NULL) {
# if argument "x" was passed through dots, use that
# otherwise, if it was in config_file, use that
@@ -309,6 +299,11 @@
#FIXME: respect overwrite argument
if (missing(.TRTH) && !exists(".TRTH")) stop("Run configureTRTH function first")
+ # make a temp dir to use for splitting so that (fingers crossed)
+ # more than one instance can be run at a time in separate R sessions.
+ dir.create(.TRTH$tmp_archive_dir <- addslash(tempdir()), showWarnings=FALSE, mode='0775')
+ on.exit(unlink(.TRTH$tmp_archive_dir, recursive=TRUE))
+
if (substr(.TRTH$path.output, nchar(.TRTH$path.output), nchar(.TRTH$path.output)) != "/") {
.TRTH$path.output <- paste(.TRTH$path.output, "/", sep="")
}
@@ -338,14 +333,14 @@
#unzip the file
print(paste("unzipping ",filename.gz, sep=""))
#system(paste("gzip -d -f ",archive_dir,filename.gz,sep=""))
- system(paste("gunzip -f < ", .TRTH$archive_dir, filename.gz, " > ", .TRTH$tmp$archive_dir, filename.csv, sep=""))
+ system(paste("gunzip -f < ", .TRTH$archive_dir, filename.gz, " > ", .TRTH$tmp_archive_dir, filename.csv, sep=""))
}
ignored.csvs <- NULL #this will hold the names of CSVs that already have a header
- setwd(.TRTH$tmp$archive) #this directory contains the big CSVs that were unzipped
- .TRTH$files.csv <- list.files(.TRTH$tmp$archive)
- for (i in 1:length(.TRTH$files.csv))
+ setwd(.TRTH$tmp_archive_dir) #this directory contains the big CSVs that were unzipped
+ .TRTH$big.csv <- list.files(.TRTH$tmp_archive_dir)
+ for (i in 1:length(.TRTH$big.csv))
{
- filename.csv <- .TRTH$files.csv[i]
+ filename.csv <- .TRTH$big.csv[i]
# Use awk to split the big CSV into daily CSVs. Each CSV will have a single
# row which we will then overwrite with the column headers. Then we'll
# use awk again to put the data into the split files
@@ -355,7 +350,7 @@
print(paste('Making headers from', filename.csv))
system(paste('awk -v f2="" -F "," '," '",'{f1 = $1"."$2".csv";if(f1 != f2) { print >> f1; close(f2); f2=f1; } }',"' ",filename.csv, sep=""))
- tmpfiles <- list.files(.TRTH$tmp$archive_dir)
+ tmpfiles <- list.files(.TRTH$tmp_archive_dir)
files.header <- tmpfiles[grep("RIC",tmpfiles)]
big.files <- tmpfiles[grep("@", tmpfiles)] #Big zipped CSVs from Reuters have e-mail address in name
@@ -378,9 +373,7 @@
# mv header.csv "RIC.Date[G].csv"
for (fl in tmp.files.csv) { # make files with header that awk will later populate
- # -n means don't overwrite files; useful if e.g. part001.csv.gz ends halfway through the day
- # and part002.csv.gz has a different number of columns (I'm not sure if that ever happens, though)
- system(paste('cp -n "', files.header, '" ', paste(.TRTH$tmp$archive_dir, fl, sep=""), sep=""))
+ system(paste('cp "', files.header, '" ', paste(.TRTH$tmp_archive_dir, fl, sep=""), sep=""))
#cp "#RIC.Date[G].csv" /home/garrett/TRTH/archive/GEM1-U1.01-APR-2008.csv
}
# after we've put a header in a file, we need to ignore that file the
@@ -389,14 +382,16 @@
# If all of the files that awk just created already exist in csv_dir and overwrite==FALSE, then
# there is no need to split this csv because we're not going to move any to csv_dir anyway.
- # a file in csv_dir might be "2012.02.10.AAPL.O.csv", but we have "AAPL.O.10-FEB-2012.csv"
+ # a file in csv_dir might be "2012.02.10.AAPL.O.csv", (or *.csv.gz) but we have "AAPL.O.10-FEB-2012.csv"
tmp <- gsub("\\.csv", "", tmp.files.csv)
new.names <- do.call(c, lapply(strsplit(tmp, "\\."), function(x) {
day <- gsub("-", ".", as.Date(x[length(x)], format='%d-%b-%Y'))
fl <- make.names(paste(x[-length(x)], collapse="."))
paste(day, "/", day, ".", fl, sep="")
}))
- if (!all(file.exists(paste(paste(.TRTH$csv_dir, new.names, sep=""), ".csv", sep=""))) || isTRUE(.TRTH$overwrite)) {
+ if (!all(file.exists(paste(paste(.TRTH$csv_dir, new.names, sep=""), ".csv", sep=""))) ||
+ !all(file.exists(paste(paste(.TRTH$csv_dir, new.names, sep=""), ".csv.gz", sep=""))) ||
+ isTRUE(.TRTH$overwrite)) {
## Split the Files
print(paste("Splitting ",filename.csv,sep=""))
# The following awk will put data in our CSV files which currently only have column headers;
@@ -408,9 +403,9 @@
print(paste('Done splitting ', filename.csv, sep=""))
} else print('All CSVs created by awk already exist. Not re-splitting')
# remove header file
- invisible(file.remove(paste(.TRTH$tmp$archive_dir, files.header, sep="")))
+ invisible(file.remove(paste(.TRTH$tmp_archive_dir, files.header, sep="")))
# remove unzipped csv
- invisible(file.remove(paste(.TRTH$tmp$archive_dir, filename.csv, sep="")))
+ invisible(file.remove(paste(.TRTH$tmp_archive_dir, filename.csv, sep="")))
## Zip the File
# print(paste("zipping ",filename.csv,sep=""))
# system(paste("gzip -f ",archive_dir,filename.csv,sep=""))
@@ -441,12 +436,11 @@
dir.create(paste(.TRTH$csv_dir, date.format, "/", sep=""), showWarnings=FALSE, recursive=TRUE, mode='0775') #mode='0664'
## Move files to appropriate place
- #system(paste("mv -vf ", path.output,"Archives/",name.csv, " ", path.output,date.format,"/",date.format,".",name.new,".csv", sep=""))
if (isTRUE(.TRTH$overwrite)) {
system(paste("mv -fv ", name.csv, " ", .TRTH$csv_dir, date.format, "/", date.format, ".", name.new, ".csv", sep=""))
- } else {
+ } else if (!file.exists(paste(.TRTH$csv_dir, date.format, "/", date.format, ".", name.new, ".csv.gz", sep=""))) {
system(paste("mv -nv ", name.csv, " ", .TRTH$csv_dir, date.format, "/", date.format, ".", name.new, ".csv", sep=""))
- }
+ } else print(paste(date.format, ".", name.new, ".csv.gz not overwritten.", sep=""))
#print(paste(date.format, name.new, "moved", sep=" "))
files.xts <- rbind(files.xts,as.data.frame(cbind(name.new,date.format),stringsAsFactors=FALSE))
}
@@ -454,6 +448,7 @@
.TRTH$files.xts <- files.xts
assign('.TRTH', .TRTH, pos=.GlobalEnv)
+ setwd(.TRTH$archive_dir)
if (isTRUE(.TRTH$use.instrument)) {
missing_i <- NULL
More information about the Blotter-commits
mailing list