[Eventstudies-commits] r31 - in pkg: R data man

Thu Feb 7 08:46:07 CET 2013

Author: vikram
Date: 2013-02-07 08:46:07 +0100 (Thu, 07 Feb 2013)
New Revision: 31

Added:
   pkg/R/identifyExtremeEvents.R
   pkg/data/IdentifyExevent.rda
   pkg/man/exact.pattern.location.Rd
   pkg/man/extreme.events.distribution.Rd
   pkg/man/gen.data.Rd
   pkg/man/get.cluster.distribution.Rd
   pkg/man/get.clusters.formatted.Rd
   pkg/man/get.event.count.Rd
   pkg/man/identify.extreme.events.Rd
   pkg/man/identify.mixedclusters.Rd
   pkg/man/numbers2words.Rd
   pkg/man/quantlie.extreme.values.Rd
   pkg/man/runlength.dist.Rd
   pkg/man/summarise.cluster.Rd
   pkg/man/summarise.rle.Rd
   pkg/man/sumstat.Rd
   pkg/man/yearly.exevent.dist.Rd
   pkg/man/yearly.exevent.summary.Rd
Log:
Added Rd documentation files, data and functions for Identify extreme events

Added: pkg/R/identifyExtremeEvents.R
===================================================================

--- pkg/R/identifyExtremeEvents.R	                        (rev 0)
+++ pkg/R/identifyExtremeEvents.R	2013-02-07 07:46:07 UTC (rev 31)
@@ -0,0 +1,737 @@
+
+# Total 16 functions
+############################
+# Identifying extreme events
+############################
+# libraries required
+library(xts)
+#----------------------------------------------------------------
+# INPUT:
+# 'input'     : Data series for which extreme events are 
+#               to be identified. More than one series 
+#               is permissble. The 'input' should be in time
+#               series format.
+# 'prob.value': This is the tail value for which event is
+#               to be defined. For eg: prob.value=5 will
+#               consider 5% tail on both sides
+#-----------------------------------------------------------------
+# OUTPUT:
+# Result will be in a list of 3 with following tables:
+# 1. Summary statistics
+#    a. Summary of whole data-set
+# 2. Lower tail: Extreme event tables
+#    a. Distribution of extreme events
+#    b. Run length distribution
+#    c. Quantile values
+#    d. Yearly distribution
+#    e. Extreme event data
+#     - Clustered, Un-clustered and Both
+# 3. Upper tail: Extreme event tables
+#    a. Distribution of extreme events
+#    b. Run length distribution
+#    c. Quantile values
+#    d. Yearly distribution
+#    e. Extreme event data
+#     - Clustered, Un-clustered and Both
+#------------------------------------------------------------------
+# NOTE:
+identify.extreme.events <- function(input,prob.value){
+  no.var <- NCOL(input)
+
+  #------------------------------------------------
+  # Breaking the function if any input is not given
+  #------------------------------------------------
+  # For one variable
+  # If class of data is not time series
+  class.input <- class(input)%in%c("xts","zoo")
+  if(class.input==FALSE){
+    stop("Input data is not in time series format. Valid 'input' should be of class xts and zoo")
+  }
+  
+  # Converting an xts object to zoo series
+  input.class <- length(which(class(input)%in%"xts"))
+  if(length(input.class)==1){
+    input <- zoo(input)
+  }
+
+  #-----------------------------------------
+  # Event series: Clustered and un-clustered
+  #-----------------------------------------
+  tmp <- get.clusters.formatted(event.series=input,
+                                response.series=input,
+                                probvalue=prob.value,
+                                event.value="nonreturns",
+                                response.value="nonreturns")
+  tail.events <- tmp[which(tmp$left.tail==1 | tmp$right.tail==1),]
+  clustered.tail.events <- tmp[which(tmp$cluster.pattern>1),]
+  unclustered.tail.events <- tmp[-which(tmp$cluster.pattern>1),]
+  # Left tail data
+  left.tail.clustered <- clustered.tail.events[which(clustered.tail.events$left.tail==1),c("event.series","cluster.pattern")]
+  left.tail.unclustered <- unclustered.tail.events[which(unclustered.tail.events$left.tail==1),c("event.series","cluster.pattern")]
+  left.all <- tail.events[which(tail.events$left.tail==1),c("event.series","cluster.pattern")]
+  # Right tail data
+  right.tail.clustered <- clustered.tail.events[which(clustered.tail.events$right.tail==1),c("event.series","cluster.pattern")]
+  right.tail.unclustered <- unclustered.tail.events[which(unclustered.tail.events$right.tail==1),c("event.series","cluster.pattern")]
+  right.all <- tail.events[which(tail.events$right.tail==1),c("event.series","cluster.pattern")]
+  
+  #---------------------
+  # Extreme event output
+  #---------------------
+  # Summary statistics
+  summ.st <- sumstat(input)
+
+  # Distribtution of events
+  event.dist <- extreme.events.distribution(input,prob.value)
+
+  # Run length distribution
+  runlength <- runlength.dist(input,prob.value)
+
+  # Quantile extreme values 
+  qnt.values <- quantile.extreme.values(input,prob.value)
+
+  # Yearly distribution of extreme event dates
+  yearly.exevent <- yearly.exevent.dist(input,prob.value)
+
+  #---------------------
+  # Compiling the output
+  #---------------------
+  output <- lower.tail <- upper.tail <- list()
+  # Compiling lower tail and upper tail separately
+  # Lower tail
+  lower.tail$data <- list(left.all,left.tail.clustered,
+                          left.tail.unclustered)
+  names(lower.tail$data) <- c("All","Clustered","Un-clustered")
+  lower.tail$extreme.event.distribution <- event.dist$lower.tail
+  lower.tail$runlength <- runlength$lower.tail
+  lower.tail$quantile.values <- qnt.values$lower.tail
+  lower.tail$yearly.extreme.event <- yearly.exevent$lower.tail
+  # Upper tail
+  upper.tail$data <- list(right.all,right.tail.clustered,
+                          right.tail.unclustered)
+  names(upper.tail$data) <- c("All","Clustered","Un-clustered")
+  upper.tail$extreme.event.distribution <- event.dist$upper.tail
+  upper.tail$runlength <- runlength$upper.tail
+  upper.tail$quantile.values <- qnt.values$upper.tail
+  upper.tail$yearly.extreme.event <- yearly.exevent$upper.tail
+  # Output
+  output$data.summary <- summ.st
+  output$lower.tail <- lower.tail
+  output$upper.tail <- upper.tail
+  return(output)
+}
+
+########################################
+# Functions used for formatting clusters
+########################################
+#------------------------
+# Categorzing tail events
+# for ES analysis
+#------------------------
+# Generates returns for the series
+# Mark left tail, right tail events
+gen.data <- function(d,probvalue,value="nonreturns"){
+  res <- data.frame(dates=index(d),value=coredata(d))
+  if(value=="returns"){
+    res$returns <- c(NA,coredata(diff(log(d))*100))
+  }else{
+    res$returns <- d
+  }
+  pval <- c(probvalue/100,(1-(probvalue/100)))
+  pval <- quantile(res$returns,prob=pval,na.rm=TRUE)
+  res$left.tail <- as.numeric(res$returns < pval[1])
+  res$right.tail <- as.numeric(res$returns > pval[2])
+  res$both.tails <- res$left.tail + res$right.tail
+  if(value=="returns"){
+    return(res[-1,])
+  }else{
+    return(res)
+  }
+}
+
+
+#-------------------
+# Summarise patterns
+summarise.rle <- function(oneseries){
+  tp <- rle(oneseries)
+  tp1 <- data.frame(tp$lengths,tp$values)
+  tp1 <- subset(tp1,tp1[,2]==1)
+  summary(tp1[,1])
+}
+
+# Summarise the pattern of cluster
+summarise.cluster <- function(obj){
+  rle.both <- summarise.rle(obj$both.tail)
+  rle.left <- summarise.rle(obj$left.tail)
+  rle.right <- summarise.rle(obj$right.tail)
+  rbind(both=rle.both,left=rle.left,right=rle.right)
+}
+  
+# Getting location for the length
+exact.pattern.location <- function(us,pt,pt.len){
+  st <- rle(us)
+  len <- st$length
+  loc.cs <- cumsum(st$length)
+  loc <- loc.cs[which(st$values==pt & st$length==pt.len)]-pt.len+1
+  return(loc)
+}
+
+# Identify and mark mixed clusters
+identify.mixedclusters <- function(m,j){
+  m$remove.mixed <- 0
+  rownum <- which(m$pattern==TRUE)
+  for(i in 1:length(rownum)){
+    nextnum <- rownum[i]+j-1
+    twonums <- m$returns[c(rownum[i]:nextnum)] > 0
+    if(sum(twonums)==j || sum(twonums)==0){
+        next
+      }else{
+        m$remove.mixed[c(rownum[i]:nextnum)] <- 5
+      }
+  }
+  m
+}
+
+#--------------------
+# Formatting clusters
+#--------------------
+# This function takes does the following transformation:
+#----------------------------------------------------
+# What the function does?
+# i.   Get extreme events from event.series
+# ii.  Remove all the mixed clusters
+# iii. Get different types cluster
+# iv.  Further club the clusters for event series and
+#      corresponding response series to get
+#      clustered returns
+# v.   Throw the output in timeseries format
+#----------------------------------------------------
+# Input for the function
+#    event.series = Series in levels or returns on events
+#                   is to be defined
+# response.series = Series in levels or returns on which
+#                   response is to be generated
+#      prob.value = Tail value for defining an event
+#    event.value  = What value is to be studied
+#                   returns or levels
+# Similarly for response.value
+#----------------------------------------------------
+# Output = Formatted clusters in time series format
+#----------------------------------------------------
+get.clusters.formatted <- function(event.series,
+                                   response.series,
+                                   probvalue=5,
+                                   event.value="returns",
+                                   response.value="returns"){
+  # Getting levels in event format
+  tmp <- gen.data(event.series,
+                  probvalue=probvalue,
+                  value=event.value)
+  res.ser <- gen.data(response.series,
+                      probvalue=probvalue,
+                      value=response.value)
+  # Storing old data points
+  tmp.old <- tmp
+
+  # Get pattern with maximum length
+  res <- summarise.cluster(tmp)
+  max.len <- max(res[,"Max."])
+
+  #------------------------
+  # Removing mixed clusters
+  #------------------------
+  for(i in max.len:2){
+    which.pattern <- rep(1,i)
+    patrn <- exact.pattern.location(tmp$both.tails,1,i)
+    # If pattern does not exist move to next pattern
+    if(length(patrn)==0){next}
+    tmp$pattern <- FALSE
+    tmp$pattern[patrn] <- TRUE
+    tmp <- identify.mixedclusters(m=tmp,i)
+    me <- length(which(tmp$remove.mixed==5))
+    
+    if(me!=0){
+      tmp <- tmp[-which(tmp$remove.mixed==5),]
+      cat("Pattern of:",i,";",
+          "Disarded event:",me/i,"\n")
+    }
+  }
+  tmp.nc <- tmp
+
+  # Merging event and response series
+  tmp.es <- xts(tmp[,-1],as.Date(tmp$dates))
+  tmp.rs <- xts(res.ser[,-1],as.Date(res.ser$dates))
+  tmp.m <- merge(tmp.es,res.ser=tmp.rs[,c("value","returns")],
+                 all=F)
+  
+  # Formatting 
+  if(event.value=="returns"){
+    which.value <- event.value
+  }else{
+    which.value <- "value"
+  }
+  # Converting to data.frame
+  temp <- as.data.frame(tmp.m)
+  temp$dates <- rownames(temp)
+  n <- temp
+  # Get pattern with maximum length
+  res <- summarise.cluster(temp)
+  max.len <- max(res[,"Max."])
+  cat("Maximum length after removing mixed clusters is",
+      max.len,"\n")
+  # Marking clusters
+  n$cluster.pattern <- n$both.tails
+  for(pt.len in max.len:1){
+    mark <- exact.pattern.location(n$both.tails,1,pt.len)
+    if(length(mark)==0){next}
+    n$cluster.pattern[mark] <- pt.len
+  }
+  
+  #-------------------
+  # Clustering returns
+  #-------------------
+  print("Clustering events.")
+  for(pt.len in max.len:2){
+    rownum <- exact.pattern.location(n$both.tails,1,pt.len)
+    # If pattern does not exist
+    if(length(rownum)==0){
+      cat("Pattern",pt.len,"does not exist.","\n");next
+    }
+    # Clustering
+    while(length(rownum)>0){
+      prevnum <- rownum[1]-1
+      lastnum <- rownum[1]+pt.len-1
+    # Clustering event series
+      if(event.value=="returns"){
+        newreturns <- (n$value[lastnum]-n$value[prevnum])*100/n$value[prevnum]
+        n[rownum[1],c("value","returns")] <-  c(n$value[lastnum],newreturns)
+      }else{
+        newreturns <- sum(n$value[rownum[1]:lastnum],na.rm=T)
+        n[rownum[1],c("value","returns")] <-  c(n$value[lastnum],newreturns)
+      }
+    # Clustering response series
+      if(response.value=="returns"){
+        newreturns.rs <- (n$value.1[lastnum]-n$value.1[prevnum])*100/n$value.1[prevnum]
+        n[rownum[1],c("value.1","returns.1")] <-  c(n$value.1[lastnum],newreturns.rs)
+      }else{
+        newreturns <- sum(n$value.1[rownum[1]:lastnum],na.rm=T)
+        n[rownum[1],c("value.1","returns.1")] <-  c(n$value.1[lastnum],newreturns)
+      }
+      n <- n[-c((rownum[1]+1):lastnum),]
+      rownum <- exact.pattern.location(n$both.tails,1,pt.len)
+    }
+  }
+  # Columns to keep
+  cn <- c(which.value,"left.tail","right.tail",
+          "returns.1","cluster.pattern")
+  tmp.ts <- zoo(n[,cn],order.by=as.Date(n$dates))
+  colnames(tmp.ts) <- c("event.series","left.tail","right.tail",
+                        "response.series","cluster.pattern")
+
+  # Results
+  return(tmp.ts)
+}
+
+##############################
+# Summary statistics functions
+##############################
+#---------------------------------------------
+# Table 1: Summary statistics
+# INPUT: Time series data-set for which
+#        summary statistics is to be estimated
+# OUTPUT: A data frame with:
+# - Values: "Minimum", 5%,"25%","Median",
+#           "Mean","75%","95%","Maximum",
+#           "Standard deviation","IQR",
+#           "Observations"
+#----------------------------------------------
+sumstat <- function(input){
+  no.var <- NCOL(input)
+  if(no.var==1){input <- xts(input)}
+  # Creating empty frame: chassis
+  tmp <- data.frame(matrix(NA,nrow=11,ncol=NCOL(input)))
+  colnames(tmp) <-  colnames(input) 
+  rownames(tmp) <- c("Min","5%","25%","Median","Mean","75%","95%",
+                         "Max","sd","IQR","Obs.")
+  # Estimating summary statistics
+  tmp[1,] <- apply(input,2,function(x){min(x,na.rm=TRUE)})
+  tmp[2,] <- apply(input,2,function(x){quantile(x,0.05,na.rm=TRUE)})
+  tmp[3,] <- apply(input,2,function(x){quantile(x,0.25,na.rm=TRUE)})
+  tmp[4,] <- apply(input,2,function(x){median(x,na.rm=TRUE)})
+  tmp[5,] <- apply(input,2,function(x){mean(x,na.rm=TRUE)})
+  tmp[6,] <- apply(input,2,function(x){quantile(x,0.75,na.rm=TRUE)})
+  tmp[7,] <- apply(input,2,function(x){quantile(x,0.95,na.rm=TRUE)})
+  tmp[8,] <- apply(input,2,function(x){max(x,na.rm=TRUE)})
+  tmp[9,] <- apply(input,2,function(x){sd(x,na.rm=TRUE)})
+  tmp[10,] <- apply(input,2,function(x){IQR(x,na.rm=TRUE)})
+  tmp[11,] <- apply(input,2,function(x){NROW(x)})
+  tmp <- round(tmp,2)
+
+  return(tmp)
+}
+
+######################
+# Yearly summary stats
+######################
+#----------------------------
+# INPUT:
+# 'input': Data series for which event cluster distribution
+#        is to be calculated;
+# 'prob.value': Probility value for which tail is to be constructed this
+#       value is equivalent to one side tail for eg. if prob.value=5
+#       then we have values of 5% tail on both sides
+# Functions used: yearly.exevent.summary()
+# OUTPUT:
+# Yearly distribution of extreme events
+#----------------------------
+yearly.exevent.dist <- function(input, prob.value){
+  no.var <- NCOL(input)
+  mylist <- list()
+  # Estimating cluster count
+  #--------------------
+  # Formatting clusters
+  #--------------------
+  tmp <- get.clusters.formatted(event.series=input,
+                                response.series=input,
+                                probvalue=prob.value,
+                                event.value="nonreturns",
+                                response.value="nonreturns")
+
+  tmp.res <- yearly.exevent.summary(tmp)
+  tmp.res[is.na(tmp.res)] <- 0
+  # Left and right tail
+  lower.tail.yearly.exevent <- tmp.res[,1:2]
+  upper.tail.yearly.exevent <- tmp.res[,3:4]
+  output <- list()
+  output$lower.tail <- lower.tail.yearly.exevent
+  output$upper.tail <- upper.tail.yearly.exevent
+  mylist <- output
+
+  return(mylist)
+}
+
+#------------------------------------------------
+# Get yearly no. and median for good and bad days
+#------------------------------------------------
+yearly.exevent.summary <- function(tmp){
+  tmp.bad <- tmp[which(tmp[,"left.tail"]==1),]
+  tmp.good <- tmp[which(tmp[,"right.tail"]==1),]
+  # Bad days
+  tmp.bad.y <- apply.yearly(xts(tmp.bad),function(x)nrow(x))
+  tmp.bad.y <- merge(tmp.bad.y,apply.yearly(xts(tmp.bad[,1]),function(x)median(x,na.rm=T)))
+  index(tmp.bad.y) <- as.yearmon(as.Date(substr(index(tmp.bad.y),1,4),"%Y"))
+  # Good days
+  tmp.good.y <- apply.yearly(xts(tmp.good),function(x)nrow(x))
+  tmp.good.y <- merge(tmp.good.y,apply.yearly(xts(tmp.good[,1]),function(x)median(x,na.rm=T)))
+    index(tmp.good.y) <- as.yearmon(as.Date(substr(index(tmp.good.y),1,4),"%Y"))
+  tmp.res <- merge(tmp.bad.y,tmp.good.y)
+  colnames(tmp.res) <- c("number.baddays","median.baddays",
+                         "number.gooddays","median.goodays")
+  output <- as.data.frame(tmp.res)
+  cn <- rownames(output)
+  rownames(output) <- sapply(rownames(output),
+                             function(x)substr(x,nchar(x)-3,nchar(x)))
+  return(output)
+}
+
+#############################
+# Getting event segregation
+# - clustered and unclustered
+#############################
+#----------------------------
+# INPUT:
+# 'input': Data series for which event cluster distribution
+#        is to be calculated;
+# Note: The input series expects the input to be in levels not in returns,
+#       if the some the inputs are already in return formats one has to
+#       use the other variable 'already.return.series'
+# 'already.return.series': column name is to be given which already has
+#       return series in the data-set
+# 'prob.value': Probility value for which tail is to be constructed this
+#       value is equivalent to one side tail for eg. if prob.value=5
+#       then we have values of 5% tail on both sides
+# Functions used: get.event.count()
+# OUTPUT:
+# Distribution of extreme events
+#----------------------------
+
+extreme.events.distribution <- function(input,prob.value){
+  # Creating an empty frame
+  no.var <- NCOL(input)
+  lower.tail.dist <- data.frame(matrix(NA,nrow=no.var,ncol=6))
+  upper.tail.dist <- data.frame(matrix(NA,nrow=no.var,ncol=6))
+  colnames(lower.tail.dist) <- c("Unclustered","Used clusters",
+                                 "Removed clusters","Total clusters",
+                                 "Total","Total used clusters")
+  rownames(lower.tail.dist) <- colnames(input)
+  colnames(upper.tail.dist) <- c("Unclustered","Used clusters",
+                                 "Removed clusters","Total clusters",
+                                 "Total","Total used clusters")
+  rownames(upper.tail.dist) <- colnames(input)
+  # Estimating cluster count
+  #--------------
+  # Cluster count
+  #--------------
+  # Non-returns (if it is already in return format)
+  tmp <- get.event.count(input,probvalue=prob.value,
+                         value="nonreturns")
+  lower.tail.dist  <- tmp[1,]
+  upper.tail.dist  <- tmp[2,]
+
+  #-----------------------------
+  # Naming the tail distribution
+  #-----------------------------
+  mylist <- list(lower.tail.dist,upper.tail.dist)
+  names(mylist) <- c("lower.tail", "upper.tail")
+  return(mylist)
+}
+
+# Functions used in event count calculation
+get.event.count <- function(series,
+                            probvalue=5,
+                            value="returns"){
+  # Extracting dataset
+  tmp.old <- gen.data(series,probvalue,value)
+  tmp <- get.clusters.formatted(event.series=series,
+                                response.series=series,
+                                probvalue,
+                                event.value=value,
+                                response.value=value)
+  
+  cp <- tmp[,"cluster.pattern"]
+  lvl <- as.numeric(levels(as.factor(cp)))
+  lvl.use <- lvl[which(lvl>1)]
+  # Calculating Total events
+  tot.ev.l <- length(which(tmp.old[,"left.tail"]==1))
+  tot.ev.r <- length(which(tmp.old[,"right.tail"]==1))
+  # Calculating Unclustered events
+  un.clstr.l <- length(which(tmp[,"left.tail"]==1 &
+                             tmp[,"cluster.pattern"]==1))
+  un.clstr.r <- length(which(tmp[,"right.tail"]==1 &
+                             tmp[,"cluster.pattern"]==1))
+  # Calculating Used clusters
+  us.cl.l <- us.cl.r <- NULL
+  for(i in 1:length(lvl.use)){
+    tmp1 <- length(which(tmp[,"cluster.pattern"]==lvl.use[i] &
+                         tmp[,"left.tail"]==1))*lvl.use[i]
+    tmp2 <- length(which(tmp[,"cluster.pattern"]==lvl.use[i] &
+                         tmp[,"right.tail"]==1))*lvl.use[i]
+    us.cl.l <- sum(us.cl.l,tmp1,na.rm=TRUE)
+    us.cl.r <- sum(us.cl.r,tmp2,na.rm=TRUE)
+  }
+
+  # Making a table
+  tb <- data.frame(matrix(NA,2,6))
+  colnames(tb) <- c("unclstr","used.clstr","removed.clstr","tot.clstr","Tot","Tot.used")
+  rownames(tb) <- c("lower","upper")
+  tb[,"Tot"] <- c(tot.ev.l,tot.ev.r)
+  tb[,"unclstr"] <- c(un.clstr.l,un.clstr.r)
+  tb[,"used.clstr"] <- c(us.cl.l,us.cl.r)
+  tb[,"Tot.used"] <- tb$unclstr+tb$used.clstr
+  tb[,"tot.clstr"] <- tb$Tot-tb$unclstr
+  tb[,"removed.clstr"] <- tb$tot.clstr-tb$used.clstr
+
+  return(tb)
+}
+
+####################################
+# Quantile values for extreme events
+####################################
+#-----------------------------------
+# INPUT:
+# 'input': Data series in time series format
+# Note: The input series expects the input to be in levels not in returns,
+#       if the some the inputs are already in return formats one has to
+#       use the other variable 'already.return.series'
+# 'already.return.series': column name is to be given which already has
+#       return series in the data-set
+# Functions used: get.clusters.formatted()
+# OUTPUT:
+# Lower tail and Upper tail quantile values
+#-----------------------------------
+quantile.extreme.values <- function(input, prob.value){
+  # Creating an empty frame
+  no.var <- NCOL(input)
+  lower.tail.qnt.value <- data.frame(matrix(NA,nrow=no.var,ncol=6))
+  upper.tail.qnt.value <- data.frame(matrix(NA,nrow=no.var,ncol=6))
+  colnames(lower.tail.qnt.value) <- c("Min","25%","Median","75%","Max",
+                                      "Mean")
+  rownames(lower.tail.qnt.value) <- colnames(input)
+  colnames(upper.tail.qnt.value) <- c("Min","25%","Median","75%","Max",
+                                      "Mean")
+  rownames(upper.tail.qnt.value) <- colnames(input)
+  # Estimating cluster count
+  #--------------------
+  # Formatting clusters
+  #--------------------
+  tmp <- get.clusters.formatted(event.series=input,
+                                response.series=input,
+                                probvalue=prob.value,
+                                event.value="nonreturns",
+                                response.value="nonreturns")
+
+  # Left tail
+  tmp.left.tail <- tmp[which(tmp$left.tail==1),
+                       "event.series"]
+  df.left <- t(data.frame(quantile(tmp.left.tail,c(0,0.25,0.5,0.75,1))))
+  tmp.left <- round(cbind(df.left,mean(tmp.left.tail)),2)
+  rownames(tmp.left) <- NULL
+  colnames(tmp.left) <- c("0%","25%","Median","75%","100%","Mean")
+  # Right tail
+  tmp.right.tail <- tmp[which(tmp$right.tail==1),
+                        "event.series"]
+  df.right <- t(data.frame(quantile(tmp.right.tail,c(0,0.25,0.5,0.75,1))))
+  tmp.right <- round(cbind(df.right,
+                           mean(tmp.right.tail)),2)
+  rownames(tmp.right) <- NULL
+  colnames(tmp.right) <- c("0%","25%","Median","75%","100%","Mean")
+  
+  lower.tail.qnt.value  <- tmp.left 
+  upper.tail.qnt.value  <- tmp.right
+
+  mylist <- list(lower.tail.qnt.value,upper.tail.qnt.value)
+  names(mylist) <- c("lower.tail", "upper.tail")
+  return(mylist)
+}
+
+##########################
+# Run length distribution
+##########################
+#-----------------------------------
+# INPUT:
+# 'input': Data series in time series format
+# Note: The input series expects the input to be in levels not in returns,
+#       if the some the inputs are already in return formats one has to
+#       use the other variable 'already.return.series'
+# 'already.return.series': column name is to be given which already has
+#       return series in the data-set
+# Functions used: get.clusters.formatted()
+#                 get.cluster.distribution()
+#                 numbers2words()
+# OUTPUT:
+# Lower tail and Upper tail Run length distribution
+#-----------------------------------
+runlength.dist <- function(input, prob.value){
+
+   # Creating an empty frame
+  no.var <- NCOL(input)
+  
+  # Finding maximum Run length
+  # Seed value
+  max.runlength <- 0 
+  #---------------------------
+  # Estimating max. Run length
+  #---------------------------
+  tmp <- get.clusters.formatted(event.series=input,
+                                response.series=input,
+                                probvalue=prob.value,
+                                event.value="nonreturns",
+                                response.value="nonreturns")
+
+  tmp.runlength <- get.cluster.distribution(tmp,"event.series")
+  max.runlength <- max(max.runlength,as.numeric(colnames(tmp.runlength)[NCOL(tmp.runlength)]))
+  
+  # Generating empty frame
+  col.names <- seq(2:max.runlength)+1
+  lower.tail.runlength <- data.frame(matrix(NA,nrow=no.var,
+                                            ncol=length(col.names)))
+  upper.tail.runlength <- data.frame(matrix(NA,nrow=no.var,
+                                            ncol=length(col.names)))
+  colnames(lower.tail.runlength) <- col.names
+  rownames(lower.tail.runlength) <- colnames(input)
+  colnames(upper.tail.runlength) <- col.names
+  rownames(upper.tail.runlength) <- colnames(input)
+
+  #----------------------
+  # Run length estimation
+  #----------------------
+  tmp.res <- get.cluster.distribution(tmp,"event.series")
+  for(j in 1:length(colnames(tmp.res))){
+    col.number <- colnames(tmp.res)[j]
+    lower.tail.runlength[1,col.number] <- tmp.res[1,col.number]
+    upper.tail.runlength[1,col.number] <- tmp.res[2,col.number]
+  }
+  
+  # Replacing NA's with zeroes
+  lower.tail.runlength[is.na(lower.tail.runlength)] <- 0
+  upper.tail.runlength[is.na(upper.tail.runlength)] <- 0
+
+  # creating column names
+  word.cn <- NULL
+  for(i in 1:length(col.names)){
+    word.cn[i] <- numbers2words(col.names[i])
+  }
+  colnames(lower.tail.runlength) <- word.cn
+  colnames(upper.tail.runlength) <- word.cn
+  mylist <- list(lower.tail.runlength,upper.tail.runlength)
+  names(mylist) <- c("lower.tail", "upper.tail")
+  return(mylist) 
+}
+
+#-------------------------
+# Get cluster distribution
+#-------------------------
+# Input for this function is the output of get.cluster.formatted
+get.cluster.distribution <- function(tmp,variable){
+  # Extract cluster category 
+  cp <- tmp[,"cluster.pattern"]
+  lvl <- as.numeric(levels(as.factor(cp)))
+  lvl.use <- lvl[which(lvl>1)]
+  # Get numbers for each category
+  tb <- data.frame(matrix(NA,2,length(lvl.use)))
+  colnames(tb) <- as.character(lvl.use)
+  rownames(tb) <- c(paste(variable,":lower tail"),
+                    paste(variable,":upper tail"))
+  for(i in 1:length(lvl.use)){
+    tb[1,i] <- length(which(tmp[,"cluster.pattern"]==lvl.use[i]
+                            & tmp[,"left.tail"]==1))
+    tb[2,i] <- length(which(tmp[,"cluster.pattern"]==lvl.use[i]
+                            & tmp[,"right.tail"]==1))
+    
+  }
+  return(tb)
+}
+
+#----------------------------
+# Converting numbers to words
+#----------------------------
+numbers2words <- function(x){
+  helper <- function(x){
+    digits <- rev(strsplit(as.character(x), "")[[1]])
+    nDigits <- length(digits)
+    if (nDigits == 1) as.vector(ones[digits])
+    else if (nDigits == 2)
+      if (x <= 19) as.vector(teens[digits[1]])
+      else trim(paste(tens[digits[2]],
+                      Recall(as.numeric(digits[1]))))
+    else if (nDigits == 3) trim(paste(ones[digits[3]], "hundred",
+               Recall(makeNumber(digits[2:1]))))
+    else {
+      nSuffix <- ((nDigits + 2) %/% 3) - 1
+      if (nSuffix > length(suffixes)) stop(paste(x, "is too large!"))
+      trim(paste(Recall(makeNumber(digits[
+                                          nDigits:(3*nSuffix + 1)])),
+                 suffixes[nSuffix],
+                 Recall(makeNumber(digits[(3*nSuffix):1]))))
+    }
+  }
+  trim <- function(text){
+    gsub("^\ ", "", gsub("\ *$", "", text))
+  }
+  makeNumber <- function(...) as.numeric(paste(..., collapse=""))
+  opts <- options(scipen=100)
+  on.exit(options(opts))
+  ones <- c("", "one", "two", "three", "four", "five", "six", "seven",
+            "eight", "nine")
+  names(ones) <- 0:9
+  teens <- c("ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
+             "sixteen", " seventeen", "eighteen", "nineteen")
+  names(teens) <- 0:9
+  tens <- c("twenty", "thirty", "forty", "fifty", "sixty", "seventy",
+            "eighty",
+            "ninety")
+  names(tens) <- 2:9
+  x <- round(x)
+  suffixes <- c("thousand", "million", "billion", "trillion")
+  if (length(x) > 1) return(sapply(x, helper))
+  helper(x)
+}

Added: pkg/data/IdentifyExevent.rda
===================================================================
(Binary files differ)


Property changes on: pkg/data/IdentifyExevent.rda
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Added: pkg/man/exact.pattern.location.Rd
===================================================================
--- pkg/man/exact.pattern.location.Rd	                        (rev 0)
+++ pkg/man/exact.pattern.location.Rd	2013-02-07 07:46:07 UTC (rev 31)
@@ -0,0 +1,39 @@
+\name{exact.pattern.location}
+\alias{exact.pattern.location}
+
+\title{
+ Gets the location of the pattern
+}
+
+\description{
+ The function gives the exact location of the exact pattern in the series.
+ }
+
+\usage{
+exact.pattern.location(us,pt,pt.len)
+}
+
+\arguments{
+  \item{us}{It is the series in which the location of the pattern is to
+    be found}
+  \item{pt}{It is the pattern which is to be searched in the series for
+    eg. 1}
+  \item{pt.len}{It is the length of the pattern which is to be searched
+  for}
+}
+
+\value{
+Output is the row number of the start of the pattern
+}
+
+
+\examples{
+# Loading data
+data(IdentifyExevent)
+# Series input
+input <- diff(log(input.data[,"sp500"]))
+# Marking left tail and right tail
+res <- gen.data(d=input,probvalue=5,value="nonreturns")
+# Getting summarised run length
+output <- exact.pattern.location(res$left.tail, pt=1, pt.len=2)
+}
\ No newline at end of file

Added: pkg/man/extreme.events.distribution.Rd
===================================================================
--- pkg/man/extreme.events.distribution.Rd	                        (rev 0)
+++ pkg/man/extreme.events.distribution.Rd	2013-02-07 07:46:07 UTC (rev 31)
@@ -0,0 +1,40 @@
+\name{extreme.events.distribution}
+\alias{extreme.events.distribution}
+
+\title{
+ Estimating extreme event distribution of clustered and unclustered data
+}
+
+\description{
+  It is the wrapper over the get.event.count function for estimating
+  extreme event distribution of clustered and unclustered data 
+  
+ }
+
+\usage{
+extreme.events.distribution(input, prob.value)
+}
+
+\arguments{
+  \item{input}{Series for which cluster distribution is to be checked}
+  \item{prob.value}{It is tail value for which the extreme event is to
+    be defined. For eg: prob.value of 5 will consider 5\% tail on both
+    sides}
+}
+
+\value{
+Output will be the distribution of clustered and unclustered extreme
+event of the series 
+}
+
+\seealso{
+  get.event.count
+}
+
+\examples{
+data(IdentifyExevent)
+# Series input
+input <- diff(log(input.data[,"sp500"]))
+output <- extreme.events.distribution(input,prob.value=5)
+
+}
\ No newline at end of file

Added: pkg/man/gen.data.Rd
===================================================================
--- pkg/man/gen.data.Rd	                        (rev 0)
+++ pkg/man/gen.data.Rd	2013-02-07 07:46:07 UTC (rev 31)
@@ -0,0 +1,41 @@
+\name{gen.data}
+\alias{gen.data}
+
+\title{
+Marking upper and lower tail events for extreme event analysis.
+}
+
+\description{
+This function generates a column as left.tail and right.tail which has
+binary numbers. If the observation belongs to left tail then the
+left.tail will be 1 else 0. 
+ }
+
+\usage{
+gen.data(d,probvalue,value)
+}
+
+\arguments{
+  \item{d}{'d' is the time-series on which extreme event analysis is
+    done.} 
+  \item{probvalue}{It is tail value for which the extreme event is to
+    be defined. For eg: prob.value of 5 will consider 5\% tail on both
+    sides.}
+  \item{value}{If the series 'd' is in returns format then
+  value="nonreturns" else value="returns"}
+}
+
+\value{
+  Output is a data frame with columns as date, value, returns, left.tail
+  which is left tail dummy simmilarly for right tail and both tails.
+
+}
+
+
+\examples{
+data(IdentifyExevent)
+# Series input
+input <- diff(log(input.data[,"sp500"]))
+# Marking left tail and right tail
+res <- gen.data(d=input,probvalue=5,value="nonreturns")
+}
\ No newline at end of file

Added: pkg/man/get.cluster.distribution.Rd
===================================================================
--- pkg/man/get.cluster.distribution.Rd	                        (rev 0)
+++ pkg/man/get.cluster.distribution.Rd	2013-02-07 07:46:07 UTC (rev 31)
@@ -0,0 +1,38 @@
+\name{get.cluster.distribution}
+\alias{get.cluster.distribution}
+
+\title{
+ Estimating runlength distribution for the clusters
+}
+
+\description{
+ Estimating runlength distribution of the clusters in the extreme
+ event. 
+ }
+
+\usage{
+get.cluster.distribution(tmp, variable)
+}
+
+\arguments{
+  \item{tmp}{It is the output of the get.clusters.formatted}
+  \item{variable}{Variable on which cluster distribution is to be
+  estimated; variable="event.series"}
+}
+
+\value{
+  Output is the runlength distribution for the clusters
+}
+
+\seealso{
+  get.clusters.formatted
+}
+
+\examples{
+data(IdentifyExevent)
+# Series input
+input <- diff(log(input.data[,"sp500"]))
+tmp <- get.clusters.formatted(event.series=input,response.series=input,
+probvalue=5,event.value="nonreturns",response.value="nonreturns")
[TRUNCATED]

To get the complete diff run:
    svnlook diff /svnroot/eventstudies -r 31