[Returnanalytics-commits] r2058 - pkg/PerformanceAnalytics/sandbox/Meucci/R

noreply at r-forge.r-project.org noreply at r-forge.r-project.org
Sun Jun 24 19:21:03 CEST 2012


Author: mkshah
Date: 2012-06-24 19:21:03 +0200 (Sun, 24 Jun 2012)
New Revision: 2058

Modified:
   pkg/PerformanceAnalytics/sandbox/Meucci/R/DetectOutliersviaMVE.R
Log:
Cleaned file and checked against Meucci's Matlab Code

Modified: pkg/PerformanceAnalytics/sandbox/Meucci/R/DetectOutliersviaMVE.R
===================================================================
--- pkg/PerformanceAnalytics/sandbox/Meucci/R/DetectOutliersviaMVE.R	2012-06-24 17:20:00 UTC (rev 2057)
+++ pkg/PerformanceAnalytics/sandbox/Meucci/R/DetectOutliersviaMVE.R	2012-06-24 17:21:03 UTC (rev 2058)
@@ -20,27 +20,27 @@
 #' @export
 RejectOutlier = function( sample )
 {   
-    library( matlab )
+  library( matlab )
     
-    # parameter checks
-    if ( ncol( corruptSample ) > nrow( corruptSample ) ) { stop("The number of assets must be greater than number of observations (otherwise system is singular)") }    
+  # parameter checks
+  if ( ncol( corruptSample ) > nrow( corruptSample ) ) { stop("The number of assets must be greater than number of observations (otherwise system is singular)") }    
     
-    # initialize parameters
-    T = nrow( sample )
-    m = matrix( colMeans( sample ) , nrow = ncol( sample ) )
-    U = sample - ones( T , 1 ) %*% t( m )
+  # initialize parameters
+  T = nrow( sample )
+  m = matrix( colMeans( sample ) , nrow = ncol( sample ) )
+  U = sample - ones( T , 1 ) %*% t( m )
     
-    # measure lambdas associated with each observation
-    lambdas = diag( U %*% solve( t(U) %*% U ) %*% t(U) ) # result is from Poston, Wegman, Priebe and Solka (1997)
-    # the various lambdas denote the t-th element of the diagonal of the information matrix
-    # the farthest outlier corresponds to the highest value of lambda
-    a = max( lambdas )
-    rejected = which.max( lambdas ) # identify the farthest outlier
+  # measure lambdas associated with each observation
+  lambdas = diag( U %*% solve( t(U) %*% U ) %*% t(U) ) # result is from Poston, Wegman, Priebe and Solka (1997)
+  # the various lambdas denote the t-th element of the diagonal of the information matrix
+  # the farthest outlier corresponds to the highest value of lambda
+  a = max( lambdas )
+  rejected = which.max( lambdas ) # identify the farthest outlier
     
-    if ( max(lambdas > 1) == 1 ) { stop( "Lambdas cannot be greater than 1")}
-    if ( max(lambdas < 0) == 1 ) { stop( "Lambdas cannot be less than 0")}
+  if ( max(lambdas > 1) == 1 ) { stop( "Lambdas cannot be greater than 1")}
+  if ( max(lambdas < 0) == 1 ) { stop( "Lambdas cannot be less than 0")}
     
-    return( list( rejected = rejected , lambdas = lambdas ) )
+  return( list( rejected = rejected , lambdas = lambdas ) )
 }
 
 #' Compute the minimum volume ellipsoid for a given (multi-variate) time-series
@@ -64,57 +64,55 @@
 #' @export
 ComputeMVE = function ( data )
 {
-    library( matlab )    
-    NumObservations = nrow( data )
-    Ones = ones( NumObservations , 1 )    
-    det_S_New = 0    
+  library( matlab )    
+  NumObservations = nrow( data )
+  Ones = ones( NumObservations , 1 )    
+  det_S_New = 0    
     
-    # step 0: initialize the relative weights
-    w = 1 / NumObservations * Ones
+  # step 0: initialize the relative weights
+  w = 1 / NumObservations * Ones
     
-    # step 1: compute the location parameter (m), and the scatter matrix (S)
-    m = matrix( colMeans( data ) , nrow = ncol( data ) )
-    S = cov( data ) # notice the weights in the scatter matrix are not normalized
+  # step 1: compute the location parameter (m), and the scatter matrix (S)
+  m = matrix( colMeans( data ) , nrow = ncol( data ) )
+  S = cov( data ) # notice the weights in the scatter matrix are not normalized
     
-    keeploop = TRUE
-    while ( keeploop == TRUE )
+  keeploop = TRUE
+  while ( keeploop == TRUE )
+  {
+    Mahalanobis = matrix( , ncol = 0 , nrow = 1 )          
+    for ( t in 1:NumObservations )
     {
-        Mahalanobis = matrix( , ncol = 0 , nrow = 1 )          
-        for ( t in 1:NumObservations )
-        {
-            # cycle thru each observation...
-            x_t = t( data[ t , , drop = FALSE ] )                            
+      # cycle thru each observation...
+      x_t = t( data[ t , , drop = FALSE ] )                            
             
-            # ...and calculate the square Mahalanobis distances
-            Mahalanobis = cbind( Mahalanobis , t((x_t - m)) %*% solve(S) %*% ( x_t - m ) )
-        }              
+      # ...and calculate the square Mahalanobis distances
+      Mahalanobis = cbind( Mahalanobis , t((x_t - m)) %*% solve(S) %*% ( x_t - m ) )
+    }                
+    # Step 3: update the weights if the Mahalanobis distance squared > 1 as follows...
+    update = matlab:::find(Mahalanobis > 1 )        
+    w[ update ] = w[ update ] * t( Mahalanobis[ update ] ) 
+    # ... otherwise leave the weight unchanged
         
+    # Step 4: If convergence is reached, stop and define mu and Sigma, otherwise go to Step 1
+    m = t( data ) %*% w / sum (w )
+    S = t(( data - Ones %*% t( m ) )) %*% diag( as.vector( w ) ) %*% ( data - Ones %*% t(m) )
         
-        # Step 3: update the weights if the Mahalanobis distance squared > 1 as follows...
-        update = matlab:::find(Mahalanobis > 1 )        
-        w[ update ] = w[ update ] * t( Mahalanobis[ update ] ) 
-        # ... otherwise leave the weight unchanged
+    det_S_Old = det_S_New        
+    det_S_New = base:::det(S)          
         
-        # Step 4: If convergence is reached, stop and define mu and Sigma, otherwise go to Step 1
-        m = t( data ) %*% w / sum (w )
-        S = t(( data - Ones %*% t( m ) )) %*% diag( as.vector( w ) ) %*% ( data - Ones %*% t(m) )
+    exitFlag = ( ( det_S_Old / det_S_New ) < .99999 ) # loop exits when evaluates to FALSE
         
-        det_S_Old = det_S_New        
-        det_S_New = base:::det(S)          
+    if ( det_S_New == 0 ) { exitFlag = TRUE }
         
-        exitFlag = ( ( det_S_Old / det_S_New ) < .99999 ) # loop exits when evaluates to FALSE
+    if ( !is.logical( exitFlag ) ) { browser() }
         
-        if ( det_S_New == 0 ) { exitFlag = TRUE }
-        
-        if ( !is.logical( exitFlag ) ) { browser() }
-        
-        keeploop = ( exitFlag )
-    }    
+    keeploop = ( exitFlag )
+  }    
     
-    MVE_Location = m
-    MVE_Dispersion = S
+  MVE_Location = m
+  MVE_Dispersion = S
     
-    return( list( MVE_Location = MVE_Location , MVE_Dispersion = MVE_Dispersion ) )
+  return( list( MVE_Location = MVE_Location , MVE_Dispersion = MVE_Dispersion ) )
 }
 
 #' Use the minimum volume ellipsoid to detect outliers
@@ -136,54 +134,54 @@
 #' @export
 DetectOutliersViaMVE = function( corruptSample )
 {
-    library( matlab )    
+  library( matlab )    
     
-    # parameter checks
-    if ( ncol( corruptSample ) > nrow( corruptSample ) ) { stop("The number of assets must be greater than number of observations (otherwise system is singular)") }        
+  # parameter checks
+  if ( ncol( corruptSample ) > nrow( corruptSample ) ) { stop("The number of assets must be greater than number of observations (otherwise system is singular)") }        
     
-    # initialize variables
-    T = nrow( corruptSample )    
-    index = seq( from = 1 , to = T , 1 )
-    vol_MVE = sample_Length = matrix( , nrow = 1 , ncol = 0 )
-    store = rep( list() , length = ceil( T / 2 ) )
+  # initialize variables
+  T = nrow( corruptSample )    
+  index = seq( from = 1 , to = T , 1 )
+  vol_MVE = sample_Length = matrix( , nrow = 1 , ncol = 0 )
+  store = rep( list() , length = ceil( T / 2 ) )
     
-    lambdas = RejectOutlier( corruptSample )$lambdas
-    cutofflist = order( lambdas , decreasing = TRUE )
+  lambdas = RejectOutlier( corruptSample )$lambdas
+  cutofflist = order( lambdas , decreasing = TRUE )
     
-    # compute high-breakdown estimates
-    for ( j in 1:ceil( T / 2 ) ) # stop when the number of data left is less than half the original sample, otherwise repeat
-    {    
-        # step 1: compute MVE location and dispersion 
-        MVE = ComputeMVE( corruptSample )
+  # compute high-breakdown estimates
+  for ( j in 1:ceil( T / 2 ) ) # stop when the number of data left is less than half the original sample, otherwise repeat
+  {    
+    # step 1: compute MVE location and dispersion 
+    MVE = ComputeMVE( corruptSample )
         
-        # store results
-        store[[j]]$MVE_Location = MVE$MVE_Location
-        store[[j]]$MVE_Dispersion = MVE$MVE_Dispersion
+    # store results
+    store[[j]]$MVE_Location = MVE$MVE_Location
+    store[[j]]$MVE_Dispersion = MVE$MVE_Dispersion
         
-        sample_Length = cbind( sample_Length , nrow( corruptSample ) )
-        vol_MVE = cbind( vol_MVE , sqrt( det( MVE$MVE_Dispersion ) ) )
+    sample_Length = cbind( sample_Length , nrow( corruptSample ) )
+    vol_MVE = cbind( vol_MVE , sqrt( det( MVE$MVE_Dispersion ) ) )
         
-        store[[j]]$index = index
+    store[[j]]$index = index
         
-        # step 2: find the farthest outlier among the data and remove the observation (one at a time)        
-        rejected = RejectOutlier( corruptSample )$rejected
-        corruptSample = corruptSample[ -rejected  , ] # drop outlier rows from sample
-        index = index[ -rejected ]
-    }
+    # step 2: find the farthest outlier among the data and remove the observation (one at a time)        
+    rejected = RejectOutlier( corruptSample )$rejected
+    corruptSample = corruptSample[ -rejected  , ] # drop outlier rows from sample
+    index = index[ -rejected ]
+  }
     
-    # plot volume of ellipsoid as function of sample length    
-    # shows an abrupt jump when the first outlier is added to the data
-    # The respective sample covariance is the minimum covariance determinant and the respective ellipsoid
-    plot( sample_Length , vol_MVE , type = "l" , main = "Outlier detection" , ylab = "volume of Min volume ellipsoid" , xlab = "sample length" )
+  # plot volume of ellipsoid as function of sample length    
+  # shows an abrupt jump when the first outlier is added to the data
+  # The respective sample covariance is the minimum covariance determinant and the respective ellipsoid
+  plot( sample_Length , vol_MVE , type = "l" , main = "Outlier detection" , ylab = "volume of Min volume ellipsoid" , xlab = "sample length" )
     
-    # return the minimum volume ellipsoid at each sample length
-    result = rbind( sample_Length , vol_MVE )
-    rownames( result ) = c("index" , "volumeMVE" )   
+  # return the minimum volume ellipsoid at each sample length
+  result = rbind( sample_Length , vol_MVE )
+  rownames( result ) = c("index" , "volumeMVE" )   
     
-    # TODO: measure the slope of result$volumeMVE. When slope doubles that marks the first outlier
-    numOutliers = NULL
+  # TODO: measure the slope of result$volumeMVE. When slope doubles that marks the first outlier
+  numOutliers = NULL
     
-    return( list( plotdata = t( result ) , cutofflist = cutofflist , numOutliers = numOutliers ) )
+  return( list( plotdata = t( result ) , cutofflist = cutofflist , numOutliers = numOutliers ) )
 }
 
 
@@ -200,21 +198,21 @@
 #' @export
 NoisyObservations = function( numGoodSamples , numOutliers , covarianceMatrix , shuffle = FALSE )
 {    
-    mu = matrix( rep( 0 , nrow( covarianceMatrix ) ) )    
-    T = numGoodSamples
+  mu = matrix( rep( 0 , nrow( covarianceMatrix ) ) )    
+  T = numGoodSamples
     
-    # "good" observations
-    library( MASS )
-    sample = mvrnorm( n = T , mu = mu , Sigma = covarianceMatrix ) # draw T samples from mvrnorm distribution
+  # "good" observations
+  library( MASS )
+  sample = mvrnorm( n = T , mu = mu , Sigma = covarianceMatrix ) # draw T samples from mvrnorm distribution
     
-    # generate outliers
-    outliers = 10 * matrix( runif( numOutliers * nrow( covarianceMatrix ) ) , nrow = numOutliers , ncol = ncol( covarianceMatrix ) )
+  # generate outliers
+  outliers = 10 * matrix( runif( numOutliers * nrow( covarianceMatrix ) ) , nrow = numOutliers , ncol = ncol( covarianceMatrix ) )
     
-    # add "bad" observation(s)
-    corruptSample = rbind ( sample , outliers )
+  # add "bad" observation(s)
+  corruptSample = rbind ( sample , outliers )
     
-    # shuffle rows
-    if ( shuffle == TRUE ) { corruptSample = corruptSample[ sample( nrow( corruptSample ) ) , ] }
+  # shuffle rows
+  if ( shuffle == TRUE ) { corruptSample = corruptSample[ sample( nrow( corruptSample ) ) , ] }
     
-    return( corruptSample = corruptSample )
+  return( corruptSample = corruptSample )
 }



More information about the Returnanalytics-commits mailing list