library(timeSeries)
library(kohonen)
# This R code is to accompany the article "R Tutorial on Machine Learning: How to Visualize Option-Like Hedge Fund Returns for Risk Analysis", published in Wilmott Magazine, January 2019.
# Created in 2018 by Rodex Risk Advisers LLC, Altendorf SZ, Switzerland.
# The information contained in this file is for general information purposes only. Rodex Risk Advisers assumes no responsibility for errors or omissions in the contents of the Service.
# Download CSV file and save on your local drive. Set working drive:
setwd("myDrive")
s_data_file <- "Data Tutorial.csv" # Name of CSV-file with data.
no_units <- 25 # Set the number of units of the SOM.
d_start_INS <- "1994-01-31" # Set start and ...
d_end_INS <- "2017-12-31" # ... end date for in-sample period.
s_dd <- read.table(s_data_file, sep = ",", header = TRUE) # Read SPX index data.
s_date <- as.Date(s_dd[ , 1], format="%d/%m/%Y") # Convert column 1 to date format.
z <- timeSeries(s_dd[ , 2], s_date) # Create z as timeSeries.
z <- apply(z, 2, as.numeric) # Convert z to numeric.
x_ret <- returns(z, method = "discrete") # Calculate returns from indices.
x_SPX <- matrix(NA, nrow = nrow(x_ret), ncol = 2) # Create new matrix to hold variables.
x_SPX[ , 1] <- x_ret[ , 1]
x_SPX[ , 2] <- abs(x_SPX[ , 1]) # ABS function mimics a long straddle.
colnames(x_SPX) <- c("SPX", "ABS_SPX")
x_SPX_TS <- timeSeries(x_SPX, time(x_ret)) # Create timeSeries for later analysis.
x_INS <- window(x_SPX_TS, d_start_INS, d_end_INS) # Cut window for training data (= in-sample).
y <- match(as.Date(d_end_INS), as.Date(time(x_SPX_TS))) # Determine row number of last in-sample data point.
x_OOS <- x_SPX_TS[(1 + y):nrow(x_ret), ] # Cut window for prediction data (=out-of-sample).
# Prepare scatterplot:
s_dd <- rep("black", nrow(x_INS)) # Paint in-sample data points in black.
s_dd <- c(s_dd, rep("red", nrow(x_OOS))) # Paint out-of-sample data points in red.
s_cex <- rep(1, nrow(x_INS)) # Draw in-sample data points in small size.
s_cex <- c(s_cex, rep(3, nrow(x_OOS))) # Draw out-of-sample data points in larger size.
plot(x_SPX, main = "Scatterplot", col = s_dd, cex = s_cex)
cor(x_INS) # Print linear correlation matrix.
x <- scale(embed(x_INS, 1)) # Scale variable to mean 0 and variance 1; embed eliminates the dates from the the timeSeries x_INS (needed for processing with function som later).
set.seed(7) # Setting the seed for random generator leads to reproducible results. SOMs will change if seed is not set (try and run the next 7 lines with commenting this line out).
# Create a 5 x 5 SOM with hexagonal units and a bubble neighbourhood function:
x.grid = somgrid(sqrt(no_units), sqrt(no_units), topo = "hexagonal", neighbourhood.fct = "bubble")
x.som <- som(x, x.grid, rlen = 10000, alpha = c(0.05, 0.01), keep.data = TRUE, mode = "online", dist.fcts = "euclidean")
summary(x.som)
plot(x.som, type="changes", main = "Training Progress")
plot(x.som, type= "counts", main = "Mapping Frequencies", shape = "straight") # Plot the SOM with the number of monthly returns mapped onto each unit.
plot(x.som, type ="quality", shape = "straight") # Mapping quality.
y <- which(x.som$unit.classif == 20) # Which monthly returns were mapped onto unit 20?
x_SPX_TS[y, ] # Output monthly returns mapped onto unit 1.
plot(x.som, type = "codes", main = "Codebook Vectors", shape = "straight")
som_cluster <- cutree(hclust(object.distances(x.som, "codes")), 5) # Separate 5 areas on the SOM.
add.cluster.boundaries(x.som, som_cluster) # Draw cluster boundaries on SOM.
# Try set.seed(10) and compare results!
# Analyse clusters:
x.som$unit.classif # Show onto which units the monthly returns were mapped.
# Variable x_units determines which units are shown:
x_units <- 25 # Change this to analyse the other clusters described below.
# Cluster 5: x_units <- 25; SPX v.lo (< -15%), ABS_SPX v.hi (> +15%)
# Cluster 4: x_units <- c(22, 23); SPX v.hi (+7 to +11%), ABS_SPX v.hi (+7 to +11%)
# Cluster 3: x_units <- c(14, 15, 20, 24); SPX lo (-5 to -8%), ABS_SPX hi (+5 to +8%)
# Cluster 2: x_units <- c(12, 13, 16, 17, 18, 19, 21); SPX hi (+3 to +7%), ABS_SPX hi (+3 to +7%)
# Cluster 1: x_units <- c(1:11); SPX mid (-4 to +2%), ABS_SPX mid (+2 to +4%)
# Print the monthly returns mapped onto cluster as defined above:
y <- x.som$unit.classif %in% x_units # Creates vector with TRUE if month is mapped onto x_units, FALSE otherwise.
y <- y * 1:NROW(x_INS) # Convert TRUE/FALSE vector to vector with index for TRUE, 0 otherwise.
y <- y[y > 0] # Eliminate all 0s from index.
x_SPX_TS[y, ] # Print months based on index.
##########
# Place unknown data on the map (prediction):
zz_OOS <- embed((x_OOS - apply(x_INS, 2, mean)) / apply(x_INS, 2, sd), 1) # Scale out-of-sample returns with in-sample mean and st.dev.
# These scaled data points can then be applied to generate the out-of-sample predictions:
x_pred <- predict(x.som, newdata = zz_OOS, x) # Out-of-sample prediction.
x_pred$unit.classif # Print to which units the out-of-sample monthly returns are mapped.