library(timeSeries)
library(kohonen)

# This R code is to accompany the article "R Tutorial on Machine Learning: How to Visualize Option-Like Hedge Fund Returns for Risk Analysis", published in Wilmott Magazine, January 2019.
# Created in 2018 by Rodex Risk Advisers LLC, Altendorf SZ, Switzerland.
# The information contained in this file is for general information purposes only. Rodex Risk Advisers assumes no responsibility for errors or omissions in the contents of the Service.

# Download CSV file and save on your local drive. Set working drive:
setwd("myDrive")

s_data_file <- "Data Tutorial.csv"						# Name of CSV-file with data.
no_units <- 25									# Set the number of units of the SOM.
d_start_INS <- "1994-01-31"							# Set start and ...
d_end_INS <- "2017-12-31"							# ... end date for in-sample period.

s_dd <- read.table(s_data_file, sep = ",", header = TRUE)		# Read SPX index data.
s_date <- as.Date(s_dd[ , 1], format="%d/%m/%Y")			# Convert column 1 to date format.
z <- timeSeries(s_dd[ , 2], s_date)						# Create z as timeSeries.
z <- apply(z, 2, as.numeric)							# Convert z to numeric.
x_ret <- returns(z, method = "discrete")					# Calculate returns from indices.
x_SPX <- matrix(NA, nrow = nrow(x_ret), ncol = 2)			# Create new matrix to hold variables.
x_SPX[ , 1] <- x_ret[ , 1]
x_SPX[ , 2] <- abs(x_SPX[ , 1])						# ABS function mimics a long straddle.
colnames(x_SPX) <- c("SPX", "ABS_SPX")
x_SPX_TS <- timeSeries(x_SPX, time(x_ret))				# Create timeSeries for later analysis.

x_INS <- window(x_SPX_TS, d_start_INS, d_end_INS)			# Cut window for training data (= in-sample).
y <- match(as.Date(d_end_INS), as.Date(time(x_SPX_TS)))		# Determine row number of last in-sample data point.
x_OOS <- x_SPX_TS[(1 + y):nrow(x_ret), ]					# Cut window for prediction data (=out-of-sample).

# Prepare scatterplot:
s_dd <- rep("black", nrow(x_INS))						# Paint in-sample data points in black.
s_dd <- c(s_dd, rep("red", nrow(x_OOS)))					# Paint out-of-sample data points in red.
s_cex <- rep(1, nrow(x_INS))							# Draw in-sample data points in small size.
s_cex <- c(s_cex, rep(3, nrow(x_OOS)))					# Draw out-of-sample data points in larger size.

plot(x_SPX, main = "Scatterplot", col = s_dd, cex = s_cex)
cor(x_INS)										# Print linear correlation matrix.

x <- scale(embed(x_INS, 1))							# Scale variable to mean 0 and variance 1; embed eliminates the dates from the the timeSeries x_INS (needed for processing with function som later).

set.seed(7)										# Setting the seed for random generator leads to reproducible results. SOMs will change if seed is not set (try and run the next 7 lines with commenting this line out).
# Create a 5 x 5 SOM with hexagonal units and a bubble neighbourhood function:
x.grid = somgrid(sqrt(no_units), sqrt(no_units), topo = "hexagonal", neighbourhood.fct = "bubble")
x.som <- som(x, x.grid, rlen = 10000, alpha = c(0.05, 0.01), keep.data = TRUE, mode = "online", dist.fcts = "euclidean")
summary(x.som)
plot(x.som, type="changes", main = "Training Progress")
plot(x.som, type= "counts", main = "Mapping Frequencies", shape = "straight")	# Plot the SOM with the number of monthly returns mapped onto each unit.
plot(x.som, type ="quality", shape = "straight")			# Mapping quality.

y <- which(x.som$unit.classif == 20)		# Which monthly returns were mapped onto unit 20?
x_SPX_TS[y, ]					# Output monthly returns mapped onto unit 1.

plot(x.som, type = "codes", main = "Codebook Vectors", shape = "straight")
som_cluster <- cutree(hclust(object.distances(x.som, "codes")), 5)	# Separate 5 areas on the SOM.
add.cluster.boundaries(x.som, som_cluster)					# Draw cluster boundaries on SOM.
# Try set.seed(10) and compare results!

# Analyse clusters:
x.som$unit.classif		# Show onto which units the monthly returns were mapped.

# Variable x_units determines which units are shown:

x_units <-  25			# Change this to analyse the other clusters described below.

# Cluster 5: x_units <- 25; SPX v.lo (< -15%), ABS_SPX v.hi (> +15%)
# Cluster 4: x_units <- c(22, 23); SPX v.hi (+7 to +11%), ABS_SPX v.hi (+7 to +11%)
# Cluster 3: x_units <- c(14, 15, 20, 24); SPX lo (-5 to -8%), ABS_SPX hi (+5 to +8%)
# Cluster 2: x_units <- c(12, 13, 16, 17, 18, 19, 21); SPX hi (+3 to +7%), ABS_SPX hi (+3 to +7%)
# Cluster 1: x_units <- c(1:11); SPX mid (-4 to +2%), ABS_SPX mid (+2 to +4%)

# Print the monthly returns mapped onto cluster as defined above:
y <- x.som$unit.classif %in% x_units	# Creates vector with TRUE if month is mapped onto x_units, FALSE otherwise.
y <- y * 1:NROW(x_INS)				# Convert TRUE/FALSE vector to vector with index for TRUE, 0 otherwise.
y <- y[y > 0]					# Eliminate all 0s from index.
x_SPX_TS[y, ]					# Print months based on index.

##########
# Place unknown data on the map (prediction):

zz_OOS <- embed((x_OOS - apply(x_INS, 2, mean)) / apply(x_INS, 2, sd), 1)	# Scale out-of-sample returns with in-sample mean and st.dev.
# These scaled data points can then be applied to generate the out-of-sample predictions:
x_pred <- predict(x.som, newdata = zz_OOS, x)	# Out-of-sample prediction.
x_pred$unit.classif				# Print to which units the out-of-sample monthly returns are mapped.