Basic Data Analytics: 12/16/19

rm(list = ls())
#install.packages("vars")
library(dplyr)
library(vars)
library(tidyverse)
library(readxl)
library(urca)
library(car)
#install.packages(foreca)
#install.packages("forecast")
library(forecast)
dev.off()

# A function for black theme:
my_theme <- function(...) {
theme(
axis.line = element_blank(),
axis.text.x = element_text(color = "white", lineheight = 0.9),
axis.text.y = element_text(color = "white", lineheight = 0.9),
axis.ticks = element_line(color = "white", size = 0.2),
axis.title.x = element_text(color = "white", margin = margin(0, 10, 0, 0)),
axis.title.y = element_text(color = "white", angle = 90, margin = margin(0, 10, 0, 0)),
axis.ticks.length = unit(0.3, "lines"),
legend.background = element_rect(color = NA, fill = "gray10"),
legend.key = element_rect(color = "white", fill = "gray10"),
legend.key.size = unit(1.2, "lines"),
legend.key.height = NULL,
legend.key.width = NULL,
legend.text = element_text(color = "white"),
legend.title = element_text(face = "bold", hjust = 0, color = "white"),
legend.text.align = NULL,
legend.title.align = NULL,
legend.direction = "vertical",
legend.box = NULL,
panel.background = element_rect(fill = "gray10", color = NA),
panel.border = element_blank(),
panel.grid.major = element_line(color = "grey35"),
panel.grid.minor = element_line(color = "grey20"),
panel.spacing = unit(0.5, "lines"),
strip.background = element_rect(fill = "grey30", color = "grey10"),
strip.text.x = element_text(color = "white"),
strip.text.y = element_text(color = "white", angle = -90),
plot.background = element_rect(color = "gray10", fill = "gray10"),
plot.title = element_text(color = "white", hjust = 0, lineheight = 1.25,
margin = margin(2, 2, 2, 2)),
plot.subtitle = element_text(color = "white", hjust = 0, margin = margin(2, 2, 2, 2)),
plot.caption = element_text(color = "white", hjust = 0),
plot.margin = unit(rep(1, 4), "lines"))

}

#Loading data

data <- read.csv("D:/Techcombank/data1.csv", sep = ";", header = TRUE)

data.frame(data)

data %>% head()

data <- na.omit(data)

#plot mutivariate data plot
scatterplotMatrix(data[2:4])

dev.off()
plot(data$avg_bal, data$SLGD)

#calculating Correlations for Multivariate DAta

cor.test(data$avg_bal, data$SLGD)
cor.test(data$avg_bal, data$SLKH)
cor.test(data$SLKH, data$SLGD)

#Building a VAR Model
#Vector Autoregression (VAR) is a multivariate forecasting
#algorithm that is used when two or more time series influence each other

#1.Analyze the time series characteristics
#2.Test for causation amongst the time series
#3.Test for stationarity
#4.Transform the series to make it stationary, if needed
#5.Find optimal order (p)
#6.Prepare training and test datasets
#7.Train the model
#8.Roll back the transformations, if any.
#9.Evaluate the model using test set
#10.Forecast to future

#1.Analyze the time series characteristics

data %>% str()

data$MONTHID <- as.character(data$MONTHID)
data$time <- as.Date(data$Monthid,format = "%m/%d/%Y")

data$log.avgbal <- log(data$avg_bal)
data$log.SLGD <- log(data$SLGD)
data$log.SLHD <- log(data$SLHD)
data$log.SLKH <- log(data$SLKH)

data %>% head()

# Visualize time series:
data %>% ggplot(aes(time, avg_bal, series = "red")) +
geom_line() +
my_theme() +
scale_color_manual(values = c("red", "red")) +
labs(x = NULL, y = NULL,
title = "visualization")

#2.Test for causation amongst the time series
# AUTOMATICALLY SEARCH FOR THE MOST SIGNIFICANT RESULT
for (i in 1:4)
{
cat("LAG =", i)
print(causality(VAR(data, p = i, type = "const"), cause = "AVG_BAL")$Granger)
}

#3.Test for stationarity
##Perform Augmented Dicket Fuller

ur.df(data$avg_bal, type = "none") %>%
summary()

ur.df(data$SLGD, type = "none") %>%
summary()

ur.df(data$SLKH, type = "none") %>%
summary()

#test-statistics more than critical values => non-stationary

###difference

ur.df(diff(data$avg_bal, differences = 2), type = "none") %>%
summary()

ur.df(diff(data$SLGD, differences = 2), type = "none") %>%
summary()

ur.df(diff(data$SLHD, differences = 2), type = "none") %>%
summary()

ur.df(diff(data$SLKH, differences = 2), type = "none") %>%
summary()

dif.avgbal <- diff(data$avg_bal, differences = 2)
dif.SLGD <- diff(data$SLGD, differences = 2)
dif.SLHD <- diff(data$SLHD, differences = 2)
dif.SLKH <- diff(data$SLKH, differences = 2)

df2 <- data.frame(dif.avgbal = dif.avgbal, dif.SLGD = dif.SLGD, dif.SLKH = dif.SLKH)

df2 %>% head()

df2 <- ts(df2, frequency = 12, start = c(2017,3))

df2$Monthid <- as.Date(df2$Month, "%d-%m-%y")
str(df2)

df2 %>% str()

df2$Monthid <- as.Date(df2$Month, format = "%m/%d/%Y")

df2 %>% head()

dev.off()
df2 %>% ggplot(aes(Monthid, dif.SLKH)) +
geom_line()

VARselect(df2$dif.avgbal, lag.max = 5, type = "const")
VARselect(df2$dif.SLGD, lag.max = 5, type = "const")
VARselect(df2$dif.SLHD, lag.max = 5, type = "const")
VARselect(df2$dif.SLKH, lag.max = 5, type = "const")

##Plot ACF
acf(df2$dif.avgbal)
acf(df2$dif.SLGD)
acf(df2$dif.SLHD)
acf(df2$dif.SLKH)

#Var estimate
str(df2)

df <- cbind(df2$Monthid, df2$dif.avgbal, df2$dif.SLGD, df2$dif.SLKH)
df$month <- ts(start = c(2017,3), end = c(2019, 11), frequency = 1)

varp <- VAR(df, p = 2, type = "const", season = 4)
varp

lmp <- varp$varresult
summary(lmp)

#install.packages("stargazer")
library(stargazer)
stargazer(lmp$y1, lmp$y2, lmp$y3, type = "text", dep.var.labels.include = FALSE)

#plot residuals and their ACF and PACF
plot(varp$varresult)

#Granger causility
causality(varp, cause = "y1")
causality(varp, cause = "y2")
causality(varp, cause = "y3")
causality(varp, cause = "y4")

varp.f <- predict(varp, n.ahead = 6, ci = 0.95)
data.frame(varp.f)
par(mar=c(1,1,1,1))
plot(varp.f, names = "y1")
fanchart(varp.f, names = "y4")

varp.f %>% head()
#Generate impulse funtion
irf.avgbal <- irf(varp, impulse = "y1", response = "y4",
n.ahead = 12, boot = TRUE)
plot(irf.avgbal, ylab = "avgbal", main = "Shock from SLGD")

head(varp.f)

#undifference a time series

df3 <- diffinv(df2$dif.avgbal, xi = 0.5)
df3 %>% head()

#Backtest result

Result Accuracy: MAPE 0.0025

Basic Data Analytics

Monday, 16 December 2019

Using Vector AutoRegression Model for Multivariate Time series forecast