Thursday, 2 January 2020

Autoregressive Moving Average with Exogenous Inputs Model

rm(list = ls())
#install.packages("TSA")
library(forecast)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(TSA)

data(airquality)

data <- airquality

data %>% head()

#clear n.a data
set.seed(123)

ozone <- subset(na.omit(data))

#split trainset and test set
train.set <- ceiling(0.7 * nrow(ozone))
test.set <- nrow(ozone) - train.set

# ensure to take only subsequent measurements for time-series analysis:
trainset <- seq_len(nrow(ozone))[1:train.set]
testset <- setdiff(seq_len(nrow(ozone)), trainset)

#Create time series
year <- 1973

date <- as.Date(paste0(year, "-", ozone$Month, "-", ozone$Day))

min.date <- as.numeric(format(min(date), "%j"))
max.date <- as.numeric(format(max(date), "%j"))

ozone.ts <- ts(ozone$Ozone, start = min.date, end = max.date, frequency = 1)

ozone.ts <- window(ozone.ts, 121, 231) # deal with repetition due to missing time values

ozone$t <- seq(start(ozone.ts)[1], end(ozone.ts)[1]) # assumes that measurements are consecutive although they are not

ozone %>% head()

#Visualize
ggplot(ozone, aes(x = t, y = Ozone)) + geom_line() +
  geom_point()

#The time-series data seem to be stationary.
#Let us consider the ACF and pACF plots to see which AR and MA terms we should consider

par(mfrow = c(1,2))
acf(ozone.ts)
pacf(ozone.ts)

# ARIMAX(0,0,0) model

features <- c("Solar.R", "Wind", "Temp") # exogenous features

model <- arimax( x = ozone$Ozone[trainset],
                 xreg = ozone[trainset, features],
                 order = c(0,0,0))

model %>% head()

preds.temporal <- predict(model, newxreg = ozone[testset, features])

preds.temporal