#install.packages("TSA")
library(forecast)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(TSA)
data(airquality)
data <- airquality
data %>% head()
#clear n.a data
set.seed(123)
ozone <- subset(na.omit(data))
#split trainset and test set
train.set <- ceiling(0.7 * nrow(ozone))
test.set <- nrow(ozone) - train.set
# ensure to take only subsequent measurements for time-series analysis:
trainset <- seq_len(nrow(ozone))[1:train.set]
testset <- setdiff(seq_len(nrow(ozone)), trainset)
#Create time series
year <- 1973
date <- as.Date(paste0(year, "-", ozone$Month, "-", ozone$Day))
min.date <- as.numeric(format(min(date), "%j"))
max.date <- as.numeric(format(max(date), "%j"))
ozone.ts <- ts(ozone$Ozone, start = min.date, end = max.date, frequency = 1)
ozone.ts <- window(ozone.ts, 121, 231) # deal with repetition due to missing time values
ozone$t <- seq(start(ozone.ts)[1], end(ozone.ts)[1]) # assumes that measurements are consecutive although they are not
ozone %>% head()
#Visualize
ggplot(ozone, aes(x = t, y = Ozone)) + geom_line() +
geom_point()
#The time-series data seem to be stationary.
#Let us consider the ACF and pACF plots to see which AR and MA terms we should consider
par(mfrow = c(1,2))
acf(ozone.ts)
pacf(ozone.ts)
# ARIMAX(0,0,0) model
features <- c("Solar.R", "Wind", "Temp") # exogenous features
model <- arimax( x = ozone$Ozone[trainset],
xreg = ozone[trainset, features],
order = c(0,0,0))
model %>% head()
preds.temporal <- predict(model, newxreg = ozone[testset, features])
preds.temporal