# Time Series Analysis Laboratory with R
# Prof. Lea Petrella
# Faculty of Economics
# Department of Methods and Models for Economics, Territory and Finance

# Sapienza University of Rome
# a.y. 2019-2020
#####################################

# Useful materials and links
# R: https://www.r-project.org
# RStudio: https://www.rstudio.com/home/
# Manual: The Art of R Programming
# Repositories & Code: GitHub (https://github.com)
#####################################


########################################
# Lesson 6: R Laboratory (Part 4)
########################################


################################################## 
#
#   Index:
# 1)estimation of an ARIMA(p, i, q) process 
# 2)residuals analysis
# 3)estimation strategy
# 4)model selection

##########################################################################
rm(list = ls())
graphics.off()
set.seed(123)

# Let's study how to assess the goodness of fit of a time series
library(tseries)
library(lmtest)

##### Easy simulation of a known process
# We fix the number of observations to be generated and then set the first two observations to zero
n = 1000
Ysim1 = c(); Ysim1[1] = 0; Ysim1[2] = 0

for (i in 3:n){
  Ysim1[i] =  0.4 * Ysim1[i-1] + 0.3 * Ysim1[i-2] + rnorm(1, 0, 1)
}

# ACF and PACF
par(mfrow = c(2, 1), lwd = 2, cex.axis = 1.7, cex.lab = 1.3, cex.main = 1.5)
acf(Ysim1)
pacf(Ysim1)

# Fit the process starting from the order suggested by the above-mentioned plots
# NOTE: the arma() function optimizes (minimizes) the conditional sum of squared errors  
?arma

fit20 = arma(Ysim1, order = c(2, 0), include.intercept = F) # AR(2) without intercept
fit20int = arma(Ysim1, order = c(2, 0), include.intercept = T) # AR(2) with intercept
fit10 = arma(Ysim1, order = c(1, 0), include.intercept = F) # AR(1) without intercept
# !!! The last model produces a warning message!!!
# To solve this issue, we may consider
fit10 = arma(Ysim1, order = c(1, 0), include.intercept = F, method = "Brent", lower = -1, upper = 1)
 
# analyze the residuals to verify the goodness of the estimate
res20 = fit20$residuals
plot.ts(res20, main = "AR(2) residuals")
res10 = fit10$residuals
plot.ts(res10, main = "AR(1) residuals")

# compare the acf
acf(res20, na.action = na.omit, main = "AR(2)")
acf(res10, na.action = na.omit, main = "AR(1)")

# compare the pacf
pacf(res20, na.action = na.omit, main = "AR(2)")
pacf(res10, na.action = na.omit, main = "AR(1)")

# test for normality and independence of residuals of the correct model
jarque.bera.test(na.exclude(res20)) # H0: normality
shapiro.test(res20) # We can also test for normality of residuals using the Shapiro-Wilk test
Box.test(res20, lag = 12, type = "Ljung-Box") # H0: no correlation
Box.test(res20, lag = 12, type = "Box-Pierce") # Another possibility is to use the Box-Pierce test 

# test for normality and independence of residuals of the misspecified model
jarque.bera.test(na.exclude(res10)) # H0: normality
shapiro.test(res10) # 
Box.test(res10, lag = 12, type = "Ljung-Box") # H0: no correlation
Box.test(res10, lag = 12, type = "Box-Pierce") # Another possibility is to use the Box-Pierce test 

# to visually check the normality of the residuals we can draw the Quantile-Quantile (QQ) plots 
?qqnorm
qqnorm(res20, main = "Normal Q-Q Plot AR(2)")
qqline(res20, col = "red", lty = 2) # qqline adds the âtheoreticalâ normal line to the qqplot
qqnorm(res10, main = "Normal Q-Q Plot AR(1)")
qqline(res10, col = "red", lty = 2)

################################################## 

# To estimate the parameters of an ARIMA model, you can also use the arima() function
# NOTE: the arima() function allows you to optimize (maximize) the likelihood function
?arima

fit20i = arima(Ysim1, order = c(2, 0, 0), include.mean = F)
fit10i = arima(Ysim1, order = c(1, 0, 0), include.mean = F)

# The tsdiag() function allows you to analyze the model residuals by using a single command
# NOTE: the standardized residuals, the ACF and the p-values of the Ljung-Box test are plotted
?tsdiag

tsdiag(fit20i)
tsdiag(fit10i)


##### Simulation of a non-stationary process
n = 200 # we set the number of observations to be generated
Ysim1 = c(); Ysim1[1] = 0; Ysim1[2] = 0
shock = rnorm(n, 0, 1) # draw the error component from a Gaussian distribution

for (i in 3:n){
  Ysim1[i] = 0.04 * i + 0.6 * Ysim1[i-1] - 0.5 * Ysim1[i-2] - 0.1 * shock[i-1] + 0.3 * shock[i-2] + shock[i]
}

par(mfrow = c(1, 1))
plot.ts(Ysim1)

# firstly, we need to make the series stationary
acf(Ysim1)
DYsim = diff(Ysim1, differences = 1)
plot.ts(DYsim)

# Subsequently, the acf and pacf are plotted
par(mfrow = c(2, 1))
acf(DYsim)
pacf(DYsim)

# If we can't choose a single model by looking at these plots, 
# it is possible to fit and compare multiple models to see which one best fits the data
fit1 = arima(DYsim, order = c(1, 0, 1), include.mean = T) # ARMA(1, 1) with intercept
fit2 = arima(DYsim, order = c(2, 0, 2), include.mean = T) # ARMA(2, 2) with intercept

# analyze the residuals to verify the goodness of fit
res1 = fit1$residuals
res2 = fit2$residuals
acf(res1, main = "ARMA(1, 1)")
acf(res2, main = "ARMA(2, 2)")

# test for normality and independence of residuals
jarque.bera.test(na.exclude(res1)) # H0: normality
Box.test(res1, lag = 12, type = "Ljung-Box") # H0: no correlation

jarque.bera.test(na.exclude(res2)) # H0: normality
Box.test(res2, lag = 12, type = "Ljung-Box") # H0: no correlation

# If the residuals do not show particular problems graphically and pass the diagnostic tests,
# the model can be chosen by observing the significance of the coefficients,
# by comparing the information criteria (AIC) and the conditional Sum of Squared Errors (SSE)

# the coeftest() command returns a coefficient matrix containing the point estimates, associated standard errors, 
# test statistics and p-values 
?coeftest
coeftest(fit1)
coeftest(fit2)

# We choose the model with minimum SSE and AIC
sum(fit1$residuals^2, na.rm = T)
sum(fit2$residuals^2, na.rm = T)

fit1$aic
fit2$aic

# Once the model is selected, we can move on to the series forecast
?predict
forecast = predict(fit2, n.ahead = 50, se.fit = T)
forecast
# As the forecast horizon grows, we can see how:
# i) the fit converges to the unconditional average of the process
mean(DYsim)
# ii) the standard deviation of the error converges to the unconditional standard deviation of the process
sd(DYsim)


##### Now let's try to estimate a series that we have not simulated
# ... do you remember AirPassengers?
rm(list = ls())
?AirPassengers
AirPassengers
Y = AirPassengers

par(mfrow = c(1, 1))
plot(Y, ylab = "Passenger numbers (1000's)", main = "Air Passenger from 1949 to 1961")

# We need to transform the data to remove trend and seasonality
# At first, we apply the logarithm to eliminate heteroskedasticity and reduce variability
Y = log(Y)
plot(Y)

# we differentiate the series to make it stationary and eliminate seasonality
DY = diff(diff(Y, differences = 1), lag = 12)
plot(DY)

# To verify that the series has actually become stationary we can use the Dickey-Fuller test, 
# where the null hypothesis is that the process is a random walk
# NB! The command adf.test() allows to choose the alternative hypothesis H1:
adf.test(DY, alternative = "stationary")
adf.test(DY, alternative = "explosive")
# Please also note that the function tests for covariance stationarity only (unit root)

acf(DY)
pacf(DY)

# Estimation strategy:
# we choose a minimum lag and we increase it until we have WN residues
# ARMA(1, 1)
fit1 = arma(DY, order = c(1, 1), include.intercept = F)

# let's look at the residuals
res1 = fit1$residuals
plot(res1)
acf(res1, na.action = na.pass)

# we test for normality and no correlation of the residuals
jarque.bera.test(na.exclude(res1)) # H0: normality
Box.test (res1, lag = 12, type = "Ljung-Box") # H0: no correlation
summary(fit1) # the AR component is not significant => we increase the MA component

# MA(1)
fit2 = arma(DY, order = c(0, 1))

# let's look at the residuals
res2 = fit2$residuals
plot(res2)

# MA(12)
fit12 = arma(DY, order = c(0, 12))

# let's look at the residuals
res12 = fit12$residuals
plot(res12)

# we test for normality and independence of residuals for both the fitted MA(1) and MA(12) models
jarque.bera.test(na.exclude(res2)) # H0: normality
Box.test(res2, lag = 12, type = "Ljung-Box") # H0: no correlation
summary(fit2)

jarque.bera.test(na.exclude(res12)) # H0: normality
Box.test(res12, lag = 12, type = "Ljung-Box") # H0: no correlation
summary(fit12)


# Final fit
fit3 = arma(DY, lag = list(ar = NULL, ma = c(1, 12)))

# let's look at the residuals
res3 = fit3$residuals
plot(res3)
acf(res3, na.action = na.pass)
# we test residual normality and independence
jarque.bera.test(na.exclude(res3)) # H0: normality
Box.test(res3, lag = 12, type = "Ljung-Box") # H0: no correlation
adf.test(na.exclude(res3), alternative = "stationary") # check for stationarity
summary(fit3)

fit4 = arma(DY, lag = list(ar = 1, ma = c(1, 12)))
res4 = fit4$residuals
plot(res4)
acf(res4, na.action = na.pass)
# Finally, we test residual normality and independence
jarque.bera.test(na.exclude(res4)) # H0: normality
Box.test(res4, lag = 12, type = "Ljung-Box") # H0: no correlation
adf.test(na.exclude(res4), alternative = "stationary") # check for stationarity
summary(fit4)

##### NOW IT'S YOUR TURN!!!
# Fit the series "Studio1.txt" and "Studio2.txt"