# Time Series Analysis Laboratory with R # Prof. Lea Petrella # Faculty of Economics # Department of Methods and Models for Economics, Territory and Finance # Sapienza University of Rome # a.y. 2019-2020 ##################################### # Useful materials and links # R: https://www.r-project.org # RStudio: https://www.rstudio.com/home/ # Manual: The Art of R Programming # Repositories & Code: GitHub (https://github.com) ##################################### ######################################## # Lesson 6: R Laboratory (Part 4) ######################################## ################################################## # # Index: # 1)estimation of an ARIMA(p, i, q) process # 2)residuals analysis # 3)estimation strategy # 4)model selection ########################################################################## rm(list = ls()) graphics.off() set.seed(123) # Let's study how to assess the goodness of fit of a time series library(tseries) library(lmtest) ##### Easy simulation of a known process # We fix the number of observations to be generated and then set the first two observations to zero n = 1000 Ysim1 = c(); Ysim1[1] = 0; Ysim1[2] = 0 for (i in 3:n){ Ysim1[i] = 0.4 * Ysim1[i-1] + 0.3 * Ysim1[i-2] + rnorm(1, 0, 1) } # ACF and PACF par(mfrow = c(2, 1), lwd = 2, cex.axis = 1.7, cex.lab = 1.3, cex.main = 1.5) acf(Ysim1) pacf(Ysim1) # Fit the process starting from the order suggested by the above-mentioned plots # NOTE: the arma() function optimizes (minimizes) the conditional sum of squared errors ?arma fit20 = arma(Ysim1, order = c(2, 0), include.intercept = F) # AR(2) without intercept fit20int = arma(Ysim1, order = c(2, 0), include.intercept = T) # AR(2) with intercept fit10 = arma(Ysim1, order = c(1, 0), include.intercept = F) # AR(1) without intercept # !!! The last model produces a warning message!!! # To solve this issue, we may consider fit10 = arma(Ysim1, order = c(1, 0), include.intercept = F, method = "Brent", lower = -1, upper = 1) # analyze the residuals to verify the goodness of the estimate res20 = fit20$residuals plot.ts(res20, main = "AR(2) residuals") res10 = fit10$residuals plot.ts(res10, main = "AR(1) residuals") # compare the acf acf(res20, na.action = na.omit, main = "AR(2)") acf(res10, na.action = na.omit, main = "AR(1)") # compare the pacf pacf(res20, na.action = na.omit, main = "AR(2)") pacf(res10, na.action = na.omit, main = "AR(1)") # test for normality and independence of residuals of the correct model jarque.bera.test(na.exclude(res20)) # H0: normality shapiro.test(res20) # We can also test for normality of residuals using the Shapiro-Wilk test Box.test(res20, lag = 12, type = "Ljung-Box") # H0: no correlation Box.test(res20, lag = 12, type = "Box-Pierce") # Another possibility is to use the Box-Pierce test # test for normality and independence of residuals of the misspecified model jarque.bera.test(na.exclude(res10)) # H0: normality shapiro.test(res10) # Box.test(res10, lag = 12, type = "Ljung-Box") # H0: no correlation Box.test(res10, lag = 12, type = "Box-Pierce") # Another possibility is to use the Box-Pierce test # to visually check the normality of the residuals we can draw the Quantile-Quantile (QQ) plots ?qqnorm qqnorm(res20, main = "Normal Q-Q Plot AR(2)") qqline(res20, col = "red", lty = 2) # qqline adds the “theoretical” normal line to the qqplot qqnorm(res10, main = "Normal Q-Q Plot AR(1)") qqline(res10, col = "red", lty = 2) ################################################## # To estimate the parameters of an ARIMA model, you can also use the arima() function # NOTE: the arima() function allows you to optimize (maximize) the likelihood function ?arima fit20i = arima(Ysim1, order = c(2, 0, 0), include.mean = F) fit10i = arima(Ysim1, order = c(1, 0, 0), include.mean = F) # The tsdiag() function allows you to analyze the model residuals by using a single command # NOTE: the standardized residuals, the ACF and the p-values of the Ljung-Box test are plotted ?tsdiag tsdiag(fit20i) tsdiag(fit10i) ##### Simulation of a non-stationary process n = 200 # we set the number of observations to be generated Ysim1 = c(); Ysim1[1] = 0; Ysim1[2] = 0 shock = rnorm(n, 0, 1) # draw the error component from a Gaussian distribution for (i in 3:n){ Ysim1[i] = 0.04 * i + 0.6 * Ysim1[i-1] - 0.5 * Ysim1[i-2] - 0.1 * shock[i-1] + 0.3 * shock[i-2] + shock[i] } par(mfrow = c(1, 1)) plot.ts(Ysim1) # firstly, we need to make the series stationary acf(Ysim1) DYsim = diff(Ysim1, differences = 1) plot.ts(DYsim) # Subsequently, the acf and pacf are plotted par(mfrow = c(2, 1)) acf(DYsim) pacf(DYsim) # If we can't choose a single model by looking at these plots, # it is possible to fit and compare multiple models to see which one best fits the data fit1 = arima(DYsim, order = c(1, 0, 1), include.mean = T) # ARMA(1, 1) with intercept fit2 = arima(DYsim, order = c(2, 0, 2), include.mean = T) # ARMA(2, 2) with intercept # analyze the residuals to verify the goodness of fit res1 = fit1$residuals res2 = fit2$residuals acf(res1, main = "ARMA(1, 1)") acf(res2, main = "ARMA(2, 2)") # test for normality and independence of residuals jarque.bera.test(na.exclude(res1)) # H0: normality Box.test(res1, lag = 12, type = "Ljung-Box") # H0: no correlation jarque.bera.test(na.exclude(res2)) # H0: normality Box.test(res2, lag = 12, type = "Ljung-Box") # H0: no correlation # If the residuals do not show particular problems graphically and pass the diagnostic tests, # the model can be chosen by observing the significance of the coefficients, # by comparing the information criteria (AIC) and the conditional Sum of Squared Errors (SSE) # the coeftest() command returns a coefficient matrix containing the point estimates, associated standard errors, # test statistics and p-values ?coeftest coeftest(fit1) coeftest(fit2) # We choose the model with minimum SSE and AIC sum(fit1$residuals^2, na.rm = T) sum(fit2$residuals^2, na.rm = T) fit1$aic fit2$aic # Once the model is selected, we can move on to the series forecast ?predict forecast = predict(fit2, n.ahead = 50, se.fit = T) forecast # As the forecast horizon grows, we can see how: # i) the fit converges to the unconditional average of the process mean(DYsim) # ii) the standard deviation of the error converges to the unconditional standard deviation of the process sd(DYsim) ##### Now let's try to estimate a series that we have not simulated # ... do you remember AirPassengers? rm(list = ls()) ?AirPassengers AirPassengers Y = AirPassengers par(mfrow = c(1, 1)) plot(Y, ylab = "Passenger numbers (1000's)", main = "Air Passenger from 1949 to 1961") # We need to transform the data to remove trend and seasonality # At first, we apply the logarithm to eliminate heteroskedasticity and reduce variability Y = log(Y) plot(Y) # we differentiate the series to make it stationary and eliminate seasonality DY = diff(diff(Y, differences = 1), lag = 12) plot(DY) # To verify that the series has actually become stationary we can use the Dickey-Fuller test, # where the null hypothesis is that the process is a random walk # NB! The command adf.test() allows to choose the alternative hypothesis H1: adf.test(DY, alternative = "stationary") adf.test(DY, alternative = "explosive") # Please also note that the function tests for covariance stationarity only (unit root) acf(DY) pacf(DY) # Estimation strategy: # we choose a minimum lag and we increase it until we have WN residues # ARMA(1, 1) fit1 = arma(DY, order = c(1, 1), include.intercept = F) # let's look at the residuals res1 = fit1$residuals plot(res1) acf(res1, na.action = na.pass) # we test for normality and no correlation of the residuals jarque.bera.test(na.exclude(res1)) # H0: normality Box.test (res1, lag = 12, type = "Ljung-Box") # H0: no correlation summary(fit1) # the AR component is not significant => we increase the MA component # MA(1) fit2 = arma(DY, order = c(0, 1)) # let's look at the residuals res2 = fit2$residuals plot(res2) # MA(12) fit12 = arma(DY, order = c(0, 12)) # let's look at the residuals res12 = fit12$residuals plot(res12) # we test for normality and independence of residuals for both the fitted MA(1) and MA(12) models jarque.bera.test(na.exclude(res2)) # H0: normality Box.test(res2, lag = 12, type = "Ljung-Box") # H0: no correlation summary(fit2) jarque.bera.test(na.exclude(res12)) # H0: normality Box.test(res12, lag = 12, type = "Ljung-Box") # H0: no correlation summary(fit12) # Final fit fit3 = arma(DY, lag = list(ar = NULL, ma = c(1, 12))) # let's look at the residuals res3 = fit3$residuals plot(res3) acf(res3, na.action = na.pass) # we test residual normality and independence jarque.bera.test(na.exclude(res3)) # H0: normality Box.test(res3, lag = 12, type = "Ljung-Box") # H0: no correlation adf.test(na.exclude(res3), alternative = "stationary") # check for stationarity summary(fit3) fit4 = arma(DY, lag = list(ar = 1, ma = c(1, 12))) res4 = fit4$residuals plot(res4) acf(res4, na.action = na.pass) # Finally, we test residual normality and independence jarque.bera.test(na.exclude(res4)) # H0: normality Box.test(res4, lag = 12, type = "Ljung-Box") # H0: no correlation adf.test(na.exclude(res4), alternative = "stationary") # check for stationarity summary(fit4) ##### NOW IT'S YOUR TURN!!! # Fit the series "Studio1.txt" and "Studio2.txt"