# Time Series Analysis Laboratory with R
# Prof. Lea Petrella
# Faculty of Economics
# Department of Methods and Models for Economics, Territory and Finance

# Sapienza University of Rome
# a.y. 2019-2020
#####################################

# Useful materials and links
# R: https://www.r-project.org
# RStudio: https://www.rstudio.com/home/
# Manual: The Art of R Programming
# Repositories & Code: GitHub (https://github.com)
#####################################


################################################## 
# Lesson 3: R Laboratory (Part 1)
########################################

#   Index:
# 1)data import
# 2)descriptive statistics
# 3)graphical representation
# 4)distribution functions
# 5)the linear regression model
#
################################################## 
# How to import data from external files
# read.table (...) 
#its derivatives
# read.csv (...)
# read.xls (...), requires the 'gdata' package
# read_excel (...), requires the 'readxl' package
# read.fwf (...)

# IMPORTANT: before importing a data set, it is recommended to open the file in order to get how to set the function arguments
# for instance, in order to see if there is a header with the variables names
# or if there are (and how many!) initial lines to skip etc ...

# When in doubt, use command
#? read.table


# FIRST STEP: Save the data on a file
# Check that the data is entered with a dot instead of a comma!
# SECOND STEP: remove all the objects in the memory
rm (list = ls ())
graphics.off ()
# THIRD STEP: set the working directory in the fold where the data are located
getwd ()
setwd (dir = getwd ())

# ::::: OPEN THE DATASET ::::: #
Data2 = read.csv ("consumitrimestrali.csv", header = T, sep = ";", dec = ",") #load data format .csv [Comma Separated Values]
Data3 = read.table ("prices_mensile.txt", header = T, dec = ",") #load data format .txt
Data4 = read.table ("prices_titles_day_days.dat") #load data format .dat [header argument missing]

### NB: the above-mentioned commands do not open the excel files !!! An additional package is required.
library (readxl)
Data1 = read_excel ("annual_consumption.xls", sheet = 1) #load data format .xls / .xlsx
Data1 = as.data.frame (Data1) #transform data into dataframe
# Otherwise, you can use the 'Import Dataset' command in the 'Environment' window without setting a new working directory
# NOTE: R packages already contain well-known and used datasets in literature.
# to see the available datasets just use the data () command
data()

####library (Tseries)
# ::::: how to transform a dataset into a time series ::::: #
Y1 = ts (Data1 [, 2], frequency = 1, start = 1970) # annual frequency starting in 1970
Y2 = ts (Data2 [, 2], frequency = 4, start = c (1995, 1)) # quarterly frequency starting from 1995
Y3 = ts (Data3 [, 2], frequency = 12, start = c (1996, 1)) #frequency monthly starting from 1996
Y4 = ts (Data4 [, 1], start = 2001, deltat = 1/261) #frequency frequency starting from 2001

# ::::: BASIC STATISTICS AND TIME SERIES GRAPHIC ANALYSIS ::::: #
# summary () provides basic sample statistics
summary (Y1); summary (Y2); summary (Y3); summary (Y4)

# It is possible to obtain the individual descriptive statistics ...
mean (Y4)
var (Y4)
sd (Y4)
quantile (Y4)
quantile (Y4, probs = c (0.005, 0.01, 0.9, 0.95))
# For instance, the 'moments' package provides commands for calculating skewness and kurtosis
#Alternatively, the package "fBasics" contains the formula "basicStats"that computes an overview of basic statistical values
#(Minimum, Maximum ,1. Quartile, 3. Quartile, Mean, Median, Sum, SE Mean, LCL Mean, UCL Mean, Variance, Stdev,Skewness, Kurtosis.)

# Plot the daily price range of the Standard and Poors 500 index
plot.ts (Y4, type = "l", col = "dark blue", ylab = "Price", main = "S&P 500")
# Additional arguments can be:
# 'lty': type of line;
# 'lwd': line width
# Another popular package for graphics is 'ggplot2'


# Two graphs can be compared by usingthe par function (mfrow = (r, c))
# where r and c indicate the number of rows and columns of the general graph, respectively
par (mfrow = c (2,1)) # returns two graphs in a column
plot (Y2 / 1000, type = "l", ylab = "Quarterly consumption")
plot (Y3, type = "l", ylab = "Monthly prices")

# let's go back to plotting a single seriesm but now use the histogram
par (mfrow = c (1,1))
hist (Y1 / 1000, nclass = 30, freq = F, main = "annual consumption")
# makes the histogram of observed points using relative frequencies)

hist (diff (log (Y4)), nclass = 100, freq = F, main = "Return")
lines (density (diff (log (Y4))), col = "red")
# the 'lines' command argument adds the (estimated) density of what we are plotting

#### PROBABILITY DISTRIBUTIONS AND RANDOM NUMBERS :::::

# The most used probability distributions are
# 1) Probability Density Function
# [prefix - "d"]
# 2) Cumulative Density Function or Distribution Function
# [prefix - "p"]
# 3) Quantile Function
# [prefix - "q"]
# 4) Random number generator
# [prefix - "r"]

# -------------------------------------------
# Binomial distribution
# -------------------------------------------
# Example: flip a coin 10 times.
# Let X = {number of heads}
# Then X ~ Binomial (10.1 / 2).
# What is the probability of observing 5 heads?
# Pr (X = 5)
dbinom (5, size = 10, prob = 1/2)

# What about the probability of observing less than or equal to 6 heads?
# Pr (X <= 6)
sum (dbinom (0: 6, size = 10, prob = 1/2)) # or
pbinom (6, size = 10, p = 1/2)

# And more than 7 heads?
# Pr (X> = 7)
sum (dbinom (7:10, size = 10, prob = 1/2)) # or
1 - pbinom (6, size = 10, p = 1/2)


# -------------------------------------------
# Gaussian distribution: mu and sigma parameters
# -------------------------------------------
# PDF
dnorm (0) # default = Normal Standard
dnorm (0, mean = 0, sd = 1)
dnorm (0, mean = 1, sd = 1)

# CDF
pnorm (0)
pnorm (0,2,2)

# Quantiles
qnorm (0975)
qnorm (0.975,2,2)

# Generate random numbers from gaussian distribution
RNorm (10)
RNorm (10,7,5)

# Example: generate 1000 observations of a process
# White Noise with a standard deviation of 3
WN = rnorm (1000, mean = 0, sd = 3)

par (mfrow = c (2,1))
plot.ts (WN)
hist (WN, nclass = 100, freq = F, main = "White Noise")
abline (v = mean (WN), col = "blue", lwd = 2)
curves (dnorm (x, 0.3), add = T, lwd = 2, lty = 2, col = "red")
abline (v = 0, col = "red", lwd = 2)


##### THE LINEAR REGRESSION MODEL #####
# We use a well-known dataset, already in class: "AirPassengers"
Y = AirPassengers
# Firstly, we carry out an exploratory analysis of the data
Y
sum (is.na (Y)) # Number of missing data 
frequency (Y) # Frequency of observations
summary (Y) # Summary of the time series
par (mfrow = c (1,1))
plot (Y, xlab = "Date", ylab = "Passenger numbers (1000's)", main = "Air Passenger numbers from 1949 to 1961")
# What do we observe?

# We decompose the series into the seasonal and trend component
additive.dec = decompose (Y, type = c ("additive")) # Y (t) = T (t) + S (t) + e (t)
plot (additive.dec) # Let's look at the results of the additive decomposition
multi.dec = decompose (Y, type = c ("multiplicative")) # Y (t) = T (t) * S (t) * e (t)
plot (multi.dec) # Let's look at the results of the multiplicative decomposition

#Plot the correlogram of Y, that is the sample autocorrelation function
par (mfrow = c (2,1))
acf (Y) # indicates the amplitude and duration of the process memory
pacf (Y) # indicates the autocorrelation at two different instants, keeping the variables between the two instants fixed
# What about the ACF and PACF of a White Noise?
acf (WN)
PACF (WN)
# We need it in order to evaluate whether the observed series presents autocorrelation or not

# We regress the series on seasonal dummies and a linear trend
t = 1: length (Y) # how to create the trend
s = kronecker (rep (1, 12), diag (12)) # create the seasonal dummies

# Fit a linear model
reg1 = lm (Y ~ t) # regression with trend only
summary (reg1)
reg = lm (Y ~ s + t - 1) # regression with trend and seasonality
summary (reg)

par (mfrow = c (2,1))
res1 = reg1 $ residuals # regression residuals with trend
plot.ts (res1) # graphic representation

res = reg $ residuals # regression residuals with trend and seasonality
plot.ts (res) # graphic representation
# Are they distributed as a Gaussian v.c? Are they homosexedastic?
standard.res = (res - mean (res)) / (var (res) ^ 0.5) # standardized residuals
# alternatively, you can use the 'scale' command

# test the hypothesis of normality for residuals.
# Graphically, we expect residuals to behave like V.C. Gaussian
par (mfrow = c (1,1))
plot (standard.res, main = "Standardized residue diagram") # Plot
hist (standard.res, nclass = 20, freq = F) # Histogram
curve (dnorm (x, mean = 0, sd = 1), col = "red", add = T)
qqnorm (standard.res) # QQplot
abline (0,1, col = "red")
# test normality with the Jarque-Bera test statistic.
# H0: the residuals are distributed as a v.c. Gaussian
jarque.bera.test (standard.res)

# Test normality with an alternative test
# H0: the residues are distributed as a v.c. Gaussian
shapiro.test (standard.res)

# Test homoskedasticity hypothesis
# graphically, we expect that the variance of the residuals does not vary over time
plot (standard.res, main = "Diagram of standardized residues")

# We use the Breusch-Pagan test
# H0: residuals are homoskedastic
library (lmtest)
bptest (reg)

# Test residuals autocorrelation
# Visually you can draw the graph of the residuals (ordered) towards the previous residuals (abscissa)
# which should not reveal any obvious patterns
n <-length (standard.res)
plot (standard.res [-n], standard.res [-1], pch = 16)
# Are they similar to a White Noise?
WN = RNorm (n)
lines (WN [-n], WN [-1], type = "p", col = "red", lwd = 2)
# An alternative way is to look at the residual acf
par (mfrow = c (2,1))
acf (standard.res) # makes us think of an AR (2) process with complex roots
acf (WN)

# We use the Ljung-Box and Box-Pierce statistics (H0: no autocorrelation up to lag k)
k = 12
Box.test (standard.res, lag = k, type = "Ljung-Box") # Ljung-Box test
Box.test (standard.res, lag = k, type = "Box-Pierce") # Box-Pierce test

# Alternatively, it is possible to run the Durbin-Watson test
dwtest (reg) # NB: only valid for order 1 autocorrelation (at lag 1)