# Time Series Analysis Laboratory with R # Prof. Lea Petrella # Faculty of Economics # Department of Methods and Models for Economics, Territory and Finance # Sapienza University of Rome # a.y. 2019-2020 ##################################### # Useful materials and links # R: https://www.r-project.org # RStudio: https://www.rstudio.com/home/ # Manual: The Art of R Programming # Repositories & Code: GitHub (https://github.com) ##################################### ################################################## # Lesson 3: R Laboratory (Part 1) ######################################## # Index: # 1)data import # 2)descriptive statistics # 3)graphical representation # 4)distribution functions # 5)the linear regression model # ################################################## # How to import data from external files # read.table (...) #its derivatives # read.csv (...) # read.xls (...), requires the 'gdata' package # read_excel (...), requires the 'readxl' package # read.fwf (...) # IMPORTANT: before importing a data set, it is recommended to open the file in order to get how to set the function arguments # for instance, in order to see if there is a header with the variables names # or if there are (and how many!) initial lines to skip etc ... # When in doubt, use command #? read.table # FIRST STEP: Save the data on a file # Check that the data is entered with a dot instead of a comma! # SECOND STEP: remove all the objects in the memory rm (list = ls ()) graphics.off () # THIRD STEP: set the working directory in the fold where the data are located getwd () setwd (dir = getwd ()) # ::::: OPEN THE DATASET ::::: # Data2 = read.csv ("consumitrimestrali.csv", header = T, sep = ";", dec = ",") #load data format .csv [Comma Separated Values] Data3 = read.table ("prices_mensile.txt", header = T, dec = ",") #load data format .txt Data4 = read.table ("prices_titles_day_days.dat") #load data format .dat [header argument missing] ### NB: the above-mentioned commands do not open the excel files !!! An additional package is required. library (readxl) Data1 = read_excel ("annual_consumption.xls", sheet = 1) #load data format .xls / .xlsx Data1 = as.data.frame (Data1) #transform data into dataframe # Otherwise, you can use the 'Import Dataset' command in the 'Environment' window without setting a new working directory # NOTE: R packages already contain well-known and used datasets in literature. # to see the available datasets just use the data () command data() ####library (Tseries) # ::::: how to transform a dataset into a time series ::::: # Y1 = ts (Data1 [, 2], frequency = 1, start = 1970) # annual frequency starting in 1970 Y2 = ts (Data2 [, 2], frequency = 4, start = c (1995, 1)) # quarterly frequency starting from 1995 Y3 = ts (Data3 [, 2], frequency = 12, start = c (1996, 1)) #frequency monthly starting from 1996 Y4 = ts (Data4 [, 1], start = 2001, deltat = 1/261) #frequency frequency starting from 2001 # ::::: BASIC STATISTICS AND TIME SERIES GRAPHIC ANALYSIS ::::: # # summary () provides basic sample statistics summary (Y1); summary (Y2); summary (Y3); summary (Y4) # It is possible to obtain the individual descriptive statistics ... mean (Y4) var (Y4) sd (Y4) quantile (Y4) quantile (Y4, probs = c (0.005, 0.01, 0.9, 0.95)) # For instance, the 'moments' package provides commands for calculating skewness and kurtosis #Alternatively, the package "fBasics" contains the formula "basicStats"that computes an overview of basic statistical values #(Minimum, Maximum ,1. Quartile, 3. Quartile, Mean, Median, Sum, SE Mean, LCL Mean, UCL Mean, Variance, Stdev,Skewness, Kurtosis.) # Plot the daily price range of the Standard and Poors 500 index plot.ts (Y4, type = "l", col = "dark blue", ylab = "Price", main = "S&P 500") # Additional arguments can be: # 'lty': type of line; # 'lwd': line width # Another popular package for graphics is 'ggplot2' # Two graphs can be compared by usingthe par function (mfrow = (r, c)) # where r and c indicate the number of rows and columns of the general graph, respectively par (mfrow = c (2,1)) # returns two graphs in a column plot (Y2 / 1000, type = "l", ylab = "Quarterly consumption") plot (Y3, type = "l", ylab = "Monthly prices") # let's go back to plotting a single seriesm but now use the histogram par (mfrow = c (1,1)) hist (Y1 / 1000, nclass = 30, freq = F, main = "annual consumption") # makes the histogram of observed points using relative frequencies) hist (diff (log (Y4)), nclass = 100, freq = F, main = "Return") lines (density (diff (log (Y4))), col = "red") # the 'lines' command argument adds the (estimated) density of what we are plotting #### PROBABILITY DISTRIBUTIONS AND RANDOM NUMBERS ::::: # The most used probability distributions are # 1) Probability Density Function # [prefix - "d"] # 2) Cumulative Density Function or Distribution Function # [prefix - "p"] # 3) Quantile Function # [prefix - "q"] # 4) Random number generator # [prefix - "r"] # ------------------------------------------- # Binomial distribution # ------------------------------------------- # Example: flip a coin 10 times. # Let X = {number of heads} # Then X ~ Binomial (10.1 / 2). # What is the probability of observing 5 heads? # Pr (X = 5) dbinom (5, size = 10, prob = 1/2) # What about the probability of observing less than or equal to 6 heads? # Pr (X <= 6) sum (dbinom (0: 6, size = 10, prob = 1/2)) # or pbinom (6, size = 10, p = 1/2) # And more than 7 heads? # Pr (X> = 7) sum (dbinom (7:10, size = 10, prob = 1/2)) # or 1 - pbinom (6, size = 10, p = 1/2) # ------------------------------------------- # Gaussian distribution: mu and sigma parameters # ------------------------------------------- # PDF dnorm (0) # default = Normal Standard dnorm (0, mean = 0, sd = 1) dnorm (0, mean = 1, sd = 1) # CDF pnorm (0) pnorm (0,2,2) # Quantiles qnorm (0975) qnorm (0.975,2,2) # Generate random numbers from gaussian distribution RNorm (10) RNorm (10,7,5) # Example: generate 1000 observations of a process # White Noise with a standard deviation of 3 WN = rnorm (1000, mean = 0, sd = 3) par (mfrow = c (2,1)) plot.ts (WN) hist (WN, nclass = 100, freq = F, main = "White Noise") abline (v = mean (WN), col = "blue", lwd = 2) curves (dnorm (x, 0.3), add = T, lwd = 2, lty = 2, col = "red") abline (v = 0, col = "red", lwd = 2) ##### THE LINEAR REGRESSION MODEL ##### # We use a well-known dataset, already in class: "AirPassengers" Y = AirPassengers # Firstly, we carry out an exploratory analysis of the data Y sum (is.na (Y)) # Number of missing data frequency (Y) # Frequency of observations summary (Y) # Summary of the time series par (mfrow = c (1,1)) plot (Y, xlab = "Date", ylab = "Passenger numbers (1000's)", main = "Air Passenger numbers from 1949 to 1961") # What do we observe? # We decompose the series into the seasonal and trend component additive.dec = decompose (Y, type = c ("additive")) # Y (t) = T (t) + S (t) + e (t) plot (additive.dec) # Let's look at the results of the additive decomposition multi.dec = decompose (Y, type = c ("multiplicative")) # Y (t) = T (t) * S (t) * e (t) plot (multi.dec) # Let's look at the results of the multiplicative decomposition #Plot the correlogram of Y, that is the sample autocorrelation function par (mfrow = c (2,1)) acf (Y) # indicates the amplitude and duration of the process memory pacf (Y) # indicates the autocorrelation at two different instants, keeping the variables between the two instants fixed # What about the ACF and PACF of a White Noise? acf (WN) PACF (WN) # We need it in order to evaluate whether the observed series presents autocorrelation or not # We regress the series on seasonal dummies and a linear trend t = 1: length (Y) # how to create the trend s = kronecker (rep (1, 12), diag (12)) # create the seasonal dummies # Fit a linear model reg1 = lm (Y ~ t) # regression with trend only summary (reg1) reg = lm (Y ~ s + t - 1) # regression with trend and seasonality summary (reg) par (mfrow = c (2,1)) res1 = reg1 $ residuals # regression residuals with trend plot.ts (res1) # graphic representation res = reg $ residuals # regression residuals with trend and seasonality plot.ts (res) # graphic representation # Are they distributed as a Gaussian v.c? Are they homosexedastic? standard.res = (res - mean (res)) / (var (res) ^ 0.5) # standardized residuals # alternatively, you can use the 'scale' command # test the hypothesis of normality for residuals. # Graphically, we expect residuals to behave like V.C. Gaussian par (mfrow = c (1,1)) plot (standard.res, main = "Standardized residue diagram") # Plot hist (standard.res, nclass = 20, freq = F) # Histogram curve (dnorm (x, mean = 0, sd = 1), col = "red", add = T) qqnorm (standard.res) # QQplot abline (0,1, col = "red") # test normality with the Jarque-Bera test statistic. # H0: the residuals are distributed as a v.c. Gaussian jarque.bera.test (standard.res) # Test normality with an alternative test # H0: the residues are distributed as a v.c. Gaussian shapiro.test (standard.res) # Test homoskedasticity hypothesis # graphically, we expect that the variance of the residuals does not vary over time plot (standard.res, main = "Diagram of standardized residues") # We use the Breusch-Pagan test # H0: residuals are homoskedastic library (lmtest) bptest (reg) # Test residuals autocorrelation # Visually you can draw the graph of the residuals (ordered) towards the previous residuals (abscissa) # which should not reveal any obvious patterns n <-length (standard.res) plot (standard.res [-n], standard.res [-1], pch = 16) # Are they similar to a White Noise? WN = RNorm (n) lines (WN [-n], WN [-1], type = "p", col = "red", lwd = 2) # An alternative way is to look at the residual acf par (mfrow = c (2,1)) acf (standard.res) # makes us think of an AR (2) process with complex roots acf (WN) # We use the Ljung-Box and Box-Pierce statistics (H0: no autocorrelation up to lag k) k = 12 Box.test (standard.res, lag = k, type = "Ljung-Box") # Ljung-Box test Box.test (standard.res, lag = k, type = "Box-Pierce") # Box-Pierce test # Alternatively, it is possible to run the Durbin-Watson test dwtest (reg) # NB: only valid for order 1 autocorrelation (at lag 1)