library(zoo) library(tseries) library(xts) library(timeDate) library(timeSeries) library(PerformanceAnalytics) library(car) library(wavethresh) library(gridExtra) printTable = function( tab ) { grid.newpage() par(mfrow=c(1,1)) grid.table(round(tab, 6), gpar.rowtext=gpar(fontsize = 16), show.box=T, show.hlines=T, show.vlines=T, separator = "black") par(mfrow=c(1,1)) } # # Find the lag for the maximum cross correlation between two series # findMaxCCF = function(a,b) { d <- ccf(a, b, plot = FALSE) cor = d$acf[,,1] lag = d$lag[,,1] res = data.frame(cor,lag) res_max = res[which.max(res$cor),] return(res_max) } getPrices = function( tickerSym, startDate, endDate ) { ts = get.hist.quote( instrument = tickerSym, start = startDate, end = endDate, quote="AdjClose", provider="yahoo", compression="d", quiet=T) return(ts) } # # For a given stock: # Return a time series of the CC returns # # Arguments: # tickerSym - the ticker symbol recognized by Yahoo for the stock # startDate - a character string with the start data (in ISO format) # endDate - a character string with the end data (in ISO format) # getReturns = function( tickerSym, startDate, endDate ) { ts = getPrices( tickerSym, startDate, endDate) ret = diff(log(ts)) colnames(ret) = tickerSym return(ret) } # Calculate the simple weekly return. Here the stock would be bought on day 1 and sold # seven days later. # simpleWeekReturn = function( tickerSym, dateRange) { startDate = dateRange[1] endDate = dateRange[length(dateRange)] ts = getPrices(tickerSym, startDate, endDate) buyDates = dateRange[1:(length(dateRange)-1)] sellDates = dateRange[2:length(dateRange)] p = as.numeric(ts[buyDates]) p_plus_1 = as.numeric(ts[sellDates]) ret = (p_plus_1 - p)/p ret.z = as.zoo(ret) index(ret.z) = sellDates return(ret.z) } waveletWeekReturn = function( tickerSym, dateRange, smoothPrice) { startDate = dateRange[1] endDate = dateRange[length(dateRange)] ts = getPrices(tickerSym, startDate, endDate) buyDates = dateRange[1:(length(dateRange)-1)] sellDates = dateRange[2:length(dateRange)] p = as.numeric(smoothPrice[buyDates]) p_plus_1 = as.numeric(ts[sellDates]) ret = (p_plus_1 - p)/p ret.z = as.zoo(ret) index(ret.z) = sellDates return(ret.z) } # # This file is for SPX options, which is quoted as ^GSPC rootPath = "/home/iank/Documents/options_data" file = "SPX2011.csv" filePath = paste(rootPath, file, sep="/") # options.df = read.table(filePath,header=TRUE,sep=",", row.names=NULL, # colClasses = columns) optionsDataObj = "optionsData.R" if (file.exists( optionsDataObj) == T) { load(optionsDataObj) } else { options.df = read.table(filePath, header=TRUE,sep=",", row.names=NULL) save(options.df, file=optionsDataObj) } # ---------------------------------------------------------------------------------------------------- # This version of the R-script works for 2011 data. The data for 2005 has slightly different columns so # the code must be modified slightly. # ---------------------------------------------------------------------------------------------------- # # colnames(options.df) = colNames # options.df = options.df[1:18] # get rid of a sequential value column (I don't know how it got there) # remvoe the Underlying, exchange, optionext, optionalias temp = options.df[,c(-1,-3,-4, -5, -20)] options.df = temp rm(temp) columns = c("UnderlyingLast","Type","Expiration","DataDate","Strike","Last", "Bid","Ask","Volume","OpenInterest","IV","Delta","Gamma","Theta","Vega") colnames(options.df) = columns # Convert the data in the form month/day/year to Date objects and put them back in the data frame. strikeDate = as.Date(strptime(options.df[,"Expiration"], format="%m/%d/%Y", tz="EST")) optionDate = as.Date(strptime(options.df[,"DataDate"], format="%m/%d/%Y", tz="EST")) options.df[,"Expiration"] = strikeDate options.df[,"DataDate"] = optionDate # # To calculate the realized volatility for week t, we need twenty trading days back in time. # So the return start date is a month before the option start. minDate = min(optionDate) maxDate = max(optionDate) optionStartDate = minDate - 30 # days optionEndDate = maxDate sp500Sym = "^GSPC" sp500Ret = getReturns(sp500Sym, optionStartDate, optionEndDate) # # Get a vector of trading days. These are week days, with holidays removed. The trading days # are relative to the New York Stock Exchange (NYSE). # # startDate and endDate are Date objects. The function returns a date ordered vector of Date # objects which are the trading days. # # This function requires the timeDate package # tradingCalendar = function( startDate, endDate) { require(timeDate) timeSeq = timeSequence(from=as.character(startDate), to=as.character(endDate), by="day", format = "%Y-%m-%d", zone = "NewYork", FinCenter = "America/New_York") ix = as.logical(isWeekday(timeSeq, wday=1:5)) tradingDays = timeSeq[ix] startYear = as.POSIXlt(startDate)$year + 1900 endYear = as.POSIXlt(endDate)$year + 1900 tradingDays.dt = as.Date(tradingDays) hol = as.Date(holidayNYSE(startYear:endYear)) ix = which(tradingDays.dt %in% hol) tradingDays.dt = tradingDays.dt[-ix] return(tradingDays.dt) } # tradingCalendar # The notes and code here references: # # Exploiting Option Information in the Equity Market # GUIDO BALTUSSEN, BART VAN DER GRIENT, WILMA DE GROOT, # ERIK HENNINK AND WEILI ZHOU, December 2011 # # We apply the following screens on all options to ensure that we select liquid and heavily traded options that we # believe contain the most reliable information. We filter out options with zero volume or # open interest. volIx = which( options.df[,"Volume"] > 0) options.df = options.df[volIx,] intIx = which( options.df[,"OpenInterest"] > 0) options.df = options.df[intIx,] startTime = options.df[,"DataDate"] endTime = options.df[,"Expiration"] # as it turns out, there are some very long term options (leaps). So weed these out. rangeIx = which(endTime < (maxDate + 60)) startTime = startTime[rangeIx] endTime = endTime[rangeIx] # Build a trading day hash table: index by a date and get an trading day back tradingDays = tradingCalendar(minDate, maxDate + 60) #------------------------------------------------------------------------------------------ # Buld a trading calendar so that the number of trading days between the quote date and the # expiration date can be calculated. #------------------------------------------------------------------------------------------ # As it turns out, some options are marked for expiration on Saturday. No new options an be purchased # after market close on Friday, but the option can still be effected by after hours trading. Unfortunately # the trading calendar doesn't include saturday. So plug in the saturdays if they exist in the endTime # vector (the option expirations) optionDays = tradingDays uniqueEndTime = unique(endTime) # Unique option expiration days. saturdayIx = which(! uniqueEndTime %in% tradingDays) if (length(saturdayIx > 0)) { saturdays = uniqueEndTime[saturdayIx] optionDays = sort(c(tradingDays, saturdays)) } dayIx.z = as.zoo(seq(from=1, to=length(optionDays), by=1)) index(dayIx.z) = optionDays # For every start date, build a vector that contains the trading day (number) # for that start date. uniqueStart = unique(startTime) # option quote dates (e.g., the moving start of the option period) startIx = dayIx.z[uniqueStart] # the trading day number for the start days rleLen = rle(as.numeric(startTime))$lengths ends = cumsum(rleLen) starts = c(0, ends[1:(length(ends)-1)]) + 1 startDay = rep(0, length(startTime)) for (i in 1:length(starts)) { startDay[starts[i]:ends[i]] = startIx[i] } # For every option expiration date, build a vector that has the trading day # for that option expiration date. Note that option expirations are not # sequential, but they do exist in blocks. endIx = dayIx.z[uniqueEndTime] # The trading day number for the expiration day rleLen = rle(as.numeric(endTime))$length # find the "run lengths" in the option expirations ends = cumsum(rleLen) starts = c(0, ends[1:(length(ends)-1)]) + 1 endDay = rep(0, length(startTime)) for (i in 1:length(starts)) { tryCatch( { secDate = endTime[starts[i]] # get the section date ix = endIx[secDate] endDay[starts[i]:ends[i]] = ix }, error = function(e) { print(sprintf("i = %d", i)) break; } ) } # From the article: # As most activity in options is concentrated in the short end, we select options # with a remaining maturity of approximately one month by requiring a maturity between 10 # and 40 trading days. deltaDay = (endDay - startDay) + 1 daysInRange = (deltaDay >= 10) & (deltaDay <= 40) temp = options.df[daysInRange,] options.df = temp rm(temp) # Mark the options that are at-the-money (atm) # # From the article: # We follow Xing et al. (2010) when separating options into at-themoney (ATM) # and out-of-the-money (OTM). A put or call option is defined as ATM when the # ratio of the strike price to the stock price is between 0.95 and 1.05 ratio = options.df[,"Strike"] / options.df[,"UnderlyingLast"] atm = sapply(ratio, FUN=function(v) {b = (v >= 0.95) && (v <= 1.05)}) options.df[,"atm"] = atm options.df[,"otm"] = (!atm) # # This function is used to generate the weekly values for calculating the skew. # # "We use the weekly average of the IV variables to reduce the effect of noise, # while requiring at least two non-missing values during the past five days to # compute the measure." # weekAggregateSkew = function(option.type.df, isCall) { weekBreaks = endpoints(option.type.df[,"DataDate"], on="weeks") starts = weekBreaks[1:(length(weekBreaks)-1)]+1 ends = weekBreaks[2:length(weekBreaks)] colNames = c("UnderlyingLast", "Expiration", "DataDate", "Strike", "Bid", "Ask", "IV", "atm", "otm") week.df = data.frame() for (i in 1:length(starts)) { week = option.type.df[starts[i]:ends[i], colNames] if (isCall) { ix = which(week[,"atm"]) } else { ix = which(week[,"otm"]) } avgIV = mean(week[ix,"IV"]) avgStrike = mean(week[ix, "Strike"]) avgBid = mean(week[ix,"Bid"]) avgAsk = mean(week[ix, "Ask"]) avgLast = mean(week[ix, "UnderlyingLast"]) date = week[nrow(week),"DataDate"] expir = week[1,"Expiration"] row = as.data.frame(list(last = avgLast, expir = expir, date = date, strike = avgStrike, ask = avgBid, bid = avgAsk, IV = avgIV)) week.df = rbind(week.df, row) } return(week.df) } # weekAggregateSkew # # Weekly average aggregate of the put and call IV for ATM puts and calls. The argument to the # function is a data frame of both call and put options. # # "ATM[i,t] IV , is the average of the implied volatility of the ATM call and ATM # put option on stock i in week t." # weekAggregateATMIV = function( option.df ) { weekBreaks = endpoints(option.df[,"DataDate"], on="weeks") starts = weekBreaks[1:(length(weekBreaks)-1)]+1 ends = weekBreaks[2:length(weekBreaks)] colNames = c("UnderlyingLast", "Expiration", "DataDate", "Strike", "IV", "atm" ) week.df = data.frame() for (i in 1:length(starts)) { week = option.df[starts[i]:ends[i], colNames] ix = which(week[,"atm"]) avgIV = mean(week[ix,"IV"]) avgStrike = mean(week[ix, "Strike"]) avgLast = mean(week[ix, "UnderlyingLast"]) date = week[nrow(week),"DataDate"] expir = week[1,"Expiration"] row = as.data.frame(list(last = avgLast, expir = expir, date = date, strike = avgStrike, IV = avgIV)) week.df = rbind(week.df, row) } return(week.df) } # weekAggregateSkew weekVol = function(sp500Ret, weekEndDate, startDate, endDate ) { retData = as.vector(sp500Ret) charvec = as.character(index(sp500Ret)) tradingDays = tradingCalendar(startDate, endDate) retIx = index(sp500Ret) realizedVol = rep(0, length(weekEndDate)) for (i in 1:length(weekEndDate)) { endIx = which(tradingDays == weekEndDate[i]) # print(paste(endIx, ":", tradingDays.dt[ix])) startIx = endIx - 20 # 20 days endDate = tradingDays[endIx] startDate = tradingDays[startIx] retStart = which(retIx == startDate) retEnd = which(retIx == endDate) dayBlock = sp500Ret[retStart:retEnd] dayVar = var(dayBlock) # implied volatility is yearly volitility, so convert from daily vol to # yearly vol yearVol = sqrt( 252 * dayVar) realizedVol[i] = yearVol } return(realizedVol) } # weekVol # # Generate a linear regresison plot with the regression line, the regression values, # the confidence interval and the prediction bands. scatterPlot = function(y, x, colNames, main, xLabel, yLabel ) { df = as.data.frame(cbind(y, x)) colnames(df) = colNames f.str = paste(colNames[1], "~", colNames[2]) f = as.formula(f.str) l = lm(f, data=df) p = predict(l, interval="confidence", df=l$df.residual) topConf = p[,"upr"] lowConf = p[,"lwr"] p = predict(l, interval="prediction", df=l$df.residual) topPred = p[,"upr"] lowPred = p[,"lwr"] par(mfrow=c(1,1)) plot(x = x, y = y, xlab = xLabel, ylab = yLabel, type = "p", col="blue", pch=19, main= main) abline(l, lwd=2, col="red") lines(x = x, y = topConf, type="l", col="green") lines(x = x, y = lowConf, type="l", col="green") lines(x = x, y = topPred, type="l", col="magenta") lines(x = x, y = lowPred, type="l", col="magenta") legend("bottomright", legend=c("regression line", "95% Confidence Interval", "95% Prediction Band"), col=c("red", "green", "magenta"), lwd=4, cex=0.90) par(mfrow=c(1,1)) } # # Call options # callIx = which(options.df[,"Type"] == "call") call.options.df = options.df[callIx,-2] call.options.week = weekAggregateSkew(call.options.df, isCall=TRUE) putIx = which(options.df[,"Type"] == "put") put.options.df = options.df[putIx,-2] # Mark the put options that are out-of-the-money (otm) # # a put option as OTM when the ratio is lower than 0.95, but higher than 0.80. # When multiple options fall into the same group, we select options with # moneyness closest to 1.00 (ATM) or 0.95 (OTM) ratio = put.options.df[,"Strike"] / put.options.df[,"UnderlyingLast"] otm = sapply(ratio, FUN=function(v) {b = (v >= 0.80) && (v <= 0.95)}) put.options.df[,"otm"] = otm put.options.week = weekAggregateSkew(put.options.df, isCall = FALSE) skew.df = data.frame(put.options.week[,"date"], put.options.week[,"IV"] - call.options.week[,"IV"] ) colnames(skew.df) = c("date", "skew") skew.z = as.zoo(skew.df[,"skew"]) index(skew.z) = skew.df[,"date"] # The second measure is the realized (historical) versus implied volatility # spread, which is thought to capture the volatility risk of a stock. We measure # the realized versus implied volatility spread by the difference between the # realized volatility of the past 20 daily stock returns and the implied # volatility. Where RV[i,t] is the realized volatility of stock i measured in # week t and ATM[i,t] IV , is the average of the implied volatility of the ATM # call and ATM put option on stock i in week t. atmIV = weekAggregateATMIV(options.df) # Now we need to calculate the realized volatility weekEndDate = atmIV[,"date"] # realized annual volatility realVol = weekVol(sp500Ret, weekEndDate, optionStartDate, optionEndDate ) rviv = as.zoo(realVol - atmIV[,"IV"]) dateIndex = atmIV[,"date"] # option quote date - no Saturdays index(rviv) = dateIndex weekATMIVCall = weekAggregateATMIV( call.options.df ) weekATMIVPut = weekAggregateATMIV( put.options.df ) skewATM = as.zoo(weekATMIVPut[,"IV"] - weekATMIVCall[,"IV"]) index(skewATM) = dateIndex retEnd1 = dateIndex[length(dateIndex)] retEndIx = which(tradingDays == retEnd1) retEnd2 = tradingDays[ retEndIx + 5] returnRange = c(dateIndex, retEnd2) weekRet = simpleWeekReturn(sp500Sym, returnRange) regData.df = as.data.frame(cbind(as.numeric(weekRet), as.numeric(skew.z), as.numeric(rviv), as.numeric(skewATM))) colnames(regData.df) = c("weekRet", "skew", "rviv", "skewATM") # Regression of the skewed values and the realized volatility difference against the weekly returns l = lm(weekRet ~ skew + rviv + skewATM, data=regData.df) s = summary(l) par(mfrow=c(1,1)) printTable(s$coefficients) par(mfrow=c(1,1)) print(sprintf("R^2 = %0.6f", s$r.squared)) resid = residuals(l) resid.std = (resid - mean(resid)) / sd(resid) fitted.vals = fitted.values(l) # Look at the residual distribution par(mfrow=c(1,2)) plot(x=fitted.vals, y = resid.std, xlab="Fitted Values", ylab="Standardized Residuals", pch=16, col="blue") qqPlot(resid, main="Regression Residuals", ylab="Emperical Quantiles", col.lines=c("black", "blue", "green")) par(mfrow=c(1,1)) resid = residuals(l) resid.std = (resid - mean(resid)) / sd(resid) fitted.vals = fitted.values(l) # Plot the regression of the realized volatility difference against the weekly returns scatterPlot(as.numeric(weekRet), as.numeric(rviv), colNames = c("weekRet", "rviv"), main="Weekly return vs. Realized Volatility Difference", xLabel = "Realized Vol. Diff", yLabel = "Weekly Return" ) # Plot the volatility skew against the weekly returns scatterPlot(as.numeric(weekRet), as.numeric(skew.z), colNames = c("weekRet", "skew"), main="Weekly return vs. Volatility Skew", xLabel = "Volatility Skew", yLabel = "Weekly Return" ) # Plot the volatility skew against the weekly returns scatterPlot(as.numeric(weekRet), as.numeric(skewATM), colNames = c("weekRet", "skewATM"), main="Weekly return vs. ATM Volatility Skew", xLabel = "ATM Volatility Skew", yLabel = "Weekly Return" ) # Plot the distribution of the weekly returns # weekRet.std = (weekRet - mean(weekRet))/sd(weekRet) par(mfrow=c(1,2)) chart.Histogram(R = weekRet.std, methods="add.normal", xlab="Standardized Weekly Returns", breaks=40 ) qqPlot(as.numeric(weekRet.std), main="Standardized Weekly Ret.", ylab="Emperical Quantiles") par(mfrow=c(1,1)) # # Lets also look at the linear regression relative to the current week returns curRange = c(minDate, dateIndex) curWeekRet = simpleWeekReturn(sp500Sym, curRange) curData.df = as.data.frame(cbind(as.numeric(curWeekRet), as.numeric(skew.z), as.numeric(rviv), as.numeric(skewATM))) colnames(curData.df) = c("weekRet", "skew", "rviv", "skewATM") # Regression of the skewed values and the realized volatility difference against the weekly returns curl = lm(weekRet ~ skew + rviv + skewATM, data=curData.df) s = summary(curl) par(mfrow=c(1,1)) printTable(s$coefficients) par(mfrow=c(1,1)) print(sprintf("R^2 = %0.6f", s$r.squared)) # Plot the volatility skew against the current weekly returns scatterPlot(as.numeric(curWeekRet), as.numeric(skewATM), colNames = c("weekRet", "skewATM"), main="Current Weekly return vs. ATM Volatility Skew", xLabel = "ATM Volatility Skew", yLabel = "Current Weekly Return" ) # # Look at the effects of wavelet smoothing # # --------------------------------------------------------------------------------------- # This is a failed experiement. Calculating the return from a wavelet smoothed present value and a # (historical) future value makes the association with the volatility worse and makes the regression # worse: # # > cor(skewATM, weekRet) # [1] 0.1562891 # > cor(skewATM, waveWeekRet) # [1] -0.0682818 # Estimate Std. Error t value Pr(>|t|) # (Intercept) 0.004044 0.008946 0.452 0.653 # skew -0.034507 0.053915 -0.640 0.525 # rviv -0.019966 0.044709 -0.447 0.657 # skewATM -0.013437 0.059027 -0.228 0.821 # # Residual standard error: 0.02686 on 48 degrees of freedom # Multiple R-squared: 0.01901, Adjusted R-squared: -0.04231 # sp500Price = getPrices(sp500Sym, minDate, maxDate) # There are 252 values. We need 256, so fill in 4 values at the low end where they should not # have a lot of effect. sp500PriceFilled = c(as.numeric(sp500Price[1:4]), as.numeric(sp500Price)) # calculate the wavelet transform for the price ywd = wd(sp500PriceFilled) # threshold the wavelet trasform result and invert to reproduce the smoothed time series ywr <- wr(threshold(ywd, by.level=TRUE, policy="universal")) # Remove the extra values that were added sp500Smooth = as.zoo(ywr[5:length(ywr)]) index(sp500Smooth) = index(sp500Price) # # Plot the original price series and the smoothed time series # par(mfrow=c(1,1)) plot(sp500Price, type="l", col="blue", xlab="2011", ylab="Daily Close Price, 2011", lwd=2, main="SP500 Daily Close Price and Wavelet Threshold Smoothed Close Price") lines(sp500Smooth, type="l", col="magenta", lwd=2) grid(col="grey30") legend("bottomleft", legend=c("Daily Close Price", "Wavelet Threshold"), col=c("blue", "magenta", "magenta"), lwd=4 ) par(mfrow=c(1,1)) waveWeekRet = waveletWeekReturn(sp500Sym, returnRange, sp500Smooth) par(mfrow=c(1,1)) plot(weekRet, type="l", col="blue", ylab="Weekly Return", xlab="2011", main="Weekly Return vs. Wavelet Weekly Return") lines(waveWeekRet, type="l", col="magenta") abline(h=0, col="black") legend("bottomleft", legend=c("Weekly Return", "Wavelet Weekly Return"), col=c("blue", "magenta", "magenta"), lwd=4 ) par(mfrow=c(1,1)) waveData.df = as.data.frame(cbind(as.numeric(waveWeekRet), as.numeric(skew.z), as.numeric(rviv), as.numeric(skewATM))) colnames(waveData.df) = c("weekRet", "skew", "rviv", "skewATM") # Regression of the skewed values and the realized volatility difference against the weekly returns wavel = lm(weekRet ~ skew + rviv + skewATM, data=waveData.df) s = summary(wavel) par(mfrow=c(1,1)) printTable(s$coefficients) par(mfrow=c(1,1)) print(sprintf("R^2 = %0.6f", s$r.squared))