library(zoo)
library(tseries)
library(xts)
library(timeDate)
library(timeSeries)
library(PerformanceAnalytics)
library(car)
library(wavethresh)
library(gridExtra)


printTable = function( tab )
{
  grid.newpage()
  par(mfrow=c(1,1))
  grid.table(round(tab, 6), gpar.rowtext=gpar(fontsize = 16), show.box=T, 
             show.hlines=T, show.vlines=T, separator = "black")
  par(mfrow=c(1,1))
}


#
# Find the lag for the maximum cross correlation between two series
#
findMaxCCF = function(a,b)
{
  d <- ccf(a, b, plot = FALSE)
  cor = d$acf[,,1]
  lag = d$lag[,,1]
  res = data.frame(cor,lag)
  res_max = res[which.max(res$cor),]
  return(res_max)
} 

getPrices = function( tickerSym, startDate, endDate )
{
  ts = get.hist.quote( instrument = tickerSym, start = startDate, end = endDate,
                       quote="AdjClose", provider="yahoo", 
                       compression="d",  quiet=T)
  return(ts)
}

#
# For a given stock:
#   Return a time series of the CC returns
#
# Arguments:
#   tickerSym - the ticker symbol recognized by Yahoo for the stock
#   startDate - a character string with the start data (in ISO format)
#   endDate   - a character string with the end data (in ISO format)
#
getReturns = function( tickerSym, startDate, endDate )
{
  ts = getPrices( tickerSym, startDate, endDate)
  ret = diff(log(ts))
  colnames(ret) = tickerSym
  return(ret)
}

# Calculate the simple weekly return. Here the stock would be bought on day 1 and sold
# seven days later.
#
simpleWeekReturn = function( tickerSym, dateRange)
{
  startDate = dateRange[1]
  endDate = dateRange[length(dateRange)]
  ts = getPrices(tickerSym, startDate, endDate)
  buyDates = dateRange[1:(length(dateRange)-1)]
  sellDates = dateRange[2:length(dateRange)]
  p = as.numeric(ts[buyDates])
  p_plus_1 = as.numeric(ts[sellDates])
  ret = (p_plus_1 - p)/p
  ret.z = as.zoo(ret)
  index(ret.z) = sellDates
  return(ret.z)
}

waveletWeekReturn = function( tickerSym, dateRange, smoothPrice)
{
  startDate = dateRange[1]
  endDate = dateRange[length(dateRange)]
  ts = getPrices(tickerSym, startDate, endDate)
  buyDates = dateRange[1:(length(dateRange)-1)]
  sellDates = dateRange[2:length(dateRange)]
  p = as.numeric(smoothPrice[buyDates])
  p_plus_1 = as.numeric(ts[sellDates])
  ret = (p_plus_1 - p)/p
  ret.z = as.zoo(ret)
  index(ret.z) = sellDates
  return(ret.z)
}


#
# This file is for SPX options, which is quoted as ^GSPC
rootPath = "/home/iank/Documents/options_data"
file = "SPX2011.csv"
filePath = paste(rootPath, file, sep="/")


# options.df = read.table(filePath,header=TRUE,sep=",", row.names=NULL,
#                         colClasses = columns)
optionsDataObj = "optionsData.R"
if (file.exists( optionsDataObj) == T) 
{
  load(optionsDataObj)
} else {
  options.df = read.table(filePath, header=TRUE,sep=",", row.names=NULL)
  save(options.df, file=optionsDataObj)
}

# ----------------------------------------------------------------------------------------------------
# This version of the R-script works for 2011 data. The data for 2005 has slightly different columns so
# the code must be modified slightly.
# ----------------------------------------------------------------------------------------------------

# # colnames(options.df) = colNames
# options.df = options.df[1:18] # get rid of a sequential value column (I don't know how it got there)
# remvoe the Underlying, exchange, optionext, optionalias
temp = options.df[,c(-1,-3,-4, -5, -20)]
options.df = temp
rm(temp)
columns = c("UnderlyingLast","Type","Expiration","DataDate","Strike","Last",
            "Bid","Ask","Volume","OpenInterest","IV","Delta","Gamma","Theta","Vega")
colnames(options.df) = columns

# Convert the data in the form month/day/year to Date objects and put them back in the data frame.
strikeDate = as.Date(strptime(options.df[,"Expiration"], format="%m/%d/%Y", tz="EST"))
optionDate = as.Date(strptime(options.df[,"DataDate"], format="%m/%d/%Y", tz="EST"))
options.df[,"Expiration"] = strikeDate
options.df[,"DataDate"] = optionDate

#
# To calculate the realized volatility for week t, we need twenty trading days back in time.
# So the return start date is a month before the option start.
minDate = min(optionDate)
maxDate = max(optionDate)
optionStartDate = minDate - 30 # days
optionEndDate = maxDate

sp500Sym = "^GSPC"
sp500Ret = getReturns(sp500Sym, optionStartDate, optionEndDate)

#
# Get a vector of trading days. These are week days, with holidays removed. The trading days
# are relative to the New York Stock Exchange (NYSE).
#
# startDate and endDate are Date objects. The function returns a date ordered vector of Date
# objects which are the trading days.
#
# This function requires the timeDate package
#
tradingCalendar = function( startDate, endDate)
{
  require(timeDate)
  timeSeq = timeSequence(from=as.character(startDate), to=as.character(endDate), 
                         by="day", format = "%Y-%m-%d",
                         zone = "NewYork", FinCenter = "America/New_York")
  ix = as.logical(isWeekday(timeSeq, wday=1:5))
  tradingDays = timeSeq[ix]
  startYear = as.POSIXlt(startDate)$year + 1900
  endYear = as.POSIXlt(endDate)$year + 1900
  tradingDays.dt = as.Date(tradingDays)
  hol = as.Date(holidayNYSE(startYear:endYear))
  ix = which(tradingDays.dt %in% hol)
  tradingDays.dt = tradingDays.dt[-ix]
  return(tradingDays.dt)
} # tradingCalendar


# The notes and code here references:
#
# Exploiting Option Information in the Equity Market
# GUIDO BALTUSSEN, BART VAN DER GRIENT, WILMA DE GROOT,
# ERIK HENNINK AND WEILI ZHOU, December 2011
# 
# We apply the following screens on all options to ensure that we select liquid and heavily traded options that we
# believe contain the most reliable information. We filter out options with zero volume or
# open interest.
volIx = which( options.df[,"Volume"] > 0)
options.df = options.df[volIx,]
intIx = which( options.df[,"OpenInterest"] > 0)
options.df = options.df[intIx,]

startTime =  options.df[,"DataDate"]
endTime = options.df[,"Expiration"]
# as it turns out, there are some very long term options (leaps). So weed these out.
rangeIx = which(endTime < (maxDate + 60))
startTime = startTime[rangeIx]
endTime = endTime[rangeIx]

# Build a trading day hash table: index by a date and get an trading day back
tradingDays = tradingCalendar(minDate, maxDate + 60)

#------------------------------------------------------------------------------------------
# Buld a trading calendar so that the number of trading days between the quote date and the 
# expiration date can be calculated. 
#------------------------------------------------------------------------------------------

# As it turns out, some options are marked for expiration on Saturday. No new options an be purchased
# after market close on Friday, but the option can still be effected by after hours trading.  Unfortunately
# the trading calendar doesn't include saturday. So plug in the saturdays if they exist in the endTime
# vector (the option expirations)
optionDays = tradingDays
uniqueEndTime = unique(endTime)  # Unique option expiration days.
saturdayIx = which(! uniqueEndTime %in% tradingDays)
if (length(saturdayIx > 0)) {
  saturdays = uniqueEndTime[saturdayIx]
  optionDays = sort(c(tradingDays, saturdays))
}


dayIx.z = as.zoo(seq(from=1, to=length(optionDays), by=1))
index(dayIx.z) = optionDays

# For every start date, build a vector that contains the trading day (number)
# for that start date.
uniqueStart = unique(startTime)  # option quote dates (e.g., the moving start of the option period)
startIx = dayIx.z[uniqueStart]   # the trading day number for the start days
rleLen = rle(as.numeric(startTime))$lengths
ends = cumsum(rleLen)
starts = c(0, ends[1:(length(ends)-1)]) + 1
startDay = rep(0, length(startTime))
for (i in 1:length(starts)) {
  startDay[starts[i]:ends[i]] = startIx[i]
}

# For every option expiration date, build a vector that has the trading day
# for that option expiration date.  Note that option expirations are not
# sequential, but they do exist in blocks.
endIx = dayIx.z[uniqueEndTime]   # The trading day number for the expiration day
rleLen = rle(as.numeric(endTime))$length   # find the "run lengths" in the option expirations
ends = cumsum(rleLen)
starts = c(0, ends[1:(length(ends)-1)]) + 1
endDay = rep(0, length(startTime))
for (i in 1:length(starts)) {
  tryCatch( {
    secDate = endTime[starts[i]]  # get the section date
    ix = endIx[secDate]
    endDay[starts[i]:ends[i]] = ix
  },
            error = function(e) {
              print(sprintf("i = %d", i))
              break;
            }
  )
}

# From the article:
# As most activity in options is concentrated in the short end, we select options
# with a remaining maturity of approximately one month by requiring a maturity between 10
# and 40 trading days.

deltaDay = (endDay - startDay) + 1
daysInRange = (deltaDay >= 10) & (deltaDay <= 40)

temp = options.df[daysInRange,]
options.df = temp
rm(temp)

# Mark the options that are at-the-money (atm)
#
# From the article:
# We follow Xing et al. (2010) when separating options into at-themoney (ATM)
# and out-of-the-money (OTM). A put or call option is defined as ATM when the 
# ratio of the strike price to the stock price is between 0.95 and 1.05
ratio = options.df[,"Strike"] / options.df[,"UnderlyingLast"]
atm = sapply(ratio, FUN=function(v) {b = (v >= 0.95) && (v <= 1.05)})
options.df[,"atm"] = atm
options.df[,"otm"] = (!atm)

#
# This function is used to generate the weekly values for calculating the skew.
#
# "We use the weekly average of the IV variables to reduce the effect of noise, 
# while requiring at least two non-missing values during the past five days to
# compute the measure."
#
weekAggregateSkew = function(option.type.df, isCall)
{
  weekBreaks = endpoints(option.type.df[,"DataDate"], on="weeks")
  starts = weekBreaks[1:(length(weekBreaks)-1)]+1
  ends = weekBreaks[2:length(weekBreaks)]
  colNames = c("UnderlyingLast", "Expiration", "DataDate", "Strike", "Bid",   "Ask", "IV", "atm", "otm")
  week.df = data.frame()
  for (i in 1:length(starts)) {
    week = option.type.df[starts[i]:ends[i], colNames]
    if (isCall) {
      ix = which(week[,"atm"])
    } else {
      ix = which(week[,"otm"])
    }
    avgIV = mean(week[ix,"IV"])
    avgStrike = mean(week[ix, "Strike"])
    avgBid = mean(week[ix,"Bid"])
    avgAsk = mean(week[ix, "Ask"])
    avgLast = mean(week[ix, "UnderlyingLast"])
    date = week[nrow(week),"DataDate"]
    expir = week[1,"Expiration"]
    row = as.data.frame(list(last = avgLast, expir = expir, date = date, strike = avgStrike, 
                             ask = avgBid, bid = avgAsk, IV = avgIV))
    week.df = rbind(week.df, row)
  }
  return(week.df)
} # weekAggregateSkew

#
# Weekly average aggregate of the put and call IV for ATM puts and calls. The argument to the
# function is a data frame of both call and put options.
# 
# "ATM[i,t] IV , is the average of the implied volatility of the ATM call and ATM
# put option on stock i in week t."
# 
weekAggregateATMIV = function( option.df )
{
  weekBreaks = endpoints(option.df[,"DataDate"], on="weeks")
  starts = weekBreaks[1:(length(weekBreaks)-1)]+1
  ends = weekBreaks[2:length(weekBreaks)]
  colNames = c("UnderlyingLast", "Expiration", "DataDate", "Strike", "IV", "atm" )
  week.df = data.frame()
  for (i in 1:length(starts)) {
    week = option.df[starts[i]:ends[i], colNames]
    ix = which(week[,"atm"])
    avgIV = mean(week[ix,"IV"]) 
    avgStrike = mean(week[ix, "Strike"])
    avgLast = mean(week[ix, "UnderlyingLast"])
    date = week[nrow(week),"DataDate"]
    expir = week[1,"Expiration"]
    row = as.data.frame(list(last = avgLast, expir = expir, date = date, strike = avgStrike, 
                             IV = avgIV))
    week.df = rbind(week.df, row)
  }
  return(week.df)
} # weekAggregateSkew


weekVol = function(sp500Ret, weekEndDate, startDate, endDate )
{
  retData = as.vector(sp500Ret)
  charvec = as.character(index(sp500Ret))
  tradingDays = tradingCalendar(startDate, endDate)
  retIx = index(sp500Ret)
  realizedVol = rep(0, length(weekEndDate))
  for (i in 1:length(weekEndDate)) {
    endIx = which(tradingDays == weekEndDate[i])
#     print(paste(endIx, ":", tradingDays.dt[ix]))
    startIx = endIx - 20 # 20 days
    endDate = tradingDays[endIx]
    startDate = tradingDays[startIx]
    retStart = which(retIx == startDate)
    retEnd = which(retIx == endDate)
    dayBlock = sp500Ret[retStart:retEnd]
    dayVar = var(dayBlock)
    # implied volatility is yearly volitility, so convert from daily vol to
    # yearly vol
    yearVol = sqrt( 252 * dayVar)
    realizedVol[i] = yearVol
  }
  return(realizedVol)
} # weekVol

#
# Generate a linear regresison plot with the regression line, the regression values,
# the confidence interval and the prediction bands.
scatterPlot = function(y, x, colNames, main, xLabel, yLabel )
{
  df = as.data.frame(cbind(y, x))
  colnames(df) = colNames
  f.str = paste(colNames[1], "~", colNames[2])
  f = as.formula(f.str)
  l = lm(f, data=df)
  p = predict(l, interval="confidence", df=l$df.residual)
  topConf = p[,"upr"]
  lowConf = p[,"lwr"]
  p = predict(l, interval="prediction", df=l$df.residual)
  topPred = p[,"upr"]
  lowPred = p[,"lwr"]
  
  par(mfrow=c(1,1))
  plot(x = x, y = y, xlab = xLabel, ylab = yLabel, 
       type = "p", col="blue", pch=19, main= main)
  abline(l, lwd=2, col="red")
  lines(x = x, y = topConf, type="l", col="green")
  lines(x = x, y = lowConf, type="l", col="green")
  lines(x = x, y = topPred, type="l", col="magenta")
  lines(x = x, y = lowPred, type="l", col="magenta")
  legend("bottomright", legend=c("regression line", "95% Confidence Interval", "95% Prediction Band"), 
         col=c("red", "green", "magenta"), lwd=4, cex=0.90)
  par(mfrow=c(1,1))
}


#
#  Call options
#
callIx = which(options.df[,"Type"] == "call")
call.options.df = options.df[callIx,-2]

call.options.week = weekAggregateSkew(call.options.df, isCall=TRUE)

putIx = which(options.df[,"Type"] == "put")
put.options.df = options.df[putIx,-2]

# Mark the put options that are out-of-the-money (otm)
#
# a put option as OTM when the ratio is lower than 0.95, but higher than 0.80.
# When multiple options fall into the same group, we select options with
# moneyness closest to 1.00 (ATM) or 0.95 (OTM)
ratio = put.options.df[,"Strike"] / put.options.df[,"UnderlyingLast"]
otm = sapply(ratio, FUN=function(v) {b = (v >= 0.80) && (v <= 0.95)})
put.options.df[,"otm"] = otm

put.options.week = weekAggregateSkew(put.options.df, isCall = FALSE)
skew.df = data.frame(put.options.week[,"date"], put.options.week[,"IV"] - call.options.week[,"IV"] )
colnames(skew.df) = c("date", "skew")
skew.z = as.zoo(skew.df[,"skew"])
index(skew.z) = skew.df[,"date"]

# The second measure is the realized (historical) versus implied volatility
# spread, which is thought to capture the volatility risk of a stock. We measure
# the realized versus implied volatility spread by the difference between the
# realized volatility of the past 20 daily stock returns and the implied
# volatility. Where RV[i,t] is the realized volatility of stock i measured in
# week t and ATM[i,t] IV , is the average of the implied volatility of the ATM
# call and ATM put option on stock i in week t.

atmIV = weekAggregateATMIV(options.df)
# Now we need to calculate the realized volatility
weekEndDate = atmIV[,"date"]
# realized annual volatility
realVol = weekVol(sp500Ret, weekEndDate, optionStartDate, optionEndDate )
rviv = as.zoo(realVol - atmIV[,"IV"])
dateIndex = atmIV[,"date"]  # option quote date - no Saturdays
index(rviv) = dateIndex

weekATMIVCall = weekAggregateATMIV( call.options.df )
weekATMIVPut = weekAggregateATMIV( put.options.df )

skewATM = as.zoo(weekATMIVPut[,"IV"] - weekATMIVCall[,"IV"])
index(skewATM) = dateIndex

retEnd1 = dateIndex[length(dateIndex)]
retEndIx = which(tradingDays == retEnd1)
retEnd2 = tradingDays[ retEndIx + 5]
returnRange = c(dateIndex, retEnd2)

weekRet = simpleWeekReturn(sp500Sym, returnRange)

regData.df = as.data.frame(cbind(as.numeric(weekRet), as.numeric(skew.z), as.numeric(rviv), as.numeric(skewATM)))
colnames(regData.df) = c("weekRet", "skew", "rviv", "skewATM")

# Regression of the skewed values and the realized volatility difference against the weekly returns
l = lm(weekRet ~ skew + rviv + skewATM, data=regData.df)

s = summary(l)
par(mfrow=c(1,1))
printTable(s$coefficients)
par(mfrow=c(1,1))

print(sprintf("R^2 = %0.6f", s$r.squared))

resid = residuals(l)

resid.std = (resid - mean(resid)) / sd(resid)
fitted.vals = fitted.values(l)

# Look at the residual distribution
par(mfrow=c(1,2))
plot(x=fitted.vals, y = resid.std, xlab="Fitted Values", ylab="Standardized Residuals", pch=16, col="blue")
qqPlot(resid, main="Regression Residuals", ylab="Emperical Quantiles", col.lines=c("black", "blue", "green"))
par(mfrow=c(1,1))


resid = residuals(l)

resid.std = (resid - mean(resid)) / sd(resid)
fitted.vals = fitted.values(l)

# Plot the regression of the realized volatility difference against the weekly returns
scatterPlot(as.numeric(weekRet), as.numeric(rviv), colNames = c("weekRet", "rviv"), 
                       main="Weekly return vs. Realized Volatility Difference", 
                       xLabel = "Realized Vol. Diff", yLabel = "Weekly Return" )

# Plot the volatility skew against the weekly returns
scatterPlot(as.numeric(weekRet), as.numeric(skew.z), colNames = c("weekRet", "skew"), 
            main="Weekly return vs. Volatility Skew", 
            xLabel = "Volatility Skew", yLabel = "Weekly Return" )

# Plot the volatility skew against the weekly returns
scatterPlot(as.numeric(weekRet), as.numeric(skewATM), colNames = c("weekRet", "skewATM"), 
            main="Weekly return vs. ATM Volatility Skew", 
            xLabel = "ATM Volatility Skew", yLabel = "Weekly Return" )

# Plot the distribution of the weekly returns
#
weekRet.std = (weekRet - mean(weekRet))/sd(weekRet)
par(mfrow=c(1,2))
chart.Histogram(R = weekRet.std, methods="add.normal", xlab="Standardized Weekly Returns", breaks=40 )
qqPlot(as.numeric(weekRet.std), main="Standardized Weekly Ret.", ylab="Emperical Quantiles")
par(mfrow=c(1,1))

#
# Lets also look at the linear regression relative to the current week returns
curRange = c(minDate, dateIndex)
curWeekRet = simpleWeekReturn(sp500Sym, curRange)

curData.df = as.data.frame(cbind(as.numeric(curWeekRet), as.numeric(skew.z), as.numeric(rviv), as.numeric(skewATM)))
colnames(curData.df) = c("weekRet", "skew", "rviv", "skewATM")

# Regression of the skewed values and the realized volatility difference against the weekly returns
curl = lm(weekRet ~ skew + rviv + skewATM, data=curData.df)

s = summary(curl)
par(mfrow=c(1,1))
printTable(s$coefficients)
par(mfrow=c(1,1))

print(sprintf("R^2 = %0.6f", s$r.squared))

# Plot the volatility skew against the current weekly returns
scatterPlot(as.numeric(curWeekRet), as.numeric(skewATM), colNames = c("weekRet", "skewATM"), 
            main="Current Weekly return vs. ATM Volatility Skew", 
            xLabel = "ATM Volatility Skew", yLabel = "Current Weekly Return" )

#
# Look at the effects of wavelet smoothing
#
# ---------------------------------------------------------------------------------------
# This is a failed experiement. Calculating the return from a wavelet smoothed present value and a
# (historical) future value makes the association with the volatility worse and makes the regression
# worse:
#
# > cor(skewATM, weekRet)
# [1] 0.1562891
# > cor(skewATM, waveWeekRet)
# [1] -0.0682818
# Estimate Std. Error t value Pr(>|t|)
# (Intercept)  0.004044   0.008946   0.452    0.653
# skew        -0.034507   0.053915  -0.640    0.525
# rviv        -0.019966   0.044709  -0.447    0.657
# skewATM     -0.013437   0.059027  -0.228    0.821
# 
# Residual standard error: 0.02686 on 48 degrees of freedom
# Multiple R-squared:  0.01901,  Adjusted R-squared:  -0.04231 
# 

sp500Price = getPrices(sp500Sym, minDate, maxDate)
# There are 252 values. We need 256, so fill in 4 values at the low end where they should not
# have a lot of effect.
sp500PriceFilled = c(as.numeric(sp500Price[1:4]), as.numeric(sp500Price))
# calculate the wavelet transform for the price
ywd = wd(sp500PriceFilled)
# threshold the wavelet trasform result and invert to reproduce the smoothed time series
ywr <- wr(threshold(ywd, by.level=TRUE, policy="universal"))
# Remove the extra values that were added
sp500Smooth = as.zoo(ywr[5:length(ywr)])
index(sp500Smooth) = index(sp500Price)
#
# Plot the original price series and the smoothed time series
#
par(mfrow=c(1,1))
plot(sp500Price, type="l", col="blue", xlab="2011", ylab="Daily Close Price, 2011", lwd=2,
     main="SP500 Daily Close Price and Wavelet Threshold Smoothed Close Price")
lines(sp500Smooth, type="l", col="magenta", lwd=2)
grid(col="grey30")
legend("bottomleft", legend=c("Daily Close Price", "Wavelet Threshold"), 
       col=c("blue", "magenta", "magenta"), lwd=4 )
par(mfrow=c(1,1))

waveWeekRet = waveletWeekReturn(sp500Sym, returnRange, sp500Smooth)

par(mfrow=c(1,1))
plot(weekRet, type="l", col="blue", ylab="Weekly Return", xlab="2011",
     main="Weekly Return vs. Wavelet Weekly Return")
lines(waveWeekRet, type="l", col="magenta")
abline(h=0, col="black")
legend("bottomleft", legend=c("Weekly Return", "Wavelet Weekly Return"), 
       col=c("blue", "magenta", "magenta"), lwd=4 )
par(mfrow=c(1,1))

waveData.df = as.data.frame(cbind(as.numeric(waveWeekRet), as.numeric(skew.z), as.numeric(rviv), as.numeric(skewATM)))
colnames(waveData.df) = c("weekRet", "skew", "rviv", "skewATM")

# Regression of the skewed values and the realized volatility difference against the weekly returns
wavel = lm(weekRet ~ skew + rviv + skewATM, data=waveData.df)

s = summary(wavel)
par(mfrow=c(1,1))
printTable(s$coefficients)
par(mfrow=c(1,1))

print(sprintf("R^2 = %0.6f", s$r.squared))