#
# Fix Compustat corporate data.
#
# The CRSP/Compustat data is used by researchers and graduate students in
# finance. While this is an important and critical resource, the data cannot be
# used without being corrected (fixed). The corporate factor data from compustat
# occasionally is missing critical information like the outstanding number of
# shares or even the close price. This code attempts to patch these holes
#
# A detailed discussion on the CRSP/Compustat data can be found on 
# http://www.bearcave.com/finance/wrds_data.html
#
# The Warton Research Data Service is gratefully acknowledged.  My access to this
# data has been through the University of Washington Computatonal Finance and Risk
# Management graduate program.
#
library(tseries)
library(timeDate)
library(zoo)
library(xts)
library(RCurl)

NUM_SHARES = "CSHOQ"
SHARE_PRICE = "PRCCQ"
SALES = "SALEQ"
COMMON_EQT = "CEQQ"
INCOME = "NIQ"
LIABILITIES = "LTQ"
TICKER = "tic"
DATE = "datadate"
FYEAR = "fyearq"
FQTR = "fqtr"

#
# vec : a vector of values
# stockFact.df : a compustat data frame that includes a date column
# return: vec as a zoo object with dates from stockFact.df as the zoo index.
#
addDate = function(vec, stockFact.df)
{
  dates = as.Date(strptime(as.vector(as.character(stockFact.df[,DATE])), format="%Y-%m-%d", tz="EST"))
  vec.z = as.zoo(vec)
  index(vec.z) = dates
  return(vec.z)
}

# 
# Reading Yahoo data can be tricky. The stock may not exist (for example, the ticker may be some custom
# Compustat ticker that doesn't exist on Yahoo).  Or the stock may exist, but not at that date.
# 
getStockInfo = function(ticker, date)
{
  close = NA
  date = as.Date(date)
  start = date
  end = date
  url <- paste("http://chart.yahoo.com/table.csv?s=", ticker, 
               format(start, "&a=%m&b=%d&c=%Y"), 
               format(end, "&d=%m&e=%d&f=%Y"), "&g=d&q=q&y=0&z=", ticker, "&x=.csv", sep = "")
  page = getURL(url) 
  readURL = TRUE
  ix = grep(pattern="404 Not Found", x = page)
  if (length(ix) == 0) {
    # we could still have a URL like  "Date,Open,High,Low,Close,Volume,Adj Close\n"
    # so check for a number as would be found in
    # "Date,Open,High,Low,Close,Volume,Adj Close\n2001-07-30,14.50,14.78,14.50,14.70,3654300,12.61\n"
    readURL = any(grepl("[[:digit:]]",page))
    if (readURL) {
      # We found something with numbers. Now check that the date is right
      table = read.csv(textConnection(page))
      dates = as.Date(as.vector(table[,"Date"]))
      ix = which(dates == date)
      if (length(ix) > 0) { # the date is right
        close = as.numeric(table[ix,"Close"])
      }
      else {
        readURL = FALSE
      }
    }
  } else {
    readURL = FALSE
  }
  rslt = list(readOK = readURL, close = close)
  return(rslt)
} # getStockInfo


#
# Given a time series of stock prices (a zoo time series, with a date index), fill in the closest
# close price for missing quotes by referencing finance.yahoo.com.
#
fixSharePriceHoles = function(sharePrice.z, ticker)
{
  newPrices.z = sharePrice.z
  ix = which(is.na(sharePrice.z))
  if (length(ix) > 0) {
    for (i in ix) {
      date = index(sharePrice.z[i])
      # Find the nearest trading day (this appears unnecessary)
      while(! isBizday(x = as.timeDate(date), holidays=holidayNYSE(as.numeric(format(date, "%Y"))))) {
        date = date - 1
      }
      rslt = getStockInfo(ticker, date)
      sharePrice.z[i] = rslt$close
    } # for
    zooIx = index(sharePrice.z)
    v = coredata(sharePrice.z)
    v_approx = na.approx(v, maxgap=4, na.rm=FALSE)
    v_fill = na.fill(v_approx, fill="extend")
    newPrices.z = zoo(v_fill)
    index(newPrices.z) = zooIx
  } # if
  return(newPrices.z)
} # fixSharePriceHoles


#
# There are cases where Compustat is missing a share price value. This code will attempt to
# fill the value in from a finance.yahoo.com lookup. If the ticker is not available on Yahoo
# (perhaps because the company is out of business), interpolate the value if possible.
#
fixSharePrice = function(factors.df)
{
  stocks = sort(unique(as.vector(factors.df[,TICKER])))
  for (i in 1:length(stocks)) {
    stock = stocks[i]
    stockIx = which(factors.df[,TICKER] == stock)
    stockFactors.df = factors.df[stockIx,]
    # Add close prices for missing close prices
    rawPrices = stockFactors.df[, SHARE_PRICE]
    if (any(is.na(rawPrices))) {
      numNA = length(which(is.na(rawPrices)))
      fracNA = numNA / length(rawPrices)
      if (fracNA < 0.2) {
        rawPrices.z = addDate(rawPrices, stockFactors.df)
        prices.z = fixSharePriceHoles(rawPrices.z, stock)
        factors.df[stockIx, SHARE_PRICE] = data.frame(prices.z)
      }
    }
  }
  return(factors.df)
} # fixSharePrice


#
# Replace NA values with interpolated values, if the "gap" (the run of NA values) is 4 or less.
# Note that the interploation has to be done for a stock region, so it has to be performed one
# stock at a time.
#
# There are some cases where values are critical. For example, there must be a value for
# the number of shares outstanding or the market and enterprise values can't be calculated.
# Similarly, there must be a close price. If these values can't be interpolated and there
# are too many missing, then that stock should be removed from the data set. In this case
# the maxFracAllowed should be set (perhaps to a value like 0.1) and remove should be set
# to TRUE.
#
filterOutBadFactor = function(factors.df, factorLabel, maxFracAllowed = 1, extend = FALSE, remove=FALSE)
{
  ix = which(is.na(factors.df[,factorLabel]))
  factorNA.df = factors.df[ix,]
  # stocks: Stocks that have factors with NA values
  stocks = as.vector(unique(factorNA.df[,TICKER]))
  ixBlock = c()
  for (stock in stocks) {
    ixStock = which(factors.df[,TICKER] == stock)
    facVal = factors.df[ixStock,factorLabel]
    if (sum(is.na(facVal)) < (length(facVal)-2)) {
      facVal = na.approx(facVal, maxgap = 4, na.rm = FALSE)
      if (extend) {
         facVal = na.fill(facVal, fill="extend", maxgap=4, na.rm=F)
      }
      factors.df[ixStock,factorLabel] = facVal
    }
    if (remove) {
      numNA = sum(is.na(facVal))
      percnt = numNA / length(facVal)
      if (percnt > maxFracAllowed) {
        ixBlock = c(ixBlock, ixStock)
      }
    }
  } # for
  if (length(ixBlock) > 0) {
    factorsCommonFilt.df = factors.df[-ixBlock,]
  } else {
    factorsCommonFilt.df = factors.df
  }
  return(factorsCommonFilt.df)
}

#
# fix out of order dates. This exists for the stock "MYL"
# and perhaps others.
#
fixDateOrder = function(factors.df)
{
  newFactors.df = factors.df
  stocks = as.vector(factors.df[,TICKER])
  for (sym in stocks) {   
    symIx = which(newFactors.df[,TICKER] == sym)
    block = newFactors.df[symIx,]
    dates = as.Date(as.vector(block[,DATE]))
    ordIx = order(dates)
    sequential = seq(from=1, to=length(dates))
    if (! all(ordIx == sequential)) {
      newBlock = block[ordIx,]
      newFactors.df[symIx,] = newBlock
    }
  }
  return(newFactors.df)
}


# 
# There are cases where the date in the "datadate" column does not match the 
# date in the fyearq fqtr columns. The fyearq and fqtr columns do appear to have
# the correct dates.  This code fixes the dates so that they do not screw up
# later processing when the data is placed in a zoo (date indexed) data structure.
#
fixDates = function( factors.df )
{
  quarters = c("03-31", "06-30", "09-30", "12-31")
  qtrIx = factors.df[,FQTR]
  fyear = factors.df[,FYEAR]
  qtr = quarters[qtrIx]
  dates = paste(fyear, qtr, sep="-")
  factors.df[,DATE] = dates
  newFactors.df = fixDateOrder( factors.df )
  return(newFactors.df)
}


#
# Build a table that shows the percentage of NA values for each corporate factor.
#
buildPrcntNATable = function(factors.df)
{
  metaCols = c("gvkey", "datadate", "fyearq", "fqtr", "tic", "conm", "DATAFQTR", "GGROUP", "GSECTOR" )
  facCols = colnames(factors.df)
  metIx = which(facCols %in% metaCols)
  dataCols = facCols[-metIx]
  #
  # calculation the ratio of NA values to defined values in the data
  #
  percntNA = apply(factors.df[,dataCols], 2,
                   FUN=function(v) {
                     NAs = is.na(v)
                     s = sum(NAs)
                     frac = s / length(v)
                     return(frac)
                   })
  
  rowNames = names(percntNA)
  ix = order(percntNA, decreasing=T)
  percntNASort = percntNA[ix]
  names(percntNASort) = rowNames[ix]
  percntNASort = round(as.matrix(percntNASort, nrow=length(percntNASort), ncol=1) * 100, 2)
  colnames(percntNASort) = c("% missing values")
  return(percntNASort)
}

#
# Find stocks that are in the constituent index data frame but are not in the close prices
# and return the stockks as a vector to ticker symbols. These can be used in the WRDS database
# to download some of the missing data.
#
findMissingStocks = function(sp500Ix.df, sp500Close.df)
{
  indexCusip = as.vector(sp500Ix.df[,"co_cusip"])
  cusipLen = nchar(indexCusip)
  indexCusipTrunc = substr(indexCusip, start=1, stop=(cusipLen-1))
  
  closeCusip = as.vector(sp500Close.df[,"CUSIP"])
  
  indexUnique = unique(indexCusipTrunc)
  closeUnique = unique(closeCusip)
  
  missingIx = which(! indexUnique %in% closeUnique)
  missingIndexCusip = indexUnique[missingIx]
  indexIx = which(indexCusipTrunc %in% missingIndexCusip)
  indexBlock = sp500Ix.df[indexIx,]
  missingStocks = unique(as.vector(indexBlock[,"co_tic"]))
  return(missingStocks)
}


#
# Using the S&P 500 index CUSIP, find the matching close price CUSIP and fill in the index symbol
# for the close price symbol. This make the ticker symbols consistent.
#
# The CUSIP has an extra digit.  For example, in the index Honeywell's CUSIP is 438516106.  In the
# Compustat close price data set the CUSIP is 43851610, one digit less. To get the close price CUSIP 
# to match the index CUSIP, the longer index CUSIP is truncated by one digit. 
#
replaceNames = function(sp500ix.df, sp500Close.df)
{
  indexCusip = as.vector(sp500Ix.df[,"co_cusip"])
  cusipLen = nchar(indexCusip)
  indexCusipTrunc = substr(indexCusip, start=1, stop=(cusipLen-1))
  indexSym = as.vector(sp500Ix.df[,"co_tic"])
  indexTbl = unique(cbind(indexCusipTrunc, indexSym))
  colnames(indexTbl) = c("CUSIP", "sym")
  
  closeCUSIP = as.vector(sp500Close.df[,"CUSIP"])
  closeSyms = as.vector(sp500Close.df[,"TICKER"])
  for (i in 1:nrow(indexTbl)) {
    ixCUSIP = as.character(indexTbl[i,"CUSIP"])
    ixSym = as.character(indexTbl[i,"sym"])
    ix = which(closeCUSIP == ixCUSIP)
    if (length(ix) > 0) {
      closeSyms[ix] = ixSym
    }
  }
  sp500CloseNew = cbind(sp500Close.df[,"date"], 
                        closeSyms, sp500Close.df[,c("COMNAM", "CUSIP", "PRC", "sprtrn")])
  colnames(sp500CloseNew) = c("date", "TICKER", "COMNAM", "CUSIP", "PRC", "sprtrn")
  return(sp500CloseNew)
}


# Root path to the directory that contains the Compustat corporate
# factors.
dataDir = "/home/iank/Documents/thesis_project/data"

compustatFactors = "sp500_compustat.csv"
fixedFactors = "sp500_compustat_fixed.csv"

sp500Close = "sp500_close_prices.csv"
sp500CloseQtr = "sp500_close_qtr.csv"
sp500CloseMonth = "sp500_close_mon.csv"

sp500Index = "s_and_p_500.csv"

factorPath = paste(dataDir, compustatFactors, sep="/")
factorOutPath = paste(dataDir, fixedFactors, sep="/")

closePath = paste(dataDir, sp500Close, sep="/")
closeOut = paste(dataDir, sp500CloseQtr, sep="/")
closeMon = paste(dataDir, sp500CloseMonth, sep="/")

sp500Path = paste(dataDir, sp500Index, sep="/")

factorsRaw.df = read.csv(file=factorPath)

# These are quarterly corporate factors that are used to synthesize portfolio
# factors where there are less than five percent 6% of values which are NA.
# Because there are only a few missing values we can try to use interpolation to
# fill in some values.
# 
fixList = c(COMMON_EQT, "CHEQ", "COGSQ", "DLCQ", "DLTTQ", "IBCOMQ", "IBQ", "CHEQ", "TXTQ", "PIQ", INCOME, LIABILITIES)

factors1.df = fixDates( factorsRaw.df )
factors2.df = fixSharePrice( factors1.df )
factors3.df = filterOutBadFactor(factors2.df, NUM_SHARES, maxFracAllowed = 0.1, extend = TRUE, remove = TRUE)
fixed.df = factors3.df
for (fact in fixList) {
  fixed.df = filterOutBadFactor(fixed.df, fact)
}
factorsFilt.df = filterOutBadFactor(fixed.df, SALES, extend=TRUE)

naTable = buildPrcntNATable(factorsFilt.df)
par(mfrow=c(1,1))
barplot(t(naTable), beside=T, las=2, ylab="Percent of values that are NA", main="NA/Value Distribution")
par(mfrow=c(1,1))

write.csv(x = factorsFilt.df, file=factorOutPath, quote=F, row.names=F)

# sort table alphabetically
names = rownames(naTable)
namesSrt = order(names)
naNameSrt = matrix(naTable[namesSrt,])
rownames(naNameSrt) = names[namesSrt]

txtq = factorsFilt.df[,"TXTQ"]
piq = factorsFilt.df[,"PIQ"]
trate = txtq / piq
# get rid of NA values
trate[is.na(trate)] = 0
# There should not be a tax rate over 0.4 or less than zero
trate[ trate < 0] = 0
trate[ trate > 0.4] = 0.4

# par(mfrow=c(1,1))
# hist(trate, xlab="Tax Rate", main="Tax Rate Distribution", col="grey")
# par(mfrow=c(1,1))

#
# Now do the S&P 500 close prices. The close prices are obtained from the Compustat monthly update
# (e.g., they are monthly close prices).  The ticker symbols are not consistent between the S&P 500
# index constituents and the close prices. So a few things must be done to prepare the close prices:
#
# 1. Using the CUSIP numbers, fill in the ticker symbol so that there is a consistent symbol.
# 2. Regularize the quarterly close dates.
#

sp500Ix.df = read.csv(file=sp500Path)

sp500Close.df = read.csv(file=closePath)

# fill in ticker symbols from the index in the close price data set to make the symbol
# reference consistent.
sp500CloseNew.df = replaceNames(sp500ix.df, sp500Close.df)
dates = as.Date(strptime(as.vector(sp500CloseNew.df[,"date"]), format="%m/%d/%Y", tz="EST"))
sp500CloseNew.df[,"date"] = dates
zeroLenTicker = which(nchar(as.vector(sp500CloseNew.df[,"TICKER"])) == 0)
sp500CloseNew2.df = sp500CloseNew.df
if (length(zeroLenTicker) > 0) {
  sp500CloseNew2.df = sp500CloseNew.df[-zeroLenTicker,]
}

saveCols = colnames(sp500CloseNew2.df)
colnames(sp500CloseNew2.df) = c("datadate", "tic", "conm", "CUSIP", "PRCCQ", saveCols[length(saveCols)])
sp500CloseFixed.df = fixSharePrice( sp500CloseNew2.df )
# There should be no share prices that are NA at this point. If there are, remove the stocks from the data
# set since it's not been possible to correct them.
naIx = which(is.na(sp500CloseFixed.df[,"PRCCQ"]))
naSyms = as.vector(sp500CloseFixed.df[naIx,"tic"])
runLen = rle(naSyms)
removeSyms = runLen$values[runLen$lengths > 1]
removeIx = which(as.vector(sp500CloseFixed.df[,"tic"]) %in% removeSyms)
sp500CloseFilt.df = sp500CloseFixed.df
if (length(removeIx) > 0) {
  sp500CloseFilt.df = sp500CloseFixed.df[-removeIx,]
}

colnames(sp500CloseFilt.df) = saveCols
sp500CloseFilt.df[is.na(sp500CloseFilt.df)] = 0

dates = sp500CloseFilt.df[,"date"]

months = as.numeric(format(dates, format="%m"))
ix = which(months %in% c(3, 6, 9, 12))
cols = c("date", "TICKER", "COMNAM", "CUSIP", "PRC")

qtr = dates[ix]
q1Ix = which(as.numeric(format(qtr, format="%m")) == 3)
q1Yr = format(qtr[q1Ix], format="%Y")
qtr[q1Ix] = as.Date(paste(q1Yr, "03-31", sep="-"))

q2Ix = which(as.numeric(format(qtr, format="%m")) == 6)
q2Yr = format(qtr[q2Ix], format="%Y")
qtr[q2Ix] = as.Date(paste(q2Yr, "06-30", sep="-"))

q3Ix = which(as.numeric(format(qtr, format="%m")) == 9)
q3Yr = format(qtr[q3Ix], format="%Y")
qtr[q3Ix] = as.Date(paste(q3Yr, "09-30", sep="-"))

q4Ix = which(as.numeric(format(qtr, format="%m")) == 12)
q4Yr = format(qtr[q4Ix], format="%Y")
qtr[q4Ix] = as.Date(paste(q4Yr, "12-31", sep="-"))

qtrDates = as.Date(strptime(qtr, format="%Y-%m-%d", tz="EST"))
sp500CloseFilt.df[ix,"date"] = qtrDates

sp500CloseQtr.df = sp500CloseFilt.df[ix, cols]

write.csv(x = sp500CloseQtr.df, file=closeOut, quote=F, row.names=F)

write.csv(x = sp500CloseFilt.df, file=closeMon, quote=F, row.names=F)