# # Fix Compustat corporate data. # # The CRSP/Compustat data is used by researchers and graduate students in # finance. While this is an important and critical resource, the data cannot be # used without being corrected (fixed). The corporate factor data from compustat # occasionally is missing critical information like the outstanding number of # shares or even the close price. This code attempts to patch these holes # # A detailed discussion on the CRSP/Compustat data can be found on # http://www.bearcave.com/finance/wrds_data.html # # The Warton Research Data Service is gratefully acknowledged. My access to this # data has been through the University of Washington Computatonal Finance and Risk # Management graduate program. # library(tseries) library(timeDate) library(zoo) library(xts) library(RCurl) NUM_SHARES = "CSHOQ" SHARE_PRICE = "PRCCQ" SALES = "SALEQ" COMMON_EQT = "CEQQ" INCOME = "NIQ" LIABILITIES = "LTQ" TICKER = "tic" DATE = "datadate" FYEAR = "fyearq" FQTR = "fqtr" # # vec : a vector of values # stockFact.df : a compustat data frame that includes a date column # return: vec as a zoo object with dates from stockFact.df as the zoo index. # addDate = function(vec, stockFact.df) { dates = as.Date(strptime(as.vector(as.character(stockFact.df[,DATE])), format="%Y-%m-%d", tz="EST")) vec.z = as.zoo(vec) index(vec.z) = dates return(vec.z) } # # Reading Yahoo data can be tricky. The stock may not exist (for example, the ticker may be some custom # Compustat ticker that doesn't exist on Yahoo). Or the stock may exist, but not at that date. # getStockInfo = function(ticker, date) { close = NA date = as.Date(date) start = date end = date url <- paste("http://chart.yahoo.com/table.csv?s=", ticker, format(start, "&a=%m&b=%d&c=%Y"), format(end, "&d=%m&e=%d&f=%Y"), "&g=d&q=q&y=0&z=", ticker, "&x=.csv", sep = "") page = getURL(url) readURL = TRUE ix = grep(pattern="404 Not Found", x = page) if (length(ix) == 0) { # we could still have a URL like "Date,Open,High,Low,Close,Volume,Adj Close\n" # so check for a number as would be found in # "Date,Open,High,Low,Close,Volume,Adj Close\n2001-07-30,14.50,14.78,14.50,14.70,3654300,12.61\n" readURL = any(grepl("[[:digit:]]",page)) if (readURL) { # We found something with numbers. Now check that the date is right table = read.csv(textConnection(page)) dates = as.Date(as.vector(table[,"Date"])) ix = which(dates == date) if (length(ix) > 0) { # the date is right close = as.numeric(table[ix,"Close"]) } else { readURL = FALSE } } } else { readURL = FALSE } rslt = list(readOK = readURL, close = close) return(rslt) } # getStockInfo # # Given a time series of stock prices (a zoo time series, with a date index), fill in the closest # close price for missing quotes by referencing finance.yahoo.com. # fixSharePriceHoles = function(sharePrice.z, ticker) { newPrices.z = sharePrice.z ix = which(is.na(sharePrice.z)) if (length(ix) > 0) { for (i in ix) { date = index(sharePrice.z[i]) # Find the nearest trading day (this appears unnecessary) while(! isBizday(x = as.timeDate(date), holidays=holidayNYSE(as.numeric(format(date, "%Y"))))) { date = date - 1 } rslt = getStockInfo(ticker, date) sharePrice.z[i] = rslt$close } # for zooIx = index(sharePrice.z) v = coredata(sharePrice.z) v_approx = na.approx(v, maxgap=4, na.rm=FALSE) v_fill = na.fill(v_approx, fill="extend") newPrices.z = zoo(v_fill) index(newPrices.z) = zooIx } # if return(newPrices.z) } # fixSharePriceHoles # # There are cases where Compustat is missing a share price value. This code will attempt to # fill the value in from a finance.yahoo.com lookup. If the ticker is not available on Yahoo # (perhaps because the company is out of business), interpolate the value if possible. # fixSharePrice = function(factors.df) { stocks = sort(unique(as.vector(factors.df[,TICKER]))) for (i in 1:length(stocks)) { stock = stocks[i] stockIx = which(factors.df[,TICKER] == stock) stockFactors.df = factors.df[stockIx,] # Add close prices for missing close prices rawPrices = stockFactors.df[, SHARE_PRICE] if (any(is.na(rawPrices))) { numNA = length(which(is.na(rawPrices))) fracNA = numNA / length(rawPrices) if (fracNA < 0.2) { rawPrices.z = addDate(rawPrices, stockFactors.df) prices.z = fixSharePriceHoles(rawPrices.z, stock) factors.df[stockIx, SHARE_PRICE] = data.frame(prices.z) } } } return(factors.df) } # fixSharePrice # # Replace NA values with interpolated values, if the "gap" (the run of NA values) is 4 or less. # Note that the interploation has to be done for a stock region, so it has to be performed one # stock at a time. # # There are some cases where values are critical. For example, there must be a value for # the number of shares outstanding or the market and enterprise values can't be calculated. # Similarly, there must be a close price. If these values can't be interpolated and there # are too many missing, then that stock should be removed from the data set. In this case # the maxFracAllowed should be set (perhaps to a value like 0.1) and remove should be set # to TRUE. # filterOutBadFactor = function(factors.df, factorLabel, maxFracAllowed = 1, extend = FALSE, remove=FALSE) { ix = which(is.na(factors.df[,factorLabel])) factorNA.df = factors.df[ix,] # stocks: Stocks that have factors with NA values stocks = as.vector(unique(factorNA.df[,TICKER])) ixBlock = c() for (stock in stocks) { ixStock = which(factors.df[,TICKER] == stock) facVal = factors.df[ixStock,factorLabel] if (sum(is.na(facVal)) < (length(facVal)-2)) { facVal = na.approx(facVal, maxgap = 4, na.rm = FALSE) if (extend) { facVal = na.fill(facVal, fill="extend", maxgap=4, na.rm=F) } factors.df[ixStock,factorLabel] = facVal } if (remove) { numNA = sum(is.na(facVal)) percnt = numNA / length(facVal) if (percnt > maxFracAllowed) { ixBlock = c(ixBlock, ixStock) } } } # for if (length(ixBlock) > 0) { factorsCommonFilt.df = factors.df[-ixBlock,] } else { factorsCommonFilt.df = factors.df } return(factorsCommonFilt.df) } # # fix out of order dates. This exists for the stock "MYL" # and perhaps others. # fixDateOrder = function(factors.df) { newFactors.df = factors.df stocks = as.vector(factors.df[,TICKER]) for (sym in stocks) { symIx = which(newFactors.df[,TICKER] == sym) block = newFactors.df[symIx,] dates = as.Date(as.vector(block[,DATE])) ordIx = order(dates) sequential = seq(from=1, to=length(dates)) if (! all(ordIx == sequential)) { newBlock = block[ordIx,] newFactors.df[symIx,] = newBlock } } return(newFactors.df) } # # There are cases where the date in the "datadate" column does not match the # date in the fyearq fqtr columns. The fyearq and fqtr columns do appear to have # the correct dates. This code fixes the dates so that they do not screw up # later processing when the data is placed in a zoo (date indexed) data structure. # fixDates = function( factors.df ) { quarters = c("03-31", "06-30", "09-30", "12-31") qtrIx = factors.df[,FQTR] fyear = factors.df[,FYEAR] qtr = quarters[qtrIx] dates = paste(fyear, qtr, sep="-") factors.df[,DATE] = dates newFactors.df = fixDateOrder( factors.df ) return(newFactors.df) } # # Build a table that shows the percentage of NA values for each corporate factor. # buildPrcntNATable = function(factors.df) { metaCols = c("gvkey", "datadate", "fyearq", "fqtr", "tic", "conm", "DATAFQTR", "GGROUP", "GSECTOR" ) facCols = colnames(factors.df) metIx = which(facCols %in% metaCols) dataCols = facCols[-metIx] # # calculation the ratio of NA values to defined values in the data # percntNA = apply(factors.df[,dataCols], 2, FUN=function(v) { NAs = is.na(v) s = sum(NAs) frac = s / length(v) return(frac) }) rowNames = names(percntNA) ix = order(percntNA, decreasing=T) percntNASort = percntNA[ix] names(percntNASort) = rowNames[ix] percntNASort = round(as.matrix(percntNASort, nrow=length(percntNASort), ncol=1) * 100, 2) colnames(percntNASort) = c("% missing values") return(percntNASort) } # # Find stocks that are in the constituent index data frame but are not in the close prices # and return the stockks as a vector to ticker symbols. These can be used in the WRDS database # to download some of the missing data. # findMissingStocks = function(sp500Ix.df, sp500Close.df) { indexCusip = as.vector(sp500Ix.df[,"co_cusip"]) cusipLen = nchar(indexCusip) indexCusipTrunc = substr(indexCusip, start=1, stop=(cusipLen-1)) closeCusip = as.vector(sp500Close.df[,"CUSIP"]) indexUnique = unique(indexCusipTrunc) closeUnique = unique(closeCusip) missingIx = which(! indexUnique %in% closeUnique) missingIndexCusip = indexUnique[missingIx] indexIx = which(indexCusipTrunc %in% missingIndexCusip) indexBlock = sp500Ix.df[indexIx,] missingStocks = unique(as.vector(indexBlock[,"co_tic"])) return(missingStocks) } # # Using the S&P 500 index CUSIP, find the matching close price CUSIP and fill in the index symbol # for the close price symbol. This make the ticker symbols consistent. # # The CUSIP has an extra digit. For example, in the index Honeywell's CUSIP is 438516106. In the # Compustat close price data set the CUSIP is 43851610, one digit less. To get the close price CUSIP # to match the index CUSIP, the longer index CUSIP is truncated by one digit. # replaceNames = function(sp500ix.df, sp500Close.df) { indexCusip = as.vector(sp500Ix.df[,"co_cusip"]) cusipLen = nchar(indexCusip) indexCusipTrunc = substr(indexCusip, start=1, stop=(cusipLen-1)) indexSym = as.vector(sp500Ix.df[,"co_tic"]) indexTbl = unique(cbind(indexCusipTrunc, indexSym)) colnames(indexTbl) = c("CUSIP", "sym") closeCUSIP = as.vector(sp500Close.df[,"CUSIP"]) closeSyms = as.vector(sp500Close.df[,"TICKER"]) for (i in 1:nrow(indexTbl)) { ixCUSIP = as.character(indexTbl[i,"CUSIP"]) ixSym = as.character(indexTbl[i,"sym"]) ix = which(closeCUSIP == ixCUSIP) if (length(ix) > 0) { closeSyms[ix] = ixSym } } sp500CloseNew = cbind(sp500Close.df[,"date"], closeSyms, sp500Close.df[,c("COMNAM", "CUSIP", "PRC", "sprtrn")]) colnames(sp500CloseNew) = c("date", "TICKER", "COMNAM", "CUSIP", "PRC", "sprtrn") return(sp500CloseNew) } # Root path to the directory that contains the Compustat corporate # factors. dataDir = "/home/iank/Documents/thesis_project/data" compustatFactors = "sp500_compustat.csv" fixedFactors = "sp500_compustat_fixed.csv" sp500Close = "sp500_close_prices.csv" sp500CloseQtr = "sp500_close_qtr.csv" sp500CloseMonth = "sp500_close_mon.csv" sp500Index = "s_and_p_500.csv" factorPath = paste(dataDir, compustatFactors, sep="/") factorOutPath = paste(dataDir, fixedFactors, sep="/") closePath = paste(dataDir, sp500Close, sep="/") closeOut = paste(dataDir, sp500CloseQtr, sep="/") closeMon = paste(dataDir, sp500CloseMonth, sep="/") sp500Path = paste(dataDir, sp500Index, sep="/") factorsRaw.df = read.csv(file=factorPath) # These are quarterly corporate factors that are used to synthesize portfolio # factors where there are less than five percent 6% of values which are NA. # Because there are only a few missing values we can try to use interpolation to # fill in some values. # fixList = c(COMMON_EQT, "CHEQ", "COGSQ", "DLCQ", "DLTTQ", "IBCOMQ", "IBQ", "CHEQ", "TXTQ", "PIQ", INCOME, LIABILITIES) factors1.df = fixDates( factorsRaw.df ) factors2.df = fixSharePrice( factors1.df ) factors3.df = filterOutBadFactor(factors2.df, NUM_SHARES, maxFracAllowed = 0.1, extend = TRUE, remove = TRUE) fixed.df = factors3.df for (fact in fixList) { fixed.df = filterOutBadFactor(fixed.df, fact) } factorsFilt.df = filterOutBadFactor(fixed.df, SALES, extend=TRUE) naTable = buildPrcntNATable(factorsFilt.df) par(mfrow=c(1,1)) barplot(t(naTable), beside=T, las=2, ylab="Percent of values that are NA", main="NA/Value Distribution") par(mfrow=c(1,1)) write.csv(x = factorsFilt.df, file=factorOutPath, quote=F, row.names=F) # sort table alphabetically names = rownames(naTable) namesSrt = order(names) naNameSrt = matrix(naTable[namesSrt,]) rownames(naNameSrt) = names[namesSrt] txtq = factorsFilt.df[,"TXTQ"] piq = factorsFilt.df[,"PIQ"] trate = txtq / piq # get rid of NA values trate[is.na(trate)] = 0 # There should not be a tax rate over 0.4 or less than zero trate[ trate < 0] = 0 trate[ trate > 0.4] = 0.4 # par(mfrow=c(1,1)) # hist(trate, xlab="Tax Rate", main="Tax Rate Distribution", col="grey") # par(mfrow=c(1,1)) # # Now do the S&P 500 close prices. The close prices are obtained from the Compustat monthly update # (e.g., they are monthly close prices). The ticker symbols are not consistent between the S&P 500 # index constituents and the close prices. So a few things must be done to prepare the close prices: # # 1. Using the CUSIP numbers, fill in the ticker symbol so that there is a consistent symbol. # 2. Regularize the quarterly close dates. # sp500Ix.df = read.csv(file=sp500Path) sp500Close.df = read.csv(file=closePath) # fill in ticker symbols from the index in the close price data set to make the symbol # reference consistent. sp500CloseNew.df = replaceNames(sp500ix.df, sp500Close.df) dates = as.Date(strptime(as.vector(sp500CloseNew.df[,"date"]), format="%m/%d/%Y", tz="EST")) sp500CloseNew.df[,"date"] = dates zeroLenTicker = which(nchar(as.vector(sp500CloseNew.df[,"TICKER"])) == 0) sp500CloseNew2.df = sp500CloseNew.df if (length(zeroLenTicker) > 0) { sp500CloseNew2.df = sp500CloseNew.df[-zeroLenTicker,] } saveCols = colnames(sp500CloseNew2.df) colnames(sp500CloseNew2.df) = c("datadate", "tic", "conm", "CUSIP", "PRCCQ", saveCols[length(saveCols)]) sp500CloseFixed.df = fixSharePrice( sp500CloseNew2.df ) # There should be no share prices that are NA at this point. If there are, remove the stocks from the data # set since it's not been possible to correct them. naIx = which(is.na(sp500CloseFixed.df[,"PRCCQ"])) naSyms = as.vector(sp500CloseFixed.df[naIx,"tic"]) runLen = rle(naSyms) removeSyms = runLen$values[runLen$lengths > 1] removeIx = which(as.vector(sp500CloseFixed.df[,"tic"]) %in% removeSyms) sp500CloseFilt.df = sp500CloseFixed.df if (length(removeIx) > 0) { sp500CloseFilt.df = sp500CloseFixed.df[-removeIx,] } colnames(sp500CloseFilt.df) = saveCols sp500CloseFilt.df[is.na(sp500CloseFilt.df)] = 0 dates = sp500CloseFilt.df[,"date"] months = as.numeric(format(dates, format="%m")) ix = which(months %in% c(3, 6, 9, 12)) cols = c("date", "TICKER", "COMNAM", "CUSIP", "PRC") qtr = dates[ix] q1Ix = which(as.numeric(format(qtr, format="%m")) == 3) q1Yr = format(qtr[q1Ix], format="%Y") qtr[q1Ix] = as.Date(paste(q1Yr, "03-31", sep="-")) q2Ix = which(as.numeric(format(qtr, format="%m")) == 6) q2Yr = format(qtr[q2Ix], format="%Y") qtr[q2Ix] = as.Date(paste(q2Yr, "06-30", sep="-")) q3Ix = which(as.numeric(format(qtr, format="%m")) == 9) q3Yr = format(qtr[q3Ix], format="%Y") qtr[q3Ix] = as.Date(paste(q3Yr, "09-30", sep="-")) q4Ix = which(as.numeric(format(qtr, format="%m")) == 12) q4Yr = format(qtr[q4Ix], format="%Y") qtr[q4Ix] = as.Date(paste(q4Yr, "12-31", sep="-")) qtrDates = as.Date(strptime(qtr, format="%Y-%m-%d", tz="EST")) sp500CloseFilt.df[ix,"date"] = qtrDates sp500CloseQtr.df = sp500CloseFilt.df[ix, cols] write.csv(x = sp500CloseQtr.df, file=closeOut, quote=F, row.names=F) write.csv(x = sp500CloseFilt.df, file=closeMon, quote=F, row.names=F)