# # Calculate the distribution of factor values. That is, how many corporate values are present vs how many are # zero. # omitCols = c("gvkey","datadate","fyearq","fqtr","tic","conm","DATAFQTR","NAICS") dataDir = "../data" allFactorFile = "all_factors.csv" allFactorPath = paste(dataDir, allFactorFile, sep="/") factorData = read.csv(file=allFactorPath) # set NA values to zero factorData[is.na(factorData)] = 0 colNames = colnames(factorData) omitIx = which(colNames %in% omitCols) factorDataFilt = factorData[,-omitIx] dataPrcnt = apply(factorDataFilt, 2, FUN=function(v) { l = length(v) z = sum(v != 0) prcnt = z / l return(prcnt)}) dataPrcntSrt = round(sort(dataPrcnt, decreasing=TRUE), 4) bins = seq(from=1.0, to=0, by=-0.05) factorBins = list() rnames = c() startIx = 1 for (i in 2:length(bins)) { endIx = i start = bins[startIx] end = bins[endIx] ranges[(i-1),] = c(start, end) factorBins[[(i-1)]] = list(start = start, end = end, factors=c()) startIx = endIx } rangeIx = 1 for (i in 1:length(dataPrcntSrt)) { val = dataPrcntSrt[i] repeat { start = factorBins[[rangeIx]]$start end = factorBins[[rangeIx]]$end if (val < end) { rangeIx = rangeIx + 1 } else { break } } # repeat factorBins[[rangeIx]]$factors = c(factorBins[[rangeIx]]$factors, names(dataPrcntSrt[i])) } # # Print the factors out in 8 column tables for (i in 1:length(factorBins)) { print(sprintf("[%2.2f:%2.2f]", factorBins[[i]]$start*100, factorBins[[i]]$end*100), quote=FALSE) factors = factorBins[[i]]$factors s = c() printed = FALSE for (j in 1:length(factors)) { s = paste(s, factors[j], sep="|") printed = FALSE if ((j %% 8) == 0) { print(paste(s, "|", sep=""), quote=FALSE) s = c() printed = TRUE } } if (! printed) { print(paste(s, "|", sep=""), quote=FALSE) } } for (i in 1:length(factorBins)) { if (factorBins[[i]]$end < 0.6) { break } print(sprintf("[%2.2f:%2.2f]", factorBins[[i]]$start*100, factorBins[[i]]$end*100), quote=FALSE) factors = factorBins[[i]]$factors print('', quote=FALSE) for (j in 1:length(factors)) { print("", quote=FALSE) print(sprintf("", factors[j]), quote=FALSE) print("", quote=FALSE) } print("
%s
", quote=FALSE) }