Getting Fama-French industry data into R

Here’s a little program to pull Fama-French industry classifications from Ken French’s website, process them and output them to a table. The tables (e.g., 48-industry, 17-industry) don’t seem to be in exactly the same format, so some tweaking may be required for the table of interest to you. Note that I include a little code at the end that I use to put the data in my PostgreSQL database; this won’t work for you, so you can delete that part.

I subsequently added code to pull in Fama-French factor data here.

########################################################################
# Small program to fetch and organize Fama-French industry data.
# The idea is to make a table that could be used for SQL merges.
########################################################################

# The URL for the data.
ff.url <- paste("http://mba.tuck.dartmouth.edu", 
								"pages/faculty/ken.french/ftp",
								"Industry_Definitions.zip", sep="/")

# Download the data and unzip it
f <- tempfile() 
download.file(ff.url, f) 
file.list <- unzip(f,list=TRUE)

trim <- function(string) {
	# Remove leading and trailing spaces from a string
	ifelse(grepl("^\\s*$", string, perl=TRUE),"", 
				gsub("^\\s*(.*?)\\s*$","\\1",string,perl=TRUE))
}

# Function to do the heavy lifting
extract_ff_ind_data <- function (file) {

  # Read in the data in a plain form
  ff_ind <- as.vector(read.delim(unzip(f, files=file), header=FALSE, 
                                 stringsAsFactors=FALSE))
  
  # The first 10 characters of each line are the industry data, but only the first
  # row of the data for the SIC codes in an industry are filled in;
  # so fill in the rest.
  ind_num <- trim(substr(ff_ind[,1],1,10))
  for (i in 2:length(ind_num)) { 
    if (ind_num[i]=="") ind_num[i] <- ind_num[i-1]
  }
  
  # The rest of each line is either detail on an industry or details about the
  # range of SIC codes that fit in each industry with a label for each group
  # of SIC codes.
  sic_detail <- trim(substr(ff_ind[,1],11,100))

  # If the line doesn't start with a number, it's an industry description
  is.desc <- grepl("^\\D",sic_detail,perl=TRUE)
 
  # Pull out information from rows about industries
  regex.ind <- "^(\\d+)\\s+(\\w+).*$"
  ind_num <- gsub(regex.ind,"\\1",ind_num,perl=TRUE)
  ind_abbrev <- gsub(regex.ind,"\\2",ind_num[is.desc],perl=TRUE)
  ind_list <- data.frame(ind_num=ind_num[is.desc],ind_abbrev, 
                         ind_desc=sic_detail[is.desc])
  
  # Pull out information rows about ranges of SIC codes
  regex.sic <- "^(\\d+)-(\\d+)\\s*(.*)$"
  ind_num <- ind_num[!is.desc]
  sic_detail <- sic_detail[!is.desc]
  sic_low  <- as.integer(gsub(regex.sic,"\\1",sic_detail,perl=TRUE))
  sic_high <- as.integer(gsub(regex.sic,"\\2",sic_detail,perl=TRUE))
  sic_desc <- gsub(regex.sic,"\\3",sic_detail,perl=TRUE)
  sic_list <- data.frame(ind_num, sic_low, sic_high, sic_desc)
  
  return(merge(ind_list,sic_list,by="ind_num",all=TRUE))
}

# Extract the data of interest
ind_48_table <- extract_ff_ind_data("Siccodes48.txt")
# ind_49_table <- extract_ff_ind_data("Siccodes49.txt")
# ind_12_table <- extract_ff_ind_data("Siccodes12.txt")
# ind_17_table <- extract_ff_ind_data("Siccodes17.txt")

# Load the data into my database
library(RPostgreSQL)
drv <- dbDriver("PostgreSQL")
pg <- dbConnect(drv, dbname = "crsp")
rs <- dbWriteTable(pg,c("ff","ind_48"),ind_48_table,overwrite=TRUE)

Once I got the data into PostgreSQL, it was a simple matter of joining up with my existing data:

SELECT a.company_id, a.fyear, a.sic, b.ind_desc, b.ind_num
FROM 
  execdata_plus AS a
  LEFT JOIN ff.ind_48 AS b
  ON a.sic::integer BETWEEN b.sic_low AND b.sic_high;
Advertisements
This entry was posted in R and tagged , , . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s