Perl script to load in CSV file from SAS file to PostgreSQL

To run this, you would put (say) “dsi.csv.gz” and “dsi_schema.csv” in the working directory and then call “csv_to_pg.pl dsi” to run the following:

#!/usr/bin/perl
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use Text::CSV_XS;
use DBI;
# $hours, $minutes, and $seconds represent a time today,
# in the current time zone
use Time::Local;
use Time::localtime;
#use warnings;
# Get table and file name from command line
$table_name = @ARGV[0];
print "$table_name\n";
$gz_file = "./$table_name.csv.gz";


# Set up database connection
$dbname = "crsp";

my $dbh = DBI->connect("dbi:Pg:dbname=$dbname", 'username', 'password')  
  or die "Cannot connect: " . $DBI::errstr;

# Get table information from the "schema" file
my $schema_csv = Text::CSV_XS->new ({ binary => 1 });
$schema = "$table_name" . "_schema.csv";
$db_schema = "crsp";

$dbh->do("SET search_path TO $db_schema");

my %hrec;

open my $io, "getline ($io)) {
  my @fields = @$row;
  my $field = @fields[0];
  $field =~ s/^do$/do_/i;
  my $type = @fields[1];
  $hrec{$field} = $type;
#  print "$field, $type\n";
}

# Clean up, etc.
$schema_csv->eof or $schema_csv->error_diag;
close $io or die "$schema: $!";

# Get the first row of the text file
my $csv = Text::CSV_XS->new ({ binary => 1 });
my $fh = new IO::Uncompress::Gunzip $gz_file 
        or die "IO::Uncompress::Gunzip failed: $GunzipError\n";

$row = $csv->getline($fh);

$sql = "CREATE TABLE $table_name (";

# Set some default/initial parameters
$first_field = 1;
$sep="";
$has_date = 0;
$has_gvkey = 0;
$i=0;
# Construct SQL fragment associated with each variable for 
# the table creation statement
foreach $field (@$row) {
  
  # Flag some key fields for indexing. Probably should use the schema file
  # to indicate what fields should be used for indexing. Not sure if
  # WRDS SAS file contains useful information in this regard.
  if ($field =~ /^PERMNO$/i) { $has_gvkey = 1; }
  if ($field =~ /^DATE$/i) { $has_date = 1;}
  $field =~ s/^do$/do_/i;

  # Dates are stored as integers initially, then converted to 
  # dates later on (see below).
  if ($hrec{$field} eq "date") {
    $type = "integer";
  } else {
    $type = $hrec{$field};
  }

  # Concatenate the component strings. Note that, apart from the first
  # field a leading comma is inserted to separate fields in the 
  # CREATE TABLE SQL statement.
  $sql = $sql . $sep . $field . " " . $type;
  if ($first_field) { $sep=", "; $first_field=0; }
}
$sql = $sql . ");";
print "$sql\n";

if(TRUE) {
  # Clean up, etc.
  $csv->eof or $csv->error_diag;
  close $fh or die "$gz_file: $!";

  # Drop the table if it exists already, then create the new table
  # using field names taken from the first row
  $dbh->do("DROP TABLE IF EXISTS $table_name;");
  $dbh->do($sql);

  # Use PostgreSQL's COPY function to get data into the database
  $time = localtime;
  printf "Beginning file import at %d:%02d:%02d\n",@$time[2],@$time[1],@$time[0];
  $cmd = "gunzip -c $gz_file | psql -U username ";
  # $cmd .= "-p test ";
  $cmd .= "-d $dbname -c \"COPY $db_schema.$table_name FROM STDIN CSV HEADER\"";

  print "$cmd\n";
  $result = system($cmd);
  print "Result of system command: $result\n";

  $time = localtime;
  printf "Completed file import at %d:%02d:%02d\n",@$time[2],@$time[1],@$time[0];

  # Convert date fields from integers to dates
  foreach $key (keys %hrec) { 
    $value = $hrec{$key};
    
    if ($value eq 'date') {
      print "Converting $key from integer to date\n";
      $sql = "ALTER TABLE $table_name ALTER $key TYPE date ".
        "USING (date '1960-01-01' +  $key);";
      $dbh-> do($sql); 
    } 
  }
}

# Create indices for performance
if ($table_name eq "ccmxpf_linktable") {
  $sql = "CREATE INDEX $table_name" ."_lpermno_idx ON $table_name (lpermno);";
  print "$sql\n";
  $dbh->do($sql);
  $sql = "CREATE INDEX $table_name" ."_main_idx ON $table_name (gvkey);";
  print "$sql\n";
  $dbh->do($sql);
  $dbh->do("CLUSTER $table_name USING $table_name" ."_main_idx");
  $index=1;
}

if ($table_name eq "dsf" || $table_name eq "erdport1") {
  $sql = "ALTER TABLE $table_name ADD PRIMARY KEY (permno, date);";
  print "$sql\n";
  $dbh->do($sql);
  
  $dbh->do("CLUSTER $table_name USING $table_name" . "_pkey");
  
  $sql = "CREATE INDEX $table_name" ."_date_idx ON $table_name (date);";
  print "$sql\n";
  $dbh->do($sql);

  $index=1;
}

if ($table_name eq "dsi") {
  $sql = "ALTER TABLE $table_name ADD PRIMARY KEY (date);";
  print "$sql\n";
  $dbh->do($sql);
  $dbh->do("CLUSTER $table_name USING $table_name" . "_pkey");
  $index=1;
}

if ($table_name eq "company") {
  $sql = "ALTER TABLE $table_name ADD PRIMARY KEY (permno);";
  print "$sql\n";
  $dbh->do($sql);
  $dbh->do("CLUSTER $table_name USING $table_name" . "_pkey");
  $index=1;
}

if ($has_date ==1 & $has_gvkey==1) {
  $index_on = "(PERMNO, DATE)";
  $index +=1;
} elsif ($has_date==1) {
  $index_on = "(DATE)";
  $index +=1;
} elsif ($has_gvkey==1) {
  $index_on = "(PERMNO)";
  $index +=1;
}

if ($index==1) {
  
  $sql = "DROP INDEX IF EXISTS $table_name" ."_main_idx;";
  print "$sql\n";
  $dbh->do($sql);
  
  $sql = "CREATE INDEX $table_name" ."_main_idx ON $table_name $index_on;";
  print "$sql\n";
  $dbh->do($sql);
  $dbh->do("CLUSTER $table_name USING $table_name" ."_main_idx");
}

$dbh->disconnect();

Advertisements
This entry was posted in Uncategorized and tagged , . Bookmark the permalink.

One Response to Perl script to load in CSV file from SAS file to PostgreSQL

  1. Pingback: Getting WRDS data into MySQL or PostgreSQL | iangow

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s