Winsorizing in R to match SAS

Here is a SAS macro for winsorizing data that seems to be commonly used in accounting and finance research. This code comes from replication materials provided by Fang, Huang and Karpoff (“FHK”) here. Below I provide an R function that produces equivalent results.

%MACRO winsorize(var, var_out, by_var, left, right, input, output);
	%LET var_number = 1; 
	%LET var&var_number = %QSCAN(&var, &var_number, %STR( ));
	%DO %WHILE (&&var&var_number NE);
		%LET var_number = %EVAL(&var_number + 1);
		%LET var&var_number = %QSCAN(&var, &var_number, %STR( ));
	%END;
	%LET var_number = %EVAL(&var_number - 1);

  %IF &left = AND &right = %THEN %DO;
    %PUT ERROR: Both top and bottom percentiles to winsorize are missing, no winsorize is needed!!!;
    %GOTO exit;
  %END;

	%IF &var_out = AUTONAME %THEN %DO;
		%DO i = 1 %TO &var_number;
			%LET &&var_out_&i = &&var&i.._w_&left._&right;
		%END;
	%END;

	%ELSE %DO; 
		%LET var_out_number = 1;
		%LET var_out&var_out_number = %QSCAN(&var_out, &var_out_number, %STR( ));
	  	%DO %WHILE (&&var_out&var_out_number NE);
			%LET var_out_number = %EVAL(&var_out_number + 1);
	 		%LET var_out&var_out_number = %QSCAN(&var_out, &var_out_number, %STR( ));
		%END;
		%LET var_out_number = %EVAL(&var_out_number - 1);
		
		%IF &var_number ^= &var_out_number %THEN %DO;
			%PUT ERROR: The Number of variables to winsorize does not equal the number of output variables!!!;
			%PUT variables to be winsorize:;
			%DO i = 1 %TO &var_number;	
				%PUT    &i: &&var&i;		
			%END;
			%PUT variable names after winsorize:;
			%DO i = 1 %TO &var_out_number; 
				%PUT    &i: &&var_out&i; 
			%END;		
				%GOTO exit;
		%END;
	%END;

	%IF &by_var =  %THEN %DO;
		%LET by_var = just_for_merge;
		DATA _winsorize_temp;
			SET &input;
			just_for_merge = 1;
		RUN;
	%END;
	%ELSE %DO;
		DATA _winsorize_temp;
			SET &input;
		RUN;
	%END;
	PROC SORT data = _winsorize_temp;
		BY &by_var;
	RUN;
	
	PROC UNIVARIATE DATA = _winsorize_temp NOPRINT;
		BY &by_var;
		VAR %DO i = 1 %TO &var_number; &&var&i %END;;
		OUTPUT OUT = winsorized PCTLPTS = %IF &left ^= %THEN %DO; &left %END; %IF &right ^= %THEN %DO; &right %END;
			PCTLPRE = %DO i=1 %TO &var_number; var_&i %END;
			PCTLNAME = %IF &left ^= %THEN %DO;  _pct_&left %END; %IF &right ^= %THEN %DO;  _pct_&right %END;;
	RUN;

	DATA &output;
		MERGE _winsorize_temp winsorized;
		BY &by_var;
		%DO i=1 %TO &var_number;
			IF &&var&i ^= . THEN DO;
				%IF &left ^= AND &right^= %THEN %DO;
					&&var_out&i = MIN(var_&i._pct_&right, MAX(var_&i._pct_&left, &&var&i));
					DROP var_&i._pct_&left var_&i._pct_&right;
				%END;
		        %ELSE %IF &left ^= %THEN %DO;
	  	        			&&var_out&i = MAX(var_&i._pct_&left, &&var&i);
							DROP var_&i._pct_&left;
	      	  	%END;
				%ELSE %IF &right ^= %THEN %DO;
  	        		&&var_out&i = MIN(var_&i._pct_&right, &&var&i);
					DROP var_&i._pct_&right;
				%END;
			END;
		%END;
		%IF &by_var = just_for_merge %THEN DROP just_for_merge;;
	RUN;
	%exit:
%MEND winsorize;

%winsorize(
		var = at mtob leverage roa da1_adj da2_adj da3_adj da4_adj da_lr1_adj da_lr2_adj da_lr3_adj da_lr4_adj,
		var_out = at mtob leverage roa da1_adj da2_adj da3_adj da4_adj da_lr1_adj da_lr2_adj da_lr3_adj da_lr4_adj,
		by_var = fyear, 
		left = 1, 
		right = 99, 
		input = sho_accruals2,
		output = sho_accruals2);

I ran the SAS code from FHK and dumped the pre- and post-winsorization data into Stata files.

Below is code that takes the pre-winsorization data, winsorizes it, then compares the result with the post-winsorization data from SAS. Here I have hard-coded the quantiles at which the data are winsorized; it would be easy to tweak the function to make this variable. For sure, the R function is much easier to follow. A critical element to getting the same results is the argument type = 2 (get the details here).

setwd("/Users/igow/git/fhk_replication")
library(haven) 
library(dplyr, warn.conflicts = FALSE)

sho_accruals2_post_wins <- read_dta("data/sho_accruals2_post_wins.dta")
sho_accruals2_pre_wins <- read_dta("data/sho_accruals2_pre_wins.dta")

win01 <- function(x) {
    cuts <- quantile(x, probs = c(0.01, 0.99), 
                     type = 2, na.rm = TRUE)
    x[x < cuts[1]] <- cuts[1]
    x[x > cuts[2]] <- cuts[2]
    x
}

win_vars <- c("at", "mtob", "leverage", "roa", "da1_adj", "da2_adj", 
                 "da3_adj", "da4_adj", "da_lr1_adj", "da_lr2_adj", 
                 "da_lr3_adj", "da_lr4_adj")

sho_accruals2_new <-
    sho_accruals2_pre_wins %>%
    group_by(fyear) %>%
    mutate_at(all_of(win_vars), win01) %>%
    ungroup()

merged <-
    sho_accruals2_new %>%
    inner_join(sho_accruals2_post_wins, 
               by = c("gvkey", "fyear"),
               suffix = c("_r", "_sas"))

tol <- 1e-8

merged %>% 
    mutate(diff = abs(da_lr1_adj_r - da_lr1_adj_sas)) %>%
    filter(diff > tol) %>% 
    select(gvkey, fyear, da_lr1_adj_r, da_lr1_adj_sas, diff) %>%
    arrange(desc(diff))
#> # A tibble: 0 x 5
#> # … with 5 variables: gvkey <chr>, fyear <dbl>, da_lr1_adj_r <dbl>,
#> #   da_lr1_adj_sas <dbl>, diff <dbl>

Created on 2020-10-09 by the reprex package (v0.3.0)

view raw winsor.md hosted with ❤ by GitHub
This entry was posted in Uncategorized. Bookmark the permalink.

Leave a comment