/*******************************************************************************
Description: This program runs a factor analysis to calculate the SEFI-2 index 
by DA using the 2011 Census.

Census variables included in SEFI-2 factor analysis:
1. Average household income 
2. Unemployment rate age 15+ 
3. Proportion of population age 15+ without high school graduation
4. Proportion of single parent households

Variables with missing values at the DA level were imputed with values at CSD 
level where possible.  Another imputation is done for First Nations Reserves 
using the weighted mean for North and South First Nations communities. 
*******************************************************************************/

******************************************;
title 'SEFI-2 Calculated from 2011 Census';
******************************************;

*** read in 2011 census data at DA level;
proc sort data = census2011 out = census (where=(geo_type='DA' and POPTOT ^= .) 
                 keep=geo_type geo_id geo_name POPTOT INC_PHH_AVEINC INC_PHH 
				      LF_UNEMRATE15 LF_POPTOT15 
					  FAMSTAT_TOTLPAR FAMSTAT_TOTCENFAM 
					  EDHS_WOHS15 EDHS_POP15);
   by geo_id;
run;

*** calculate derived variables;
data census;
   length dauid $8;
   set census;
   
   dauid = geo_id;
   
   *** set zero income to missing (and then impute);
   zero_income = 0;
   if INC_PHH_AVEINC = 0 then do;
      INC_PHH_AVEINC = .;
	  zero_income = 1;
   end;
   
   if FAMSTAT_TOTCENFAM > 0 then singleparent = FAMSTAT_TOTLPAR/FAMSTAT_TOTCENFAM;
   if EDHS_POP15 > 0 then nohigh = EDHS_WOHS15/EDHS_POP15;
   
   zero_ur = (LF_UNEMRATE15=0);
   zero_sp = (singleparent=0);
   zero_hs = (nohigh=0);  
         
   label POPTOT = 'Total Population'
         INC_PHH_AVEINC = 'Average Household Income ($)'
         LF_UNEMRATE15 = 'Unemployment Rate for Labour Force Population aged 15 years and older'
		 nohigh = 'Proportion of population 15 years and older without High School Graduation'
		 singleparent = 'Proportion of Lone Parent Families';
run;

proc means data = census mean n nmiss;
   var POPTOT INC_PHH_AVEINC INC_PHH 
       LF_UNEMRATE15 LF_POPTOT15 
       singleparent FAMSTAT_TOTLPAR FAMSTAT_TOTCENFAM
       nohigh EDHS_WOHS15 EDHS_POP15;
   title2 'Check for missing values on 2011 Census at DA level';
run;

proc print data = census;
   where LF_UNEMRATE15 = . | nohigh = . | singleparent = .;
   var dauid geo_name POPTOT INC_PHH_AVEINC INC_PHH 
       LF_UNEMRATE15 LF_POPTOT15 
       singleparent FAMSTAT_TOTLPAR FAMSTAT_TOTCENFAM 
	   nohigh EDHS_WOHS15 EDHS_POP15;
run; 

proc freq data = census;
   tables zero_income zero_ur zero_sp zero_hs;
run;

*** read in census data at CSD level;
proc sort data = census2011 out=census_csd (where=(geo_type='CSD' and POPTOT ^= .) 
                 keep=geo_type geo_id geo_name POPTOT 
				      INC_PHH_AVEINC INC_PHH 
					  LF_UNEMRATE15 LF_POPTOT15 
					  FAMSTAT_TOTLPAR FAMSTAT_TOTCENFAM
					  EDHS_WOHS15 EDHS_POP15);
   by geo_id;
run;

*** calculate derived variables;
data census_csd (rename=(INC_PHH_AVEINC=INC_PHH_AVEINC_csd LF_UNEMRATE15=LF_UNEMRATE15_csd));
   length csdid $7;
   set census_csd;
   
   csdid = geo_id;
   
   *** set zero income to missing (and then impute);
   zero_income_csd = 0;
   if INC_PHH_AVEINC = 0 then do;
      INC_PHH_AVEINC = .;
	  zero_income_csd = 1;
   end;
   
   if FAMSTAT_TOTCENFAM > 0 then singleparent_csd = FAMSTAT_TOTLPAR/FAMSTAT_TOTCENFAM;
   if EDHS_POP15 > 0 then nohigh_csd = EDHS_WOHS15/EDHS_POP15;
  
   label INC_PHH_AVEINC = 'Average Household Income ($)'
         LF_UNEMRATE15 = 'Unemployment Rate for Labour Force Population aged 15 years and older'
		 nohigh_csd = 'Proportion of population 15 years and older without High School Graduation'
		 singleparent_csd = 'Proportion of Lone Parent Families';
run;
  
proc means data = census_csd mean n nmiss;
   var POPTOT INC_PHH_AVEINC_csd INC_PHH 
       LF_UNEMRATE15_csd LF_POPTOT15 
       singleparent_csd FAMSTAT_TOTLPAR FAMSTAT_TOTCENFAM 
	   nohigh_csd EDHS_WOHS15 EDHS_POP15;
   title2 'Check for missing values on 2011 Census at CSD level';
run;

proc print data = census_csd;
   where LF_UNEMRATE15_csd = . | nohigh_csd = . | singleparent_csd = .;
   var csdid geo_name POPTOT INC_PHH_AVEINC_csd INC_PHH 
       LF_UNEMRATE15_csd LF_POPTOT15 
       singleparent_csd FAMSTAT_TOTLPAR FAMSTAT_TOTCENFAM 
	   nohigh_csd EDHS_WOHS15 EDHS_POP15;
run; 

proc freq data = census_csd;
   tables zero_income_csd;
run;

*** use concordance file to merge DAs to CSDs;
*** keep one record per DA (there is one record per DA block);
proc sort data = conc_2011 out = conc(keep=pruid cduid csduid dauid) nodupkey;
   where pruid = '46';
   by dauid;
run;

proc sort data = conc;
   by csduid;
run;

proc sort data = census_csd;
   by csdid;
run;

proc print data = conc (obs=20);
   var cduid csduid dauid;
   title2 'Check CSD & DA codes in Concordance File';
run; 

data census_csd;
   merge conc (in=m1 rename=(csduid=csdid cduid=cdid) drop=pruid)
         census_csd (in=m2);
   by csdid;
   if m2;
run;

proc sort data = census_csd;
   by dauid;
run;

proc sort data = census;
   by dauid;
run;

proc print data = census_csd (obs=20);
   var dauid;
   title2 'Check DA codes from Concordance File';
run; 

proc print data = census (obs=20);
   var dauid;
   title2 'Check DA codes from Census Data';
run; 

*** imput missing values at DA level with CSD values;
data census;
   merge census_csd (in=m1 keep=cdid csdid dauid INC_PHH_AVEINC_csd LF_UNEMRATE15_csd nohigh_csd singleparent_csd geo_name rename=(geo_name=csd_name))
         census (in=m2);
   by dauid;   
   if m2; 
   in_csd = m1;
   
   *** impute CSD value to DA if DA value is missing or zero;
   if INC_PHH_AVEINC = . and INC_PHH_AVEINC_csd > 0 then INC_PHH_AVEINC = INC_PHH_AVEINC_csd;  
      
   if LF_UNEMRATE15 = . then LF_UNEMRATE15 = LF_UNEMRATE15_csd;
   if nohigh = . then nohigh = nohigh_csd;
   if singleparent = . then singleparent = singleparent_csd;
run;

proc freq data = census;
   tables in_csd;
run;

proc means data = census mean n nmiss;
   var POPTOT INC_PHH_AVEINC LF_UNEMRATE15 nohigh singleparent;
   title2 'Check for missing values after imputation at CSD level';
run;

proc print data = census;
   where INC_PHH_AVEINC <= 0 | LF_UNEMRATE15 = . | nohigh = . | singleparent = .;
   var csd_name geo_name dauid csdid POPTOT INC_PHH_AVEINC LF_UNEMRATE15 nohigh singleparent;
run;

*** now impute weighted means of variables by north/south for FN Reserves;

*** use pccf to provide information on DAs such as FN Reserves (csdtype=R) and latitude;
proc freq data = pccf_2011;
   tables csdtype;
run;

proc sort data = pccf_2011 out = pccf(keep=dauid lat csdtype csdname) nodups;
   by dauid descending lat;
run;

*** add in postal codes retired before Jan 1, 2011 (160 DAs are not in PCCF but are in census data);
proc sort data = pccf_2011_ret out = pccf_ret(keep=dauid lat csdtype csdname) nodups;
   by dauid descending lat;
run;

data pccf;
   set pccf;
   by dauid;
   if first.dauid;
run;

data pccf_ret;
   set pccf_ret;
   by dauid;
   if first.dauid;
run;

data pccf_ret;
   merge pccf (in=m1)
         pccf_ret (in=m2);
   by dauid;
   if m2 & ^m1;
run;

data pccf;
   set pccf
       pccf_ret;
   by dauid;
run;

proc sort data = census nodupkey;
   by dauid;
run;

data census miss;
   merge census (in=m1)
         pccf (in=m2);
   by dauid;
   if m1;
   in_pccf = m2;
   
   *** identify FN communities for imputation including Fairford (46180005), Hollow Water (46190058),
   Jackhead (46190083) and Cross Lake (46220033) which are missing from this pccf file;
   fnda = (csdtype = 'IRI'); **or (dauid in ('46180005','46190058','46190083','46220033'));
   label fnda = 'First Nations DA (1=Yes)';

   *** identify FN communities north of 53 parallel or remote (Poplar River, Berens River, Pauingassi, Little Grand Rapids, Bloodvein;
   if fnda = 1 then do;
      fnnorth = (lat >= 53); **or dauid in ('46190070','46190068','46190076','46190077','46190061','46190063');
      label fnnorth = 'First Nations DA North of 53rd Parallel (1=Yes)';
   end;
   if m1 & ^m2 then output miss;
   output census;
run;

proc freq data = census;
   tables in_pccf;
run;

proc print data = census;
   where in_pccf = 0;
   var csdtype csdname dauid;
run;

proc freq data = census;
   where fnda = 1;
   tables fnnorth*csdname*dauid /list missing;
   title2 'North and South FN DAs';
run;

*** generate weighted means for FN DAs by north/south;

%macro wmean (var,wgt,output);

proc summary data = census nway;
   where fnda = 1;
   class fnnorth;
   var &var;
   weight &wgt;
   output out=&output sum= sumwgt= mean=/autoname;
   title2 "Calculate Weighted Means for &var in North and South FN DAs";
run;

%mend;

options mprint;

%wmean (var=INC_PHH_AVEINC,wgt=INC_PHH,output=impute_income);
%wmean (var=LF_UNEMRATE15,wgt=LF_POPTOT15,output=impute_umemp);
%wmean (var=nohigh,wgt=EDHS_POP15,output=impute_nohigh);
%wmean (var=singleparent,wgt=FAMSTAT_TOTCENFAM,output=impute_single);

proc sort data = census;
   by fnnorth;
run;

data census;
   merge census (in=m1)
         impute_income (in=m2 keep=fnnorth INC_PHH_AVEINC_mean)
         impute_umemp (in=m3 keep=fnnorth LF_UNEMRATE15_mean)
         impute_nohigh (in=m4 keep=fnnorth nohigh_mean)
         impute_single (in=m5 keep=fnnorth singleparent_mean);
   by fnnorth;
   if m1;
   fix_inc = 0;
   fix_unemp = 0;
   fix_hs = 0;
   fix_sp = 0;
   if m2 = 1 and INC_PHH_AVEINC = . and INC_PHH_AVEINC_mean ^= . then do;
      INC_PHH_AVEINC = int(INC_PHH_AVEINC_mean);
      fix_inc = 1;
   end;
   if m3 = 1 and LF_UNEMRATE15 = . and LF_UNEMRATE15_mean ^= . then do;
      LF_UNEMRATE15 = int(LF_UNEMRATE15_mean);
      fix_unemp = 1;
   end;
   if m4 = 1 and nohigh = . and nohigh_mean ^= . then do;
      nohigh = int(nohigh_mean);
      fix_hs = 1;
   end;
   if m5 = 1 and singleparent = . and singleparent_mean ^= . then do;
      singleparent = int(singleparent_mean);
      fix_sp = 1;
   end;
run;

proc freq data = census;
   tables fix_inc fix_unemp fix_hs fix_sp fix_inc*fix_unemp*fix_hs*fix_sp /list missing;
   title2 'Census Values based on DA level after imputation from FN communities'; 
run;

proc print data = census;   
   where INC_PHH_AVEINC = . | LF_UNEMRATE15 = . | nohigh = . | singleparent = .;
   var csd_name geo_name dauid csdid POPTOT INC_PHH_AVEINC LF_UNEMRATE15 nohigh singleparent;
   title2 'No Imputed Values Found for at least one SEFI variable';
run;

proc means data = census n nmiss mean stddev median min max nolabels;
   var INC_PHH_AVEINC LF_UNEMRATE15 nohigh singleparent;
   title2 'Mean and Median values for Census Variables';
run;

*****************************;
title2 'SEFI Factor Analysis';
*****************************;

proc factor data = census nfactors=1 out=sefi(rename=(factor1=sefi2));
  var INC_PHH_AVEINC LF_UNEMRATE15 nohigh singleparent; 
run;

*** flip the sign for SEFI and scale it so that ranges from approximately 0-10, 10 being highest (best) SES; 
data sefi;
   set sefi;
   sefi2_c = (sefi2*-1) + 5;  
   label sefi2 = 'SEFI-2 from 2011 Census'
         sefi2_c = 'SEFI-2 from 2011 Census (scaled)';
   keep dauid sefi2 sefi2_c POPTOT INC_PHH_AVEINC LF_UNEMRATE15 nohigh singleparent;		 
run;  

proc sort data = sefi;
   by dauid;
run;

proc means data = sefi n nmiss mean var median min max;
   var sefi2 sefi2_c;
   title3 'Simple Statistics';
run;