/** This is modified code for identifying a matched case-control cohort using risk-set sampling. The code was originally created for CNODEs project with authors and documentation below **/ /** Author: Wenbin Li Modified by Menglan Pang **/ /** This macro is developed for matching cases and controls with replacement **/ /** Features: 1. Individuals are eligible to be controls for more than 1 case; 2. Cases are eligible to be controls for another case, so long as the patient was free of the outcome at the time he/she was selected as a control; 3. Allows for 1:N matching In the first run,select as the controls for the case the first N patients in the risk set; For cases where there are fewer than N patients to serve as controls, select the whole list; For cases where there are no patients to serve as controls in the first run, conduct the second run,and widen the matching criterion to avoid losing the case **/ ** This code assumes the user has a cohort dataset already created which contain the following: 1. A binary outcome variable (0/1) 2. A cohort entry date (time 0) 3. An end of follow-up date (earliest of some set of criteria (project specific). Example: earliest of event outcome, death date, LTC facility entry, or end of study period) 4. An ID variable (Example: SCRPHIN) 5. All variables to be used to identify the risk set (project specific). Example: sex, age, treatment duration. **; ** Macro Parameters: 1. cohort_name = name of cohort dataset (include project. prefix if it a saved dataset) 2. outcome_var = name of binary outcome variable 3. end_fu = end of follow-up date 4. randnum = any random number (seed value) 5. K = maximum number of controls to match to a case 6. control_index_dt = identifies how index date for control is to be calculated (case_fu, case_event) case_fu calculates the control to have index date such that the control has the same follow-up time as its matched case (control cohort entry dt + case follow-up duration) case_event assigns the value of the matched cases end of follow-up date (the outcome date) to the control 7. rs_match_where = criteria for risk set matching (will be used in where statement of the SQL for identifying all possible controls within the risk set). Note: all cases related variables require the 'a.' alias to identify it is from the cases dataset, and all controls related variables require the 'b.' alias to identify it is from the controls dataset Example (same sex, cohort entry (time 0) within 90 days, age within 2 years, case end of followup within followup of control, and not same end of follow-up date where the control ends on the outcome date): a.male = b.male and abs(a.t0 - b.t0) <= 90 and abs(a.age - b.age) <= 2 and b.t0 <= a.endfu <= b.endfu and not (a.endfu = b.endfu and b.outcome = 1) 8. id = ID variable. Example: SCRPHIN 9. time0 = time 0 variable (i.e. cohort entry date) 10. gender = gender variable (i.e. sex, male, female) 11. agevar = age variable 12. output_name = name of output matched dataset 13. no_match = flag whether or not limiting cases to those with no matches on prior runs (Y/N). If first run of matching this should be N (default) **; %macro risk_set_match (cohort_name = , outcome_var = , end_fu = , randnum = , K = , control_index_dt = , rs_match_where = , id = , time0 = , gender = , agevar = , output_name = , no_match = N ); ** Create cases and controls datasets **; data controls; set &cohort_name; caco = 0; run; data case; set &cohort_name; if &outcome_var = 1; index_date = &end_fu; format index_date yymmddd10.; run; %if &no_match=N %then %do; ** Create a match_num variable (each case gets a differnt match_num value 1, 2, 3, ... N) **; proc sql; create table case_caco1 as select *, 1 as caco, monotonic() as match_num from case; quit; %end; %if &no_match=Y %then %do; proc sort data=case_caco1; by match_num; run; proc sort data=no_match; by match_num; run; data case_caco1; merge case_caco1 (in=a) no_match (in=b keep=match_num); by match_num; if a & b; run; proc datasets library=work; delete no_match; quit; %end; ** create a macro variable called N that has the value of the total number of cases in the cohort **; data _null_; set case_caco1 end = last; if last then call symput('N', match_num); run; %put &N; %macro do_first; %do I = 1 %to &N; data case_caco1_I; set case_caco1; if match_num = &I; run; ** Risk Set Matching - identifies all possible controls for a case 'I' that match on the criteria supplied to be included in the risk set **; proc sql; create table risk as select a.&id as case_ID, b.&id as control_ID, a.&agevar as case_age, b.&agevar as control_age, a.&gender as case_male, b.&gender as control_male, a.&time0 as case_t0, b.&time0 as control_t0, a.&end_fu as case_endfu, b.&end_fu as control_endfu, a.caco as case_caco, b.caco as control_caco, a.index_date, a.match_num from case_caco1_I a, controls b where &rs_match_where; quit; proc append base = risk_set data = risk; run; %end; %mend; option nonotes; %do_first; option notes; proc sort data = risk_set; by case_ID control_ID; run; data risk_set; set risk_set; rand_num = RANUNI(&randnum); run; %if &control_index_dt = case_event %then %do; data risk_set; set risk_set; index_date = index_date; format index_date yymmddd10.; run; %end; %if &control_index_dt = case_fu %then %do; data risk_set; set risk_set; index_date = control_t0 + (case_endfu - case_t0); format index_date yymmddd10.; run; %end; proc sort data = risk_set; by case_ID rand_num; run; ** Select first 'K' controls at random. Also identifies those cases that had some matches but not the maximum 'K' number of matches **; data cohort_match not_enough; set risk_set; by case_ID; retain num; if first.case_ID then num = 1; if num le &K then do; ** K controls for each case **; output cohort_match; num=num+1; end; if last.case_ID then do; if num le &k then output not_enough; end; run; ** Collect all cases that do not qualify for the first round (had 0 matches based on the first set of matching criteria) **; proc sql; create table no_match as select * from case_caco1 where match_num not in (select match_num from cohort_match); quit; ** Put the number of cases with no controls matched to it in the LOG **; %let dsid=%sysfunc(open(no_match)); %let n_no_match=%sysfunc(attrn(&dsid,nlobs)); %let rc=%sysfunc(close(&dsid)); %put THERE ARE &n_no_match CASES WITHOUT A CONTROL MATCHED TO IT; ** Create dataset of matched cases and controls **; proc sql; create table matched_cases as select * from case_caco1 where match_num in (select match_num from cohort_match) order by match_num; create table matched_controls as select controls.*, match_num, index_date, num from controls, cohort_match where controls.&id = control_ID order by match_num, num; quit; data &output_name; set matched_cases matched_controls; by match_num; drop &outcome_var num; run; proc datasets nolist; delete controls case case_caco1_I risk risk_set cohort_match not_enough matched_cases matched_controls; quit; %mend risk_set_match; /** EXAMPLE PROGRAM CALL ** FROM CNODES INCRETINS PROJECT: Primary analysis for the pancreatic cancer study cohort - case control matching **; data cohort; set project.dsen_incretins_study_cohort2_cov; study_exit_date=end_fu_dt2; dur_treated_diab=(study_entry_dt-bc_prvddt); dur_fup=(study_exit_date-(study_entry_dt+365))+1; format study_exit_date yymmddd10.; run; %risk_set_match (cohort_name = cohort, outcome_var = outcome2, end_fu = study_exit_date, randnum = 123, K = 20, control_index_dt = case_fu, rs_match_where = a.sex=b.sex and abs(a.study_entry_dt-b.study_entry_dt)<=180 and abs((a.study_entry_dt - a.birthdt)-(b.study_entry_dt - b.birthdt))<=365 and abs(a.dur_treated_diab-b.dur_treated_diab)<=90 and (a.dur_fup <= b.dur_fup) and not (a.study_exit_date=b.study_exit_date and b.outcome2=1), id = scrphin, time0 = study_entry_dt, gender = sex, agevar = bc_age, output_name = matched1, no_match = N ); ** Because there were some cases that did not have a match, try to find at least one match by expanding the age window from 1 year (365 days) to 2 year (730 days) **; %risk_set_match (cohort_name = cohort, outcome_var = outcome2, end_fu = study_exit_date, randnum = 123, K = 20, control_index_dt = case_fu, rs_match_where = a.sex=b.sex and abs(a.study_entry_dt-b.study_entry_dt)<=180 and abs((a.study_entry_dt - a.birthdt)-(b.study_entry_dt - b.birthdt))<=730 and abs(a.dur_treated_diab-b.dur_treated_diab)<=90 and (a.dur_fup <= b.dur_fup) and not (a.study_exit_date=b.study_exit_date and b.outcome2=1), id = scrphin, time0 = study_entry_dt, gender = sex, agevar = bc_age, output_name = matched2, no_match = Y ); ** Set the matched sets together **; data matched; set matched1 (keep=scrphin sex st_age st_agegrp caco index_date study_entry_dt match_num birthdt bc_prvddt study_cyear dur_treated_diab alcohol pancreatitis statin neuropathy renal retinal p_arterio nhosps_grp nhosps nmeds_grp nmeds nmeds_ad_grp nmeds_ad dpp4 glp1 insulin meglitinide metformin other sulfonylurea thiazolidinedione study_exit_date dur_fup male alpha_glucosidase) matched2 (keep=scrphin sex st_age st_agegrp caco index_date study_entry_dt match_num birthdt bc_prvddt study_cyear dur_treated_diab alcohol pancreatitis statin neuropathy renal retinal p_arterio nhosps_grp nhosps nmeds_grp nmeds nmeds_ad_grp nmeds_ad dpp4 glp1 insulin meglitinide metformin other sulfonylurea thiazolidinedione study_exit_date dur_fup male alpha_glucosidase); run; **/