/*
Macro: identify_CCS_health_status

Developed by: Barret Monchka
***************************************************************************

***************************************************************************
******************************** Purpose **********************************
***************************************************************************
This macro was developed to ascertain Clinical Classification System (CCS) disease category status, given a diagnosis 
code crosswalk file. There are 130 CCS categories of chronic diseases as well as 30 higher-level categories (clusters) that are mapped to 
ICDA-8, ICD-9-CM and ICD-10-CA.

******************************************************************************
******************************** Input files *********************************
******************************************************************************
The macro requires a total of six input files, with one optional input file:

- Four files containing CCS category descriptions, CCS cluster descriptions, crosswalks between CCS categories and ICD codes, and mappings between CCS categories and CCS clusters: 
   1. CCS_categories.csv: provides descriptions for the CCS categories and contains two variables: 
                         -CCS_category_id (an ID assigned to each of the 130 categories) 
                         -CCS_category_descr (provides description for each category).
   2. CCS_clusters.csv: lists high level CCS categories and contains two variables: 
                        -CCS_cluster_id (an ID assigned to higher level clusters of CCS categories) 
                        -CCS_cluster_descr(provides description for each high level cluster)
   3. CCS_diag_crosswalk.csv:contains the mappings between CCS categories and ICD codes and has three variables:
                        -diagnosis_coding_system (an indicator of the coding system: ICDA-8, ICD-9-CM or ICD-10-CA)                        -
                        -CCS_category_id  
                        -diagnosis_code_match_pattern (lists the ICD codes that map to the CCS category and cluster, formatted
                         into uppercase, without whitespace, and matches all precise codes under the specific parent code) 
   4. CCS_cluster_mappings.csv:contains the mappings between CCS categories and clusters and has two variables:
                        -CCS_category_id  
                        -CCS_cluster_id 

- Two files to input the study cohort: 
  1. Cohort_diagnoses:  includes the cohort members and all ICD codes identified in inpatient and/or outpatient records during the ascertainment window.
     It should include four variables: Unique participant identifier (e.g., scrphin), encounter_date (date of diagnosis), 
     diagnosis_code (the ICD code), coding_system (an indicator of the diagnosis coding system: ICDA-8, ICD-9-CM, or ICD-10-CA). 
     Note: Diagnosis codes must be all uppercase letters or numeric, and whitespace must be trimmed. 
     Note: Must include a coding_system variable (allowed values: 'ICDA-8', 'ICD-9-CM', or 'ICD-10-CA').
     Note: One individual may have multiple diagnosis records, however, the diagnoses should be unique when grouped by scrphin, encounter_date, coding_system, diagnosis_code
  2. Cohort_scrphins: lists all study individuals' IDs (one value, e.g., scrphin).

- One file to input ICD code labels. This is an optional file. It could be supplied to produce labels of the ICD codes that were not mapped (skipped codes). 
	It should have three variables: coding_system, diagnosis_code and diagnosis_descr (which provides a description/label for the ICD code).
 
Note: Input files were split in this manner to improve computational efficiency 

******************************************************************************
******************************** Parameters **********************************
******************************************************************************
participant_id_var_: name of the variable that uniquely identifies study participants. For example, scrphin
participant_id_ds_: name of the dataset, including library name, with participant IDs. For example, work.cohort_scrphins	
diagnosis_codes_ds_: name of dataset, including library name, containing ICD diagnosis codes. For example, work.cohort_diagnoses
concept_labels_ds_: name of dataset, including library name, containing the CCS category descriptions (e.g., work.CCS_categories)  
cluster_labels_ds_: name of dataset, including library name, containing the higher-level CCS cluster descriptions (e.g., work.CCS_clusters)  
diag_crosswalk_ds_: name of dataset, including library name, containing the crosswalk between ICD codes and CCS categories (e.g., work.CCS_diag_crosswalk)
cluster_mappings_ds_: name of dataset, including library name, containing the crosswalk between CCS categories and clusters (e.g., work.CCS_cluster_mappings)
out_wide_concept_binary_ds_name_: name of dataset, including library name, where a summary will be saved of the status of each CCS condition for all participants
out_wide_concept_freq_ds_name_: name of dataset, including library name, where a summary will be saved of the number of CCS conditions identified per participant
out_wide_concept_date_ds_name_: name of dataset, including library name, where a summary will be saved of the earliest diagnosis date of each CCS condition per participant.
out_long_concept_ds_name_: name of dataset, including library name, where a summary will be saved of the earliest diagnosis date of each CCS condition per participant.
out_wide_cluster_binary_ds_name_: name of dataset, including library name, where a summary will be saved of the status of each high-level CCS cluster per participant.
out_skipped_codes_ds_name_: name of dataset, including library name, where a summary of all the skipped ICD codes will be saved (i.e., the diagnosis codes that 
                            were not included in the provided ICD-to-CCS crosswalk file. 
diag_code_labels_ds_: [optional] name of dataset, including library name, containing ICD code labels
var_name_prefix_: [optional] characters that will be used to prefix the output variables. For example, if you'd like
				  all generated variables to begin with child_ or mom_ or dad_, this can be specified here. The prefix must be provided
				  without quotes and is limited to a maximum of 8 characters.
debug_: [default=0] a flag indicating whether intermediate datasets should be deleted (debug_=0) or retained (debug_=1)
	
Note: wide format refers to datasets where each row refers to a unique study participant with several variables (e.g., one variable for each CCS condition), 
	  whereas in long format each row refers to a single CCS ascertainment and the earliest occurrence (participant id, CCS_category_id, first_occurrence) 

Note: The ICDA-8 and ICD-9-CM codes in the crosswalk file are limited to 3 digits. The macro truncates longer ICDA-8 and ICD-9-CM 
codes to the first 3 digits.
Note: It's recommended to review the skipped diagnosis codes (output_skipped_codes_ds_name_) to ensure relevant ICD codes 
were not excluded when ascertaining disease status.
Note: Computation time varies depending on cohort size and number of diagnoses codes. In previous research with a cohort size 
of 125,000 individuals and about 14 million diagnosis codes over 5 decades, the computation time to run the macro was approximately 30 minutes. 
*/


***************************************************************************
********************************Example calling code***********************
***************************************************************************;

*providing labels for diagnosis codes to better examine skipped diagnosis codes;

%identify_CCS_health_status(
	participant_id_var_ = scrphin,
	participant_id_ds_ = work.cohort_scrphins,
	diagnosis_codes_ds_ = work.cohort_diagnoses,
	concept_labels_ds_ = work.CCS_categories,
	cluster_labels_ds_ = work.CCS_clusters,
	diag_crosswalk_ds_ = work.CCS_diag_crosswalk,
	cluster_mappings_ds_ = work.CCS_cluster_mappings,
	out_wide_concept_binary_ds_name_ = work.cohort_CCS,
	out_wide_concept_freq_ds_name_ = work.cohort_CCS_freq,
	out_wide_concept_date_ds_name_ = work.cohort_CCS_date,
	out_long_concept_ds_name_ = work.cohort_CCS_long,
	out_wide_cluster_binary_ds_name_ = work.cohort_cluster,
	out_skipped_codes_ds_name_ = work.skipped_codes_summary,
	diag_code_labels_ds_ = work.diagnosis_code_labels,
	var_name_prefix_ = cohort_,
	debug_ = 0
);

*calling the macro without providing diagnosis code labels;

%identify_CCS_health_status(
	participant_id_var_ = scrphin,
	participant_id_ds_ = work.cohort_scrphins,
	diagnosis_codes_ds_ = work.cohort_diagnoses,
	concept_labels_ds_ = work.CCS_categories,
	cluster_labels_ds_ = work.CCS_clusters,
	diag_crosswalk_ds_ = work.CCS_diag_crosswalk,
	cluster_mappings_ds_ = work.CCS_cluster_mappings,
	out_wide_concept_binary_ds_name_ = work.cohort_CCS,
	out_wide_concept_freq_ds_name_ = work.cohort_CCS_freq,
	out_wide_concept_date_ds_name_ = work.cohort_CCS_date,
	out_long_concept_ds_name_ = work.cohort_CCS_long,
	out_wide_cluster_binary_ds_name_ = work.cohort_cluster,
	out_skipped_codes_ds_name_ = work.skipped_codes_summary,
	var_name_prefix_ = cohort_,
	debug_ = 0
);

*************************************************************************
********************************Macro************************************
*************************************************************************;
%macro identify_CCS_health_status(
	participant_id_var_ =,
	participant_id_ds_ =,
	diagnosis_codes_ds_ =,
	concept_labels_ds_ =,
	cluster_labels_ds_ =,
	diag_crosswalk_ds_ =,
	cluster_mappings_ds_ =,
	out_wide_concept_binary_ds_name_ =,
	out_wide_concept_freq_ds_name_ =,
	out_wide_concept_date_ds_name_ =,
	out_long_concept_ds_name_ =,
	out_wide_cluster_binary_ds_name_ =,
	out_skipped_codes_ds_name_ =,
	diag_code_labels_ds_ =,
	var_name_prefix_ =,
	debug_ = 0
);

	%let MAX_VAR_NAME_PREFIX_LENGTH = 8;
	%let error_occured = 0;

	*NOTE: the maximum length for variable name prefixes is dependent upon the maximum length of CCS category string identifiers, which is currently 12;
	*If the maximum length of CCS category string identifers is increased past 12 characters, then MAX_VAR_NAME_PREFIX_LENGTH must be decreased by the same amount;

	*check for error conditions;
	%if %length(&var_name_prefix_) > &MAX_VAR_NAME_PREFIX_LENGTH %then %do;
		%let error_occured = 1;
		%put ERROR: var_name_prefix_ cannot have a length greater than &MAX_VAR_NAME_PREFIX_LENGTH characters;
	%end;

	%if %length(&participant_id_var_) < 1 %then %do;
		%let error_occured = 1;
		%put ERROR: parameter participant_id_var_ is required;
	%end;

	%if %length(&participant_id_ds_) < 1 %then %do;
		%let error_occured = 1;
		%put ERROR: parameter participant_id_ds_ is required;
	%end;

	%if %length(&diagnosis_codes_ds_) < 1 %then %do;
		%let error_occured = 1;
		%put ERROR: parameter diagnosis_codes_ds_ is required;
	%end;

	%if %length(&concept_labels_ds_) < 1 %then %do;
		%let error_occured = 1;
		%put ERROR: parameter concept_labels_ds_ is required;
	%end;

	%if %length(&cluster_labels_ds_) < 1 %then %do;
		%let error_occured = 1;
		%put ERROR: parameter cluster_labels_ds_ is required;
	%end;

	%if %length(&diag_crosswalk_ds_) < 1 %then %do;
		%let error_occured = 1;
		%put ERROR: parameter diag_crosswalk_ds_ is required;
	%end;

	%if %length(&cluster_mappings_ds_) < 1 %then %do;
		%let error_occured = 1;
		%put ERROR: parameter cluster_mappings_ds_ is required;
	%end;

	*proceed if no errors have been encountered;
	%if &error_occured = 0 %then %do;

		*ensure diagnosis codes are uppercase and strip whitespace to facilitate string comparisons, and add wildcard character;
		data &diag_crosswalk_ds_;

			set &diag_crosswalk_ds_;

			diagnosis_code_match_pattern = cats(diagnosis_code_match_pattern, '%');
			diagnosis_code_match_pattern = upcase(diagnosis_code_match_pattern);
			diagnosis_code_match_pattern = strip(diagnosis_code_match_pattern);

			diagnosis_coding_system = upcase(diagnosis_coding_system);

		run;

		data &diagnosis_codes_ds_;

			set &diagnosis_codes_ds_;

			diagnosis_code = upcase(diagnosis_code);
			diagnosis_code = strip(diagnosis_code);
			coding_system = upcase(coding_system);

		run;

		*ascertain CCS conditions, accepting all precise child codes under provided parent codes;
		proc sql noprint;

			create table CCS_long_1 as
			select diagnoses.*,
				   CCS_crosswalk.diagnosis_code_match_pattern,
				   CCS_crosswalk.CCS_category_id
			from &diagnosis_codes_ds_ as diagnoses
			left outer join &diag_crosswalk_ds_ as CCS_crosswalk
				on diagnoses.coding_system eq CCS_crosswalk.diagnosis_coding_system
				and diagnoses.diagnosis_code like strip(CCS_crosswalk.diagnosis_code_match_pattern);

		quit;

		*summarize skipped diagnosis codes (i.e., diagnosis codes that were not listed in the ICD-to-CCS crosswalk file);

		*extract diagnosis codes that were skipped;
		data skipped_diagnoses;

			set CCS_long_1;

			where missing(CCS_category_id);

		run;

		proc sql noprint;

			create table skipped_diagnoses_freq_1 as
			select coding_system,
				   diagnosis_code,
				   count(*) as count
			from skipped_diagnoses
			group by coding_system,
					 diagnosis_code;

		quit;

		*if diagnosis code descriptions were provided, then attach diagnosis code labels to the skipped codes summary dataset;
		%if %length(&diag_code_labels_ds_) > 0 %then %do;

			proc sql noprint;

				create table skipped_diagnoses_freq_2 as
				select skipped_codes.*,
					   code_labels.diagnosis_descr
				from skipped_diagnoses_freq_1 as skipped_codes
				left outer join &diag_code_labels_ds_ as code_labels
					on skipped_codes.coding_system eq code_labels.coding_system
					and skipped_codes.diagnosis_code eq code_labels.diagnosis_code;

			quit;

		%end;
		%else %do;

			data skipped_diagnoses_freq_2;
				set skipped_diagnoses_freq_1;
			run;

		%end;

		proc sort data=skipped_diagnoses_freq_2
				  out=skipped_diagnoses_freq_3;
			by descending count diagnosis_code coding_system;
		run;

		*process CCS clinical concept assignments;

		*exclude diagnosis records that did not map to a CCS category;
		data CCS_long_2;

			set CCS_long_1;

			where not missing(CCS_category_id);

			format encounter_date date11.;

			keep &participant_id_var_
				 encounter_date
				 CCS_category_id;

			label &participant_id_var_ = 'Participant ID';
			label encounter_date = 'Encounter date';
			label CCS_category_id = 'CCS concept ID';

		run;

		*binary health status;

		proc sql noprint;

			create table CCS_long_bin_1 as
			select &participant_id_var_,
				   CCS_category_id,
				   (count(*) > 0) as health_status
			from CCS_long_2
			group by &participant_id_var_,
				     CCS_category_id;

		quit;

		proc sql noprint;

			create table CCS_long_bin_2 as
			select CCS_bin.*,
				   CCS_category_labels.CCS_category_descr
			from CCS_long_bin_1 as CCS_bin
			inner join &concept_labels_ds_ as CCS_category_labels
				on CCS_bin.CCS_category_id eq CCS_category_labels.CCS_category_id;

		quit;

		*attach variable labels;
		data CCS_long_bin_2;

			set CCS_long_bin_2;

			label &participant_id_var_ = 'Participant ID';
			label CCS_category_id = 'CCS concept ID';
			label CCS_category_descr = 'CCS concept description';

		run;

		*prepare data to be transformed into wide format: create variable names with common and optional prefixes, and sort;
		data CCS_long_bin_2;

			length var_name $ 32;

			set CCS_long_bin_2;

			var_name = cats("&var_name_prefix_.", 'CCS_', CCS_category_id);

		run;

		proc sort data=CCS_long_bin_2;
			by &participant_id_var_ CCS_category_id;
		run;

		proc transpose data=CCS_long_bin_2 out=CCS_wide_binary_1 (drop=_NAME_);
			by &participant_id_var_;
			id var_name;
			idlabel CCS_category_descr;
			var health_status;
		run;

		*group ascertained conditions into higher-level clusters;

		*link CCS categories with higher-level clusters;
		proc sql noprint;

			create table cluster_long_bin_1 as
			select CCS_bin.*,
				   cluster_mappings.CCS_cluster_id
			from CCS_long_bin_1 as CCS_bin
			inner join &cluster_mappings_ds_ as cluster_mappings
				on CCS_bin.CCS_category_id eq cluster_mappings.CCS_category_id;

		quit;

		*remove duplicate cluster assignments;
		proc sql noprint;

			create table cluster_long_bin_2 as
			select cluster_bin.&participant_id_var_,
				   cluster_bin.CCS_cluster_id,
				   (count(*) > 0) as health_status
			from cluster_long_bin_1 as cluster_bin
			group by &participant_id_var_,
				     CCS_cluster_id;

		quit;

		*attach cluster labels;
		proc sql noprint;

			create table cluster_long_bin_3 as
			select cluster_bin.*,
				   cluster_labels.CCS_cluster_descr
			from cluster_long_bin_2 as cluster_bin
			inner join &cluster_labels_ds_ as cluster_labels
				on cluster_bin.CCS_cluster_id eq cluster_labels.CCS_cluster_id;

		quit;

		*attach variable labels;
		data cluster_long_bin_3;

			set cluster_long_bin_3;

			label &participant_id_var_ = 'Participant ID';
			label CCS_cluster_id = 'CCS cluster ID';
			label CCS_cluster_descr = 'CCS cluster description';

		run;

		*prepare data to be transformed into wide format: create variable names with common and optional prefixes, and sort;
		data cluster_long_bin_3;

			length var_name $ 32;

			set cluster_long_bin_3;

			*note: the following concatenation implicitly converts decimal numbers to character and replaces the decimal with an underscore;
			var_name = cats("&var_name_prefix_.", 'cluster_', CCS_cluster_id);

		run;

		proc sort data=cluster_long_bin_3;
			by &participant_id_var_ CCS_cluster_id;
		run;

		proc transpose data=cluster_long_bin_3 out=cluster_wide_binary_1 (drop=_NAME_);
			by &participant_id_var_;
			id var_name;
			idlabel CCS_cluster_descr;
			var health_status;
		run;

		*number of health conditions per participant;

		proc sql noprint;

			create table CCS_freq_1 as
			select &participant_id_var_,
				   count(*) as num_CCS_conditions
			from CCS_long_bin_2
			group by &participant_id_var_;

		quit;

		data CCS_freq_1;

			set CCS_freq_1;

			label &participant_id_var_ = 'Participant ID';
			label num_CCS_conditions = 'Number of CCS concepts';

		run;

		*earliest date of occurrence;

		proc sql noprint;

			create table CCS_long_date_1 as
			select &participant_id_var_,
				   CCS_category_id,
				   min(encounter_date) as first_occurrence format date11.
			from CCS_long_2
			group by &participant_id_var_,
				     CCS_category_id;

		quit;

		proc sql noprint;

			create table CCS_long_date_2 as
			select CCS_dates.*,
				   CCS_category_labels.CCS_category_descr
			from CCS_long_date_1 as CCS_dates
			inner join &concept_labels_ds_ as CCS_category_labels
				on CCS_dates.CCS_category_id eq CCS_category_labels.CCS_category_id;

		quit;

		*attach variable labels;
		data CCS_long_date_2;

			set CCS_long_date_2;

			label &participant_id_var_ = 'Participant ID';
			label CCS_category_id = 'CCS concept ID';
			label CCS_category_descr = 'CCS concept description';
			label first_occurrence = 'Date of first occurrence';

		run;

		*prepare data to be transformed into wide format: create variable names with common and optional prefixes, and sort;
		data CCS_long_date_2;

			length var_name $ 32;

			set CCS_long_date_2;

			var_name = cats("&var_name_prefix_.", 'CCS_', CCS_category_id, '_dt');

		run;

		proc sort data=CCS_long_date_2;
			by &participant_id_var_ CCS_category_id;
		run;

		proc transpose data=CCS_long_date_2 out=CCS_wide_date_1 (drop=_NAME_ _LABEL_);
			by &participant_id_var_;
			id var_name;
			idlabel CCS_category_descr;
			var first_occurrence;
		run;

		*post processing;

		*copy to new datasets in order to add individuals without disease/exposure (if requested);
		proc sql noprint;

			create table CCS_wide_binary_2 as
			select *
			from CCS_wide_binary_1;

			create table CCS_freq_2 as
			select *
			from CCS_freq_1;
			
			create table CCS_wide_date_2 as
			select *
			from CCS_wide_date_1;

			create table cluster_wide_binary_2 as
			select *
			from cluster_wide_binary_1;
			
		quit;

		*add individuals that have not been identified as having any of the health status categories;
		%if %length(&participant_id_ds_) > 0 %then %do;

			proc sql noprint;

				insert into CCS_wide_binary_2 (&participant_id_var_)
				select &participant_id_var_
				from &participant_id_ds_
				where &participant_id_var_ not in (select &participant_id_var_ from CCS_wide_binary_1);

				insert into CCS_freq_2 (&participant_id_var_)
				select &participant_id_var_
				from &participant_id_ds_
				where &participant_id_var_ not in (select &participant_id_var_ from CCS_freq_1);

				insert into CCS_wide_date_2 (&participant_id_var_)
				select &participant_id_var_
				from &participant_id_ds_
				where &participant_id_var_ not in (select &participant_id_var_ from CCS_wide_date_1);

				insert into cluster_wide_binary_2 (&participant_id_var_)
				select &participant_id_var_
				from &participant_id_ds_
				where &participant_id_var_ not in (select &participant_id_var_ from cluster_wide_binary_1);

			quit;

		%end;

		*set missing values to zero (false);
		proc stdize data=CCS_wide_binary_2 out=CCS_wide_binary_3 reponly missing=0;
		run;

		proc stdize data=CCS_freq_2 out=CCS_freq_3 reponly missing=0;
		run;

		proc stdize data=cluster_wide_binary_2 out=cluster_wide_binary_3 reponly missing=0;
		run;

		*copy final datasets to user-specified locations if the user requested they be outputted;
		%if %length(&out_wide_concept_binary_ds_name_) > 0 %then %do;
			data &out_wide_concept_binary_ds_name_;
				set CCS_wide_binary_3;
			run;
		%end;

		%if %length(&out_wide_concept_freq_ds_name_) > 0 %then %do;
			data &out_wide_concept_freq_ds_name_;
				set CCS_freq_3;
			run;
		%end;

		%if %length(&out_wide_concept_date_ds_name_) > 0 %then %do;
			data &out_wide_concept_date_ds_name_;
				set CCS_wide_date_2;
			run;
		%end;

		%if %length(&out_long_concept_ds_name_) > 0 %then %do;
			data &out_long_concept_ds_name_;
				set CCS_long_2;
			run;
		%end;

		%if %length(&out_skipped_codes_ds_name_) > 0 %then %do;
			data &out_skipped_codes_ds_name_;
				set skipped_diagnoses_freq_3;
			run;
		%end;

		%if %length(&out_wide_cluster_binary_ds_name_) > 0 %then %do;
			data &out_wide_cluster_binary_ds_name_;
				set cluster_wide_binary_3;
			run;
		%end;

		*delete all of the intermediary datasets unless the debug flag has been set;
		%if &debug_ = 0 %then %do;

			proc datasets library=work nolist;

				delete
				CCS_long_1
				skipped_diagnoses
				skipped_diagnoses_freq_1
				skipped_diagnoses_freq_2
				skipped_diagnoses_freq_3
				CCS_long_2
				CCS_long_bin_1
				CCS_long_bin_2
				CCS_wide_binary_1
				cluster_long_bin_1
				cluster_long_bin_2
				cluster_long_bin_3
				cluster_wide_binary_1
				CCS_freq_1
				CCS_long_date_1
				CCS_long_date_2
				CCS_wide_date_1
				CCS_wide_binary_2
				CCS_freq_2
				CCS_wide_date_2
				cluster_wide_binary_2
				CCS_wide_binary_3
				CCS_freq_3
				cluster_wide_binary_3;

			run;

		%end;

	%end;

%mend;