/*--------------------------------------------------------------------------- © Copyright 2018University of Manitoba This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ------------------------------------------------------------------------------ Programmer: Gurpreet Singh Pabla Please direct questions and comments to info@cpe.umanitoba.ca January 24, 2018 This macro produces a summary report containing the number of non-missing records in each variable for two versions of the same dataset. For more descriptive information, see the accompanying documentation for the PREDQ SAS Macro on the MCHP web site. ------------------------------------------------------------------------------ Macro Description: %PREDQ(DOMAIN = name of the library. ,DB = Database name to compare to. It will automatically pull last member in the cluster unless providing memnum in both DB & NEW_DTSET. May provide database prefix if new_dtset is kept blank. ,NEW_DTSET = New Database. Keep this field blank if both the databases to compare are in same cluster. provide memnum to compare specific member. ,OUTPUT_DIR = Location of output. ,FILE_ID = Text added to end of EXCEL filename e.g.: Compare last members of two different databases: %PREDQ(common,conc_2006,conc_2011, G:\dmusers\username); Compare last two members of same database:%PREDQ(HEALTH,MHMHLTH_MHID_2010APR,, G:\dmusers\username); Compare specific members of two same/different databases:%PREDQ(REGISTRY,VSA_MMDF_2000JAN(MEMNUM=2), VSA_MMDF_2000JAN(MEMNUM=3),G:\dmusers\username); Check the "Results" tab for the output or excel created at the location provided. >>Orange implies 0% to 10% drop in number of records >>Red implies 10% to 100% drop in number of records >>Yellow on "Previous vs Latest Data Type" column implies data type change >>Light Green on "Previous vs Latest Data Type" column implies data type is similar but format is different **************************************************************************/ /*OPTIONS MACROGEN MLOGIC MPRINT SYMBOLGEN;*/ %MACRO PREDQ(DOMAIN=,DB=,NEW_DTSET=,OUTPUT_DIR=,FILE_ID=); LIBNAME DB_DIR SASSPDS "&DOMAIN." HOST='SPDS' SERV='5190' USER="&SPDSUSER" PASSWD="&SPDSPW" ACLSPECIAL=YES ; /***************If both databases are not in same cluster or MEMNUM is provided**************/ %IF ((%INDEX(%UPCASE(&DB.),MEMNUM) > 0 ) OR (%INDEX(%UPCASE(&NEW_DTSET.),MEMNUM) > 0) OR ("&NEW_DTSET." NE %STR("")) ) %THEN %DO; /************Creating a flag if memnum is provided***********/ %IF ((%INDEX(%UPCASE(&DB.),MEMNUM) > 0 ) OR (%INDEX(%UPCASE(&NEW_DTSET.),MEMNUM) > 0)) %THEN %DO; %LET MEM_PRSNT_FLAG = 'Y'; %END; %ELSE %DO; %LET MEM_PRSNT_FLAG = 'N'; %END; %LET DB2 = &DB.; %LET DB1 = &NEW_DTSET.; %DO I = 1 %TO 2; /***********Get member number(if not provided)***********/ ODS SELECT NONE; ODS OUTPUT ENGINEHOST=CLSMEM_&I.; PROC CONTENTS DATA=DB_DIR.&&DB&I. OUT=CONTENTS&I.; RUN; ODS OUTPUT CLOSE; DATA _NULL_; SET CLSMEM_&I.; IF UPCASE(LABEL1) = 'CLUSTER MEMBERS ARE' THEN DO ; CNT = (COUNT(CVALUE1,',')); CALL SYMPUTX("MEM&I.", CNT+1 ); END; ELSE /*IF UPCASE(LABEL1) = 'DATA SET IS A CLUSTER' AND UPCASE(CVALUE1) = 'NO' THEN*/ CALL SYMPUTX("MEM&I.",1); RUN; PROC SQL; CREATE TABLE CL_NM&I. AS SELECT NAME,TYPE,LENGTH,FORMAT,FORMATL,FORMATD FROM CONTENTS&I. ORDER BY VARNUM; QUIT; PROC SQL NOPRINT; SELECT COUNT(DISTINCT NAME) INTO :TOT_VAR&I. FROM CL_NM&I. ; QUIT; /********Concatenating data type, length and format*******/ DATA DT_TYP&I.(drop=type length FORMAT:); SET CL_NM&I.; length typ $40.; IF TYPE = 2 THEN TYP = 'Char'||'('||strip(put(length,8.))||')('||strip(put(FORMAT,8.))|| strip(put(FORMATL,8.))||'.'||strip(put(FORMATD,8.))||')'; ELSE TYP = 'Num'||'('||strip(put(length,8.))||')('||strip(put(FORMAT,8.))||strip(put(FORMATL,8.))|| '.'||strip(put(FORMATD,8.))||')'; TYP=tranwrd(TYP, "(0.0)", ""); TYP=tranwrd(TYP, ".0)", ")"); run; data _null_; set CL_NM&I. end=end; count+1; call symputX("LIST_COL_NM"||left(count),left(trim(name))); run; /************Take non-missing count************/ PROC SQL; CREATE TABLE DQLST_MEM&I. AS SELECT %DO K = 1 %TO &&TOT_VAR&I.; %IF &K = &&TOT_VAR&I. %THEN %DO; COUNT(&&LIST_COL_NM&K.) AS &&LIST_COL_NM&K. %END; %ELSE %DO; COUNT(&&LIST_COL_NM&K.) AS &&LIST_COL_NM&K. , %END; %END; %IF &MEM_PRSNT_FLAG. = 'Y' %THEN %DO; FROM DB_DIR.&&DB&I. %END; %ELSE %DO; FROM DB_DIR.&&DB&I.(MEMNUM=&&MEM&I.) %END; ; QUIT; %if &I = 1 %THEN %DO; PROC TRANSPOSE DATA=DQLST_MEM&I. OUT=TSPOS_DQLST_MEM&I. NAME=Variables_in_cluster prefix=Latest_cluster_member;RUN; %END; %ELSE %DO; PROC TRANSPOSE DATA=DQLST_MEM&I. OUT=TSPOS_DQLST_MEM&I. NAME=Variables_in_cluster prefix=Previous_cluster_member;RUN; DATA VAR_ORD(KEEP=NAME ORD); SET CL_NM1 CL_NM2; ORD = _N_; RUN; PROC SORT DATA=VAR_ORD NODUPKEY ;BY NAME;RUN; %END; %END; /************Compare data type/formats of variables************/ PROC SQL; CREATE TABLE DQ_DTTYP AS SELECT COALESCE(A.NAME,B.NAME) AS NAM , CASE WHEN A.TYP NOT IN (' ') AND B.TYP NOT IN (' ') AND A.TYP NE B.TYP THEN STRIP(B.TYP)||" VS "||STRIP(A.TYP) ELSE COALESCE(B.TYP,A.TYP) END AS OLD_VS_NEW_TYP_DIFF FROM DT_TYP1 A FULL OUTER JOIN DT_TYP2 B ON A.NAME = B.NAME ; QUIT; PROC SQL; CREATE TABLE DQ_DTSET AS SELECT COALESCE(A.VARIABLES_IN_CLUSTER,B.VARIABLES_IN_CLUSTER) AS VARIABLES_IN_CLUSTER, B.PREVIOUS_CLUSTER_MEMBER1 AS PREVIOUS_CLUSTER_MEMBER, A.LATEST_CLUSTER_MEMBER1 AS LATEST_CLUSTER_MEMBER FROM TSPOS_DQLST_MEM1 A FULL OUTER JOIN TSPOS_DQLST_MEM2 B ON A.VARIABLES_IN_CLUSTER = B.VARIABLES_IN_CLUSTER ; QUIT; PROC SQL; CREATE TABLE DQ_DTSET_FIN AS SELECT A.* , B.OLD_VS_NEW_TYP_DIFF FROM DQ_DTSET A FULL OUTER JOIN DQ_DTTYP B ON A.VARIABLES_IN_CLUSTER = B.NAM ; QUIT; PROC SQL; CREATE TABLE DQ_DTSET_FINAL AS SELECT A.* FROM DQ_DTSET_FIN A FULL OUTER JOIN VAR_ORD B ON A.VARIABLES_IN_CLUSTER = B.NAME ORDER BY ORD ; QUIT; /*****************Creating Output Report****************/ ODS SELECT ALL; ods Tagsets.ExcelXP file="&OUTPUT_DIR.\%scan(&DB1,1,'(')_PREDQ&FILE_ID..xls" style=sasweb; ods tagsets.ExcelXP options(sheet_name="predq" embedded_TITLES='yes' AUTOFILTER='ALL' frozen_headers = 'yes' ) HEADTEXT=""; TITLE1 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Orange implies 0% to 10% drop in number of records'; TITLE2 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Red implies 10% to 100% drop in number of records'; TITLE3 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Yellow on "Previous vs Latest Data Type" column implies data type change'; TITLE4 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Light Green on "Previous vs Latest Data Type" column implies data type is similar but format is different'; %IF &MEM_PRSNT_FLAG. = 'Y' %THEN %DO; TITLE5 JUSTIFY=LEFT ITALIC BOLD COLOR=BLACK FONT=Arial HEIGHT=9pt "Latest Cluster Member: &DB1. Previous Cluster member: &DB2."; %END; %ELSE %DO; TITLE5 JUSTIFY=LEFT ITALIC BOLD COLOR=BLACK FONT=Arial HEIGHT=9pt "Latest Cluster Member: &DB1.(MEMNUM=&MEM1.) Previous Cluster member: &DB2.(MEMNUM=&MEM2.)"; %END; proc report data=DQ_DTSET_final nowd split='*'; columns Variables_in_cluster Previous_cluster_member Latest_cluster_member Percent_chng old_vs_new_typ_diff old_vs_new_typ; define Variables_in_cluster /display "Variables In Cluster" LEFT; define Previous_cluster_member /display "Previous Cluster Member Count" format=comma15. center ; define Latest_cluster_member /display "Latest Cluster Member Count" format=comma15. center ; define Percent_chng/computed "Percent Change" CENTER ; define old_vs_new_typ_diff/NOPRINT; define old_vs_new_typ/COMPUTED "Previous vs Latest Data Type"; compute Percent_chng/ character length=50; if Previous_cluster_member = 0 AND Latest_cluster_member > 0 then Percent_chng = '100% '; ELSE if Previous_cluster_member = 0 AND Latest_cluster_member = 0 then Percent_chng = '0% '; else Percent_chng=strip(PUT((((Latest_cluster_member-Previous_cluster_member)/Previous_cluster_member))*100,8.2))||'%'; IF Previous_cluster_member = . THEN Percent_chng = 'Variable not in Previous Cluster'; else IF latest_cluster_member = . THEN Percent_chng = 'Variable not in Latest Cluster'; if substr(Percent_chng,1,1) ='-' and substr(Percent_chng,3,1) not in ('.') then call define(_row_,"style"," style={background=red font_weight=bold}") ; else if substr(Percent_chng,1,1) ='-' then call define(_row_,"style","style={background=orange font_weight=bold}") ; endcomp; compute old_vs_new_typ/ character length=40; old_vs_new_typ = old_vs_new_typ_diff; if INDEX(old_vs_new_typ_diff,'VS') > 0 THEN DO; IF SUBSTR(old_vs_new_typ_diff,1,INDEX(old_vs_new_typ_diff,")")) = substr(substr(old_vs_new_typ_diff,INDEX(old_vs_new_typ_diff, " VS ")+4),1,index(substr(old_vs_new_typ_diff,INDEX(old_vs_new_typ_diff," VS ")+4),")") ) then call define(_col_,"style","style={background=light green font_weight=bold}") ; else call define(_col_,"style","style={background=yellow font_weight=bold}") ; END; endcomp; run; ods tagsets.ExcelXP close; TITLE1 ''; TITLE2 ''; TITLE3 ''; TITLE4 ''; TITLE5 ''; PROC DATASETS NOLIST LIBRARY=WORK KILL; QUIT; %end; /*******************If both databases are in same cluster**************/ %ELSE /*%IF &NEW_DTSET = %THEN */ %DO; /***********Get member number(if not provided)***********/ PROC SQL NOPRINT; SELECT MEMNAME INTO :DSN SEPARATED BY ' ' FROM DICTIONARY.MEMBERS WHERE LIBNAME = %UPCASE("&DOMAIN") AND %UPCASE(SUBSTR(MEMNAME, 1, %LENGTH(&DB))) = %UPCASE("&DB"); QUIT; %LET N_DSN = %SYSFUNC(COUNTW(&DSN)); %DO F = 1 %TO &N_DSN; ODS SELECT NONE; ODS OUTPUT ENGINEHOST=CLSMEM&F.; PROC CONTENTS DATA=DB_DIR.%scan(&dsn, &F) OUT=CONTENTS&F.; RUN; ODS OUTPUT CLOSE; DATA _NULL_; SET CLSMEM&F.; IF UPCASE(LABEL1) = 'CLUSTER MEMBERS ARE' THEN DO ; CNT = (COUNT(CVALUE1,',')); CALL SYMPUTX('MEM1',CNT+1); CALL SYMPUTX('MEM2',CNT); OUTPUT; END; ELSE IF UPCASE(STRIP(LABEL1)) = 'DATA SET IS A CLUSTER' AND UPCASE(STRIP(CVALUE1)) = 'NO' THEN DO; CALL SYMPUTX("MEM2",0); END; RUN; %PUT &MEM1. &MEM2.; %IF &MEM2. = 0 %THEN %DO; %PUT "ERROR: Cluster DB_DIR.%scan(&dsn, &F) has only one member"; %END; %ELSE %DO; PROC SQL; CREATE TABLE CL_NM&F. AS SELECT NAME FROM CONTENTS&F. ORDER BY VARNUM; QUIT; PROC SQL NOPRINT; SELECT COUNT(DISTINCT NAME) INTO :TOT_VAR&F. FROM CL_NM&F. ; QUIT; DATA _NULL_; SET CL_NM&F. END=END; COUNT+1; CALL SYMPUTX('LIST_COL_NM'||LEFT(COUNT),LEFT(TRIM(NAME))); RUN; /************Take non-missing count************/ %DO L = 1 %TO 2; PROC SQL; CREATE TABLE DQLST_MEM&L. AS SELECT %DO K = 1 %TO &&TOT_VAR&F.; %IF &K = &&TOT_VAR&F. %THEN %DO; COUNT(&&LIST_COL_NM&K.) AS &&LIST_COL_NM&K. %END; %ELSE %DO; COUNT(&&LIST_COL_NM&K.) AS &&LIST_COL_NM&K. , %END; %END; FROM DB_DIR.%scan(&dsn, &F) (MEMNUM=&&MEM&L.) ; QUIT; %if &l = 1 %THEN %DO; PROC TRANSPOSE DATA=DQLST_MEM&L. OUT=TSPOS_DQLST_MEM&L. NAME=Variables_in_cluster prefix=Latest_cluster_member;RUN; %END; %ELSE %DO; PROC TRANSPOSE DATA=DQLST_MEM&L. OUT=TSPOS_DQLST_MEM&L. NAME=Variables_in_cluster prefix=Previous_cluster_member;RUN; %END; %END; PROC SQL; CREATE TABLE DQ_DTSET AS SELECT A.Variables_in_cluster, B.Previous_cluster_member1 AS Previous_cluster_member, A.Latest_cluster_member1 AS Latest_cluster_member FROM TSPOS_DQLST_MEM1 A ,TSPOS_DQLST_MEM2 B WHERE A.Variables_in_cluster = B.Variables_in_cluster; QUIT; /***********Create output report***********/ ODS SELECT ALL; ods Tagsets.ExcelXP file="&OUTPUT_DIR.\%scan(&dsn, &F)_PREDQ&FILE_ID..xls" style=sasweb; ods tagsets.ExcelXP options(sheet_name="predq" embedded_TITLES='yes' AUTOFILTER='ALL' frozen_headers = 'yes' ) HEADTEXT=""; TITLE1 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Orange implies 0% to 10% drop in number of records'; TITLE2 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Red implies 10% to 100% drop in number of records'; TITLE3 JUSTIFY=LEFT ITALIC BOLD COLOR=BLACK FONT=Arial HEIGHT=9pt "Latest Cluster Member: %scan(&dsn, &F)(MEMNUM=&MEM1.) Previous Cluster member: %scan(&dsn, &F)(MEMNUM=&MEM2.)"; proc report data=DQ_DTSET nowd split='*'; columns Variables_in_cluster Previous_cluster_member Latest_cluster_member Percent_chng; define Variables_in_cluster /display "Variables In Cluster" CENTER; define Previous_cluster_member /display "Previous Cluster Member" format=comma15. CENTER; define Latest_cluster_member /display "Latest Cluster Member " format=comma15. CENTER; define Percent_chng/computed "Percent Change" format=percent10.2 CENTER; compute Percent_chng; if Previous_cluster_member = 0 AND Latest_cluster_member > 0 then Percent_chng = 1; ELSE if Previous_cluster_member = 0 AND Latest_cluster_member = 0 then Percent_chng = 0; else Percent_chng=((Latest_cluster_member-Previous_cluster_member)/Previous_cluster_member); if -0.1 < Percent_chng < 0 then call define(_row_,"style","style={background=orange font_weight=bold}") ; else if Percent_chng <= -0.1 then call define(_row_,"style","style={background=red font_weight=bold}") ; endcomp; run; ods Tagsets.ExcelXP close; TITLE1 ''; TITLE2 ''; TITLE3 ''; %END; PROC DATASETS NOLIST LIBRARY=WORK KILL; QUIT; %END; %END; %MEND; /*%PREDQ(DOMAIN=health,DB=MH_HOSP_2012APR_DAD,NEW_DTSET=MH_HOSP_2013APR_DAD,OUTPUT_DIR=G:\dmusers\username);*/ /*%PREDQ(DOMAIN=common,DB=conc_2006,NEW_DTSET=conc_2011,OUTPUT_DIR=G:\dmusers\username);*/ /*%PREDQ(DOMAIN=HEALTH,DB=MHMHLTH_MHID_2010APR,NEW_DTSET=,OUTPUT_DIR=G:\dmusers\username);*/ /*%PREDQ(DOMAIN=REGISTRY,DB=VSA_MMDF_2000JAN(MEMNUM=13),NEW_DTSET=VSA_MMDF_2000JAN(MEMNUM=14),OUTPUT_DIR=G:\dmusers\username);*/ /*%PREDQ(DOMAIN= Common ,DB= TARIFF_,NEW_DTSET=,OUTPUT_DIR=G:\dmusers\username);*/