/*---------------------------------------------------------------------------
© Copyright 2018University of Manitoba
This program is free software: you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation, either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
------------------------------------------------------------------------------
Programmer: Gurpreet Singh Pabla
Please direct questions and comments to info@cpe.umanitoba.ca
January 24, 2018
This macro produces a summary report containing the number of non-missing records in each
variable for two versions of the same dataset. For more descriptive information, see the
accompanying documentation for the PREDQ SAS Macro on the MCHP web site.
------------------------------------------------------------------------------
Macro Description:
%PREDQ(DOMAIN = name of the library.
,DB = Database name to compare to. It will automatically pull last member in the cluster
unless providing memnum in both DB & NEW_DTSET. May provide database prefix if new_dtset is
kept blank.
,NEW_DTSET = New Database. Keep this field blank if both the databases to compare are in
same cluster. provide memnum to compare specific member.
,OUTPUT_DIR = Location of output.
,FILE_ID = Text added to end of EXCEL filename
e.g.:
Compare last members of two different databases: %PREDQ(common,conc_2006,conc_2011,
G:\dmusers\username);
Compare last two members of same database:%PREDQ(HEALTH,MHMHLTH_MHID_2010APR,,
G:\dmusers\username);
Compare specific members of two same/different databases:%PREDQ(REGISTRY,VSA_MMDF_2000JAN(MEMNUM=2),
VSA_MMDF_2000JAN(MEMNUM=3),G:\dmusers\username);
Check the "Results" tab for the output or excel created at the location provided.
>>Orange implies 0% to 10% drop in number of records
>>Red implies 10% to 100% drop in number of records
>>Yellow on "Previous vs Latest Data Type" column implies data type change
>>Light Green on "Previous vs Latest Data Type" column implies data type is similar but format
is different
**************************************************************************/
/*OPTIONS MACROGEN MLOGIC MPRINT SYMBOLGEN;*/
%MACRO PREDQ(DOMAIN=,DB=,NEW_DTSET=,OUTPUT_DIR=,FILE_ID=);
LIBNAME DB_DIR SASSPDS "&DOMAIN." HOST='SPDS' SERV='5190' USER="&SPDSUSER" PASSWD="&SPDSPW"
ACLSPECIAL=YES ;
/***************If both databases are not in same cluster or MEMNUM is provided**************/
%IF ((%INDEX(%UPCASE(&DB.),MEMNUM) > 0 ) OR (%INDEX(%UPCASE(&NEW_DTSET.),MEMNUM) > 0) OR
("&NEW_DTSET." NE %STR("")) ) %THEN %DO;
/************Creating a flag if memnum is provided***********/
%IF ((%INDEX(%UPCASE(&DB.),MEMNUM) > 0 ) OR (%INDEX(%UPCASE(&NEW_DTSET.),MEMNUM) > 0)) %THEN %DO;
%LET MEM_PRSNT_FLAG = 'Y';
%END;
%ELSE %DO;
%LET MEM_PRSNT_FLAG = 'N';
%END;
%LET DB2 = &DB.;
%LET DB1 = &NEW_DTSET.;
%DO I = 1 %TO 2;
/***********Get member number(if not provided)***********/
ODS SELECT NONE;
ODS OUTPUT ENGINEHOST=CLSMEM_&I.;
PROC CONTENTS DATA=DB_DIR.&&DB&I. OUT=CONTENTS&I.;
RUN;
ODS OUTPUT CLOSE;
DATA _NULL_;
SET CLSMEM_&I.;
IF UPCASE(LABEL1) = 'CLUSTER MEMBERS ARE' THEN DO ;
CNT = (COUNT(CVALUE1,','));
CALL SYMPUTX("MEM&I.", CNT+1 );
END;
ELSE /*IF UPCASE(LABEL1) = 'DATA SET IS A CLUSTER' AND UPCASE(CVALUE1) = 'NO' THEN*/
CALL SYMPUTX("MEM&I.",1);
RUN;
PROC SQL;
CREATE TABLE CL_NM&I. AS
SELECT NAME,TYPE,LENGTH,FORMAT,FORMATL,FORMATD FROM CONTENTS&I. ORDER BY VARNUM;
QUIT;
PROC SQL NOPRINT;
SELECT COUNT(DISTINCT NAME) INTO :TOT_VAR&I. FROM CL_NM&I. ;
QUIT;
/********Concatenating data type, length and format*******/
DATA DT_TYP&I.(drop=type length FORMAT:);
SET CL_NM&I.;
length typ $40.;
IF TYPE = 2 THEN TYP = 'Char'||'('||strip(put(length,8.))||')('||strip(put(FORMAT,8.))||
strip(put(FORMATL,8.))||'.'||strip(put(FORMATD,8.))||')';
ELSE TYP = 'Num'||'('||strip(put(length,8.))||')('||strip(put(FORMAT,8.))||strip(put(FORMATL,8.))||
'.'||strip(put(FORMATD,8.))||')';
TYP=tranwrd(TYP, "(0.0)", "");
TYP=tranwrd(TYP, ".0)", ")");
run;
data _null_;
set CL_NM&I. end=end;
count+1;
call symputX("LIST_COL_NM"||left(count),left(trim(name)));
run;
/************Take non-missing count************/
PROC SQL;
CREATE TABLE DQLST_MEM&I. AS
SELECT
%DO K = 1 %TO &&TOT_VAR&I.;
%IF &K = &&TOT_VAR&I. %THEN %DO;
COUNT(&&LIST_COL_NM&K.) AS &&LIST_COL_NM&K.
%END;
%ELSE %DO;
COUNT(&&LIST_COL_NM&K.) AS &&LIST_COL_NM&K. ,
%END;
%END;
%IF &MEM_PRSNT_FLAG. = 'Y' %THEN %DO;
FROM DB_DIR.&&DB&I.
%END;
%ELSE %DO;
FROM DB_DIR.&&DB&I.(MEMNUM=&&MEM&I.)
%END;
;
QUIT;
%if &I = 1 %THEN %DO;
PROC TRANSPOSE DATA=DQLST_MEM&I. OUT=TSPOS_DQLST_MEM&I. NAME=Variables_in_cluster
prefix=Latest_cluster_member;RUN;
%END;
%ELSE %DO;
PROC TRANSPOSE DATA=DQLST_MEM&I. OUT=TSPOS_DQLST_MEM&I. NAME=Variables_in_cluster
prefix=Previous_cluster_member;RUN;
DATA VAR_ORD(KEEP=NAME ORD);
SET CL_NM1
CL_NM2;
ORD = _N_;
RUN;
PROC SORT DATA=VAR_ORD NODUPKEY ;BY NAME;RUN;
%END;
%END;
/************Compare data type/formats of variables************/
PROC SQL;
CREATE TABLE DQ_DTTYP AS
SELECT COALESCE(A.NAME,B.NAME) AS NAM
, CASE WHEN A.TYP NOT IN (' ') AND B.TYP NOT IN (' ') AND A.TYP NE B.TYP
THEN STRIP(B.TYP)||" VS "||STRIP(A.TYP)
ELSE COALESCE(B.TYP,A.TYP)
END AS OLD_VS_NEW_TYP_DIFF
FROM DT_TYP1 A
FULL OUTER JOIN DT_TYP2 B
ON A.NAME = B.NAME
;
QUIT;
PROC SQL;
CREATE TABLE DQ_DTSET AS
SELECT COALESCE(A.VARIABLES_IN_CLUSTER,B.VARIABLES_IN_CLUSTER) AS VARIABLES_IN_CLUSTER,
B.PREVIOUS_CLUSTER_MEMBER1 AS PREVIOUS_CLUSTER_MEMBER, A.LATEST_CLUSTER_MEMBER1 AS LATEST_CLUSTER_MEMBER
FROM TSPOS_DQLST_MEM1 A
FULL OUTER JOIN TSPOS_DQLST_MEM2 B
ON A.VARIABLES_IN_CLUSTER = B.VARIABLES_IN_CLUSTER
;
QUIT;
PROC SQL;
CREATE TABLE DQ_DTSET_FIN AS
SELECT A.*
, B.OLD_VS_NEW_TYP_DIFF
FROM DQ_DTSET A
FULL OUTER JOIN DQ_DTTYP B
ON A.VARIABLES_IN_CLUSTER = B.NAM
;
QUIT;
PROC SQL;
CREATE TABLE DQ_DTSET_FINAL AS
SELECT A.*
FROM DQ_DTSET_FIN A
FULL OUTER JOIN VAR_ORD B
ON A.VARIABLES_IN_CLUSTER = B.NAME
ORDER BY ORD
;
QUIT;
/*****************Creating Output Report****************/
ODS SELECT ALL;
ods Tagsets.ExcelXP file="&OUTPUT_DIR.\%scan(&DB1,1,'(')_PREDQ&FILE_ID..xls" style=sasweb;
ods tagsets.ExcelXP options(sheet_name="predq" embedded_TITLES='yes' AUTOFILTER='ALL' frozen_headers = 'yes' )
HEADTEXT="";
TITLE1 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Orange implies 0% to 10% drop in number of records';
TITLE2 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Red implies 10% to 100% drop in number of records';
TITLE3 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Yellow on "Previous vs Latest Data Type" column
implies data type change';
TITLE4 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Light Green on "Previous vs Latest Data Type" column
implies data type is similar but format is different';
%IF &MEM_PRSNT_FLAG. = 'Y' %THEN %DO;
TITLE5 JUSTIFY=LEFT ITALIC BOLD COLOR=BLACK FONT=Arial HEIGHT=9pt "Latest Cluster Member: &DB1.
Previous Cluster member: &DB2.";
%END;
%ELSE %DO;
TITLE5 JUSTIFY=LEFT ITALIC BOLD COLOR=BLACK FONT=Arial HEIGHT=9pt "Latest Cluster Member: &DB1.(MEMNUM=&MEM1.)
Previous Cluster member: &DB2.(MEMNUM=&MEM2.)";
%END;
proc report data=DQ_DTSET_final nowd split='*';
columns Variables_in_cluster Previous_cluster_member Latest_cluster_member Percent_chng old_vs_new_typ_diff old_vs_new_typ;
define Variables_in_cluster /display "Variables In Cluster" LEFT;
define Previous_cluster_member /display "Previous Cluster Member Count" format=comma15. center ;
define Latest_cluster_member /display "Latest Cluster Member Count" format=comma15. center ;
define Percent_chng/computed "Percent Change" CENTER ;
define old_vs_new_typ_diff/NOPRINT;
define old_vs_new_typ/COMPUTED "Previous vs Latest Data Type";
compute Percent_chng/ character length=50;
if Previous_cluster_member = 0 AND Latest_cluster_member > 0 then Percent_chng = '100% ';
ELSE if Previous_cluster_member = 0 AND Latest_cluster_member = 0 then Percent_chng = '0% ';
else Percent_chng=strip(PUT((((Latest_cluster_member-Previous_cluster_member)/Previous_cluster_member))*100,8.2))||'%';
IF Previous_cluster_member = . THEN Percent_chng = 'Variable not in Previous Cluster';
else IF latest_cluster_member = . THEN Percent_chng = 'Variable not in Latest Cluster';
if substr(Percent_chng,1,1) ='-' and substr(Percent_chng,3,1) not in ('.') then call define(_row_,"style","
style={background=red font_weight=bold}") ;
else if substr(Percent_chng,1,1) ='-' then call define(_row_,"style","style={background=orange font_weight=bold}") ;
endcomp;
compute old_vs_new_typ/ character length=40;
old_vs_new_typ = old_vs_new_typ_diff;
if INDEX(old_vs_new_typ_diff,'VS') > 0 THEN DO;
IF SUBSTR(old_vs_new_typ_diff,1,INDEX(old_vs_new_typ_diff,")")) = substr(substr(old_vs_new_typ_diff,INDEX(old_vs_new_typ_diff,
" VS ")+4),1,index(substr(old_vs_new_typ_diff,INDEX(old_vs_new_typ_diff," VS ")+4),")") )
then call define(_col_,"style","style={background=light green font_weight=bold}") ;
else call define(_col_,"style","style={background=yellow font_weight=bold}") ;
END;
endcomp;
run;
ods tagsets.ExcelXP close;
TITLE1 '';
TITLE2 '';
TITLE3 '';
TITLE4 '';
TITLE5 '';
PROC DATASETS NOLIST LIBRARY=WORK KILL;
QUIT;
%end;
/*******************If both databases are in same cluster**************/
%ELSE /*%IF &NEW_DTSET = %THEN */ %DO;
/***********Get member number(if not provided)***********/
PROC SQL NOPRINT;
SELECT MEMNAME INTO :DSN SEPARATED BY ' '
FROM DICTIONARY.MEMBERS
WHERE LIBNAME = %UPCASE("&DOMAIN") AND %UPCASE(SUBSTR(MEMNAME, 1, %LENGTH(&DB))) = %UPCASE("&DB");
QUIT;
%LET N_DSN = %SYSFUNC(COUNTW(&DSN));
%DO F = 1 %TO &N_DSN;
ODS SELECT NONE;
ODS OUTPUT ENGINEHOST=CLSMEM&F.;
PROC CONTENTS DATA=DB_DIR.%scan(&dsn, &F) OUT=CONTENTS&F.;
RUN;
ODS OUTPUT CLOSE;
DATA _NULL_;
SET CLSMEM&F.;
IF UPCASE(LABEL1) = 'CLUSTER MEMBERS ARE' THEN DO ;
CNT = (COUNT(CVALUE1,','));
CALL SYMPUTX('MEM1',CNT+1);
CALL SYMPUTX('MEM2',CNT);
OUTPUT;
END;
ELSE IF UPCASE(STRIP(LABEL1)) = 'DATA SET IS A CLUSTER' AND UPCASE(STRIP(CVALUE1)) = 'NO' THEN DO;
CALL SYMPUTX("MEM2",0);
END;
RUN;
%PUT &MEM1. &MEM2.;
%IF &MEM2. = 0 %THEN %DO;
%PUT "ERROR: Cluster DB_DIR.%scan(&dsn, &F) has only one member";
%END;
%ELSE %DO;
PROC SQL;
CREATE TABLE CL_NM&F. AS
SELECT NAME FROM CONTENTS&F. ORDER BY VARNUM;
QUIT;
PROC SQL NOPRINT;
SELECT COUNT(DISTINCT NAME) INTO :TOT_VAR&F. FROM CL_NM&F. ;
QUIT;
DATA _NULL_;
SET CL_NM&F. END=END;
COUNT+1;
CALL SYMPUTX('LIST_COL_NM'||LEFT(COUNT),LEFT(TRIM(NAME)));
RUN;
/************Take non-missing count************/
%DO L = 1 %TO 2;
PROC SQL;
CREATE TABLE DQLST_MEM&L. AS
SELECT
%DO K = 1 %TO &&TOT_VAR&F.;
%IF &K = &&TOT_VAR&F. %THEN %DO;
COUNT(&&LIST_COL_NM&K.) AS &&LIST_COL_NM&K.
%END;
%ELSE %DO;
COUNT(&&LIST_COL_NM&K.) AS &&LIST_COL_NM&K. ,
%END;
%END;
FROM DB_DIR.%scan(&dsn, &F) (MEMNUM=&&MEM&L.)
;
QUIT;
%if &l = 1 %THEN %DO;
PROC TRANSPOSE DATA=DQLST_MEM&L. OUT=TSPOS_DQLST_MEM&L. NAME=Variables_in_cluster prefix=Latest_cluster_member;RUN;
%END;
%ELSE %DO;
PROC TRANSPOSE DATA=DQLST_MEM&L. OUT=TSPOS_DQLST_MEM&L. NAME=Variables_in_cluster prefix=Previous_cluster_member;RUN;
%END;
%END;
PROC SQL;
CREATE TABLE DQ_DTSET AS
SELECT A.Variables_in_cluster, B.Previous_cluster_member1 AS Previous_cluster_member, A.Latest_cluster_member1
AS Latest_cluster_member
FROM TSPOS_DQLST_MEM1 A
,TSPOS_DQLST_MEM2 B
WHERE A.Variables_in_cluster = B.Variables_in_cluster;
QUIT;
/***********Create output report***********/
ODS SELECT ALL;
ods Tagsets.ExcelXP file="&OUTPUT_DIR.\%scan(&dsn, &F)_PREDQ&FILE_ID..xls" style=sasweb;
ods tagsets.ExcelXP options(sheet_name="predq" embedded_TITLES='yes' AUTOFILTER='ALL' frozen_headers = 'yes' )
HEADTEXT="";
TITLE1 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Orange implies 0% to 10% drop in number of records';
TITLE2 JUSTIFY=LEFT ITALIC COLOR=BLACK FONT=Arial HEIGHT=9pt 'Red implies 10% to 100% drop in number of records';
TITLE3 JUSTIFY=LEFT ITALIC BOLD COLOR=BLACK FONT=Arial HEIGHT=9pt "Latest Cluster Member: %scan(&dsn, &F)(MEMNUM=&MEM1.)
Previous Cluster member: %scan(&dsn, &F)(MEMNUM=&MEM2.)";
proc report data=DQ_DTSET nowd split='*';
columns Variables_in_cluster Previous_cluster_member Latest_cluster_member Percent_chng;
define Variables_in_cluster /display "Variables In Cluster" CENTER;
define Previous_cluster_member /display "Previous Cluster Member" format=comma15. CENTER;
define Latest_cluster_member /display "Latest Cluster Member " format=comma15. CENTER;
define Percent_chng/computed "Percent Change" format=percent10.2 CENTER;
compute Percent_chng;
if Previous_cluster_member = 0 AND Latest_cluster_member > 0 then Percent_chng = 1;
ELSE if Previous_cluster_member = 0 AND Latest_cluster_member = 0 then Percent_chng = 0;
else Percent_chng=((Latest_cluster_member-Previous_cluster_member)/Previous_cluster_member);
if -0.1 < Percent_chng < 0 then call define(_row_,"style","style={background=orange font_weight=bold}") ;
else if Percent_chng <= -0.1 then call define(_row_,"style","style={background=red font_weight=bold}") ;
endcomp;
run;
ods Tagsets.ExcelXP close;
TITLE1 '';
TITLE2 '';
TITLE3 '';
%END;
PROC DATASETS NOLIST LIBRARY=WORK KILL;
QUIT;
%END;
%END;
%MEND;
/*%PREDQ(DOMAIN=health,DB=MH_HOSP_2012APR_DAD,NEW_DTSET=MH_HOSP_2013APR_DAD,OUTPUT_DIR=G:\dmusers\username);*/
/*%PREDQ(DOMAIN=common,DB=conc_2006,NEW_DTSET=conc_2011,OUTPUT_DIR=G:\dmusers\username);*/
/*%PREDQ(DOMAIN=HEALTH,DB=MHMHLTH_MHID_2010APR,NEW_DTSET=,OUTPUT_DIR=G:\dmusers\username);*/
/*%PREDQ(DOMAIN=REGISTRY,DB=VSA_MMDF_2000JAN(MEMNUM=13),NEW_DTSET=VSA_MMDF_2000JAN(MEMNUM=14),OUTPUT_DIR=G:\dmusers\username);*/
/*%PREDQ(DOMAIN= Common ,DB= TARIFF_,NEW_DTSET=,OUTPUT_DIR=G:\dmusers\username);*/