/*------------------------------------------------------------------------- © Copyright 2011-2013 University of Manitoba This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ----------------------------------------------------------------------------- Programmer: Say Hong Based on the previous work by Mahmoud Azimaee Please direct questions and comments to info@cpe.umanitoba.ca Version 2.0 Date : Feb. 12, 2013 updated to version 2.1 Jan. 06, 2014. revised 30 July 2014. This macro is used to produce a VIMO table for a specific dataset or a specific cluster member or a combination of cluster members. This macro can be used to perform a univariate data quality check if invalidchk=Y. Note that data quality check is based on the format defined, so that formats have to be loaded and dq_meta macro has to be run before running the macro. Otherwise set invalidchk=N. Macro Parameters: ds = Name of input dataset, could one or two level invalidchk = specifies whether data quality check should be performed, default value is N (valid value = Y/N) memnum = List of cluster members that are used to produce VIMO table, if blank then the macro will be run for a specific dataset (non-cluster) or the whole cluster if the dataset is a cluster postals = Space separated list of postal code variables to check for invalid postal code, if blank then no invalid check will be performed. muncodes = Space separated list of mun code variables to check for invalid mun code, if blank then no invalid check will be performed suppvar = Space separated list of variables that are suppressed in SPDS leave blank if non were suppressed idvars = Space separated list of ID variables to be put in the ID category of the DQ report nooutlier = Fields to suppress outlier calculation. This parameter can be either one of the following. 1. all (suppress outlier calculation for all numeric fields) 2. space separated list of variables to suppress outlier calculation 3. location and name of the text file that contains variables to suppress outlier calculation. validminmax = If set to y/Y then the min, max values for date and datetime variables are calculated based on the valid range defined by the format. missfmt = Variables that require format but were not provided by data provider ----------------------------------------------------------------------------*/ %macro dq_vimo(ds=, invalidchk=N, memnum=, postals=, muncodes=, suppvar=, idvars=, nooutlier=, validminmax=, missfmt=); %put; %put NOTE: Data Quality VIMO Macro (Version 2.1); %put; %if %upcase(&invalidchk) ^= Y & %upcase(&invalidchk) ^= N %then %do; %put; %put ERROR: The Parameter Invalidchk Must Be Y or N; %put ERROR: Please Check the Value of Invalidchk= Parameter; %put; %return; %end; %if &invalidchk=Y and %sysfunc(exist(metadata, data)) = 0 %then %do; %put; %put WARNING: Macro parameter invalidchk has been set to Y, but metadata data set does not exist; %put WARNING: Please make sure dq_meta macro has been run; %put WARNING: This macro stop without executing; %put; %return; %end; %if %sysfunc(exist(&ds, data)) = 0 %then %do; /* Check whether dataset exists */ %put; %put ERROR: Dataset &ds Does not exist; %return; %end; %else %do; %let dsid = %sysfunc(open(&ds)); %let ncnt = %sysfunc(attrn(&dsid,nlobs)); %let rc = %sysfunc(close(&dsid)); %if &ncnt = 0 %then %do; %put; %put WARNING: Data set &ds has no observations; %put WARNING: This macro stop without executing; %put; %return; %end; %end; proc datasets nolist memtype=data; delete vimo postmun; quit; %if %index(&ds,.) = 0 %then %do; %let lib = WORK; %let dsn = %upcase(&ds); %end; %else %do; %let lib = %upcase(%scan(&ds, 1, .)); %let dsn = %upcase(%scan(&ds, 2, .)); %end; %if &nooutlier ^= and %upcase(&nooutlier) ^= ALL %then %do; /* create a list to be suppressed for outlier calculation */ %if %index(&nooutlier, \) = 0 %then %do; %let nooutlierlst = %str(%")%upcase(%scan(&nooutlier, 1))%str(%"); %let i = 2; %do %while(%scan(&nooutlier, &i) ^= ); %let nooutlierlst = &nooutlierlst %str(%")%upcase(%scan(&nooutlier, &i))%str(%"); %let i = %eval(&i + 1); %end; %end; %else %do; data nooutlier; infile &nooutlier lrecl=32767; input var :$32. @@; run; proc sql noprint; select quote(upcase(compress(var))) into :nooutlierlst separated by ' ' from nooutlier; quit; %end; %*%put nooutlierlst = &nooutlierlst; %end; /* Add Feb. 26, 2014 */ %if &missfmt ^= %then %do; /* create a list to be suppressed for outlier calculation */ %if %index(&missfmt, \) = 0 %then %do; %let missfmtlst = "%upcase(%scan(&missfmt, 1))"; %let i = 2; %do %while(%scan(&missfmt, &i) ^= ); %let missfmtlst = &missfmtlst "%upcase(%scan(&missfmt, &i))"; %let i = %eval(&i + 1); %end; %end; %else %do; data missfmt; infile &missfmt lrecl=32767; input var :$32. @@; run; proc sql noprint; select quote(upcase(compress(var))) into :missfmtlst separated by ' ' from missfmt; quit; %end; %*%put missfmtlst = &missfmtlst; %end; /* */ %if &idvars ^= %then %do; /* create list of id variable to output on the id category of the DQ report */ %let idvarlist = %str(%")%upcase(%scan(&idvars, 1))%str(%"); %let i = 2; %do %while(%scan(&idvars, &i) ^= ); %let idvarlist = &idvarlist %str(%")%upcase(%scan(&idvars, &i))%str(%"); %let i = %eval(&i + 1); %end; %end; %local cluster lib_orig; %if &memnum ^= %then %do; %let cluster = YES; %let memn = %sysfunc(countw(&memnum)); %let cls = %scan(&ds, 2, '.'); %do i = 1 %to &memn; %let mem = %scan(&memnum, &i); data data&i; set &ds(memnum=&mem); run; %end; data &cls; set data1-data&memn; run; proc datasets nolist memtype=data lib=work; delete data1-data&memn; quit; %let lib_orig = &lib; %let lib = WORK; %let dsn = %upcase(&cls); %end; %let cdfmtlist=; %let userfmtvars=; %if %upcase(&invalidchk) = Y and %sysfunc(exist(metadata, data)) ^= 0 %then %do; %*%cdnum; %local usrfmtlist fmtlist; proc sql noprint; select distinct quote(compress(fmtname)) into :usrfmtlist separated by ' ' from dictionary.formats where source='C' and upcase(libname) = 'WORK'; quit; proc sql noprint; select distinct(format) into :fmtlist separated by " " from metadata where %IF &CLUSTER=YES %THEN %DO; libname="&LIB_ORIG" & memname="&DSN" & compress(format) in (&usrfmtlist) & upcase(type)='NUM'; %END; %ELSE %DO; libname="&LIB" & memname="&DSN" & compress(format) in (&usrfmtlist) & upcase(type)='NUM'; %END; quit; %if &fmtlist ^= %then %do; proc format library=work.formats cntlout=fmtdata(where=(compress(start)^='.' and compress(end)^='.') keep=fmtname start end label); select &fmtlist; run; proc sql noprint undo_policy=none; create table exclfmt as select distinct compress(fmtname) from fmtdata where compress(start) ^= compress(end); create table fmtdata as select * from fmtdata where compress(fmtname) not in (select * from exclfmt); /* obtain a list of variables to be put in the codenum category of VIMO table */ select quote(compress(name)) into :cdfmtlist separated by " " from metadata where memname="&DSN" & compress(format) in (select distinct compress(fmtname) from fmtdata); quit; %end; proc sql noprint; /* obtain a list of variables that have format */ select quote(compress(name)) into :userfmtvars separated by " " from metadata where memname="&DSN" & compress(format) in (&usrfmtlist); quit; proc datasets nolist memtype=data lib=work; delete formats fmtdata exclfmt; quit; %end; %let nvarlist=; %let cvarlist=; %let varlist=; %let outlierlist=; proc sql noprint; %if %upcase(&nooutlier) ^= ALL %then %do; select upcase(compress(name)) into :outlierlist separated by ' ' from dictionary.columns where libname="&lib" and memname="&dsn" and type='num' %if &nooutlier ^= and &cdfmtlist= %then and upcase(compress(name)) not in (%unquote(%bquote(&nooutlierlst))); %else %if &nooutlier= and &cdfmtlist^= %then and upcase(compress(name)) not in %upcase((%unquote(%bquote(&cdfmtlist)))); %else %if &nooutlier ^= and &cdfmtlist ^= %then and upcase(compress(name)) not in %upcase((%unquote(%bquote(&nooutlierlst &cdfmtlist)))); %str(;) %end; select upcase(compress(name)) into :nvarlist separated by ' ' from dictionary.columns where libname="&lib" and memname="&dsn" and type='num' %if &cdfmtlist ^= %then and upcase(compress(name)) not in %upcase((%unquote(%bquote(&cdfmtlist))));; select upcase(compress(name)) into :cvarlist separated by ' ' from dictionary.columns where libname="&lib" and memname="&dsn" and type='char'; quit; %let varlist = &nvarlist %upcase(%sysfunc(tranwrd(&cdfmtlist, %str(%"), %str( )))) &cvarlist; %if &nvarlist ne %then %do; proc means data=&lib..&dsn qmethod=P2 noprint; var &nvarlist; output out=nvout; output out=medians(drop=_type_ _freq_) median=; %if &outlierlist ^= %then /* outliers calculation if outlierlist is not empty */ output out=outlier q1= q3= qrange=/autoname%str(;); run; /************************************/ %if &outlierlist = %then %goto here; /* perform outlier check begin here */ data outlier; drop %do i = 1 %to %sysfunc(countw(&outlierlist)); %let var = %scan(&outlierlist, &i); %if %length(&var) > 27 %then %do; %substr(&var, 1, 27)_q1 %substr(&var, 1, 27)_q3 %substr(&var, 1, 23)_qrange %end; %else %if %length(&var) > 23 and %length(&var) <= 27 %then %do; &var._q1 &var._q3 %substr(&var, 1, 23)_qrange %end; %else %do; &var._q1 &var._q3 &var._qrange %end; %end; ; set outlier; %do i = 1 %to %sysfunc(countw(&outlierlist)); %let var = %scan(&outlierlist, &i); %if %length(&var) > 27 %then %do; %put %length(&var); %put &var; L&i = %substr(&var, 1, 27)_q1 - 2.5*%substr(&var, 1, 23)_qrange; U&i = %substr(&var, 1, 27)_q3 + 2.5*%substr(&var, 1, 23)_qrange; %end; %else %if %length(&var) > 23 and %length(&var) <= 27 %then %do; L&i = &var._q1 - 2.5*%substr(&var, 1, 23)_qrange; U&i = &var._q3 + 2.5*%substr(&var, 1, 23)_qrange; %end; %else %do; L&i = &var._q1 - 2.5*&var._qrange; U&i = &var._q3 + 2.5*&var._qrange; %end; %end; run; data outlier(keep=&outlierlist); drop &outlierlist; rename %do i = 1 %to %sysfunc(countw(&outlierlist)); _tmp&i = %scan(&outlierlist, &i) %end; %str(;) if _n_ = 1 then set outlier; set &lib..&dsn(keep=&outlierlist) end=eof; %do i = 1 %to %sysfunc(countw(&outlierlist)); %let var = %scan(&outlierlist, &i); if not missing(&var) and (&var < L&i | &var > U&i) then _tmp&i + 1; %end; if eof then output; run; proc transpose data=outlier out=outlier(rename=(col1=n_outlier _name_=varname)); run; /* End outlier check */ proc sort data=outlier; by Varname; run; %here: proc transpose data=medians out=medians(rename=(_name_=Varname col1=Median)); run; proc transpose data=nvout(drop=_type_ _freq_) out=nvout(drop=N rename=(_name_=Varname)); id _stat_; run; data medians; set medians; varname = upcase(varname); run; proc sort data=medians; by Varname; run; data nvout; set nvout; varname=upcase(varname); run; proc sort data=nvout; by Varname; run; data nvout; length Varname $32 type $8; retain type 'Num'; merge nvout medians %if &outlierlist ^= %then outlier;; by Varname; run; %end; /**********************************************/ %if &cdfmtlist ^= %then %let cvarlist = &cvarlist %upcase(%sysfunc(tranwrd(&cdfmtlist, %str(%"), %str( )))); /* add 30 July 2014 */ %let chrlist=; %if &cvarlist ne %then %do; %if &idvars ^= | &postals ^= %then %do; options minoperator; /* use IN (#) as operators in macro expressions */ %do i = 1 %to %sysfunc(countw(&cvarlist)); %if %scan(&cvarlist, &i) in %upcase(&idvars &postals) ^= 1 %then %let chrlist = &chrlist %scan(&cvarlist, &i); %end; %end; %else %let chrlist = &cvarlist; %if &chrlist ne %then %do; proc summary data=&lib..&dsn(keep=&chrlist) missing; class &chrlist; ways 1; output out=clevel(rename=(_freq_=Count) drop=_type_); run; %if &cdfmtlist ^= %then %do; /* add 30 July 2014 */ data clevel; %let numcode = %upcase(%sysfunc(tranwrd(&cdfmtlist, %str(%"), %str( )))); drop &numcode; rename %do i = 1 %to %sysfunc(countw(&numcode)); tmp&i = %scan(&numcode, &i) %end; ; set clevel; %do i = 1 %to %sysfunc(countw(&numcode)); tmp&i = put(%scan(&numcode, &i), best.); if compress(tmp&i) = '.' then tmp&i = ' '; %end; run; %end; data clevel(keep=varname values count); length varname $32 values $500; set clevel; %do i = 1 %to %sysfunc(countw(&chrlist)); if not missing(%scan(&chrlist, &i)) then varname = upcase("%scan(&chrlist, &i)"); %end; values = coalescec(%sysfunc(tranwrd(%sysfunc(compbl(&chrlist)), %str( ), %str(, )))); run; proc sort data=clevel; by varname descending count; run; data clevel(keep=varname values); set clevel; by varname; if first.varname then cnt = 0; cnt + 1; if cnt <= 10; run; proc transpose data=clevel out=clevel; by varname; var values; run; data clevel(keep=varname level Type); length level $1500 Type $8; *retain Type 'Char'; set clevel; where not missing(varname); %if &cdfmtlist ^= %then %do; if upcase(varname) in (%upcase(&cdfmtlist)) then type='Codenum'; else type='Char'; %end; %else type = 'Char'%str(;); level = catx(', ', of col:); run; %end; /* creating a list of postal codes that have a frequency > 20 */ %if &postals ^= %then %do; ods listing close; ods output OneWayFreqs = freqtable; proc freq data=&lib..&dsn order=freq; tables &postals; run; ods output close; ods listing; %let npostals = %sysfunc(countw(&postals)); proc sql outobs=20 nowarn noprint; %do i = 1 %to &npostals; select count(%scan(&postals, &i)) into :cnt from freqtable; %if &cnt > 0 %then %do; %let plist&i = ; select trim(left(%scan(&postals, &i))) into :plist&i separated by ', ' from freqtable where frequency > 20 and compress(%scan(&postals, &i)) not in ('.', ' ','n/a','N/A'); %if &cnt > 20 and &&plist&i ^= %then %let plist&i = %nrbquote(&&plist&i), ...; %end; %else %let plist&i = m; %end; quit; data clevel; %if &chrlist ^= %then %do; set clevel end=eof; output; if eof then do; %do i = 1 %to &npostals; Varname = upcase("%scan(&postals, &i)"); %if %nrbquote(&&plist&i) = %then %do; level = 'Postal codes suppressed due to small frequency count'; Type = 'Char'; *level = '...'; *Type = 'Char**'; %end; %else %if %nrbquote(&&plist&i) = m %then %do; level = ' '; Type = 'Char'; %end; %else %do; level = "&&plist&i"; Type = 'Char*'; %end; output; %end; end; %end; %else %do; %do i = 1 %to &npostals; Varname = upcase("%scan(&postals, &i)"); %if %nrbquote(&&plist&i) = %then %do; level = 'Postal codes suppressed due to small frequency count'; Type = 'Char'; *level = '...'; *Type = 'Char**'; %end; %else %if %nrbquote(&&plist&i) = m %then %do; level = ' '; Type = 'Char'; %end; %else %do; level = "&&plist&i"; Type = 'Char*'; %end; output; %end; %end; run; %end; /* end creating list of postal codes */ proc datasets nolist memtype=data; delete freqtable; quit; %end; /* End listing the values of the character variables */ data all; length Type $8; set %if %sysfunc(exist(clevel, data)) = 0 and %sysfunc(exist(nvout, data)) ^= 0 %then nvout%str(;); %else %if %sysfunc(exist(clevel, data)) ^= 0 and %sysfunc(exist(nvout, data)) = 0 %then clevel%str(;); %else nvout clevel%str(;); * if upcase(substr(varname, length(varname)-1)) = 'ID' then type='ID'; run; proc sort data=all; by Varname; run; proc contents data=&lib..&dsn out=contents(keep=name format formatl label rename=(name=Varname label=Varlabel)) noprint; run; data contents; set contents; varname = upcase(varname); run; proc sort data=contents; by Varname; run; data all; merge all contents; by Varname; if upcase(compress(format)) in ('YYMMDDD', 'YYMMDD', 'DATE', 'DDMMYY', 'MMDDYY') then Type = 'Date'; if upcase(compress(format)) in ('TIME', 'TIMEAMPM', 'HHMM') then Type = 'Time'; if upcase(compress(format)) in ('DATETIME', 'DATEAMPM') then Type = 'Datetime'; if index(upcase(Varname), 'PHIN') ^= 0 and cats(format,formatl) = 'Z15' then Type = 'ID'; if Type in ('Date', 'Time', 'Datetime') then do; if formatl ^= 0 then indatafmt = cats(format, put(formatl, best.)); else indatafmt = format; end; %if &idvars ^= %then if upcase(compress(Varname)) in (%unquote(%bquote(&idvarlist))) then Type = 'ID'%str(;); run; /****************************************************************/ /* Calucating the percentage of missing values for all variables*/ %let nvar = %sysfunc(countw(&varlist)); proc sql; create table nmiss as select count(*) as n, %do i = 1 %to &nvar-1; nmiss(%scan(&varlist, &i)) as miss&i, (calculated miss&i/calculated n)*100 as percent&i, %end; nmiss(%scan(&varlist, &nvar)) as miss&nvar, (calculated miss&nvar/calculated n)*100 as percent&nvar from &lib..&dsn; quit; data nmiss(keep=varname nmiss n pctmiss); length varname $32; array miss[&nvar]; array percent[&nvar]; set nmiss; %do i = 1 %to &nvar; varname = "%scan(&varlist, &i)"; nmiss = miss[&i]; pctmiss = percent[&i]; output; %end; run; /* End calculating the percentage of missing values */ %if %upcase(&validminmax) = Y %then %do; * Calculating valid min, max value for date and datetime variables; * Requested by Dave Towns; * Added Dec. 17, 2013; %local minmaxvar minmaxfmt; %let minmaxvar = ; %let minmaxfmt = ; proc sql noprint; select distinct upcase(name), format into :minmaxvar separated by ' ', :minmaxfmt separated by ' ' from metadata where memname = "&dsn" & compress(format) in (select distinct compress(fmtname) from dictionary.formats where source='C' and (substr(fmtname, 1, 4) = 'DTTM' or substr(fmtname, 1, 7) = 'YYMMDDD') and upcase(libname) = 'WORK'); %if &minmaxvar ^= %then %do; create table validminmax as select "%scan(&minmaxvar, 1)" as varname, min(%scan(&minmaxvar, 1)) as min, max(%scan(&minmaxvar, 1)) as max, 1 as validminmax from &lib..&dsn where upcase(put(%scan(&minmaxvar,1), %scan(&minmaxfmt,1).)) = 'VALID' %do i = 2 %to %sysfunc(countw(&minmaxvar)); union select "%scan(&minmaxvar, &i)" as varname, min(%scan(&minmaxvar, &i)) as min, max(%scan(&minmaxvar, &i)) as max, 1 as validminmax from &lib..&dsn where upcase(put(%scan(&minmaxvar,&i), %scan(&minmaxfmt,&i).)) = 'VALID' %end; order by varname; %end; %else %do; %put WARNING: No user-defined format have been assigned to Date or Datetime variables; %put WARNING: Min, Max value of Date or Datetime variables are calculated based on all data values; %goto exit; %end; ; quit; proc sql undo_policy=none; create table validminmax as select a.*, b.min, b.max, b.validminmax from all(where=(varname in (%upcase("%sysfunc(tranwrd(&minmaxvar, %str( ), %str(%" %")))"))) drop=min max) as a, validminmax as b where a.varname = b.varname; quit; data all; set all(where=(varname not in (%upcase("%sysfunc(tranwrd(&minmaxvar, %str( ), %str(%" %")))")))) validminmax; run; %exit: %end; proc sort data=all; by Varname; run; proc sort data=nmiss; by Varname; run; %let chr = ; %if &chrlist ^= %then %do; /* Add 31 July 2014 */ %do i = 1 %to %sysfunc(countw(&chrlist)); %let chr = &chr "%sysfunc(compress(%scan(&chrlist, &i)))"; %end; %end; data all; length _min $ 1600 _max $ 25; drop min max; rename _min=min _max=max; merge all nmiss; by Varname; %if &chr ^= %then %do; if pctmiss = 100 & Type = ' ' then do; /* Add 31 July 2014 */ if upcase(varname) in (&chr) then do; %if &cdfmtlist ^= %then %do; if upcase(varname) in (&cdfmtlist) then Type = 'Codenum'; else Type = 'Char'; %end; %else Type = 'Char'%str(;); end; end; %end; if upcase(Type) = 'NUM' and n_outlier ^= . then pctoutlier = (n_outlier/n)*100; if Type = 'ID' then do; mean = .; std = .; median = .; order = 1; end; else if Type in ('Date', 'Datetime', 'Time') then do; _min = putn(min, indatafmt); _max = putn(max, indatafmt); mean = .; std = .; median = .; order = 5*(Type='Date') + 6*(Type='Datetime') + 7*(Type='Time'); end; else if Type = 'Num' then do; _min = left(put(min, 10.2)); _max = left(put(max, 10.2)); order = 2; end; else if Type = 'Codenum' then do; _min = level; _max = ' '; order = 3; end; else do; _min = level; _max = ' '; order = 4; end; %if %symexist(minmaxvar) %then %do; %if %upcase(&validminmax) = Y and &minmaxvar ^= %then if validminmax = 1 then _min = cats(_min, '*')%str(;); %end; run; /* checking for invalid postal code and muncode */ %if &postals ^= | &muncodes ^= %then %do; %let postals = %upcase(&postals); %let muncodes = %upcase(&muncodes); %let postmun = &postals &muncodes; %let n_postmun = %sysfunc(countw(&postmun)); data postmun; array invalid_codes[&n_postmun] $500 (&n_postmun*' '); set &lib..&dsn(keep=&postals &muncodes) end=eof; %if &postals ^= %then %do; %let n_postals = %sysfunc(countw(&postals)); %do i = 1 %to &n_postals; %let postal = %scan(&postals, &i); if &postal ^= ' ' then do; if not(anyalpha(compress(&postal),1)=1 & anyalpha(compress(&postal),3)=3 & anyalpha(compress(&postal),5)=5 & anydigit(COMPRESS(&POSTAL),2)=2 & anydigit(COMPRESS(&POSTAL),4)=4 & anydigit(COMPRESS(&POSTAL),6)=6) then do; invalid&i + 1; if index(invalid_codes[&i],trim(left(&postal))) = 0 and index(invalid_codes[&i], '...') = 0 then do; counter&i + 1; if counter&i < 5 then invalid_codes[&i] = catx(', ', invalid_codes[&i], trim(left(&postal))); else if counter&i = 5 and index(invalid_codes[&i], '...') = 0 then invalid_codes[&i] = catx(', ', invalid_codes[&i], '...'); end; end; end; %end; %end; %if &muncodes ^= %then %do; %if &postals = %then %let n_postals = 0; %do i = 1 %to %sysfunc(countw(&muncodes)); %let j = %eval(&n_postals + &i); format &muncodes; %let muncode = %scan(&muncodes, &i); if &muncode ^= ' ' then do; if not ((anydigit(compress(&muncode),1)=1 | upcase(substr(compress(&muncode),1,1))='A') & anydigit(compress(&muncode),2)=2 & anydigit(compress(&muncode),3)=3 & length(strip(&muncode))= 3) then do; invalid&j + 1; if index(invalid_codes[&j], trim(left(&muncode))) = 0 and index(invalid_codes[&j], '...') = 0 then do; counter&j + 1; if counter&j < 5 then invalid_codes[&j] = catx(', ', invalid_codes[&j], strip(&muncode)); else if counter&j = 5 and index(invalid_codes[&j], '...') = 0 then invalid_codes[&j] = catx(', ', invalid_codes[&j], '...'); end; end; end; %end; %end; if eof then output; run; data postmun(keep=Varname ninvalid Invalid_codes); length Varname $32 Invalid_codes $500; array invalid[&n_postmun]; set postmun; %do i = 1 %to &n_postmun; if invalid_codes&i ^= ' ' | invalid[&i] then do; Varname = "%scan(&postmun, &i)"; ninvalid = invalid[&i]; Invalid_codes = cat(strip(invalid_codes&i), ' (', compress(put(ninvalid, 8.)), ' invalid obs. in total)'); output; end; %end; run; data _null_; if 0 then set postmun nobs=nobs; call symputx('nobs', nobs); stop; run; %if &nobs > 0 %then %do; proc sort data=postmun; by Varname; run; %end; %end; /* End checking for invalid postal code and muncode */ %if %upcase(&invalidchk) = Y %then %do; %invalidchk %if %sysfunc(exist(invalid, data)) ^= 0 %then %do; proc sort data=invalid; by varname; run; %end; data all; %if %sysfunc(exist(invalid, data)) ^= 0 %then %do; %if (&postals ^= | &muncodes ^=) %then %do; %if &nobs > 0 %then merge all invalid postmun%str(;); %else merge all invalid%str(;); %end; %else merge all invalid%str(;); %end; %else %do; %if (&postals ^= | &muncodes ^=) %then %do; %if &nobs > 0 %then merge all postmun%str(;); %else %do; length invalid_codes $20; set all; retain ninvalid . invalid_codes ' '; %end; %end; %else %do; length invalid_codes $20; set all; retain ninvalid . invalid_codes ' '; %end; %end; by Varname; if ninvalid ^= . then pctinvalid = (ninvalid/n)*100; if pctmiss = . and pctoutlier = . and pctinvalid = . then Valid = 100; else Valid = 100 - sum(pctmiss, pctoutlier, pctinvalid); run; %end; %else %if %upcase(&invalidchk) = N %then %do; data all; %if (&postals ^= | &muncodes ^=) %then %do; %if &nobs > 0 %then merge all postmun%str(;); %else %do; set all; retain ninvalid . invalid_codes ' '; %end; %end; %else %do; length invalid_codes $20; set all; retain ninvalid . invalid_codes ' '; %end; by Varname; if ninvalid ^= . then pctinvalid = (ninvalid/n)*100; if pctmiss = . and pctoutlier = . and pctinvalid = . then Valid = 100; else Valid = 100 - sum(pctmiss, pctoutlier, pctinvalid); run; %end; %if &suppvar ne %then %do; proc sql noprint; select max(length(Varname)) into :mxlen from all; quit; %let supvar = ; %do i = 1 %to %sysfunc(countw(&suppvar)); %let supvar = &supvar "%upcase(%scan(&suppvar, &i))"; %end; %put suppvar = &supvar; /*data all; set all; if upcase(Varname) in (&supvar) then Varname = catx(repeat(' ', &mxlen - length(Varname) + 2), Varname, '(S)'); run;*/ data all; set all; if upcase(Varname) in (&supvar) then do; min = 'SUPPRESSED'; max = ' '; mean = .; median = .; std = .; end; run; %end; /***/ data all; set all; /* flag comment column if the variable missing formt */ %if &missfmt ^= %then if varname in (&missfmtlst) then invalid_codes = 'Missing format'%str(;); %if &userfmtvars ^= %then %do; if Varname in (%upcase(&userfmtvars)) then do; if invalid_codes = ' ' then invalid_codes = 'Y'; end; %end; if pctmiss = 100 then invalid_codes = ' '; if Type = 'Char*' then do; Varname = catx(' ', Varname, '*'); Type = 'Char'; end; else if Type = 'Char**' then do; Varname = catx(' ', Varname, '**'); Type = 'Char'; end; run; /***/ proc sort data=all sortseq=linguistic(numeric_collation=on); by order varname; run; proc sql; create table vimo as select Type, Varname label='Variable Name', Varlabel label='Variable Label', Valid, pctinvalid as Invalid, pctmiss as Missing, pctoutlier as Outlier, Min as Min, Max as Max, Mean as Mean, Median, Std, invalid_codes as Comment from all /*order by order, Varname*/; quit; /* Invalid and Outlier are not mutually exclusive, if valid is negative then reset valid to 0*/ data vimo; set vimo; if valid < 0 then valid = 0; run; %if %substr(&dq_dir, %length(&dq_dir), 1) = \ %then %let dq_dir = %substr(&dq_dir, 1, %length(&dq_dir) - 1); proc export data=vimo outfile="&dq_dir\&dq_name..xls" dbms=excelcs label replace; sheet = 'vimo'; run; proc datasets nolist memtype=data; delete all clevel contents invalid medians nmiss nvout outlier value values nooutlier freqtable validminmax %if &memnum ^= %then &cls;; quit; %mend dq_vimo;