/*--------------------------------------------------------------------------- © Copyright 2011-2013 University of Manitoba This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . ----------------------------------------------------------------------------- Author: Say Hong Please direct questions and comments to info@cpe.umanitoba.ca Date: Apr. 08, 2013 Updated Jan. 06, 2014 Description: This macro is an intermediate macro that has been developed to automatically perform the following based on the format defined. 1. Check for the invalid value of the character variables 2. Perform a range check for the numeric variables 3. To add descriptive label for the comment column in the vimo table. This macro will be called from dq_vimo macro. ---------------------------------------------------------------------------*/ %macro invalidchk; %local nobs num sysfmtlist varlist; %let sysfmtlist=; proc sql noprint; select distinct quote(compress(fmtname)) into :sysfmtlist separated by ' ' from dictionary.formats where source='B'; quit; data formats; set work.metadata; %IF &CLUSTER=YES %THEN %DO; if libname="&LIB_ORIG" & memname="&DSN" & compress(format) not in (" " &sysfmtlist); %END; %ELSE %DO; if libname="&LIB" & memname="&DSN" & compress(format) not in (" " &sysfmtlist); %END; run; %let dsid = %sysfunc(open(formats)); %let num = %sysfunc(attrn(&dsid,nlobs)); %let rc = %sysfunc(close(&dsid)); %if &num = 0 %then %do; %put; %put WARNING: Macro parameter invalidchk has been set to Y, but no user defined formats can be found in the Metadata data set.; %put WARNING: No invalid value check or range check will be performed.; %put; %goto exit; %end; proc sql noprint; select format into :fmtlist separated by " " from formats; quit; proc format cntlout=_outfmt library=work.formats; run; data _cntlin; set _outfmt; label = 'VALID'; default = 5; /* set default of a format to length of 'VALID'*/ run; proc datasets nolist memtype=catalog lib=work; delete formats; quit; proc format cntlin=_cntlin; run; proc format library=work.formats cntlout=fmtvalue(keep=fmtname start end label); select &fmtlist; run; data fmtvalue; set fmtvalue; value = compress(start); run; proc sql noprint; select quote(compress(fmtname)) into :fmtname separated by ',' from fmtvalue where compress(start) ^= compress(end); quit; proc contents data=&lib..&dsn out=indatafmt(keep=name format formatl) noprint; run; %let fmtexl=; /*add 28MAR2014*/ proc sql noprint undo_policy=none; select distinct quote(compress(fmtname)) into :fmtexl separated by ' ' from fmtvalue where compress(start) = compress(end); create table rangechk as select c.format as indatafmt, c.formatl, aa.* from (select distinct a.name, b.* from formats as a, fmtvalue as b where a.format = b.fmtname) as aa, indatafmt as c where aa.name = c.name; quit; proc sort data=fmtvalue; by start; run; /* range value to be display in comment column of vimo table */ data rangechk; length range $50; set rangechk; %if &fmtexl ^= %then /*add 28MAR2014*/ where fmtname not in (&fmtexl)%str(;); /* exclude variable with format defined as range and one-to-one map*/ indatafmt = cats(indatafmt, put(formatl, best.)); if compress(start) ^= compress(end) then do; if length(compress(fmtname)) >= 7 and substr(compress(fmtname),1,7) = 'YYMMDDD' then range = '['||compress(putn(input(compress(start), best.), indatafmt))||', '||compress(putn(input(compress(end),best.), indatafmt))||']'; else if length(compress(fmtname)) >= 4 and substr(compress(fmtname),1,4) = "DTTM" then range = '['||compress(putn(input(compress(start), best.), indatafmt))||', '||compress(putn(input(compress(end),best.), indatafmt))||']'; else range = '['||compress(start)||', '||compress(end)||']'; output; end; run; data _null_; if 0 then set rangechk nobs=nobs; call symputx('nobs', nobs); stop; run; %if &nobs > 0 %then %do; /*modify to add by name srtvalue 06 August, 2014*/ data rangechk; set rangechk; srtvalue = input(compress(start), best.); run; proc sort data=rangechk; by fmtname name srtvalue; run; data rangechk; retain cnt 0; set rangechk; by fmtname name; if first.name then cnt + 1; run; /* end modify */ proc print data=rangechk; run; proc sql noprint; select max(cnt) into :cnt from rangechk; quit; %do i = 1 %to &cnt; data fmt&i(keep=fmtname range); set rangechk; where cnt = &i; run; proc sql noprint undo_policy=none; select count(*) into :cnts from fmt&i; %if &cnts > 1 %then %do; select range into :range separated by ',' from fmt&i; create table fmt&i as select distinct fmtname as fmtname, "&range" as range from fmt&i; %end; quit; %end; data fmt; length range $100; %if &cnt > 1 %then set fmt1-fmt%sysfunc(compress(&cnt))%str(;); %else set %str(fmt1;); run; proc datasets nolist memtype=data; delete fmt1-fmt%sysfunc(compress(&cnt)); quit; proc sort data=fmt; by fmtname; run; %end; proc sort data=fmtvalue out=values nodupkey; by fmtname; run; data formats; set formats; if substr(format, 1, 1)='$' then format=substr(format, 2); run; proc sort data=formats ; by format; run; %if %sysfunc(exist(fmt)) = 1 %then %do; data formats fmtrange; merge formats(in=in_format drop=label) values(in=in_value rename=(fmtname=format)) fmt(rename=(fmtname=format)); by format; if in_format & in_value; if compress(format) in (&fmtname) then output fmtrange; else output formats; run; %end; %else %do; data formats; merge formats(in=in_format) values(in=in_value rename=(fmtname=format)); by format; if in_format & in_value; run; %end; proc sort data=fmtvalue nodupkey out=value; by fmtname label; /* nodup by label var */ run; proc print data=value; run; proc print data=formats; run; proc sql; create table chkinvalid as select a.fmtname, a.label, b.name, b.format, b.formatdot from value as a, formats as b where a.fmtname = b.format; quit; %if &nobs > 0 %then %do; /* if range check is performed */ data chkinvalid; set fmtrange(keep=name format formatdot label range) chkinvalid; run; %end; proc print data=chkinvalid; run; /*******************************************************************/ proc sql noprint; select distinct compress(name) into :varlist separated by ' ' from chkinvalid; %*%put varlist = &varlist; %do i = 1 %to %sysfunc(countw(&varlist)); select distinct "'"||tranwrd(compress(label), "'", "''")||"'", formatdot, format into :labelval&i separated by ', ', :formatdot, :fmtname from chkinvalid where name = "%scan(&varlist, &i)"; %*%put &&labelval&i; %let fmtname&i = &fmtname; %let varname&i = %scan(&varlist, &i); %if %substr(&formatdot,1,1) = $ %then %do; %let labelval&i = " ", &&labelval&i; %let invalidchk&i = trim(left(&&varname&i)), &formatdot; /*compress leading blank of character varialbe*/ %end; %else %do; %let labelval&i = ".", &&labelval&i; %let invalidchk&i = &&varname&i, &formatdot; %end; %end; quit; /* check for for invalid value for each of the variable where formats have been defined this block of code require to read in the dataset once which will significantly speed up the process for huge dataset */ data %do i = 1 %to %sysfunc(countw(&varlist)); invalid&i(keep=&&varname&i) %end; ; set &lib..&dsn; %do i = 1 %to %sysfunc(countw(&varlist)); if compress(put(&&invalidchk&i)) not in (&&labelval&i) then output invalid&i; %end; run; /**************************************************************************************/ proc sql noprint undo_policy=none; %do i = 1 %to %sysfunc(countw(&varlist)); select count(*) into :nrow from invalid&i; %if &nrow > 0 %then %do; select distinct &&varname&i into :invalidvalue separated by ', ' from invalid&i; create table invalid&i as select "&&varname&i" as varname, "&&fmtname&i" as fmtname, count(*) as ninvalid, count(distinct &&varname&i) as cnt, symget('invalidvalue') as invalid_codes from invalid&i; %end; %end; quit; %let invalid = ; %do i = 1 %to %sysfunc(countw(&varlist)); %if %sysfunc(exist(invalid&i)) = 1 %then %do; %let dsid = %sysfunc(open(invalid&i)); %let cntobs = %sysfunc(attrn(&dsid,nlobs)); %if &cntobs > 0 %then %let invalid = &invalid invalid&i; %let rc = %sysfunc(close(&dsid)); %end; %end; %if &invalid ^= %then %do; /* If invalid variable exist */ data invalid; length fmtname varname $40 invalid_codes $500; set &invalid; run; %* check if variable range exist in the dataset; %local rc dsid varexist; %let dsid = %sysfunc(open(chkinvalid)); %if %sysfunc(varnum(&dsid, range)) > 0 %then %let varexist = 1; %else %let varexist = 0; %let rc = %sysfunc(close(&dsid)); proc sort data=chkinvalid %if &varexist = 1 %then (keep=name range); %else (keep=name); nodupkey; by name; run; proc sort data=invalid; by varname; run; data invalid(keep=varname fmtname ninvalid invalid_codes); merge invalid(in=a) chkinvalid(rename=(name=varname)); by varname; if a; varname=upcase(varname); %if &varexist = 1 %then %do; if range ^= ' ' then do; if cnt > 5 then invalid_codes = catx(' ', put(ninvalid, 8.), "invalid obs. out of", trim(left(range)), "range"); else invalid_codes = catx(' ', invalid_codes, ' (', put(ninvalid,8.), 'Invalid Obs. in total )'); end; else do; if cnt > 5 then invalid_codes = catx(' ', cats(substr(invalid_codes, 1, index(invalid_codes, scan(invalid_codes, 5)) - 3), ', ...'), '(', strip(put(ninvalid,8.)), 'Invalid Obs. in total)'); else invalid_codes = catx(' ', invalid_codes, ' (', put(ninvalid,8.), 'Invalid Obs. in total )'); end; %end; %else %do; if cnt > 5 then /*invalid_codes = catx(' ', put(ninvalid, 8.), 'Invalid Obs. in total');*/ invalid_codes = catx(' ', cats(substr(invalid_codes, 1, index(invalid_codes, scan(invalid_codes, 5)) - 3), ', ...'), '(', strip(put(ninvalid,8.)), 'Invalid Obs. in total)'); else invalid_codes = catx(' ', invalid_codes, '(', put(ninvalid,8.), 'Invalid Obs. in total)'); %end; run; proc print data=invalid; run; %end; /* End If invalid variable exist */ proc datasets nolist memtype=catalog lib=work; delete formats; quit; proc format cntlin=_outfmt; run; proc datasets nolist lib=work memtype=data; delete invalid1-invalid%sysfunc(countw(&varlist)) chkinvalid rangechk fmtrange fmtvalue formats fmt _cntlin _outfmt indatafmt; quit; %exit: %mend invalidchk;