I figured I'd list these here as even if they don't help you, they may help someone else in future searching for a similar solution.
We use the below two macros in SAS to extract certain attributes, elements, values, etc. from XML credit inquiries. I've provided examples which I'm hoping will help explain how they work. I don't have time right now to go through it in detail, but wanted to provide something that you may find useful in the meantime. If you provide these to the analysts, they should be able to run the code as-is, and working through the examples and parameters, extract some information for themselves to do preliminary investigations and maybe give you more concrete requirements.
The only condition for the below macros, is that the XML is no longer than 32767 chars, and that it's all in a single character observation on a single row in SAS (ie. not stored over multiple observations in SAS).
They shouldn't really need to understand how the macros work, they just need to understand how to call and use them.
/*****************************************************************************
** PROGRAM: MACROS.PRXCOUNT.SAS
**
** RETURNS THE NUMBER OF TIMES A SEGMENT IS FOUND IN AN XML FILE.
**
** PARAMETERS:
** iElement : The element to search through the blob for.
** iXMLField : The name of the field that contains the XML blob to parse.
** iDelimiterType: (1 or 2). Defaults to 1. 1 USES <> AS DELIMS. 2 USES [].
**
******************************************************************************
** HISTORY:
** 1.0 MODIFIED: 25-FEB-2011 BY:RP
** - CREATED.
** 1.1 MODIFIED: 14-MAR-2011 BY:RP
** - MODIFIED TO ALLOW FOR OPTIONAL ATTRIBUTES ON THE ELEMENT BEING SEARCHED FOR.
*****************************************************************************/
%macro prxCount(iElement=, iXMLField=, iDelimiterType=1);
%local delim_open delim_close;
crLf = byte(10) || byte(13);
&iXMLField = compress(&iXMLField,crLf,);
%if &iDelimiterType eq 1 %then %do;
%let delim_open = <;
%let delim_close = >;
%end;
%else %if &iDelimiterType eq 2 %then %do;
%let delim_open = \[;
%let delim_close = \];
%end;
%else %if &iDelimiterType eq 3 %then %do;
%let delim_open = %nrbquote(&)lt%quote(%str(;)) ;
%let delim_close = %nrbquote(&)gt%quote(%str(;)) ;
%end;
%else %do;
%put ERR%str()ROR (prxCount.sas): You specified an incorrect option for the iDelimiterType parameter.;
%end;
prx_id = prxparse("/&delim_open&iElement(\s+.*?&delim_close|&delim_close){1}?(.*?)&delim_open\/&iElement&delim_close/i");
prx_count = 0;
prx_start = 1;
prx_stop = length(&iXMLField);
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
do while (prx_pos > 0);
prx_count = prx_count + 1;
call prxposn(prx_id, 1, prx_pos, prx_length);
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
end;
drop crLf prx_:;
%mend;
/*****************************************************************************
** PROGRAM: PRXEXTRACT.SAS
**
** SEARCHES THROUGH AN XML (OR HTML) FILE FOR AN ELEMENT AND EXTRACTS THE
** VALUE BETWEEN AN ELEMENTS TAGS.
**
** PARAMETERS:
** iElement : The element to search through the blob for.
** iField : The fieldname to save the result to.
** iType : (N or C) for Numeric or Character.
** iLength : The length of the field to create.
** iXMLField : The name of the field that contains the XML blob to parse.
** iDelimiterType: (1 or 2). Defaults to 1. 1 USES <> AS DELIMS. 2 USES [].
**
******************************************************************************
** HISTORY:
** 1.0 MODIFIED: 14-FEB-2011 BY:RP
** - CREATED.
** 1.1 MODIFIED: 16-FEB-2011 BY:RP
** - ADDED OPTION TO CHANGE DELIMITERS FROM <> TO []
** 1.1 MODIFIED: 17-FEB-2011 BY:RP
** - CORRECTED ERROR WHEN MATCH RETURNS A LENGTH OF ZERO
** - CORRECTED MISSING AMPERSAND FROM IDELIMITERTYPE CHECK.
** - ADDED ESCAPING QUOTES TO [] DELIMITER TYPE
** - CORRECTED WARNING WHEN MATCH RETURNS MISSING NUMERIC FIELD
** 1.2 MODIFIED: 25-FEB-2011 BY:RP
** - ADDED DELIMITER TYPES TO WORK WITH MASKED HTML CODES
** 1.3 MODIFIED: 11-MAR-2011 BY:RP
** - MODIFIED TO ALLOW FOR OPTIONAL ATTRIBUTES ON THE ELEMENT BEING SEARCHED FOR.
** 1.4 MODIFIED: 14-MAR-2011 BY:RP
** - CORRECTED TO REMOVE FALSE MATCHES FROM PRIOR VERSION. ADDED EXAMPLE.
** 1.5 MODIFIED: 10-APR-2012 BY:RP
** - CORRECTED PROBLEM WITH ZERO LENGTH STRING MATCHES
** 1.6 MODIFIED: 22-MAY-2012 BY:RP
** - ADDED ABILITY TO CAPTURE ATTRIBUTES
*****************************************************************************/
%macro prxExtract(iElement=, iField=, iType=, iLength=, iXMLField=, iDelimiterType=1, iSequence=1, iAttributesField=);
%local delim_open delim_close;
crLf = byte(10) || byte(13);
&iXMLField = compress(&iXMLField,crLf,);
%if &iDelimiterType eq 1 %then %do;
%let delim_open = <;
%let delim_close = >;
%end;
%else %if &iDelimiterType eq 2 %then %do;
%let delim_open = \[;
%let delim_close = \];
%end;
%else %if &iDelimiterType eq 3 %then %do;
%let delim_open = %nrbquote(&)lt%quote(%str(;)) ;
%let delim_close = %nrbquote(&)gt%quote(%str(;)) ;
%end;
%else %do;
%put ERR%str()ROR (prxExtract.sas): You specified an incorrect option for the iDelimiterType parameter.;
%end;
%if %sysfunc(index(&iField,[)) %then %do;
/* DONT DO THIS IF ITS AN ARRAY */
%end;
%else %do;
%if "%upcase(&iType)" eq "N" %then %do;
attrib &iField length=&iLength format=best.;
%end;
%else %do;
attrib &iField length=$&iLength format=$&iLength..;
%end;
%end;
/*
** BREAKDOWN OF REGULAR EXPRESSION (EXAMPLE USES < AND > AS DELIMS AND ANI AS THE ELEMENT BEING LOOKED FOR:
**
** &delim_open&iElement --> FINDS <ANI
** (\s+.*?&delim_close|&delim_close){1}? --> FINDS THE SHORTEST SINGLE INSTANCE OF EITHER:
** - ONE OR MORE SPACES FOLLOWED BY ANYTHING UNTIL A > CHARACTER
** - OR JUST A > CHARACTER
** THE ?: JUST TELLS IT NOT TO CAPTURE WHAT IT FOUND INBETWEEN THE ( AND )
** (.*?) --> FINDS WHAT WE ARE SEARCHING FOR AND CAPTURES IT INTO BUFFER 1.
** &delim_open --> FINDS <
** \/ --> FINDS THE / CHARACTER. THE FIRST SLASH ESCAPES IT SO IT KNOWS ITS NOT A SPECIAL REGEX SLASH
** &iElement&delim_close --> FINDS ANI>
*/
prx_id = prxparse("/&delim_open&iElement((\s+.*?)&delim_close|&delim_close){1}?(.*?)&delim_open\/&iElement&delim_close/i");
prx_start = 1;
prx_stop = length(&iXMLField);
prx_sequence = 0;
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
do while (prx_pos > 0);
prx_sequence = prx_sequence + 1;
if prx_sequence = &iSequence then do;
if prx_length > 0 then do;
call prxposn(prx_id, 3, prx_pos, prx_length);
%if "%upcase(&iType)" eq "N" %then %do;
length prx_tmp_n $200;
prx_tmp_n = substr(&iXMLField, prx_pos, prx_length);
if cats(prx_tmp_n) ne "" then do;
&iField = input(substr(&iXMLField, prx_pos, prx_length), ?best.);
end;
%end;
%else %do;
if prx_length ne 0 then do;
&iField = substr(&iXMLField, prx_pos, prx_length);
end;
else do;
&iField = "";
end;
%end;
**
** ALSO SAVE THE ATTRIBUTES TO A FIELD IF REQUESTED
*;
%if "%upcase(&iAttributesField)" ne "" %then %do;
call prxposn(prx_id, 2, prx_pos, prx_length);
if prx_length ne 0 then do;
&iAttributesField = substr(&iXMLField, prx_pos, prx_length);
end;
else do;
&iAttributesField = "";
end;
%end;
end;
end;
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
end;
drop crLf prx:;
%mend;
Example for single element:
data example;
xml = "<test><ANI2Digits>00</ANI2Digits><XNI xniattrib=1>7606256091</XNI><ANI>number2</ANI><ANI x=hmm y=yay>number3</ANI></test>"; * NOTE THE XML MUST BE ALL ON ONE LINE;
%prxExtract(iElement=xni, iField=my_xni, iType=c, iLength=15, iXMLField=xml, iSequence=1, iAttributesField=my_xni_attribs);
run;
Example for repeating elements:
data example;
xml = "<test><ANI2Digits>00</ANI2Digits><ANI>7606256091</ANI><ANI>number2</ANI><ANI x=hmm y=yay>number3</ANI></test>"; * NOTE THE XML MUST BE ALL ON ONE LINE;
%prxExtract(iElement=ani2digits, iField=ani2digits, iType=c, iLength=50, iXMLField=xml);
length ani1-ani6 $15;
length attr1-attr6 $100;
array arrani [1:6] $ ani1-ani6;
array arrattr [1:6] $ attr1-attr6;
%prxCount (iElement=ani, iXMLField=xml, iDelimiterType=1);
do cnt=1 to prx_count;
%prxExtract(iElement=ani, iField=arrani[cnt], iType=c, iLength=15, iXMLField=xml, iSequence=cnt, iAttributesField=arrattr[cnt]);
end;
run;