What @Joe says in the comments is true but unfortunately it doesn't negate that fact that people often need to solve this kind of problem. Below are some macros that I use when I need to extract certain values out of XML/HTML. It's not perfect but it's gotten the job done for everything I've needed.
The major limitation of the below macros is that they require the HTML/XML they are parsing to exist in a single field in SAS. The size limitation of a single field in SAS is 32767 chars, which means that if your HTML file is bigger than that then you will need to take just the subset of it that you need to work with.
Examples are included and the best way to figure out how it works is just to run the examples.
/*****************************************************************************
** PROGRAM: PRXEXTRACT.SAS
**
** SEARCHES THROUGH AN XML (OR HTML) FILE FOR AN ELEMENT AND EXTRACTS THE
** VALUE BETWEEN AN ELEMENTS TAGS.
**
** PARAMETERS:
** iElement : The element to search through the blob for.
** iField : The fieldname to save the result to.
** iType : (N or C) for Numeric or Character.
** iLength : The length of the field to create.
** iXMLField : The name of the field that contains the XML blob to parse.
** iDelimiterType: (1 or 2). Defaults to 1. 1 USES <> AS DELIMS. 2 USES [].
**
******************************************************************************
** HISTORY:
** 1.0 MODIFIED: 14-FEB-2011 BY:RP
** - CREATED.
** 1.1 MODIFIED: 16-FEB-2011 BY:RP
** - ADDED OPTION TO CHANGE DELIMITERS FROM <> TO []
** 1.1 MODIFIED: 17-FEB-2011 BY:RP
** - CORRECTED ERROR WHEN MATCH RETURNS A LENGTH OF ZERO
** - CORRECTED MISSING AMPERSAND FROM IDELIMITERTYPE CHECK.
** - ADDED ESCAPING QUOTES TO [] DELIMITER TYPE
** - CORRECTED WARNING WHEN MATCH RETURNS MISSING NUMERIC FIELD
** 1.2 MODIFIED: 25-FEB-2011 BY:RP
** - ADDED DELIMITER TYPES TO WORK WITH MASKED HTML CODES
** 1.3 MODIFIED: 11-MAR-2011 BY:RP
** - MODIFIED TO ALLOW FOR OPTIONAL ATTRIBUTES ON THE ELEMENT BEING SEARCHED FOR.
** 1.4 MODIFIED: 14-MAR-2011 BY:RP
** - CORRECTED TO REMOVE FALSE MATCHES FROM PRIOR VERSION. ADDED EXAMPLE.
** 1.5 MODIFIED: 10-APR-2012 BY:RP
** - CORRECTED PROBLEM WITH ZERO LENGTH STRING MATCHES
** 1.6 MODIFIED: 22-MAY-2012 BY:RP
** - ADDED ABILITY TO CAPTURE ATTRIBUTES
*****************************************************************************/
%macro prxExtract(iElement=, iField=, iType=, iLength=, iXMLField=, iDelimiterType=1, iSequence=1, iAttributesField=);
%local delim_open delim_close;
crLf = byte(10) || byte(13);
&iXMLField = compress(&iXMLField,crLf,);
%if &iDelimiterType eq 1 %then %do;
%let delim_open = <;
%let delim_close = >;
%end;
%else %if &iDelimiterType eq 2 %then %do;
%let delim_open = \[;
%let delim_close = \];
%end;
%else %if &iDelimiterType eq 3 %then %do;
%let delim_open = %nrbquote(&)lt%quote(%str(;)) ;
%let delim_close = %nrbquote(&)gt%quote(%str(;)) ;
%end;
%else %do;
%put ERR%str()ROR (prxExtract.sas): You specified an incorrect option for the iDelimiterType parameter.;
%end;
%if %sysfunc(index(&iField,[)) %then %do;
/* DONT DO THIS IF ITS AN ARRAY */
%end;
%else %do;
%if "%upcase(&iType)" eq "N" %then %do;
attrib &iField length=&iLength format=best.;
%end;
%else %do;
attrib &iField length=$&iLength format=$&iLength..;
%end;
%end;
/*
** BREAKDOWN OF REGULAR EXPRESSION (EXAMPLE USES < AND > AS DELIMS AND ANI AS THE ELEMENT BEING LOOKED FOR:
**
** &delim_open&iElement --> FINDS <ANI
** (\s+.*?&delim_close|&delim_close){1}? --> FINDS THE SHORTEST SINGLE INSTANCE OF EITHER:
** - ONE OR MORE SPACES FOLLOWED BY ANYTHING UNTIL A > CHARACTER
** - OR JUST A > CHARACTER
** THE ?: JUST TELLS IT NOT TO CAPTURE WHAT IT FOUND INBETWEEN THE ( AND )
** (.*?) --> FINDS WHAT WE ARE SEARCHING FOR AND CAPTURES IT INTO BUFFER 1.
** &delim_open --> FINDS <
** \/ --> FINDS THE / CHARACTER. THE FIRST SLASH ESCAPES IT SO IT KNOWS ITS NOT A SPECIAL REGEX SLASH
** &iElement&delim_close --> FINDS ANI>
*/
prx_id = prxparse("/&delim_open&iElement((\s+.*?)&delim_close|&delim_close){1}?(.*?)&delim_open\/&iElement&delim_close/i");
prx_start = 1;
prx_stop = length(&iXMLField);
prx_sequence = 0;
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
do while (prx_pos > 0);
prx_sequence = prx_sequence + 1;
if prx_sequence = &iSequence then do;
if prx_length > 0 then do;
call prxposn(prx_id, 3, prx_pos, prx_length);
%if "%upcase(&iType)" eq "N" %then %do;
length prx_tmp_n $200;
prx_tmp_n = substr(&iXMLField, prx_pos, prx_length);
if cats(prx_tmp_n) ne "" then do;
&iField = input(substr(&iXMLField, prx_pos, prx_length), ?best.);
end;
%end;
%else %do;
if prx_length ne 0 then do;
&iField = substr(&iXMLField, prx_pos, prx_length);
end;
else do;
&iField = "";
end;
%end;
**
** ALSO SAVE THE ATTRIBUTES TO A FIELD IF REQUESTED
*;
%if "%upcase(&iAttributesField)" ne "" %then %do;
call prxposn(prx_id, 2, prx_pos, prx_length);
if prx_length ne 0 then do;
&iAttributesField = substr(&iXMLField, prx_pos, prx_length);
end;
else do;
&iAttributesField = "";
end;
%end;
end;
end;
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
end;
drop crLf prx:;
%mend;
Example for a single element:
data example;
xml = "<test><ANI2Digits>00</ANI2Digits><XNI xniattrib=1>7606256091</XNI><ANI>number2</ANI><ANI x=hmm y=yay>number3</ANI></test>"; * NOTE THE XML MUST BE ALL ON ONE LINE;
%prxExtract(iElement=xni, iField=my_xni, iType=c, iLength=15, iXMLField=xml, iSequence=1, iAttributesField=my_xni_attribs);
run;
Example for repeating elements:
data example;
xml = "<test><ANI2Digits>00</ANI2Digits><ANI>7606256091</ANI><ANI>number2</ANI><ANI x=hmm y=yay>number3</ANI></test>"; * NOTE THE XML MUST BE ALL ON ONE LINE;
%prxExtract(iElement=ani2digits, iField=ani2digits, iType=c, iLength=50, iXMLField=xml);
length ani1-ani6 $15;
length attr1-attr6 $100;
array arrani [1:6] $ ani1-ani6;
array arrattr [1:6] $ attr1-attr6;
%prxCount (iElement=ani, iXMLField=xml, iDelimiterType=1);
do cnt=1 to prx_count;
%prxExtract(iElement=ani, iField=arrani[cnt], iType=c, iLength=15, iXMLField=xml, iSequence=cnt, iAttributesField=arrattr[cnt]);
end;
run;
Finally - if you are need the version for multiple elements you will also need the prxcount
macro:
/*****************************************************************************
** PROGRAM: MACROS.PRXCOUNT.SAS
**
** RETURNS THE NUMBER OF TIMES AN ELEMENT IS FOUND IN AN HTML/XML FILE.
**
** PARAMETERS:
** iElement : The element to search through the blob for.
** iXMLField : The name of the field that contains the XML blob to parse.
** iDelimiterType: (1/2/3). Defaults to 1. 1 USES <> AS DELIMS. 2 USES [].
** 3 USES ENCODED VALUES FOR <>.
**
******************************************************************************
** HISTORY:
** 1.0 MODIFIED: 25-FEB-2011 BY:RP
** - CREATED.
** 1.1 MODIFIED: 14-MAR-2011 BY:RP
** - MODIFIED TO ALLOW FOR OPTIONAL ATTRIBUTES ON THE ELEMENT BEING SEARCHED FOR.
*****************************************************************************/
%macro prxCount(iElement=, iXMLField=, iDelimiterType=1);
%local delim_open delim_close;
crLf = byte(10) || byte(13);
&iXMLField = compress(&iXMLField,crLf,);
%if &iDelimiterType eq 1 %then %do;
%let delim_open = <;
%let delim_close = >;
%end;
%else %if &iDelimiterType eq 2 %then %do;
%let delim_open = \[;
%let delim_close = \];
%end;
%else %if &iDelimiterType eq 3 %then %do;
%let delim_open = %nrbquote(&)lt%quote(%str(;)) ;
%let delim_close = %nrbquote(&)gt%quote(%str(;)) ;
%end;
%else %do;
%put ERR%str()ROR (prxCount.sas): You specified an incorrect option for the iDelimiterType parameter.;
%end;
prx_id = prxparse("/&delim_open&iElement(\s+.*?&delim_close|&delim_close){1}?(.*?)&delim_open\/&iElement&delim_close/i");
prx_count = 0;
prx_start = 1;
prx_stop = length(&iXMLField);
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
do while (prx_pos > 0);
prx_count = prx_count + 1;
call prxposn(prx_id, 1, prx_pos, prx_length);
call prxnext(prx_id, prx_start, prx_stop, &iXMLField, prx_pos, prx_length);
end;
drop crLf prx_:;
%mend;