All,
I been playing around with creating a C++ grammar from the Standards document N4567, which is the latest I could find. I believe the grammar is complete, but I need to test it. One issue I been trying to resolve is to have the lexer recognize Raw Strings from the standard. I've implemented a possible solution using Actions & Semantic Predicates. I need help determining if it actually works. I've read the ANTLR4 Reference on the interaction between Actions and Predicates but can decide if my solution is valid. A strip down grammar is included below. Any thoughts will be appreciated. I've tried to include my thoughts in the sample.
grammar SampleRaw;
@lexer::members {
string d_char_seq = "";
}
string_literal
: ENCODING_PREFIX? '\"' S_CHAR* '\"'
| ENCODING_PREFIX? 'R' Raw_String
;
ENCODING_PREFIX // one of
: 'u8'
| [uUL]
;
S_CHAR /* any member of the source character set except the
double_quote ", backslash \, or NEW_LINE character
*/
: ~[\"\\\n\r]
| ESCAPE_SEQUENCE
| UNIV_CHAR_NAME
;
fragment ESCAPE_SEQUENCE
: SIMPLE_ESCAPE_SEQ
| OCT_ESCAPE_SEQ
| HEX_ESCAPE_SEQ
;
fragment SIMPLE_ESCAPE_SEQ // one of
: '\\' '\''
| '\\' '\"'
| '\\' '?'
| '\\' '\\'
| '\\' 'a'
| '\\' 'b'
| '\\' 'f'
| '\\' 'n'
| '\\' 'r'
| '\\' 't'
| '\\' 'v'
;
fragment OCT_ESCAPE_SEQ
: [0-3] ( OCT_DIGIT OCT_DIGIT? )?
| [4-7] ( OCT_DIGIT )?
;
fragment HEX_ESCAPE_SEQ
: '\\' 'x' HEX_DIGIT+
;
fragment UNIV_CHAR_NAME
: '\\' 'u' HEX_QUAD
| '\\' 'U' HEX_QUAD HEX_QUAD
;
fragment HEX_QUAD
: HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
;
fragment HEX_DIGIT
: [a-zA-Z0-9]
;
fragment OCT_DIGIT
: [0-7]
;
/*
Raw_String
: '\"' D_CHAR* '(' R_CHAR* ')' D_CHAR* '\"'
;
*/
Raw_String
: ( /* CASE when D_CHAR is empty
ACTION in D_CHAR_SEQ attempts to reset variable d_char_seq
if it is empty, so handle it staticly
*/
'\"'
'('
( ~[)] // Anything but )
| [)] ~[\"] // ) Actually OK, can't be followed by "
// - )" - these are the terminating chars
)*
')'
'\"'
| '\"'
D_CHAR_SEQ /* Will the ACTION in D_CHAR_SEQ be an issue for
the Semantic Predicates Below????
*/
'('
( ~[)] // Anything but )
| [)] D_CHAR_SEQ { ( getText() != d_char_seq ) }?
/* ) Actually OK, can't be followed D_CHAR_SEQ match
IF D_CHAR_SEQs match, turn OFF the Alternative
*/
| [)] D_CHAR_SEQ { ( getText() == d_char_seq ) }? ~[\"]
/* ) Actually OK, must be followed D_CHAR_SEQ match
IF D_CHAR_SEQs match, turn ON the Alternative
Cant't match the final " , but
WE HAVE MATCHED OUR TERMINATING CHARS
*/
)*
')'
D_CHAR_SEQ /* No need to check here,
Matching Terminating CHARS is only way to get out
of loop above
*/
'\"'
)
{ d_char_seq = ""; } // Reset Variable
;
/*
fragment R_CHAR
// any member of the source character set, except a right
// parenthesis ) followed by the initial D_CHAR*
// (which may be empty) followed by a double quote ".
//
: ~[)]
;
*/
fragment D_CHAR
/* any member of the basic source character set except
space, the left parenthesis (, the right parenthesis ),
the backslash \, and the control characters representing
horizontal tab, vertical tab, form feed, and newline.
*/
: ~[ )(\\\t\v\f\n\r]
;
fragment D_CHAR_SEQ
: D_CHAR+ { d_char_seq = ( d_char_seq == "" ) ? getText() : d_char_seq ; }
;