I am trying to parse Windows header files to extract function prototypes. Microsoft being Microsoft means that the function prototypes are not in a regular, easily parseable format. Routine arguments usually, but not always, are annotated with Microsoft's Structured Annotation Language, which starts with an identifier that begins and ends with an underscore, and may have an underscore in the middle. A SAL identifier may be followed by parentheses and contain a variety of compile-time checks, but I don't care about SAL stuff. Routines are generally annotated with an access specifier, which is usually something like WINAPI, APIENTRY, etc., but there may be more than one. There are cases where the arguments are specified only by their types, too. Sheesh!
My grammar looks like this:
//
// Parse C function declarations from a header file
//
grammar FuncDef;
//
// Parser rules
//
start :
func_def+
;
func_def :
'extern'? ret_type = IDENTIFIER access = access_spec routine = IDENTIFIER '(' arg_list* ')' ';'
;
sal_statement :
SAL_NAME SAL_EXPR?
;
access_spec :
('FAR' | 'PASCAL' | 'WINAPI' | 'APIENTRY' | 'WSAAPI' | 'WSPAPI')?
;
argument :
sal_statement type = IDENTIFIER is_pointer = '*'? arg = IDENTIFIER
;
arg_list :
argument (',' argument)*
;
hex_number :
'0x' HEX_DIGIT+
;
//
// Lexer rules
//
INTEGER : Digit+;
HEX_DIGIT : [a-fA-F0-9];
SAL_NAME : '_' Capital (Letter | '_')+? '_'; // Restricted form of IDENTIFIER, so it must be first
IDENTIFIER : Id_chars+;
SAL_EXPR : '(' ( ~( '(' | ')' ) | SAL_EXPR )* ')'; // We don't care about anything within a SAL expression, so eat everything within matched and nested parentheses
CPP_COMMENT : '//' .*? '\r'? '\n' -> channel (HIDDEN);
C_COMMENT : '/*' .*? '*/' -> channel (HIDDEN);
WS : [ \t\r\n]+ -> skip; // Ignore all whitespace
fragment Id_chars : Letter | Digit | '_' | '$';
fragment Capital : [A-Z];
fragment Letter : [a-zA-Z];
fragment Digit : [0-9];
I am using the TestRig, and providing the following input:
PVOID WINAPI routine ();
PVOID WINAPI routine (type param);
extern int PASCAL FAR __WSAFDIsSet(SOCKET fd, fd_set FAR *);
// comment
/*
Another comment*/
int
WSPAPI
WSCSetApplicationCategory(
_Out_writes_bytes_to_(nNumberOfBytesToRead, *lpNumberOfBytesRead) LPBYTE lpBuffer,
_In_ DWORD PathLength,
_In_reads_opt_(ExtraLength) LPCWSTR Extra,
_When_(pbCancel != NULL, _Pre_satisfies_(*pbCancel == FALSE))
DWORD ExtraLength,
_In_ DWORD PermittedLspCategories,
_Out_opt_ DWORD * pPrevPermLspCat,
_Out_ LPINT lpErrno
);
I'm getting this output:
[@0,0:4='PVOID',<IDENTIFIER>,1:0]
[@1,6:11='WINAPI',<'WINAPI'>,1:6]
[@2,13:19='routine',<IDENTIFIER>,1:13]
[@3,21:22='()',<SAL_EXPR>,1:21]
[@4,23:23=';',<';'>,1:23]
[@5,28:32='PVOID',<IDENTIFIER>,3:0]
[@6,34:39='WINAPI',<'WINAPI'>,3:6]
[@7,41:47='routine',<IDENTIFIER>,3:13]
[@8,49:60='(type param)',<SAL_EXPR>,3:21]
[@9,61:61=';',<';'>,3:33]
[@10,66:71='extern',<'extern'>,5:0]
[@11,73:75='int',<IDENTIFIER>,5:7]
[@12,77:82='PASCAL',<'PASCAL'>,5:11]
[@13,84:86='FAR',<'FAR'>,5:18]
[@14,88:99='__WSAFDIsSet',<IDENTIFIER>,5:22]
[@15,100:124='(SOCKET fd, fd_set FAR *)',<SAL_EXPR>,5:34]
[@16,125:125=';',<';'>,5:59]
[@17,130:141='// comment\r\n',<CPP_COMMENT>,channel=1,7:0]
[@18,142:162='/*\r\nAnother comment*/',<C_COMMENT>,channel=1,8:0]
[@19,167:169='int',<IDENTIFIER>,11:0]
[@20,172:177='WSPAPI',<'WSPAPI'>,12:0]
[@21,180:204='WSCSetApplicationCategory',<IDENTIFIER>,13:0]
[@22,205:568='(\r\n _Out_writes_bytes_to_(nNumberOfBytesToRead, *lpNumberOfBytesRead) LPBYTE lpBuffer,\r\n _In_ DWORD PathLength,\r\n _In_reads_opt_(ExtraLength) LPCWSTR Extra,\r\n _When_(pbCancel != NULL, _Pre_satisfies_(*pbCancel == FALSE))\r\nDWORD ExtraLength,\r\n _In_ DWORD PermittedLspCategories,\r\n _Out_opt_ DWORD * pPrevPermLspCat,\r\n _Out_ LPINT lpErrno\r\n )',<SAL_EXPR>,13:25]
[@23,569:569=';',<';'>,22:5]
[@24,572:571='<EOF>',<EOF>,23:0]
line 1:21 mismatched input '()' expecting '('
line 3:21 mismatched input '(type param)' expecting '('
line 5:18 extraneous input 'FAR' expecting IDENTIFIER
line 5:34 mismatched input '(SOCKET fd, fd_set FAR *)' expecting '('
line 13:25 mismatched input '(\r\n _Out_writes_bytes_to_(nNumberOfBytesToRead, *lpNumberOfBytesRead) LPBYTE lpBuffer,\r\n _In_ DWORD PathLength,\r\n _In_reads_opt_(ExtraLength) LPCWSTR Extra,\r\n _When_(pbCancel != NULL, _Pre_satisfies_(*pbCancel == FALSE))\r\nDWORD ExtraLength,\r\n _In_ DWORD PermittedLspCategories,\r\n _Out_opt_ DWORD * pPrevPermLspCat,\r\n _Out_ LPINT lpErrno\r\n )' expecting '('
What I don't understand is why is SAL_EXPR matching on lines 3 and 8? It should only match something if SAL_NAME matches first.
Why doesn't SAL_NAME match at line 22?