3

I am creating an antlr4 grammar for a moderately simple language. I am struggling to get the grammar to differentiate between unary and binary minus. I have read all the other posts that I can find on this topic here on Stackoverflow, but have found that the answers either apply to antlr3 in ways I cannot figure out how to express in antlr4, or that I seem not to be adept in translating the advice of these answers to my own situation. I often end with the problem that antlr cannot unambiguously resolve the rules if I play around with other alternatives.

Below is the antlr file in its entirety. The ambiguity in this version occurs around the production:

binop_expr
        : SUMOP product
        | product ( SUMOP product )*
        ;

(I had originally used UNARY_ABELIAN_OP instead of the second SUMOP, but that led to a different kind of ambiguity — the tool apparently couldn't recognise that it needed to differentiate between the same token in two different contexts. I mention this because one of the posts here recommends using a different name for the unary operator.)


grammar Kant;

program
        : type_declaration_list main
        ;

type_declaration_list
        : type_declaration
        | type_declaration_list type_declaration
        | /* null */
        ;

type_declaration
        : 'context' JAVA_ID '{' context_body '}'
        | 'class'   JAVA_ID '{' class_body   '}'
        | 'class'   JAVA_ID 'extends' JAVA_ID '{' class_body   '}'
        ;

context_body
        : context_body context_body_element
        | context_body_element
        | /* null */
        ;

context_body_element
        : method_decl
        | object_decl
        | role_decl
        | stageprop_decl
        ;

role_decl
        : 'role' JAVA_ID '{' role_body '}'
        | 'role' JAVA_ID '{' role_body '}' REQUIRES '{' self_methods '}'
        | access_qualifier 'role' JAVA_ID '{' role_body '}'
        | access_qualifier 'role' JAVA_ID '{' role_body '}' REQUIRES '{' self_methods '}'
        ;

role_body
        : method_decl
        | role_body method_decl
        | object_decl               // illegal
        | role_body object_decl     // illegal — for better error messages only
        ;

self_methods
        : self_methods ';' method_signature
        | method_signature
        | self_methods /* null */ ';'
        ;

stageprop_decl
        : 'stageprop' JAVA_ID '{' stageprop_body '}'
        | 'stageprop' JAVA_ID '{' stageprop_body '}' REQUIRES '{' self_methods '}'
        | access_qualifier 'stageprop' JAVA_ID '{' stageprop_body '}'
        | access_qualifier 'stageprop' JAVA_ID '{' stageprop_body '}' REQUIRES '{' self_methods '}'
        ;

stageprop_body
        : method_decl
        | stageprop_body method_decl
        ;

class_body
        : class_body class_body_element
        | class_body_element
        | /* null */
        ;

class_body_element
        : method_decl
        | object_decl
        ;

method_decl
        : method_decl_hook '{' expr_and_decl_list '}'
        ;

method_decl_hook
        : method_signature
        | method_signature CONST
        ;

method_signature
        : access_qualifier return_type method_name '(' param_list ')'
        | access_qualifier return_type method_name 
        | access_qualifier method_name '(' param_list ')'
        ;

expr_and_decl_list
        : object_decl
        | expr ';' object_decl
        | expr_and_decl_list object_decl
        | expr_and_decl_list expr
        | expr_and_decl_list /*null-expr */ ';'
        | /* null */
        ;

return_type
        : type_name
        | /* null */
        ;

method_name
        : JAVA_ID
        ;

access_qualifier
        : 'public' | 'private' | /* null */
        ;

object_decl
        : access_qualifier compound_type_name identifier_list ';'
        | access_qualifier compound_type_name identifier_list
        | compound_type_name identifier_list /* null expr */ ';'
        | compound_type_name identifier_list
        ;

compound_type_name
        : type_name '[' ']'
        | type_name
        ;

type_name
        : JAVA_ID
        | 'int'
        | 'double'
        | 'char'
        | 'String'
        ;

identifier_list
        : JAVA_ID
        | identifier_list ',' JAVA_ID
        | JAVA_ID ASSIGN expr
        | identifier_list ',' JAVA_ID ASSIGN expr
        ;

param_list
        : param_decl
        | param_list ',' param_decl
        | /* null */
        ;

param_decl
        : type_name JAVA_ID
        ;

main
        : expr
        ;

expr
        : block
        | expr '.' message
        | expr '.' CLONE
        | expr '.' JAVA_ID
        | ABELIAN_INCREMENT_OP expr '.' JAVA_ID
        | expr '.' JAVA_ID ABELIAN_INCREMENT_OP
        | /* this. */ message
        | JAVA_ID
        | constant
        | if_expr
        | for_expr
        | while_expr
        | do_while_expr
        | switch_expr
        | BREAK
        | CONTINUE
        | boolean_expr
        | binop_expr
        | '(' expr ')'
        | <assoc=right> expr ASSIGN expr
        | NEW message
        | NEW type_name '[' expr ']'
        | RETURN expr
        | RETURN
        ;

relop_expr
        : sexpr RELATIONAL_OPERATOR sexpr
        ;


// This is just a duplication of expr. We separate it out
// because a top-down antlr4 parser can't handle the
// left associative ambiguity. It is used only
// for abelian types.
sexpr
        : block 
        | sexpr '.' message
        | sexpr '.' CLONE
        | sexpr '.' JAVA_ID
        | ABELIAN_INCREMENT_OP sexpr '.' JAVA_ID
        | sexpr '.' JAVA_ID ABELIAN_INCREMENT_OP
        | /* this. */ message
        | JAVA_ID
        | constant
        | if_expr
        | for_expr
        | while_expr
        | do_while_expr
        | switch_expr
        | BREAK
        | CONTINUE
        | '(' sexpr ')'
        | <assoc=right> sexpr ASSIGN sexpr
        | NEW message
        | NEW type_name '[' expr ']'
        | RETURN expr
        | RETURN
        ;

block
        : '{' expr_and_decl_list '}'
        | '{' '}'
        ;

expr_or_null
        : expr
        | /* null */
        ;

if_expr
        : 'if' '(' boolean_expr ')' expr
        | 'if' '(' boolean_expr ')' expr 'else' expr
        ;

for_expr
        : 'for' '(' object_decl boolean_expr ';' expr ')' expr  // O.K. — expr can be a block
        | 'for' '(' JAVA_ID ':' expr ')' expr
        ;

while_expr
        : 'while' '(' boolean_expr ')' expr
        ;

do_while_expr
        : 'do' expr 'while' '(' boolean_expr ')'
        ;

switch_expr
        : SWITCH '(' expr ')' '{'  ( switch_body )* '}'
        ;

switch_body
        : ( CASE constant | DEFAULT ) ':' expr_and_decl_list
        ;

binop_expr
        : SUMOP product
        | product ( SUMOP product )*
        ;

product
        : atom  ( MULOP atom )*
        ;

atom
        : null_expr
        | JAVA_ID
        | JAVA_ID ABELIAN_INCREMENT_OP
        | ABELIAN_INCREMENT_OP JAVA_ID
        | constant
        | '(' expr ')'
        | array_expr '[' sexpr ']'
        | array_expr '[' sexpr ']' ABELIAN_INCREMENT_OP
        | ABELIAN_INCREMENT_OP array_expr '[' sexpr ']'
        ;

null_expr
        : NULL
        ;

array_expr
        : sexpr
        ;

boolean_expr
        : boolean_product ( BOOLEAN_SUMOP boolean_product )*
        ;

boolean_product
        : boolean_atom  ( BOOLEAN_MULOP boolean_atom )*
        ;

boolean_atom
        : BOOLEAN
        | JAVA_ID
        | '(' boolean_expr ')'
        | LOGICAL_NOT boolean_expr
        | relop_expr
        ;

constant
        : STRING
        | INTEGER
        | FLOAT
        | BOOLEAN
        ;

message
        : <assoc=right> JAVA_ID '(' argument_list ')'
        ;

argument_list
        : expr
        | argument_list ',' expr
        | /* null */
        ;

// Lexer rules

STRING : '"' ( ~'"' | '\\' '"' )* '"' ;

INTEGER : ('1' .. '9')+ ('0' .. '9')* | '0';

FLOAT : (('1' .. '9')* | '0') '.' ('0' .. '9')* ;

BOOLEAN : 'true' | 'false' ;

SWITCH : 'switch' ;

CASE : 'case' ;

DEFAULT : 'default' ;

BREAK : 'break' ;

CONTINUE : 'continue' ;

RETURN : 'return' ;

REQUIRES : 'requires' ;

NEW : 'new' ;

CLONE : 'clone' ;

NULL : 'null' ;

CONST : 'const' ;

RELATIONAL_OPERATOR : '!=' | '==' | '>' | '<' | '>=' | '<=';

LOGICAL_NOT : '!' ;

BOOLEAN_MULOP : '&&'  ;

BOOLEAN_SUMOP : '||' | '^' ;

SUMOP : '+' | '-' ;

MULOP : '*' | '/' ;

ABELIAN_INCREMENT_OP : '++' | '--' ;

JAVA_ID: (('a' .. 'z') | ('A' .. 'Z')) (('a' .. 'z') | ('A' .. 'Z') | ('0' .. '9') | '_')* ;

INLINE_COMMENT: '//' ~[\r\n]* -> channel(HIDDEN) ;

C_COMMENT: '/*' .*? '*/' -> channel(HIDDEN) ;

WHITESPACE : ( '\t' | ' ' | '\r' | '\n'| '\u000C' )+ -> channel(HIDDEN) ;

ASSIGN : '=' ;

Typical of the problem is that the parser can't recognise the unary minus in this expression (it simply does not accept the construct):

Base b1 = new Base(-threetwoone);
Cope
  • 769
  • 7
  • 17

2 Answers2

1

Try removing the unary expression from binop_expr, and add it to the expr rule:

expr
 : ...
 | unary_expr
 | binop_expr
 | ...
 ;

unary_expr
 : SUMOP binop_expr
 ;

binop_expr
 : product ( SUMOP product )*
 ;
Bart Kiers
  • 166,582
  • 36
  • 299
  • 288
  • I ended up the same place I have been before, where the problem moves elsewhere. Consider the input counter = counter + 1. This can either parse as an assignment whose RHS is counter + 1 or as the assignment counter = counter followed by the expression +1. It now parses like this: expr : JAVA_ID (counter) / expr : JAVA_ID (counter) / expr : expr ASSIGN expr / expr_and_decl_list : expr_and_decl_list expr / constant : 1 / atom : constant / product : atom / unary_abelian_expr : '+' product / expr : expr unary_abelian_expr / expr_and_decl_list : expr_and_decl_list expr – Cope Aug 24 '15 at 10:58
1

The problem seems to arise from having two kinds of expressions in the grammar that need better to be grouped with each other instead of being intermixed as much as they are. The overall intent is to create a language where everything is an expression, including, for example, loops and conditionals. (You can use your imagination to deduce the values they generate; however, that is a semantic rather than syntactic issue).

One set of these expressions have nice abelian properties (they behave like ordinary arithmetic types with well-behaved operations). The language specifies how lower-level productions are tied together by operations that give them context: so two identifiers, "a" and "b", can be associated by the operator "+" as "a + b". It's a highly contexutalized grammar.

The other of these sets of expressions comprises productions with a much lower degree of syntactic cohesion. One can combine "if" expressions, "for" expressions, and declarations in any order. The only linguistic property associating them is catenation, as we find in productions for object_body, class_body, and expr_and_decl_list.

The basic problem is that these two grammars became intertwined in the antlr spec. Combined with the fact that operators like "+" are context-sensitive, and that the context can arise either from catenation or the more contextualised abelian alternative, the parser was left with two alternative ways of parsing some expressions. I managed to wiggle around most of the ambiguity, but it got pushed into the area of unary prefix operators. So if "a" is an expression (it is), and "b" is an expression, then the string "a + b" is ambiguous. It can mean either product ( SUMOP product )*, or it can mean expr_and_decl_list expr where expr_and_decl_list was earlier reduced from the expr "a". The expr in expr_and_decl_list expr can reduce from "+b".

I re-wrote the grammar, inspired by other examples I've found on the web. (One of the more influential was one that Bart Kiers pointed me to: If/else statements in ANTLR using listeners) You can find the new grammar below. Stuff that belonged together with the grammar for the abelian expressions but which used to reduce to the expr target, like NEW expr and expr '.' JAVA_ID, has now been separated out to reduce to an abelian_expr target. All the productions that reduce down to an abelian_expr are now more or less direct descendants of the abelian_expr node in the grammar; that helps antlr deal intelligently with what otherwise would be ambiguities. A unary "+" expression no longer can reduce directly to expr but can find its way to expr only through abelian_expr, so it will never be treated as production that can simply be catenated as a largely uncontextualized expression.

I can't prove that that's what was going on, but signs point in that direction. If someone else has a more formal or deductive analysis (particularly if it points to another conclusion) I'd love to hear your reasoning.

The lesson here seems to be that what amounted to a high-level, fundamental flaw in the design of the language managed to show up as only an isolated and in some sense minor bug: the inability to disambiguate between the unary and binary uses of an operator.

grammar Kant;

program
        : type_declaration_list main
        | type_declaration_list     // missing main
        ;

main
        : expr
        ;

type_declaration_list
        : type_declaration
        | type_declaration_list type_declaration
        | /* null */
        ;

type_declaration
        : 'context' JAVA_ID '{' context_body '}'
        | 'class'   JAVA_ID '{' class_body   '}'
        | 'class'   JAVA_ID 'extends' JAVA_ID '{' class_body   '}'
        ;

context_body
        : context_body context_body_element
        | context_body_element
        | /* null */
        ;

context_body_element
        : method_decl
        | object_decl
        | role_decl
        | stageprop_decl
        ;

role_decl
        : 'role' JAVA_ID '{' role_body '}'
        | 'role' JAVA_ID '{' role_body '}' REQUIRES '{' self_methods '}'
        | access_qualifier 'role' JAVA_ID '{' role_body '}'
        | access_qualifier 'role' JAVA_ID '{' role_body '}' REQUIRES '{' self_methods '}'
        ;

role_body
        : method_decl
        | role_body method_decl
        | object_decl               // illegal
        | role_body object_decl     // illegal — for better error messages only
        ;

self_methods
        : self_methods ';' method_signature
        | method_signature
        | self_methods /* null */ ';'
        ;

stageprop_decl
        : 'stageprop' JAVA_ID '{' stageprop_body '}'
        | 'stageprop' JAVA_ID '{' stageprop_body '}' REQUIRES '{' self_methods '}'
        | access_qualifier 'stageprop' JAVA_ID '{' stageprop_body '}'
        | access_qualifier 'stageprop' JAVA_ID '{' stageprop_body '}' REQUIRES '{' self_methods '}'
        ;

stageprop_body
        : method_decl
        | stageprop_body method_decl
        | object_decl               // illegal
        | stageprop_body object_decl        // illegal — for better error messages only
        ;

class_body
        : class_body class_body_element
        | class_body_element
        | /* null */
        ;

class_body_element
        : method_decl
        | object_decl
        ;

method_decl
        : method_decl_hook '{' expr_and_decl_list '}'
        ;

method_decl_hook
        : method_signature
        ;

method_signature
        : access_qualifier return_type method_name '(' param_list ')' CONST*
        | access_qualifier return_type method_name CONST*
        | access_qualifier method_name '(' param_list ')' CONST*
        ;

expr_and_decl_list
        : object_decl
        | expr ';' object_decl
        | expr_and_decl_list object_decl
        | expr_and_decl_list expr
        | expr_and_decl_list /*null-expr */ ';'
        | /* null */
        ;

return_type
        : type_name
        | /* null */
        ;

method_name
        : JAVA_ID
        ;

access_qualifier
        : 'public' | 'private' | /* null */
        ;

object_decl
        : access_qualifier compound_type_name identifier_list ';'
        | access_qualifier compound_type_name identifier_list
        | compound_type_name identifier_list /* null expr */ ';'
        | compound_type_name identifier_list
        ;

compound_type_name
        : type_name '[' ']'
        | type_name
        ;

type_name
        : JAVA_ID
        | 'int'
        | 'double'
        | 'char'
        | 'String'
        ;

identifier_list
        : JAVA_ID
        | identifier_list ',' JAVA_ID
        | JAVA_ID ASSIGN expr
        | identifier_list ',' JAVA_ID ASSIGN expr
        ;

param_list
        : param_decl
        | param_list ',' param_decl
        | /* null */
        ;

param_decl
        : type_name JAVA_ID
        ;

expr
        : abelian_expr
        | boolean_expr
        | block
        | if_expr
        | for_expr
        | while_expr
        | do_while_expr
        | switch_expr
        | BREAK
        | CONTINUE
        | RETURN expr
        | RETURN
        ;

abelian_expr
        : <assoc=right>abelian_expr POW abelian_expr           
        | ABELIAN_SUMOP expr                           
        | LOGICAL_NEGATION expr
        | NEW message
        | NEW type_name '[' expr ']'                            
        | abelian_expr ABELIAN_MULOP abelian_expr       
        | abelian_expr ABELIAN_SUMOP abelian_expr             
        | abelian_expr ABELIAN_RELOP abelian_expr                    
        | null_expr
        | /* this. */ message
        | JAVA_ID
        | JAVA_ID ABELIAN_INCREMENT_OP
        | ABELIAN_INCREMENT_OP JAVA_ID
        | constant
        | '(' abelian_expr ')'
        | abelian_expr '[' expr ']'
        | abelian_expr '[' expr ']' ABELIAN_INCREMENT_OP
        | ABELIAN_INCREMENT_OP expr '[' expr ']'
        | ABELIAN_INCREMENT_OP expr '.' JAVA_ID
        | abelian_expr '.' JAVA_ID ABELIAN_INCREMENT_OP
        | abelian_expr '.' message
        | abelian_expr '.' CLONE
        | abelian_expr '.' CLONE '(' ')'
        | abelian_expr '.' JAVA_ID
        | <assoc=right> abelian_expr ASSIGN expr
        ;

message
        : <assoc=right> JAVA_ID '(' argument_list ')'
        ;

boolean_expr
        : boolean_expr BOOLEAN_MULOP expr                       
        | boolean_expr BOOLEAN_SUMOP expr
        | constant      // 'true' / 'false'
        | abelian_expr
        ;

block
        : '{' expr_and_decl_list '}'
        | '{' '}'
        ;

expr_or_null
        : expr
        | /* null */
        ;

if_expr
        : 'if' '(' expr ')' expr
        | 'if' '(' expr ')' expr 'else' expr
        ;

for_expr
        : 'for' '(' object_decl expr ';' expr ')' expr  // O.K. — expr can be a block
        | 'for' '(' JAVA_ID ':' expr ')' expr
        ;

while_expr
        : 'while' '(' expr ')' expr
        ;

do_while_expr
        : 'do' expr 'while' '(' expr ')'
        ;

switch_expr
        : SWITCH '(' expr ')' '{'  ( switch_body )* '}'
        ;

switch_body
        : ( CASE constant | DEFAULT ) ':' expr_and_decl_list
        ;

null_expr
        : NULL
        ;

constant
        : STRING
        | INTEGER
        | FLOAT
        | BOOLEAN
        ;

argument_list
        : expr
        | argument_list ',' expr
        | /* null */
        ;


// Lexer rules

STRING : '"' ( ~'"' | '\\' '"' )* '"' ;

INTEGER : ('1' .. '9')+ ('0' .. '9')* | '0';

FLOAT : (('1' .. '9')* | '0') '.' ('0' .. '9')* ;

BOOLEAN : 'true' | 'false' ;

SWITCH : 'switch' ;

CASE : 'case' ;

DEFAULT : 'default' ;

BREAK : 'break' ;

CONTINUE : 'continue' ;

RETURN : 'return' ;

REQUIRES : 'requires' ;

NEW : 'new' ;

CLONE : 'clone' ;

NULL : 'null' ;

CONST : 'const' ;

ABELIAN_RELOP : '!=' | '==' | '>' | '<' | '>=' | '<=';

LOGICAL_NOT : '!' ;

POW : '**' ;

BOOLEAN_SUMOP : '||' | '^' ;

BOOLEAN_MULOP : '&&'  ;

ABELIAN_SUMOP : '+' | '-' ;

ABELIAN_MULOP : '*' | '/' | '%' ;

MINUS : '-' ;

PLUS : '+' ;

LOGICAL_NEGATION : '!' ;

ABELIAN_INCREMENT_OP : '++' | '--' ;

JAVA_ID: (('a' .. 'z') | ('A' .. 'Z')) (('a' .. 'z') | ('A' .. 'Z') | ('0' .. '9') | '_')* ;

INLINE_COMMENT: '//' ~[\r\n]* -> channel(HIDDEN) ;

C_COMMENT: '/*' .*? '*/' -> channel(HIDDEN) ;

WHITESPACE : ( '\t' | ' ' | '\r' | '\n'| '\u000C' )+ -> channel(HIDDEN) ;

ASSIGN : '=' ;
Community
  • 1
  • 1
Cope
  • 769
  • 7
  • 17