Making my Yacc sit up and beg
I'm more convinced than ever that the correct approach here is a GLR grammar, if at all possible. However, inspired by @Kaz, I produced the following yacc/bison grammar with an LALR(1) grammar (not even using precedence declarations).
Of course, it cheats, since the problem cannot be solved with an LALR(1) grammar. At appropriate intervals, it walks the constructed tree of IF THEN
and IF THEN ELSE
expressions, and moves the ELSE
clauses as required.
Nodes which need to be re-examined for possible motion are given the AST nodetype IFSEQ
and the ELSE
clauses are attached with the traditional tightest match grammar, using a classic matched-if/unmatched-if grammar. A fully-matched IF THEN ELSE
clause does not need to be rearranged; the tree rewrite will apply to the expression associated with the first ELSE
whose right-hand operand is unmatched (if there is one). Keeping the fully-matched prefix of an IF
expression separate from the tail which needs to be rearranged required almost-duplicating some rules; the almost-duplicated rules differ in that their actions directly produce TERNARY
nodes instead if IFSEQ
nodes.
In order to correctly answer the question, it would also be necessary to rearrange some IFF
nodes, since the IFF
binds more weakly than the THEN
clause and more tightly than the ELSE
clause. I think this means:
IF p THEN q IFF IF r THEN s ==> ((p → q) ↔ (r → s))
IF p THEN q IFF r ELSE s IFF t ==> (p ? (q ↔ r) : (s ↔ t))
IF p THEN q IFF IF r THEN s ELSE t IFF u ==> (p ? (q ↔ (r → s)) : (t ↔ u))
although I'm not sure that is what is being asked for (particularly the last one) and I really don't think it's a good idea. In the grammar below, if you want IFF
to apply to an IF p THEN q
subexpression, you will have to use parentheses; IF p THEN q IFF r
produces p → (q ↔ r)
and p IFF IF q THEN r
is a syntax error.
Frankly, I think this whole thing would be easier using arrows for conditionals and biconditionals (as in the glosses above), and using IF THEN ELSE
only for ternary selector expressions (written above with C-style ? :
syntax, which is another possibility). That will generate far fewer surprises. But it's not my language.
One solution for the biconditional operator with floating precedence would be to parse in two passes. The first pass would only identify the IF p THEN q
operators without an attached ELSE
, using a mechanism similar to the one proposed here, and change them to p -> q
by deleting the IF
and changing the spelling of THEN
. Other operators would not be parsed and parentheses would be retained. It would then feed to resulting token stream into a second LALR parser with a more traditional grammar style. I might get around to coding that only because I think that two-pass bison parsers are occasionally useful and there are few examples floating around.
Here's the tree-rewriting parser. I apologise for the length:
%{
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void yyerror(const char* msg);
int yylex(void);
typedef struct Node Node;
enum AstType { ATOM, NEG, CONJ, DISJ, IMPL, BICOND, TERNARY,
IFSEQ
};
struct Node {
enum AstType type;
union {
const char* atom;
Node* child[3];
};
};
Node* node(enum AstType type, Node* op1, Node* op2, Node* op3);
Node* atom(const char* name);
void node_free(Node*);
void node_print(Node*, FILE*);
typedef struct ElseStack ElseStack;
struct ElseStack {
Node* action;
ElseStack* next;
};
ElseStack* build_else_stack(Node*, ElseStack*);
ElseStack* shift_elses(Node*, ElseStack*);
%}
%union {
const char* name;
struct Node* node;
}
%token <name> T_ID
%token T_AND "and"
T_ELSE "else"
T_IF "if"
T_IFF "iff"
T_NOT "not"
T_OR "or"
T_THEN "then"
%type <node> term conj disj bicond cond mat unmat tail expr
%%
prog : %empty | prog stmt;
stmt : expr '\n' { node_print($1, stdout); putchar('\n'); node_free($1); }
| '\n'
| error '\n'
term : T_ID { $$ = atom($1); }
| "not" term { $$ = node(NEG, $2, NULL, NULL); }
| '(' expr ')' { $$ = $2; }
conj : term
| conj "and" term { $$ = node(CONJ, $1, $3, NULL); }
disj : conj
| disj "or" conj { $$ = node(DISJ, $1, $3, NULL); }
bicond: disj
| disj "iff" bicond { $$ = node(BICOND, $1, $3, NULL); }
mat : bicond
| "if" expr "then" mat "else" mat
{ $$ = node(IFSEQ, $2, $4, $6); }
unmat: "if" expr "then" mat
{ $$ = node(IFSEQ, $2, $4, NULL); }
| "if" expr "then" unmat
{ $$ = node(IFSEQ, $2, $4, NULL); }
| "if" expr "then" mat "else" unmat
{ $$ = node(IFSEQ, $2, $4, $6); }
tail : "if" expr "then" mat
{ $$ = node(IFSEQ, $2, $4, NULL); }
| "if" expr "then" unmat
{ $$ = node(IFSEQ, $2, $4, NULL); }
cond : bicond
| tail { shift_elses($$, build_else_stack($$, NULL)); }
| "if" expr "then" mat "else" cond
{ $$ = node(TERNARY, $2, $4, $6); }
expr : cond
%%
/* Walk the IFSEQ nodes in the tree, pushing any
* else clause found onto the else stack, which it
* returns.
*/
ElseStack* build_else_stack(Node* ifs, ElseStack* stack) {
if (ifs && ifs->type != IFSEQ) {
stack = build_else_stack(ifs->child[1], stack);
if (ifs->child[2]) {
ElseStack* top = malloc(sizeof *top);
*top = (ElseStack) { ifs->child[2], stack };
stack = build_else_stack(ifs->child[2], top);
}
}
return stack;
}
/* Walk the IFSEQ nodes in the tree, attaching elses from
* the else stack.
* Pops the else stack as it goes, freeing popped
* objects, and returns the new top of the stack.
*/
ElseStack* shift_elses(Node* n, ElseStack* stack) {
if (n && n->type == IFSEQ) {
if (stack) {
ElseStack* top = stack;
stack = shift_elses(n->child[2],
shift_elses(n->child[1], stack->next));
n->type = TERNARY;
n->child[2] = top;
free(top);
}
else {
shift_elses(n->child[2],
shift_elses(n->child[1], NULL));
n->type = IMPL;
n->child[2] = NULL;
}
}
return stack;
}
Node* node(enum AstType type, Node* op1, Node* op2, Node* op3) {
Node* rv = malloc(sizeof *rv);
*rv = (Node){type, .child = {op1, op2, op3}};
return rv;
}
Node* atom(const char* name) {
Node* rv = malloc(sizeof *rv);
*rv = (Node){ATOM, .atom = name};
return rv;
}
void node_free(Node* n) {
if (n) {
if (n->type == ATOM) free((char*)n->atom);
else for (int i = 0; i < 3; ++i) node_free(n->child[i]);
free(n);
}
}
const char* typename(enum AstType type) {
switch (type) {
case ATOM: return "ATOM";
case NEG: return "NOT" ;
case CONJ: return "CONJ";
case DISJ: return "DISJ";
case IMPL: return "IMPL";
case BICOND: return "BICOND";
case TERNARY: return "TERNARY" ;
case IFSEQ: return "IF_SEQ";
}
return "**BAD NODE TYPE**";
}
void node_print(Node* n, FILE* out) {
if (n) {
if (n->type == ATOM)
fputs(n->atom, out);
else {
fprintf(out, "(%s", typename(n->type));
for (int i = 0; i < 3 && n->child[i]; ++i) {
fputc(' ', out); node_print(n->child[i], out);
}
fputc(')', out);
}
}
}
void yyerror(const char* msg) {
fprintf(stderr, "%s\n", msg);
}
int main(int argc, char** argv) {
return yyparse();
}
The lexer is almost trivial. (This one uses lower-case keywords because my fingers prefer that, but it's trivial to change.)
%{
#include "ifelse.tab.h"
%}
%option noinput nounput noyywrap nodefault
%%
and { return T_AND; }
else { return T_ELSE; }
if { return T_IF; }
iff { return T_IFF; }
not { return T_NOT; }
or { return T_OR; }
then { return T_THEN; }
[[:alpha:]]+ { yylval.name = strdup(yytext);
return T_ID; }
([[:space:]]{-}[\n])+ ;
\n { return '\n'; }
. { return *yytext;}
As written, the parser/lexer reads a line at a time, and prints the AST for each line (so multiline expressions aren't allowed). I hope it's clear how to change it.