I need to handle include directives, similar to standard C/C++ preprocessor.
I already found this solution (in C#), based on changing lexer input stream: C# and ANTLR4: Handling "include" directives when parsing a file so I did the same in C++, but I get a segfault in antlr4 CPP runtime library.
Here is the very basic reproducer, based on CPP runtime demo provided example:
- Modify parsed string to embbed an include directive:
diff --git a/runtime/Cpp/demo/Linux/main.cpp b/runtime/Cpp/demo/Linux/main.cpp
index 672ce2a3b..4e44347fb 100644
--- a/runtime/Cpp/demo/Linux/main.cpp
+++ b/runtime/Cpp/demo/Linux/main.cpp
@@ -20,7 +20,7 @@ using namespace antlrcpptest;
using namespace antlr4;
int main(int , const char **) {
- ANTLRInputStream input(u8" = + \"\";(((x * π))) * µ + ∰; a + (x * (y ? 0 : 1) + z);");
+ ANTLRInputStream input(u8" = + \"\"; @include \"test.txt\"");
TLexer lexer(&input);
CommonTokenStream tokens(&lexer);
- Create the included file that contains the rest of the string:
$ cat test.txt
(((x * π))) * µ + ∰; a + (x * (y ? 0 : 1) + z);
- Add support for include directive in the provided lexer:
diff --git a/runtime/Cpp/demo/TLexer.g4 b/runtime/Cpp/demo/TLexer.g4
index ac2128c8d..26d70e4ea 100644
--- a/runtime/Cpp/demo/TLexer.g4
+++ b/runtime/Cpp/demo/TLexer.g4
@@ -3,7 +3,10 @@ lexer grammar TLexer;
// These are all supported lexer sections:
// Lexer file header. Appears at the top of h + cpp files. Use e.g. for copyrights.
-@lexer::header {/* lexer header section */}
+@lexer::header {/* lexer header section */
+ #include <iostream>
+ #include <stack>
+}
// Appears before any #include in h + cpp files.
@lexer::preinclude {/* lexer precinclude section */}
@@ -21,6 +24,16 @@ lexer grammar TLexer;
// Appears in the public part of the lexer in the h file.
@lexer::members {/* public lexer declarations section */
+std::stack<antlr4::CharStream *> input_stack;
+virtual antlr4::Token *emitEOF() override {
+ if (input_stack.empty()) {
+ return Lexer::emitEOF();
+ };
+ hitEOF = false;
+ setInputStream(input_stack.top());
+ input_stack.pop();
+ return nextToken().get();
+}
bool canTestFoo() { return true; }
bool isItFoo() { return true; }
bool isItBar() { return true; }
@@ -69,6 +82,12 @@ Comma: ',' -> skip;
Dollar: '$' -> more, mode(Mode1);
Ampersand: '&' -> type(DUMMY);
+fragment SPACES : [ \t]+ ;
+INCLUDE : '@include' SPACES '"' {
+ std::cerr << "Got include directive " << getSourceName() << "\n";
+ std::cerr << "Current mode = " << mode << "\n";
+ } -> skip, pushMode(INCLUDEHANLDING);
+
String: '"' .*? '"';
Foo: {canTestFoo()}? 'foo' {isItFoo()}? { myFooLexerAction(); };
Bar: 'bar' {isItBar()}? { myBarLexerAction(); };
@@ -84,3 +103,36 @@ Dot: '.';
mode Mode2;
DotDot: '..';
+
+mode INCLUDEHANLDING;
+// Skipped to hide FILE token to parser
+FILE : ~["]+ {
+ {
+ // Create new input stream from the file mentioned
+ std::ifstream stream(getText());
+ if (stream.fail()) {
+ std::cerr << "Config error: " << std::strerror(errno) << " for "<< getText() << "\n";
+ } else {
+ // Push the old stream to stack
+ input_stack.push(getInputStream());
+ std::cerr << "Handling open file. mode = " << mode << "\n";
+ // This new stream will be popped and used right after, on DQUOTE.
+ input_stack.push(new ANTLRInputStream(stream));
+ }
+ }
+} -> skip;
+
+// Skipped to hide DQUOTE token to parser
+DQUOTE: '"' {
+ // Injecting the newly generated Stream.
+ std::cerr << "Current mode = " << mode << "\n";
+ setInputStream(input_stack.top());
+ input_stack.pop();
+ std::cerr << "Injected stream. Now reading from " << getSourceName() << "\n";
+ std::cerr << "Current mode = " << mode << "\n";
+} -> skip;
+
+NL : ('\r'? '\n' | '\r')+ -> skip;
- Recompile the demo example
make
- Run it and you get a segfault:
Got include directive <unknown>
Current mode = 0
Handling open file. mode = 3
Current mode = 3
Injected stream. Now reading from <unknown>
Current mode = 0
Segmentation fault (core dumped)
- Debugging with gdb: I recompiled the runtime library with debug support.
gdb demo/antlr4-demo
GNU gdb (Ubuntu 9.2-0ubuntu1~20.04) 9.2
Copyright (C) 2020 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from demo/antlr4-demo...
(gdb) r
Starting program: /home/adrpes01/work/antlr/antlr4/runtime/Cpp/build/demo/antlr4-demo
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
Got include directive <unknown>
Current mode = 0
Handling open file. mode = 3
Current mode = 3
Injected stream. Now reading from <unknown>
Current mode = 0
Program received signal SIGSEGV, Segmentation fault.
antlr4::atn::LexerATNSimulator::failOrAccept (this=0x55555570c7b0, input=0x7fffffffd9b0, reach=0x555555712fc0, t=18446744073709551615)
at /home/adrpes01/work/antlr/antlr4/runtime/Cpp/runtime/src/atn/LexerATNSimulator.cpp:213
213 return _prevAccept.dfaState->prediction;
(gdb) where
#0 antlr4::atn::LexerATNSimulator::failOrAccept (this=0x55555570c7b0, input=0x7fffffffd9b0, reach=0x555555712fc0, t=18446744073709551615)
at /home/adrpes01/work/antlr/antlr4/runtime/Cpp/runtime/src/atn/LexerATNSimulator.cpp:213
#1 0x00005555555ca9d9 in antlr4::atn::LexerATNSimulator::execATN (this=0x55555570c7b0, input=0x7fffffffd9b0, ds0=0x555555712ab0)
at /home/adrpes01/work/antlr/antlr4/runtime/Cpp/runtime/src/atn/LexerATNSimulator.cpp:167
#2 0x00005555555ca58c in antlr4::atn::LexerATNSimulator::match (this=0x55555570c7b0, input=0x7fffffffd9b0, mode=3)
at /home/adrpes01/work/antlr/antlr4/runtime/Cpp/runtime/src/atn/LexerATNSimulator.cpp:76
#3 0x000055555558ea88 in antlr4::Lexer::nextToken (this=0x7fffffffdb00) at /home/adrpes01/work/antlr/antlr4/runtime/Cpp/runtime/src/Lexer.cpp:80
#4 0x00005555555887d4 in antlr4::BufferedTokenStream::fetch (this=0x7fffffffd950, n=1000)
at /home/adrpes01/work/antlr/antlr4/runtime/Cpp/runtime/src/BufferedTokenStream.cpp:96
#5 0x000055555558a812 in antlr4::BufferedTokenStream::fill (this=0x7fffffffd950)
at /home/adrpes01/work/antlr/antlr4/runtime/Cpp/runtime/src/BufferedTokenStream.cpp:404
#6 0x000055555556a400 in main () at /home/adrpes01/work/antlr/antlr4/runtime/Cpp/demo/Linux/main.cpp:27
(gdb) l
208 }
209
210 size_t LexerATNSimulator::failOrAccept(CharStream *input, ATNConfigSet *reach, size_t t) {
211 if (_prevAccept.dfaState != nullptr) {
212 accept(input, _prevAccept.dfaState->lexerActionExecutor, _startIndex, _prevAccept.index, _prevAccept.line, _prevAccept.charPos);
213 return _prevAccept.dfaState->prediction;
214 } else {
215 // if no accept and EOF is first char, return EOF
216 if (t == Token::EOF && input->index() == _startIndex) {
217 return Token::EOF;
(gdb) p _prevAccept
$1 = {index = 18446744073709551615, line = 0, charPos = 18446744073709551615, dfaState = 0x0}
(gdb)
Crash occurs at /home/adrpes01/work/antlr/antlr4/runtime/Cpp/runtime/src/atn/LexerATNSimulator.cpp:213 213 return _prevAccept.dfaState->prediction;
... whish is weird, since _prevAccept.dfaState effectively seems null:
(gdb) p _prevAccept $1 = {index = 18446744073709551615, line = 0, charPos = 18446744073709551615, dfaState = 0x0}
From what I understood, Lexer::reset() calls getInterpreteratn::LexerATNSimulator()->reset(), that itself call _prevAccept.reset();
Question is:
- Am I doing something wrong that mess up ANTLR internal ?
- Am I not doing soemthing that I should do ?
- Or is it a real bug from CPP runtime ?
Thanks for your help