Interesting. I had to learn entire new realms of Regex for this. Kudos for that.
The problem is Recursive Expressions. You need to be very sure that the patterns don't liberally recurse easily so you don't get into infinite recursion, or, as the case here seems to be, "just" recursion that grows really deep easily with lengthy inputs.
So, first I tidied it up a bit:
const std::string pattern(
R"((?(DEFINE)(?'NAMESPACE'\w*::)))"
R"((?(DEFINE)(?'CONSTANT'("(?:[^"\\]|\\.)*")|(\d+\.?\d*f?))))"
R"((?(DEFINE)(?'VARIABLE'(?P>NAMESPACE)*([A-Za-z_]\w*\.)*[A-Za-z_]\w*)))"
R"((?(DEFINE)(?'OPERAND'(\+|-)*((?P>VARIABLE)|(?P>CONSTANT)))))"
R"((?(DEFINE)(?'EXPRESSION'\s*(?P>OPERAND)\s*(\s*[\*\+-\/]\s*(?P>OPERAND))*)))"
R"((?(DEFINE)(?'ARGUMENTS'(?P>EXPRESSION)(,\s*(?P>EXPRESSION))*)))"
R"((?(DEFINE)(?'FUNCTION_CALL'(?P>VARIABLE)\(\s*(?P>ARGUMENTS)?\s*\))))"
R"((?P>FUNCTION_CALL))");
Now that I started to "grok" the pattern, I decided I'd probably not use Regex for a grammar¹, and rewrote it in Spirit X3:
namespace rules {
using namespace x3;
auto WORD = (alnum | char_('_'));
auto NAMESPACE = +WORD >> "::";
auto CONSTANT = ( lexeme [ '"' >> *~char_('"') >> '"' ] | double_ );
auto ident = lexeme [ char_("A-Za-z_") >> *WORD ];
auto VARIABLE = *NAMESPACE >> ident % '.';
auto OPERAND = *(char_("+-")) >> (VARIABLE | CONSTANT);
auto EXPRESSION = OPERAND % char_("*+/-");
auto ARGUMENTS = EXPRESSION % ',';
auto FUNCTION_CALL = VARIABLE >> '(' >> -ARGUMENTS >> ')';
auto simple_function = rule<struct simple_function_, std::string> {"simple_function"}
= skip(space) [ x3::raw[FUNCTION_CALL] ];
}
Now this is subtly more accurate due the acceptance of whitespace in more relevant places (skip
vs. lexeme
²). Also, it doesn't apparently suffer the bad backtracking issues, apparently:
Live On Wandbox
#include <iostream>
#include <fstream>
#include <sstream>
#include <iterator>
#include <string>
#include <boost/regex.hpp>
#include <boost/spirit/home/x3.hpp>
namespace x3 = boost::spirit::x3;
namespace rules {
using namespace x3;
auto WORD = (alnum | char_('_'));
auto NAMESPACE = +WORD >> "::";
auto CONSTANT = ( lexeme [ '"' >> *~char_('"') >> '"' ] | double_ );
auto ident = lexeme [ char_("A-Za-z_") >> *WORD ];
auto VARIABLE = *NAMESPACE >> ident % '.';
auto OPERAND = *(char_("+-")) >> (VARIABLE | CONSTANT);
auto EXPRESSION = OPERAND % char_("*+/-");
auto ARGUMENTS = EXPRESSION % ',';
auto FUNCTION_CALL = VARIABLE >> '(' >> -ARGUMENTS >> ')';
auto simple_function = rule<struct simple_function_, std::string> {"simple_function"}
= skip(space) [ x3::raw[FUNCTION_CALL] ];
}
int main()
{
std::ifstream file("flask");
std::string const context(std::istreambuf_iterator<char>(file), {});
std::vector<std::string> calls;
parse(context.begin(), context.end(), *x3::seek[rules::simple_function], calls);
for (auto& call : calls) {
std::cout << call << "\n";
}
}
Which prints
anno::copyright_notice("XXXXX")
anno::author("Someone")
anno::contributor("")
state::texture_coordinate(0)
state::texture_tangent_u(0)
state::texture_tangent_v(0)
¹ I know Perl6 is awesome, but still
² Boost spirit skipper issues
UPDATE/BONUS
Just to show some of what lies beyond just matching text with Spirit X3, here's a slight improvement of that quick port that shows how you can use the same rules to parse into strongly-typed AST data types.
Changes made:
- fixed a bug that didn't
lexeme
identifiers in namespace qualifiers
- at the same time made the identifier parsing consistent for namespaces (in all likelihood, namespace names cannot begin with a numeric character either)
- Parsing into strong-typed data types
AST::Variable
, AST::Literal
(for string or numeric literals) and AST::FunctionCall
- Having support for escapes inside string literals. This means that
"A\"B"
will now be correctly be parsed into an AST::Literal
containing A"B
.
- You can actually see these literals being parsed if you inspect the debug output (
#define BOOST_SPIRIT_X3_DEBUG
)
Live On Wandbox
//#define BOOST_SPIRIT_X3_DEBUG
#include <iostream>
#include <fstream>
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/include/support_istream_iterator.hpp>
namespace x3 = boost::spirit::x3;
namespace AST {
struct Variable {
std::vector<std::string> namespaces, nested_objects;
friend std::ostream& operator<<(std::ostream& os, Variable const& v) {
for (auto ns : v.namespaces)
os << '[' << ns << "]::";
bool first = true;
for (auto obj : v.nested_objects) {
os << (first?"":".") << '[' << obj << ']';
first = false;
}
return os;
}
};
using Literal = boost::variant<std::string, double>;
struct FunctionCall {
Variable name;
std::vector<std::string> arguments;
};
}
BOOST_FUSION_ADAPT_STRUCT(AST::Variable, namespaces, nested_objects)
BOOST_FUSION_ADAPT_STRUCT(AST::FunctionCall, name, arguments)
namespace rules {
using namespace x3;
auto ident = rule<struct ident_, std::string> {"ident"}
= lexeme [ raw [ (alpha|'_') >> *(alnum|'_') ] ];
auto namespace_ = rule<struct namespace_, std::string> {"namespace_"}
= ident >> "::";
auto quoted_str = rule<struct quoted_str_, std::string> {"quoted_str"}
= lexeme [ '"' >> *('\\' >> char_ | ~char_('"')) >> '"' ];
auto constant = rule<struct constant_, AST::Literal> {"constant"}
= quoted_str | double_;
auto variable = rule<struct variable_, AST::Variable> {"variable"}
= *namespace_ >> ident % '.';
auto operand = rule<struct operand_> {"operand"}
= *char_("+-") >> (variable | constant);
auto expression = rule<struct expression_, std::string> {"expression"}
= raw [ operand % char_("*+/-") ];
auto arguments = expression % ',';
auto function_call = rule<struct function_call_, AST::FunctionCall> {"function_call"}
= variable >> '(' >> -arguments >> ')';
auto simple_function = skip(space) [ function_call ];
}
int main()
{
// parsing the raw sources out as string
{
std::ifstream file("flask");
boost::spirit::istream_iterator f(file), l;
std::vector<std::string> src;
parse(f, l, *x3::seek[x3::raw[rules::simple_function]], src);
for (auto& call : src)
std::cout << call << "\n";
}
// parsing AST::FunctionCall objects
{
std::ifstream file("flask");
boost::spirit::istream_iterator f(file), l;
std::vector<AST::FunctionCall> parsed;
parse(f, l, *x3::seek[rules::simple_function], parsed);
for (auto& call : parsed) {
std::cout << call.name << "\n";
for (auto& argument : call.arguments)
std::cout << " - argument: " << argument << "\n";
}
}
}
Which prints both the "source" parsing and the "AST" parsing:
anno::copyright_notice("XXXXX")
anno::author("Som\"e\"one")
anno::contributor("")
state::texture_coordinate(0)
state::texture_tangent_u(0)
state::texture_tangent_v(0)
[anno]::[copyright_notice]
- argument: "XXXXX"
[anno]::[author]
- argument: "Som\"e\"one"
[anno]::[contributor]
- argument: ""
[state]::[texture_coordinate]
- argument: 0
[state]::[texture_tangent_u]
- argument: 0
[state]::[texture_tangent_v]
- argument: 0