On reading, the first thing I notice is that self_tag_
uses expectation points. That won't fly because it is ordered before other things that can legally start with <
, like tag_block_
:
auto html_element__def = inner_text | self_tag_ | tag_block_ ;
And due to the expectation points it will never backtrack to reach that.
Many places use operator+
where operator*
is required, like:
auto inner_text = lexeme[*(char_-'<')];
All those charset differences can be phrased as inverse sets:
auto inner_text = lexeme[*~char_('<')];
//
= lexeme[*~char_(" />")];
Aside from the fact that XML has specific valid charsets for e.g. element names, but I'm assuming you expressly want to avoid writing a conformant parser. Specifically you really need to be excluding '<', '>', '\r', '\t' etc. from your attribute name/value rules etc.
One smell is the re-use of parser rule tags. This should, as far as my understanding goes, be fine for immediately-defined rules, but certainly not for those that are defined through their tag type, with BOOST_SPIRIT_DEFINE.
Cleanup Exercism
First, a cleanup. This gets past the hurdle of template instantiation depth by commenting out *html_element_
inside tag_block__def
. But first let's see what works then:
Live On Coliru
//#define BOOST_SPIRIT_X3_DEBUG
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <iomanip>
#include <iostream>
//// Unused mixin disabled for simplicity
// #include <boost/spirit/home/x3/support/ast/position_tagged.hpp>
namespace x3 = boost::spirit::x3;
using namespace std::string_literals;
namespace Ast {
struct tag_name {};
struct html_tag;
struct html_comment;
// using mixin = x3::position_tagged;
struct mixin {};
struct attribute_data : mixin {
std::string name;
boost::optional<std::string> value;
};
using attribute_datas = std::vector<attribute_data>;
struct tag_header : mixin {
std::string name;
attribute_datas attributes;
};
struct self_tag : mixin {
tag_header header;
};
using element_base =
x3::variant<std::string, self_tag, boost::recursive_wrapper<html_tag>>;
struct html_element : mixin , element_base {
using element_base::element_base;
using element_base::operator=;
};
using html_elements = std::vector<html_element>;
struct html_tag : mixin {
tag_header header;
html_elements children;
};
} // namespace Ast
BOOST_FUSION_ADAPT_STRUCT(Ast::attribute_data, name, value)
BOOST_FUSION_ADAPT_STRUCT(Ast::tag_header, name, attributes)
BOOST_FUSION_ADAPT_STRUCT(Ast::self_tag, header)
BOOST_FUSION_ADAPT_STRUCT(Ast::html_tag, header, children)
namespace Parser {
auto attribute_identifier_ //
= x3::rule<struct AttributeIdentifier_tag, std::string>{"AttributeIdentifier"} //
= x3::lexeme[+~x3::char_(" /=>")];
auto attribute_value_ //
= x3::rule<struct AttributeValue_tag, std::string>{"AttributeValue"} //
= x3::lexeme //
[('"' > *~x3::char_('"') > '"') //
| ("'" > *~x3::char_("'") > "'") //
| *~x3::char_(" />") //
];
auto single_attribute_ =
x3::rule<struct attribute_identifier__tag, Ast::attribute_data>{"SingleAttribute"} //
= attribute_identifier_ >> -("=" >> attribute_value_);
auto attributes_ //
= x3::rule<struct attribute_data_tag, Ast::attribute_datas>{"Attributes"} //
= *single_attribute_;
[[maybe_unused]] static auto& header_of(x3::unused_type) {
thread_local Ast::tag_header s_dummy;
return s_dummy;
}
[[maybe_unused]] static auto& header_of(Ast::html_tag& ht) {
return ht.header;
}
auto tag_name_begin_func = [](auto &ctx){
get<Ast::tag_name>(ctx) = _attr(ctx).name;
// header_of(_val(ctx)).name = _attr(ctx);
// std::cout << typeid(_val(ctx)).name() << std::endl;
};
auto tag_name_end_func = [](auto& ctx){ _pass(ctx) = (get<Ast::tag_name>(ctx) == _attr(ctx)); };
auto self_tag_name_action = [](auto &ctx){ header_of(_val(ctx)).name = _attr(ctx); };
auto self_tag_attribute_action = [](auto& ctx) { header_of(_val(ctx)).attributes = _attr(ctx); };
auto tag_name_ //
= x3::rule<struct HtmlTagName_tag, std::string>{"HtmlTagName"} //
= x3::lexeme[*~x3::char_(" />")];
auto self_tag_ //
= x3::rule<struct HtmlSelfTag_tag, Ast::self_tag>{"HtmlSelfTag"} //
= '<' >> tag_name_[self_tag_name_action] >> attributes_[self_tag_attribute_action] >> "/>";
auto tag_header_ //
= x3::rule<struct HtmlTagBlockHeader_tag, Ast::tag_header>{"HtmlTagBlockHeader"} //
= '<' >> tag_name_ >> attributes_ >> '>';
x3::rule<struct tag_block__tag, Ast::html_tag> tag_block_ = "TagBlock";
x3::rule<struct html_element__tag, Ast::html_element> html_element_ = "HtmlElement";
auto tag_block__def = x3::with<Ast::tag_name>(""s) //
[ //
tag_header_[tag_name_begin_func] >> /**html_element_ >>*/ "</" >> //
x3::omit[tag_name_[tag_name_end_func]] >> '>' //
];
auto inner_text = x3::lexeme[*~x3::char_('<')];
auto html_element__def = inner_text | self_tag_ | tag_block_;
BOOST_SPIRIT_DEFINE(tag_block_, html_element_)
}
namespace unit_tests {
template <bool ShouldSucceed = true, typename P>
void test(P const& rule, std::initializer_list<std::string_view> cases) {
for (auto input : cases) {
if constexpr (ShouldSucceed) {
typename x3::traits::attribute_of<P, x3::unused_type>::type result;
auto ok = phrase_parse(input.begin(), input.end(), rule, x3::space, result);
std::cout << quoted(input) << " -> " << (ok ? "Ok" : "FAILED") << std::endl;
} else {
auto ok = phrase_parse(input.begin(), input.end(), rule, x3::space);
if (!ok)
std::cout << "Fails as expected: " << quoted(input) << std::endl;
else
std::cout << "SHOULD HAVE FAILED: " << quoted(input) << std::endl;
}
}
}
}
int main() {
unit_tests::test(Parser::self_tag_,
{
R"(<simple foo="" bar='' value-less qux=bareword/>)",
R"(<div />)",
R"(<div/>)",
R"(< div/>)",
});
unit_tests::test(Parser::html_element_,
{
R"(<simple foo="" bar='' value-less qux=bareword></simple>)",
R"(<div ></div>)",
R"(<div></div>)",
R"(< div></div>)",
R"(< div ></div>)",
R"(<div data-src="https://www.google.com" id='hello world'></div>)",
R"(<div></ div>)",
R"(<div></ div >)",
});
unit_tests::test<false>(Parser::self_tag_,
{
R"(<div/ >)",
R"(<div>< /div>)",
R"(<div></dov>)",
});
}
Outputs
"<simple foo=\"\" bar='' value-less qux=bareword/>" -> Ok
"<div />" -> Ok
"<div/>" -> Ok
"< div/>" -> Ok
"<simple foo=\"\" bar='' value-less qux=bareword></simple>" -> Ok
"<div ></div>" -> Ok
"<div></div>" -> Ok
"< div></div>" -> Ok
"< div ></div>" -> Ok
"<div data-src=\"https://www.google.com\" id='hello world'></div>" -> Ok
"<div></ div>" -> Ok
"<div></ div >" -> Ok
Fails as expected: "<div/ >"
Fails as expected: "<div>< /div>"
Fails as expected: "<div></dov>"
What Is The Trouble
As you can deduce from my hunch to comment-out the recursion *html_element_
, this is causing problems.
The real reason is that with<>
extends the context. This means that each level of recursion adds more data to the context type, causing new template instantiations.
The simplest trick is to move with<>
up outside the recursion:
auto tag_block__def = //
tag_header_[tag_name_begin_func] >> *html_element_ >> "</" >> //
x3::omit[tag_name_[tag_name_end_func]] >> '>' //
;
auto inner_text = x3::lexeme[*~x3::char_('<')];
auto html_element__def = inner_text | self_tag_ | tag_block_;
auto start = x3::with<Ast::tag_name>(""s)[html_element_];
However this highlights the problem that elements can nest, and it's useless when inner tags overwrite the context data for tag_name
. So, instead of string
we could make it stack<string>
:
auto start = x3::with<tag_stack>(std::stack<std::string>{})[html_element_];
And then amend the actions to match:
auto tag_name_begin_func = [](auto& ctx) { get<tag_stack>(ctx).push(_attr(ctx).name); };
auto tag_name_end_func = [](auto& ctx) {
auto& s = get<tag_stack>(ctx);
_pass(ctx) = (s.top() == _attr(ctx));
s.pop();
};
See it Live On Coliru
//#define BOOST_SPIRIT_X3_DEBUG
#include <boost/fusion/include/adapt_struct.hpp>
#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <iomanip>
#include <iostream>
#include <stack>
//// Unused mixin disabled for simplicity
// #include <boost/spirit/home/x3/support/ast/position_tagged.hpp>
namespace x3 = boost::spirit::x3;
using namespace std::string_literals;
namespace Ast {
struct html_tag;
struct html_comment;
// using mixin = x3::position_tagged;
struct mixin {};
struct attribute_data : mixin {
std::string name;
boost::optional<std::string> value;
};
using attribute_datas = std::vector<attribute_data>;
struct tag_header : mixin {
std::string name;
attribute_datas attributes;
};
struct self_tag : mixin {
tag_header header;
};
using element_base =
x3::variant<std::string, self_tag, boost::recursive_wrapper<html_tag>>;
struct html_element : mixin , element_base {
using element_base::element_base;
using element_base::operator=;
};
using html_elements = std::vector<html_element>;
struct html_tag : mixin {
tag_header header;
html_elements children;
};
} // namespace Ast
BOOST_FUSION_ADAPT_STRUCT(Ast::attribute_data, name, value)
BOOST_FUSION_ADAPT_STRUCT(Ast::tag_header, name, attributes)
BOOST_FUSION_ADAPT_STRUCT(Ast::self_tag, header)
BOOST_FUSION_ADAPT_STRUCT(Ast::html_tag, header, children)
namespace Parser {
struct tag_stack final {};
auto attribute_identifier_ //
= x3::rule<struct AttributeIdentifier_tag, std::string>{"AttributeIdentifier"} //
= x3::lexeme[+~x3::char_(" /=>")];
auto attribute_value_ //
= x3::rule<struct AttributeValue_tag, std::string>{"AttributeValue"} //
= x3::lexeme //
[('"' > *~x3::char_('"') > '"') //
| ("'" > *~x3::char_("'") > "'") //
| *~x3::char_(" />") //
];
auto single_attribute_ =
x3::rule<struct attribute_identifier__tag, Ast::attribute_data>{"SingleAttribute"} //
= attribute_identifier_ >> -("=" >> attribute_value_);
auto attributes_ //
= x3::rule<struct attribute_data_tag, Ast::attribute_datas>{"Attributes"} //
= *single_attribute_;
[[maybe_unused]] static auto& header_of(x3::unused_type) {
thread_local Ast::tag_header s_dummy;
return s_dummy;
}
[[maybe_unused]] static auto& header_of(Ast::html_tag& ht) {
return ht.header;
}
auto tag_name_begin_func = [](auto& ctx) { get<tag_stack>(ctx).push(_attr(ctx).name); };
auto tag_name_end_func = [](auto& ctx) {
auto& s = get<tag_stack>(ctx);
_pass(ctx) = (s.top() == _attr(ctx));
s.pop();
};
auto assign_name = [](auto& ctx) { header_of(_val(ctx)).name = _attr(ctx); };
auto assign_attrs = [](auto& ctx) { header_of(_val(ctx)).attributes = _attr(ctx); };
auto tag_name_ //
= x3::rule<struct HtmlTagName_tag, std::string>{"HtmlTagName"} //
= x3::lexeme[*~x3::char_(" />")];
auto self_tag_ //
= x3::rule<struct HtmlSelfTag_tag, Ast::self_tag>{"HtmlSelfTag"} //
= '<' >> tag_name_[assign_name] >> attributes_[assign_attrs] >> "/>";
auto tag_header_ //
= x3::rule<struct HtmlTagBlockHeader_tag, Ast::tag_header>{"HtmlTagBlockHeader"} //
= '<' >> tag_name_ >> attributes_ >> '>';
x3::rule<struct tag_block__tag, Ast::html_tag> tag_block_ = "TagBlock";
x3::rule<struct html_element__tag, Ast::html_element> html_element_ = "HtmlElement";
auto tag_block__def = //
tag_header_[tag_name_begin_func] >> *html_element_ >> "</" >> //
x3::omit[tag_name_[tag_name_end_func]] >> '>' //
;
auto inner_text = x3::lexeme[*~x3::char_('<')];
auto html_element__def = inner_text | self_tag_ | tag_block_;
auto start = x3::with<tag_stack>(std::stack<std::string>{})[html_element_];
BOOST_SPIRIT_DEFINE(tag_block_, html_element_)
}
namespace unit_tests {
template <bool ShouldSucceed = true, typename P>
void test(P const& rule, std::initializer_list<std::string_view> cases) {
for (auto input : cases) {
if constexpr (ShouldSucceed) {
typename x3::traits::attribute_of<P, x3::unused_type>::type result;
auto ok = phrase_parse(input.begin(), input.end(), rule, x3::space, result);
std::cout << quoted(input) << " -> " << (ok ? "Ok" : "FAILED") << std::endl;
} else {
auto ok = phrase_parse(input.begin(), input.end(), rule, x3::space);
if (!ok)
std::cout << "Fails as expected: " << quoted(input) << std::endl;
else
std::cout << "SHOULD HAVE FAILED: " << quoted(input) << std::endl;
}
}
}
}
int main() {
unit_tests::test(Parser::self_tag_,
{
R"(<simple foo="" bar='' value-less qux=bareword/>)",
R"(<div />)",
R"(<div/>)",
R"(< div/>)",
});
unit_tests::test(Parser::start,
{
R"(<simple foo="" bar='' value-less qux=bareword></simple>)",
R"(<div ></div>)",
R"(<div></div>)",
R"(< div></div>)",
R"(< div ></div>)",
R"(<div data-src="https://www.google.com" id='hello world'></div>)",
R"(<div></ div>)",
R"(<div></ div >)",
R"(<div><nest/><nest some="more">yay</nest></div>)",
});
unit_tests::test<false>(Parser::self_tag_,
{
R"(<div/ >)",
R"(<div>< /div>)",
R"(<div></dov>)",
});
}
Printing
"<simple foo=\"\" bar='' value-less qux=bareword/>" -> Ok
"<div />" -> Ok
"<div/>" -> Ok
"< div/>" -> Ok
"<simple foo=\"\" bar='' value-less qux=bareword></simple>" -> Ok
"<div ></div>" -> Ok
"<div></div>" -> Ok
"< div></div>" -> Ok
"< div ></div>" -> Ok
"<div data-src=\"https://www.google.com\" id='hello world'></div>" -> Ok
"<div></ div>" -> Ok
"<div></ div >" -> Ok
"<div><nest/><nest some=\"more\">yay</nest></div>" -> Ok
Fails as expected: "<div/ >"
Fails as expected: "<div>< /div>"
Fails as expected: "<div></dov>"
CLOSING THOUGHTS
I'm answering this assuming you are just doing this to learn X3. Otherwise the only recommendation is: do not do this. Use a library.
Not only does your grammar do a pretty poor job of parsing XML, it will utterly fail on HTML in the wild. Closing tags are not a given in HTML ("quirks mode"). Scripts, CDATA, entity references, Unicode, escapes will all f*ck your parser up.
Oh, have you noticed how you mostly broke attribute propagation by introducing some semantic actions? I could show you how to fix it, but I think I'd rather leave it for the moment.
Just use a library.