TL;DR
How to parse the result of a boost::spirit
grammar into an std::set
?
Full problem statement
As an exercise to learn how to use boost::spirit
, I am designing a parser for X.500/LDAP Distinguished Names. The grammar can be found in a BNF format in the RFC-1779.
I "unrolled" it and translated it into boost::spirit
rules. That's the first step. Basically, a DN is a set of RDN (Relative Distinguished Names) which themselves are tuples of (Key,Value) pairs.
I think about using
typedef std::unordered_map<std::string, std::string> rdn_type;
to represent each RDN. The RDNs are then gathered into a std::set<rdn_type>
My issue is that going through the (good) documentation of boost::spirit
, I didn't find out how to populate the set.
My current code can be found on github and I'm trying to refine it whenever I can.
Starting a satanic dance to summon SO's most popular polar bear :p
Current code
In order to have an all-at-one-place question, I add a copy of the code here, it's a bit long so I put it at the end :)
namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace phoenix = boost::phoenix;
typedef std::unordered_map<std::string, std::string> dn_key_value_map;
template <typename Iterator>
struct dn_grammar_common : public qi::grammar<Iterator, std::multiset<dn_key_value_map>(), ascii::space_type> {
struct dn_reserved_chars_ : public qi::symbols<char, char> {
dn_reserved_chars_() {
add
("\\", "\\")
("=" , "=")
("+" , "+")
("," , ",")
(";" , ";")
("#" , "#")
("<" , "<")
(">" , ">")
("\"", "\"")
("%" , "%");
}
} dn_reserved_chars;
dn_grammar_common() : dn_grammar_common::base_type(dn) {
// Useful using directives
using namespace qi::labels;
// Low level rules
// Key can only contain alphanumerical characters and dashes
key = ascii::no_case[qi::lexeme[(*qi::alnum) >> (*(qi::char_('-') >> qi::alnum))]];
escaped_hex_char = qi::lexeme[(&qi::char_("\\")) >> qi::repeat(2)[qi::char_("0-9a-fA-F")]];
escaped_sequence = escaped_hex_char |
qi::lexeme[(&qi::char_("\\")) >> dn_reserved_chars];
// Rule for a fully escaped string (used as Attribute Value) => "..."
quote_string = qi::lexeme[qi::lit('"') >>
*(escaped_sequence | (qi::char_ - qi::char_("\\\""))) >>
qi::lit('"')
];
// Rule for an hexa string (used as Attribute Value) => #23AD5D...
hex_string = (&qi::char_("#")) >> *qi::lexeme[(qi::repeat(2)[qi::char_("0-9a-fA-F")])];
// Value is either:
// - A regular string (that can contain escaped sequences)
// - A fully escaped string (that can also contain escaped sequences)
// - An hexadecimal string
value = (qi::lexeme[*((qi::char_ - dn_reserved_chars) | escaped_sequence)]) |
quote_string |
hex_string;
// Higher level rules
rdn_pair = key >> '=' >> value;
// A relative distinguished name consists of a sequence of pairs (Attribute = AttributeValue)
// Separated with a +
rdn = rdn_pair % qi::char_("+");
// The DN is a set of RDNs separated by either a "," or a ";".
// The two separators can coexist in a given DN, though it is not
// recommended practice.
dn = rdn % (qi::char_(",;"));
}
qi::rule<Iterator, std::set<dn_key_value_map>(), ascii::space_type> dn;
qi::rule<Iterator, dn_key_value_map(), ascii::space_type> rdn;
qi::rule<Iterator, std::pair<std::string, std::string>(), ascii::space_type> rdn_pair;
qi::rule<Iterator, std::string(), ascii::space_type> key, value, hex_string, quote_string;
qi::rule<Iterator, std::string(), ascii::space_type> escaped_hex_char, escaped_sequence;
};