1

This is a followup question to Using boost::spirit::qi to parse numbers with separators.


Following sehe's very good suggestions, I managed to get number parsing to work. I then attempted to update it to have a secondary parser which handled numbers with an optional sign. This second attempt failed. I suspect I have dome something incorrect with respect to how to handle a sub-grammar. Code is as follows:

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>

namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace phoenix = boost::phoenix;

template <typename Iterator, typename Num>
struct unsigned_parser : qi::grammar<Iterator, Num()> {
    unsigned_parser() : unsigned_parser::base_type(start) {
        using qi::_val;
        using qi::_1;
        using qi::eps;
        using qi::debug;
        using ascii::char_;

        bin = eps[_val=0] >> *(char_("01")[_val = _val * 2 + dval(_1)] | '_');
        oct = eps[_val=0] >> *(char_("0-7")[_val = _val * 8 + dval(_1)] | '_');
        dec = eps[_val=0]
              >> *(char_("0-9")[_val = _val * 10 + dval(_1)] | '_');
        hex = eps[_val=0]
              >> *(char_("0-9a-fA-F")[_val = _val * 16 + dval(_1)] | '_');
        start = (char_('0') >>
                 ((char_("xXhH") >> hex[_val=_1])
                  | (char_("bByY") >> bin[_val=_1])
                  | (char_("oOqQ") >> oct[_val=_1])
                  | (char_("dDtT") >> dec[_val=_1])))
                | (hex[_val=_1] >> char_("xXhH"))
                | (bin[_val=_1] >> char_("bByY"))
                | (oct[_val=_1] >> char_("oOqQ"))
                | (dec[_val=_1] >> -char_("dDtT"));
        start.name("unum");
        hex.name("hex");
        oct.name("oct");
        dec.name("dec");
        bin.name("bin");

        debug(start);
        debug(hex);
        debug(oct);
        debug(dec);
        debug(bin);
    }
    qi::rule<Iterator, Num()> start;
    qi::rule<Iterator, Num()> hex;
    qi::rule<Iterator, Num()> oct;
    qi::rule<Iterator, Num()> dec;
    qi::rule<Iterator, Num()> bin;
    struct _dval {
        template <typename> struct result { typedef uint8_t type; };
        template <typename T> uint8_t operator()(T ch) const {
            if (ch >= '0' || ch <= '9') {
                return ch - '0';
            }
            ch = std::tolower(ch);
            if (ch >= 'a' || ch <= 'f') {
                return ch - 'a' + 10;
            }
            assert(false);
        }
    };
    boost::phoenix::function<_dval> dval;
};

template <typename Iterator, typename Num>
struct signed_parser : qi::grammar<Iterator, Num()> {
    signed_parser() : signed_parser::base_type(start) {
        using qi::eps;
        using qi::_val;
        using qi::_1;
        using ascii::char_;
        using phoenix::static_cast_;
        unum = unsigned_parser<Iterator, Num>();
        start = (char_('-') >> unum[_val=-_1])
                | (-char_('+') >> unum[_val=_1]);
        unum.name("unum");
        start.name("snum");
        debug(start);
        /* debug(unum); */
    }
    qi::rule<Iterator, Num()> start;
    qi::rule<Iterator, Num()> unum;
};

int main(int argv, const char *argc[]) {
    using phoenix::ref;
    using qi::eoi;
    using qi::_1;

    typedef std::string::const_iterator iter;
    signed_parser<iter, int64_t> sp;
    int64_t val;
    if (argv != 2) {
        std::cerr << "Usage: " << argc[0] << " <input>" << std::endl;
        return 1;
    }
    std::string test(argc[1]);
    iter i = test.begin();
    iter end = test.end();
    bool rv = phrase_parse(i, end, sp[ref(val)=_1] >> eoi, ascii::space);
    if (rv) {
        assert(i == end);
        std::cout << "Succeeded: " << val << std::endl;
        return 0;
    }
    std::cout << "Failed." << std::endl;
    return 1;
}

With the signed_parser, every parse fails. Moreover, if I uncomment the commented-out debug(), the program segfaults.

I feel as if I am close to beginning to understand how to use this, so any help would be appreciated.

Community
  • 1
  • 1
md5i
  • 3,018
  • 1
  • 18
  • 32
  • 1
    I suspect lifetime shenanigans. After replacing `qi::rule unum;` in `signed_parser` with `unsigned_parser unum;` and removing the assignment `unum = ...;` in the constructor, it works for me (although that is not proof of correctness, of course). **Add:** Lifetime problems would make sense because Qi grammars aren't copyable. The rule probably ends up holding a reference to the temporary grammar object. – Wintermute Mar 24 '15 at 16:29
  • @Wintermute hehe. I took too long polishing my answer, then :) – sehe Mar 24 '15 at 17:03

1 Answers1

2

Using all those separate rules kills the opportunity for the compiler to optimize the parsing.

You cannot refer to a temporary grammar/rule. You need to have the grammar instance around:

template <typename Iterator, typename Num>
struct signed_parser : qi::grammar<Iterator, Num()> {
    signed_parser() : signed_parser::base_type(snum) {
        using namespace qi;

        snum = lit('-') >> unum
            | -lit('+') >> unum
            ;

        BOOST_SPIRIT_DEBUG_NODES((snum))
    }
private:
    qi::rule<Iterator, Num()> snum;
    unsigned_parser<Iterator, Num> unum;
};

Here's some cleanup for you:

  • swap argc and argv will you :)
  • use BOOST_SPIRIT_DEBUG* macros

    BOOST_SPIRIT_DEBUG_NODES((unum) (hex) (oct) (dec) (bin));
    
  • use bare literals instead if lit() or (worse!) char_()

  • prefer using automatic attribute propagation (Boost Spirit: "Semantic actions are evil"?). E.g. the rules can be much simpler:

        snum = lit('-') >> unum
            | -lit('+') >> unum
            ;
    
  • use %= to preserve automatic propagation in the presence of semantic actions:

        snum %= lit('-') >> unum [ _val = -_1 ]
             | -lit('+') >> unum
             ;
    
  • same thing goes for the phrase_parse call itself: you can pass bound references for the attributes. No need for semantic actions

  • doing tolower(ch) is likely slower (since you know it's ASCII), possibly incorrect (you get sign extension if your compiler has signed char)

  • UPDATE there was a rather gruesome bug in your dval actor. The range checks were wrong! Here's my fixed version:

    struct accum_f {
        template <typename...> struct result { typedef void type; };
        void operator()(char ch, Num& accum, int base) const {
            accum *= base;
    
            if      (ch >= '0' && ch <= '9') accum += ch - '0';
            else if (ch >= 'a' && ch <= 'f') accum += ch - 'a' + 10;
            else if (ch >= 'A' && ch <= 'F') accum += ch - 'A' + 10;
            else assert(false);
        }
    };
    boost::phoenix::function<accum_f> _accum;
    

    See below for consequential changes/simplifications to the semantic action

  • you can use the builting int_parser in the prefix branches; this could be (much) faster

  • caveat: when you write the unum semantic-action-less, it becomes essential that you don't "capture" the '0' with qi::char_ like you did. Otherwise, you'll be wondering why the result of any prefix-formatted number is always 48.

    unum = ('0' >>
                ( (omit[ char_("xXhH") ] >> hex)
                | (omit[ char_("bByY") ] >> bin)
                | (omit[ char_("oOqQ") ] >> oct)
                | (omit[ char_("dDtT") ] >> dec))
            )
        | (hex >> omit[  char_("xXhH") ])
        | (bin >> omit[  char_("bByY") ])
        | (oct >> omit[  char_("oOqQ") ])
        | (dec >> omit[ -char_("dDtT") ]);
    
  • using phrase_parse and a skipper has little effect as long as you're using parser expressions that don't use a skipper (see Boost spirit skipper issues)

Live On Coliru

//#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>

namespace qi      = boost::spirit::qi;
namespace ascii   = boost::spirit::ascii;

template <typename Iterator, typename Num>
struct unsigned_parser : qi::grammar<Iterator, Num()> {
    unsigned_parser() : unsigned_parser::base_type(unum) {
        using namespace qi;

        bin  = eps[_val=0] >> *(char_("01")        [ _accum(_1, _val, 2 )] | '_');
        oct  = eps[_val=0] >> *(char_("0-7")       [ _accum(_1, _val, 8 )] | '_');
        dec  = eps[_val=0] >> *(char_("0-9")       [ _accum(_1, _val, 10)] | '_');
        hex  = eps[_val=0] >> *(char_("0-9a-fA-F") [ _accum(_1, _val, 16)] | '_');
        unum = ('0' >>
                    ( (omit[ char_("xXhH") ] >> hex)
                    | (omit[ char_("bByY") ] >> bin)
                    | (omit[ char_("oOqQ") ] >> oct)
                    | (omit[ char_("dDtT") ] >> dec))
                )
            | (hex >> omit[  char_("xXhH") ])
            | (bin >> omit[  char_("bByY") ])
            | (oct >> omit[  char_("oOqQ") ])
            | (dec >> omit[ -char_("dDtT") ]);

        BOOST_SPIRIT_DEBUG_NODES((unum) (hex) (oct) (dec) (bin));
    }

  private:
    qi::rule<Iterator, Num()> unum,  hex, oct, dec, bin;

    struct accum_f {
        template <typename...> struct result { typedef void type; };
        void operator()(char ch, Num& accum, int base) const {
            accum *= base;

            if      (ch >= '0' && ch <= '9') accum += ch - '0';
            else if (ch >= 'a' && ch <= 'f') accum += ch - 'a' + 10;
            else if (ch >= 'A' && ch <= 'F') accum += ch - 'A' + 10;
            else assert(false);
        }
    };
    boost::phoenix::function<accum_f> _accum;
};

    template <typename Iterator, typename Num>
    struct signed_parser : qi::grammar<Iterator, Num()> {
        signed_parser() : signed_parser::base_type(snum) {
            using namespace qi;

            snum %= lit('-') >> unum [ _val = -_1 ]
                 | -lit('+') >> unum
                 ;

            BOOST_SPIRIT_DEBUG_NODES((snum))
        }
    private:
        qi::rule<Iterator, Num()> snum;
        unsigned_parser<Iterator, Num> unum;
    };

int main(int argc, const char *argv[]) {
    typedef std::string::const_iterator iter;
    signed_parser<iter, int64_t> const sp;

    for (std::string const& s : boost::make_iterator_range(argv+1, argv+argc))
    {
        std::cout << "\n-----------------------------\nParsing '" << s << "':\n";

        int64_t val;
        iter i = s.begin(), end = s.end();
        bool rv = phrase_parse(i, end, sp >> qi::eoi, ascii::space, val);

        if (rv) {
            std::cout << "Succeeded: " << val << std::endl;
        } else {
            std::cout << "Failed." << std::endl;
        }

        if (i!=end) {
            std::cout << "Remaining unparsed: '" << std::string(i,end) << "'\n";
        }
    }
}

Output:

-----------------------------
Parsing '-124_456d':
Succeeded: -124456

-----------------------------
Parsing '123_456D':
Succeeded: 123456

-----------------------------
Parsing '-123_456T':
Succeeded: -123456

-----------------------------
Parsing '123456t':
Succeeded: 123456

-----------------------------
Parsing '+1_bh':
Succeeded: 27

-----------------------------
Parsing '0_010Q':
Succeeded: 8

-----------------------------
Parsing '+1010_1010_0111_0111_b':
Succeeded: 43639

-----------------------------
Parsing '123_456':
Succeeded: 123456

-----------------------------
Parsing '-123456':
Succeeded: -123456

-----------------------------
Parsing '1_bh':
Succeeded: 27

-----------------------------
Parsing '-0_010Q':
Succeeded: -8

-----------------------------
Parsing '1010_1010_0111_0111_b':
Succeeded: 43639

-----------------------------
Parsing '+0d124_456':
Succeeded: 124456

-----------------------------
Parsing '0D123_456':
Succeeded: 123456

-----------------------------
Parsing '+0T123_456':
Succeeded: 123456

-----------------------------
Parsing '0t123456':
Succeeded: 123456

-----------------------------
Parsing '0h1_b':
Succeeded: 27

-----------------------------
Parsing '0Q0_010':
Succeeded: 8

-----------------------------
Parsing '0b1010_1010_0111_0111_':
Succeeded: 43639

-----------------------------
Parsing '06123_45':
Succeeded: 612345

-----------------------------
Parsing '0612345':
Succeeded: 612345

-----------------------------
Parsing '0h1_b':
Succeeded: 27

-----------------------------
Parsing '-0Q0_010':
Succeeded: -8

-----------------------------
Parsing '0b1010_1010_0111_0111_':
Succeeded: 43639
Community
  • 1
  • 1
sehe
  • 374,641
  • 47
  • 450
  • 633
  • Thank you for all the helpful suggestions! I need that sort of feedback in order to learn. A couple questions: 1) I can't use int_parser in the prefix branch and still support interspersed underscores, right? 2) If I am right about (1) then not using separate rules will cause a lot of copy/paste duplication, right? Finally, I figured out the 48 bit by trial and error, though I should have considered omit. Thanks again. – md5i Mar 24 '15 at 17:22
  • As a bonus, I implemented actual negation of the result and [removed a bug in your `dval`](http://stackoverflow.com/posts/29238681/revisions) that made all digits >9 (in hex) wrong. – sehe Mar 24 '15 at 17:24
  • To question 1) indeed, I forgot. I'd favour the duplication here (trust your profiler, but I trust my gut that it the impact is big here). Before considering `omit[]`, consider non-capturing parser directives in the first place (`lit()` doesn't capture, and bar literals are equivalent) – sehe Mar 24 '15 at 17:26
  • bare* literals, sorry – sehe Mar 25 '15 at 00:22