boost::spirit::qi::lexeme not capturing complete token

Question

I am trying to parse comma separated tokens containing hyphens. But lexeme ignores all the hyphens. Part of program is as following.

#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/support_utree.hpp>

namespace qi = boost::spirit::qi;
namespace bs = boost::spirit;

template<typename Iterator>
struct my_grammar : public qi::grammar<Iterator,bs::utree(),bs::ascii::space_type>
{
     my_grammar() : my_grammar::base_type(start,"MY")
    {
        start = token % ',';
        token = qi::lexeme[ +qi::alnum % qi::char_('-') ];
    }

    qi::rule<Iterator,bs::utree(),bs::ascii::space_type> start;
    qi::rule<Iterator,std::string()> token;
};

template<typename Iterator>
bool parse(Iterator & begin,Iterator end,my_grammar<Iterator> const & grammar)
{
    bs::utree a;
    auto r = qi::phrase_parse(begin,end,grammar,bs::ascii::space,a);
    std::cout<<a<<'\n';
    return r;
}

int main()
{
    std::string input = "i-j-k,l-m-n,p3-14 ,5jhjj-kkk";

    auto it = input.begin();
    my_grammar<decltype(it)>  g;

    if(::parse(it,input.end(),g))
    {
        std::cout<<"parse success\n";
    }
    else
    {
        std::cout<<"parse failed\n";
    }
    std::cout<<"Unparsed input => "<< std::string{it,input.end()}<<'\n';
}

Coliru Link

score 3 · Accepted Answer · answered Jul 27 '16 at 14:44

+qi::alnum % qi::char_('-')

This matches one or more series of alpha-numeric characters, separated by '-'. That's what it does, per the docs. Therefore, you shouldn't expect the hypen to be part of it.

Use

+(qi::alnum | char_('-'))

instead. Or

+qi::char_("-A-Za-z0-9")

Or in context, even:

token = qi::raw[ qi::lexeme[+(qi::alnum | '-')] ];

Live On Coliru

#define BOOST_SPIRIT_DEBUG
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/support_utree.hpp>

namespace qi = boost::spirit::qi;
namespace bs = boost::spirit;

template <typename Iterator, typename Result = std::vector<std::string> > struct my_grammar : public qi::grammar<Iterator, Result(), bs::ascii::space_type> {
    my_grammar() : my_grammar::base_type(start, "MY") {
        start = token % ',';
        token = qi::raw[ qi::lexeme[+(qi::alnum | '-')] ];
        BOOST_SPIRIT_DEBUG_NODES((start)(token))
    }

    qi::rule<Iterator, Result(), bs::ascii::space_type> start;
    qi::rule<Iterator, std::string()> token;
};

template <typename Iterator> bool parse(Iterator &begin, Iterator end, my_grammar<Iterator> const &grammar) {
    std::vector<std::string> parsed;
    auto r = qi::phrase_parse(begin, end, grammar, bs::ascii::space, parsed);
    for (auto& el : parsed)
        std::cout << el << " ";
    std::cout << '\n';
    return r;
}

int main() {
    std::string input = "i-j-k,l-m-n,p3-14 ,5jhjj-kkk";

    auto it = input.begin();
    my_grammar<decltype(it)> g;

    if (::parse(it, input.end(), g)) {
        std::cout << "parse success\n";
    } else {
        std::cout << "parse failed\n";
    }
    std::cout << "Unparsed input => " << std::string{ it, input.end() } << '\n';
}

Prints

i-j-k l-m-n p3-14 5jhjj-kkk 
parse success
Unparsed input =>

With debug enabled:

<start>
  <try>i-j-k,l-m-n,p3-14 ,5</try>
  <token>
    <try>i-j-k,l-m-n,p3-14 ,5</try>
    <success>,l-m-n,p3-14 ,5jhjj-</success>
    <attributes>[[i, -, j, -, k]]</attributes>
  </token>
  <token>
    <try>l-m-n,p3-14 ,5jhjj-k</try>
    <success>,p3-14 ,5jhjj-kkk</success>
    <attributes>[[l, -, m, -, n]]</attributes>
  </token>
  <token>
    <try>p3-14 ,5jhjj-kkk</try>
    <success> ,5jhjj-kkk</success>
    <attributes>[[p, 3, -, 1, 4]]</attributes>
  </token>
  <token>
    <try>5jhjj-kkk</try>
    <success></success>
    <attributes>[[5, j, h, j, j, -, k, k, k]]</attributes>
  </token>
  <success></success>
  <attributes>[[[i, -, j, -, k], [l, -, m, -, n], [p, 3, -, 1, 4], [5, j, h, j, j, -, k, k, k]]]</attributes>
</start>

Give a lot of liposuction with Spirit X3 if you can use c++14: **[Live On Coliru](http://coliru.stacked-crooked.com/a/b1539818a9062a78)** — sehe, Jul 27 '16 at 14:56
Problem with above suggestion is that I don't want to accept a string like `abc--xyz`, i.e. `double hyphen` is not allowed. — g-217, Jul 28 '16 at 06:15
No problem. Stackoverflow answers are for the general public though which is why I strive to make them more broadly informative. — sehe, Jul 28 '16 at 06:16
Just use a hybrid, everything you needed is there: `raw [lexeme [+alnum % '-'] ]`. The trick is of course `qi::raw` and I encourage you to read the documentation for it — sehe, Jul 28 '16 at 06:19
http://stackoverflow.com/q/7985661/85371 and http://stackoverflow.com/q/34599506/85371 might also serve as inspiration (picking up on a hint of X/Y problem) — sehe, Jul 28 '16 at 06:26

boost::spirit::qi::lexeme not capturing complete token

1 Answers1