boost-sprit-lex unifying multiple tokens into a single token in lex differentiated by the id

Question

edit : I have ripped out the lexer as it does not cleanly integrate with Qi and just obfuscates grammars (see answer below).

My lexer looks as follows :

template <typename Lexer>
struct tokens : lex::lexer<Lexer>
{
tokens()
    : left_curly("\"{\""),
    right_curly("\"}\""),
    left_paren("\"(\""),
    right_paren("\")\""),
    colon(":"),
    scolon(";"),
    namespace_("(?i:namespace)"),
    event("(?i:event)"),
    optional("(?i:optional)"),
    required("(?i:required)"),
    repeated("(?i:repeated)"),
    t_int_4("(?i:int4)"),
    t_int_8("(?i:int8)"),
    t_string("(?i:string)"),
    ordinal("\\d+"),
    identifier("\\w+")

{
    using boost::spirit::lex::_val;

    this->self
        = 
        left_curly    
        | right_curly 
        | left_paren
        | right_paren
        | colon         
        | scolon
        | namespace_      
        | event             
        | optional           
        | required          
        | repeated
        | t_int_4
        | t_int_8
        | t_string
        | ordinal             
        | identifier         
        | lex::token_def<>("[ \\t\\n]+")   [lex::_pass = lex::pass_flags::pass_ignore];
}


lex::token_def<lex::omit> left_curly, right_curly, colon, scolon,repeated, left_paren, right_paren;
lex::token_def<lex::omit> namespace_, event, optional, required,t_int_4, t_int_8, t_string;
lex::token_def<boost::uint32_t> ordinal;
lex::token_def<std::string> identifier;

};

I want t_int_4,t_int_8, and t_string to represented by a single token type attributed by an integral type. At the moment my QI grammar has to do the lifting for this and then set the token in a qi::rule semantic action :

 atomic_type = tok.t_int_4     [ _val = RBL_INT4]
                | tok.t_int_8             [ _val = RBL_INT8]
                | tok.t_string            [ _val = RBL_STRING];

I think you can assign to the token value in Lex semantic actions, just like you do in Qi. (Be sure to use an actor_lexer<> model, though and add the datatype for 'RBL_*' to the mpl::vector<> of possible token attribute types) — sehe, Oct 08 '13 at 12:32

score 1 · Accepted Answer · answered Oct 09 '13 at 09:09

From your questions relating to integrating lex into qi grammar, from the last few days. It seems you've identified multiple integration issues. At this point you should ask yourself why you are even trying to integrate a lexer into a PEG grammar. PEG grammars can neatly capture tokenization in situ, and so you don't really gain much from introducing lexer especially considering the lex->qi case where introducing a lexer has shown you that not only do you need hacks to do what is neat in qi in terms of expressing your grammar but also hacks for getting error handling and annotation working properly. Therefore I suggest removing Lex and sticking to Qi.

Here is your grammar with the lexer removed. The ast is in a file of it's own.

#include "ast.hpp"
#define BOOST_SPIRIT_USE_PHOENIX_V3
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix_core.hpp>
#include <boost/range/iterator_range.hpp>
#include <vector>

namespace qi = boost::spirit::qi;
namespace ascii = boost::spirit::ascii;
namespace px = boost::phoenix;

template <typename Iterator>
struct skipper : qi::grammar<Iterator>
{
    skipper() : skipper::base_type(start)
    {
        using boost::spirit::ascii::char_;

        start = ascii::space | qi::lit("//") >> *(ascii::char_ - qi::eol) >> qi::eol;
    }

    qi::rule<Iterator> start;
};

struct error_handler_
{
    typedef void result_type;
    template<typename First, typename Last, typename ErrorPos, typename What>
    void operator()(First f, Last l, ErrorPos e, What w) const
    {
        std::cout << "Expected : " << w << std::endl;
        std::cout << std::string(f,l) << std::endl;
        int i = std::distance(f,e);
        std::cout << std::string(i+1,' ') <<  "^---- here"  << std::endl;
    }
};

px::function<error_handler_> error_handler;

template<typename Iterator>
struct annotation_state
{
  typedef boost::iterator_range<Iterator> annotation_iterator;
  typedef std::vector<annotation_iterator> annotation_iterators;

  annotation_iterators annotations;
};

template<typename Iterator>
struct annotate_
{
    typedef void result_type;

    annotation_state<Iterator> & as;
    annotate_(annotation_state<Iterator> & as) : as(as) {}

    template<typename Val, typename First, typename Last>
    void operator()(Val v, First f, Last l) const
    {
      v.id = as.annotations.size();
      as.annotations.push_back(boost::make_iterator_range(f,l));
      std::cout << std::string(f,l) << std::endl;
    }
};



template <typename Iterator, typename Skipper>
struct grammar : qi::grammar<Iterator,namespace_descriptor(),Skipper>
{
    grammar(annotation_state<Iterator> & as) 
        : grammar::base_type(namespace_descriptor_),
          annotation_state_(as),
          annotate(as)

    {
        using namespace qi;

        atomic_type.add
            ("int4", RBL_INT4)
            ("int8", RBL_INT8)
            ("string", RBL_STRING);

        event_entry_qualifier.add
            ("optional", ENTRY_OPTIONAL)
            ("required", ENTRY_REQUIRED)
            ("repeated", ENTRY_REPEATED);

        oid_ = ordinal  > ':' > identifier;
        ordinal = uint_parser<boost::uint32_t>();
        identifier = +(char_("a","z") | char_("A","Z") | char_('_'));
        type_descriptor_ = atomic_type_ | compound_type_;
        atomic_type_ = no_case[atomic_type] > attr("");

        compound_type_ = 
            no_case[lit("event")] 
            > attr(RBL_EVENT) 
            > '(' 
            > identifier  
            > ')';

        event_entry_ = 
            no_case[event_entry_qualifier] 
            > oid_ 
            > type_descriptor_ 
            > ';';

        event_descriptor_ = 
            no_case[lit("event")] 
            > oid_ 
            > '{' 
            > *(event_entry_) 
            > '}'; 

        namespace_descriptor_ = 
            no_case[lit("namespace")] 
            > identifier 
            > '{' 
            > * (event_descriptor_) 
            > '}'; 

        identifier.name("identifier");
        oid_.name("ordinal-identifier pair");
        ordinal.name("ordinal");

        on_error<fail>(namespace_descriptor_, ::error_handler(_1,_2,_3,_4));
        on_success(oid_, annotate(_val,_1,_3));
        on_success(type_descriptor_, annotate(_val,_1,_3));
        on_success(event_entry_, annotate(_val,_1,_3));
        on_success(event_descriptor_, annotate(_val,_1,_3));
    }

    annotation_state<Iterator> & annotation_state_;
    px::function<annotate_<Iterator> > annotate;

    qi::rule< Iterator, oid()> oid_;
    qi::rule< Iterator, boost::uint32_t()> ordinal;
    qi::rule< Iterator, std::string()> identifier;
    qi::rule< Iterator, type_descriptor()> type_descriptor_;
    qi::rule< Iterator, type_descriptor()> atomic_type_;
    qi::rule< Iterator, type_descriptor()> compound_type_; 

    qi::rule< Iterator, event_entry(), Skipper> event_entry_;
    qi::rule< Iterator, event_descriptor(), Skipper> event_descriptor_;
    qi::rule< Iterator, namespace_descriptor(), Skipper> namespace_descriptor_;

    qi::symbols<char, int> atomic_type;
    qi::symbols<char, int> event_entry_qualifier;
};

int main()
{
    std::string test = "namespace ns { event 1:sihan { OpTIONAL 1:hassan event(haSsan);} }";
    typedef std::string::iterator it;

    it beg = test.begin();
    it end = test.end();

    annotation_state<it> as;
    skipper<it> skip;
    grammar<it, skipper<it> > g(as);


    bool r = qi::phrase_parse(beg,end,g,skip);
    if(r)
        ;
    else
    {
        std::cout << "parsing failed" << std::endl;
    }
}

Ah. I didn't see this answer earlier. I tend to agree (as do the Spirit devs on the mailing list, by the way) — sehe, Oct 26 '13 at 23:24

boost-sprit-lex unifying multiple tokens into a single token in lex differentiated by the id

1 Answers1

Linked