4

I have a parser in which I want to capture certain types of whitespace as enum values and preserve the spaces for the "text" values.

My whitespace parser is pretty basic (Note: I've only added the pipe character here for test/dev purposes):

struct whitespace_p : x3::symbols<Whitespace>
{
    whitespace_p()
    {
        add
        ("\n", Whitespace::NEWLINE)
        ("\t", Whitespace::TAB)
        ("|", Whitespace::PIPE)
        ;
    }
} whitespace;

And I want to capture everything either into my enum or into std::strings:

struct Element : x3::variant<Whitespace, std::string>
{
    using base_type::base_type;
    using base_type::operator=;
};

And to parse my input I use something like this:

const auto contentParser
    = x3::rule<class ContentParserID, Element, true> { "contentParser" }
    = x3::no_skip[+(x3::char_ - (whitespace))]
        | whitespace
    ;

using Elements = std::vector<Element>;
const auto elementsParser
    = x3::rule<class ContentParserID, Elements, true> { "elementsParser" }
    = contentParser >> *(contentParser);

The problem though is that the parser stops at the first tab or newline it hits.

Code: http://coliru.stacked-crooked.com/a/d2cda4ce721279a4

#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <iostream>

namespace x3 = boost::spirit::x3;

enum Whitespace
{
    NEWLINE,
    TAB,
    PIPE
};

struct whitespace_p : x3::symbols<Whitespace>
{
    whitespace_p()
    {
        add
        ("\n", Whitespace::NEWLINE)
        ("\t", Whitespace::TAB)
        ("|", Whitespace::PIPE)
        ;
    }
} whitespace;

struct Element : x3::variant<Whitespace, std::string>
{
    using base_type::base_type;
    using base_type::operator=;
};

const auto contentParser
    = x3::rule<class ContentParserID, Element, true> { "contentParser" }
    = x3::no_skip[+(x3::char_ - (whitespace))]
        | whitespace
    ;

using Elements = std::vector<Element>;
const auto elementsParser
    = x3::rule<class ContentParserID, Elements, true> { "elementsParser" }
    = contentParser >> *(contentParser);

struct print_visitor
    : public boost::static_visitor<std::string>
{
    std::string operator()(const Whitespace& ws) const
    {
        if (ws == Whitespace::NEWLINE) 
        {
            return "newline";
        }
        else if (ws == Whitespace::PIPE)
        {
            return "pipe";
        }
        else
        {
            return "tab";
        }
    }

    std::string operator()(const std::string& str) const
    {
        return str;
    }
};

int main() 
{
    const std::string text = "Hello \n World";
    std::string::const_iterator start = std::begin(text);
    const std::string::const_iterator stop = std::end(text);

    Elements elements{};

    bool result =
        phrase_parse(start, stop, elementsParser, x3::ascii::space, elements);

    if (!result) 
    {
        std::cout << "failed to parse!\n";
    } 
    else if (start != stop)
    {
        std::cout << "unparsed: " << std::string{start, stop} << '\n';
    }
    else
    {
        for (const auto& e : elements)
        {
            std::cout << "element: [" << boost::apply_visitor(print_visitor{}, e) << "]\n";
        }
    }
}

If I parse the text Hello | World then I get the results I'm expecting. But if I instead use Hello \n World the whitespace after the \n is swallowed and the World is never parsed. Ideally I'd like to see this output:

element: [Hello ]
element: [newline]
element: [ World]

How can I accomplish this? Thank you!

sehe
  • 374,641
  • 47
  • 450
  • 633
Addy
  • 2,414
  • 1
  • 23
  • 43
  • I was able to get this work if I wrapped the `| whitespace` in the `contentParser` definition with `x3::no_skip`. In other words: `| x3::no_skip[whitespace]`. I'm not entirely sure I understand why that works, but it's functional. I'd love to see an expert's solution and insight. – Addy Jun 14 '20 at 23:01

2 Answers2

3

My goto reference on skipper issues: Boost spirit skipper issues

In this case you made it work with no_skip[]. That's correct.

no_skip is like lexeme except it doesn't pre-skip, from the source (boost/spirit/home/x3/directive/no_skip.hpp):

// same as lexeme[], but does not pre-skip

Alternative Take

In your case I would flip the logic: just adjust the skipper itself.

Also, don't supply the skipper with phrase_parse, because your grammar is highly sensitive to the correct value of the skipper.

Your whole grammar could be:

const auto p  = x3::skip(x3::space - whitespace) [
        *(+x3::graph | whitespace)
    ];

Here's a Live Demo On Coliru

#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <iostream>
#include <iomanip>

namespace x3 = boost::spirit::x3;

enum Whitespace { NEWLINE, TAB, PIPE };

struct whitespace_p : x3::symbols<Whitespace> {
    whitespace_p() {
        add
            ("\n", Whitespace::NEWLINE)
            ("\t", Whitespace::TAB)
            ("|", Whitespace::PIPE)
        ;
    }
} static const whitespace;

struct Element : x3::variant<Whitespace, std::string> {
    using base_type::base_type;
    using base_type::operator=;
};

using Elements = std::vector<Element>;

static inline std::ostream& operator<<(std::ostream& os, Element const& el) {
    struct print_visitor {
        std::ostream& os;

        auto& operator()(Whitespace ws) const {
            switch(ws) {
                case Whitespace::NEWLINE: return os << "[newline]";
                case Whitespace::PIPE: return os << "[pipe]";
                case Whitespace::TAB: return os << "[tab]";
            }
            return os << "?";
        }

        auto& operator()(const std::string& str) const { return os << std::quoted(str); }
    } vis{os};
    return boost::apply_visitor(vis, el);
}

int main() {
    std::string const text = "\tHello \n World";
    auto start = begin(text), stop = end(text);

    const auto p  = x3::skip(x3::space - whitespace) [
            *(+x3::graph | whitespace)
        ];

    Elements elements;

    if (!parse(start, stop, p, elements)) {
        std::cout << "failed to parse!\n";
    } else {
        std::copy(begin(elements), end(elements), std::ostream_iterator<Element>(std::cout, "\n"));
    }

    if (start != stop) {
        std::cout << "unparsed: " << std::quoted(std::string(start, stop)) << '\n';
    }
}

Prints

[tab]
"Hello"
[newline]
"World"

Even Simpler?

It doesn't seem like you'd need any skipper here at all. Why not:

const auto p  = *(+~x3::char_("\n\t|") | whitespace);

While we're at it, there's no need for symbols to map enums:

struct Element : x3::variant<char, std::string> {
    // ...
};
using Elements = std::vector<Element>;

And then

const auto p
    = x3::rule<struct ID, Element> {}
    = +~x3::char_("\n\t|") | x3::char_;

Live On Coliru

#include <boost/spirit/home/x3.hpp>
#include <boost/spirit/home/x3/support/ast/variant.hpp>
#include <iostream>
#include <iomanip>

namespace x3 = boost::spirit::x3;

struct Element : x3::variant<char, std::string> {
    using variant = x3::variant<char, std::string>;
    using variant::variant;
    using variant::operator=;

    friend std::ostream& operator<<(std::ostream& os, Element const& el) {
        struct print_visitor {
            std::ostream& os;

            auto& operator()(char ws) const {
                switch(ws) {
                    case '\n': return os << "[newline]";
                    case '\t': return os << "[pipe]";
                    case '|': return os << "[tab]";
                }
                return os << "?";
            }

            auto& operator()(const std::string& str) const { return os << std::quoted(str); }
        } vis{os};
        return boost::apply_visitor(vis, el);
    }
};
using Elements = std::vector<Element>;

int main() {
    std::string const text = "\tHello \n World";
    auto start = begin(text);
    auto const stop = end(text);

    Elements elements;
    const auto p
        = x3::rule<struct ID, Element> {}
        = +~x3::char_("\n\t|") | x3::char_;

    if (!parse(start, stop, *p, elements)) {
        std::cout << "failed to parse!\n";
    } else {
        std::copy(begin(elements), end(elements), std::ostream_iterator<Element>(std::cout, "\n"));
    }

    if (start != stop) {
        std::cout << "unparsed: " << std::quoted(std::string(start, stop)) << '\n';
    }
}

Prints

[pipe]
"Hello "
[newline]
" World"
sehe
  • 374,641
  • 47
  • 450
  • 633
  • Thank you @sehe. Your answers are always easy to follow and full of insight. – Addy Jun 15 '20 at 00:26
  • Hilariously I mismatched the case labels (e.g. `\t` -> `[pipe]`) in the last example. I'm gonna leave it as the idea was clear ;) – sehe Jun 15 '20 at 01:15
1

The problems are that you are using a phrase_parser instead of a parser at line 76. Try to use something like

bool result =
        parse(start, stop, elementsParser, elements);

Your phrase_parser was instructed to skip spaces, what you really don't want.

Look the first answer of How to use boost::spirit to parse a sequence of words into a vector?

David Kennedy
  • 370
  • 2
  • 12
  • 1
    Note that OP might want to skip spaces (just not tabs and newlines. There may be other space characters as well depending on the encoding/[locale](https://en.cppreference.com/w/cpp/locale/isspace)). There's some subtle interaction with pre-skipping when parsing inside no_skip with an active skipper. – sehe Jun 15 '20 at 01:12