What you want the parser to produce is just the sequence of "word ids" (let's call them atoms).
Only the functor that will fuel your semantic action needs to "know about" the mappings.
I'm going to simplify your data-structure a bit here:
using AtomId = size_t;
using Atom = std::string_view; // or boost::string_view
struct mapping {
std::map<Atom, AtomId> by_word;
std::map<AtomId, Atom> by_id;
};
About That Semantic Action
You can read about the Anatomy Of Spirit Semantic Actions.
If you want to use the synthesized , local, exposed or inherited attributes, you will have decode the context parameter. Still the best treatment of this is this answer: boost spirit semantic action parameters
However, if you've looked at it, you'll find it's not very convenient. Instead, I'd suggest to stay in the Phoenix domain (where things like _1
, _val
, _pass
, _r1
and _a
magically have the intended meanings, without having to know how to address them in the context).
In that case, you will want your function to be like:
struct convert_f {
mapping &m_ref;
using Range = boost::iterator_range<It>;
AtomId operator()(Range const& text) const {
Atom atom{&*text.begin(), text.size()};
auto& left = m_ref.by_word;
auto& right = m_ref.by_id;
auto it = left.find(atom);
if (it != left.end())
return it->second;
else {
const auto iID = left.size();
left.emplace (atom, iID);
right.emplace(iID, atom);
return iID;
}
}
};
boost::phoenix::function<convert_f> convert;
You could have made Range
just std::string
, but I was thinking ahead, and since you read the full file into a vector, you can use a string_view
based on the raw source iterator range, to avoid copying anything. This also removes the creepy redundancy of storing the same std::string
inside two maps¹.
¹ but see the new "Bonus" section
Some Varied Problem Points
- BUG: if you expect
+char_
to match only contiguous chars, make sure you wrap it in a lexeme[]
(so it cannot skip whitespaces silently) OR of course make the rule implicitly lexeme (see Boost spirit skipper issues).
- BUG: don't use
+char_
unless you mean to parse /anything/ In your case, you want contiguous stretches of non-space, so at least make it +qi::graph
- BUG: when reading the data from
std::cin
you already skip whitespace, so all input will become on big word again. Use std::noskipws
first OR use std::istreambuf_iterator
instead std::istream_iterator
. Subtle, I know.
- don't expose your skipper unless you mean for the caller to change it
I probably forgot some more steps, but for now, let's forget about that and just drop a demo:
DEMO
Live On Coliru
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <string_view> // or <boost/utility/string_view.hpp>
#include <iostream>
#include <map>
using AtomId = size_t;
using Atom = std::string_view; // or boost::string_view
using Atoms = std::vector<AtomId>;
struct mapping {
std::map<Atom, AtomId> by_word;
std::map<AtomId, Atom> by_id;
};
namespace qi = boost::spirit::qi;
template <typename It>
struct parser : qi::grammar<It, Atoms()> {
parser(mapping &r) : parser::base_type(start), convert({r}) {
using namespace qi;
// we don't expose the skipper anymore, so we specify it at toplevel
start = skip(ascii::space)[ *name ];
name = raw[ +graph ] [_val = convert(_1)];
}
private:
qi::rule<It, Atoms()> start;
qi::rule<It, AtomId()> name;
struct convert_f {
mapping &m_ref;
using Range = boost::iterator_range<It>;
AtomId operator()(Range const& text) const {
Atom atom{&*text.begin(), text.size()};
auto& left = m_ref.by_word;
auto& right = m_ref.by_id;
auto it = left.find(atom);
if (it != left.end())
return it->second;
else {
const auto iID = left.size();
left.emplace (atom, iID);
right.emplace(iID, atom);
return iID;
}
}
};
boost::phoenix::function<convert_f> convert;
};
int main() {
using It = std::string::const_iterator;
std::string const input { std::istreambuf_iterator<char>(std::cin), {} };
mapping sMapping;
parser<It> const sParser(sMapping);
if (qi::parse(input.begin(), input.end(), sParser)) {
std::cout << "Parsed " << sMapping.by_id.size() << " unique atoms\n";
for (auto& [atom, id] : sMapping.by_word) {
std::cout << atom << "(" << id << ")\n";
}
std::cout << "\n";
} else {
std::cout << "Parse failed\n";
return 1;
}
}
Prints (for the current post text):
Parsed 282 unique atoms
!=(153)
"know(34)
"word(19)
##(63)
&m_ref;(135)
(atom,(161)
(it(152)
(let's(21)
(see(230)
(so(220)
(where(111)
**<kbd>[Live(279)
,(78)
//(50)
/anything/(236)
0.(208)
=(46)
About(64)
Action(67)
Actions](http://boost-spirit.com/home/2010/03/03/the-anatomy-of-semantic-actions-in-qi/).(75)
Atom(48)
Atom>(60)
AtomId(45)
AtomId>(57)
BUG:(209)
Coliru]()</kbd>**(281)
DEMO(278)
However,(92)
I(174)
I'd(105)
I'm(37)
If(76)
In(129)
Instead,(104)
OR(225)
Of(73)
On(280)
Only(25)
Phoenix(109)
Points(207)
Problem(206)
Range(136)
Semantic(66)
Some(204)
Spirit(74)
Still(86)
Subtle,(261)
That(65)
There(0)
This(193)
Use(255)
Varied(205)
What(11)
You(68)
[Anatomy(72)
`+char_`(211)
`+qi::graph`(241)
`Range`(171)
`_1`,(114)
`_a`(119)
`_pass`,(116)
`_r1`(117)
`_val`,(115)
`lexeme[]`(219)
`std::cin`(246)
`std::istream_iterator`.(260)
`std::istreambuf_iterator`(258)
`std::noskipws`(256)
`std::string`(200)
`std::string`,(172)
`string_view`(183)
a(40)
about(71)
about"(35)
action(32)
address(127)
again.(254)
ahead,(177)
all(249)
already(247)
also(194)
and(118)
answer:(90)
anything.(192)
at(96)
atom);(164)
atoms).(24)
atom{&*text.begin(),(142)
attribute(9)
attributes,(81)
auto(149)
auto&(144)
avoid(190)
based(184)
be(132)
become(251)
best(87)
big(252)
binding(4)
bit(41)
boost::iterator_range<It>;(137)
boost::phoenix::function<convert_f>(167)
boost::string_view(52)
but(173)
by_id;(61)
by_word;(58)
call(22)
caller(266)
can(69)
cannot(221)
case,(130)
change(267)
chars,(215)
complexity(42)
const(141)
const&(139)
context(84)
context).(128)
contiguous(214)
convenient.(103)
convert;(168)
convert_f(134)
copying(191)
could(169)
course(226)
creepy(196)
data(244)
decode(83)
demo:(277)
domain(110)
don't(232)
drop(276)
else(157)
expect(210)
expose(263)
exposed(8)
file(180)
find(99)
first(257)
for(265)
forget(275)
forgot(269)
from(245)
fuel(29)
full(179)
function(131)
functor(26)
going(38)
have(82)
having(124)
here:(43)
how(126)
https://stackoverflow.com/questions/17072987/boost-spirit-skipper-issues/17073965#17073965).(231)
https://stackoverflow.com/questions/3066701/boost-spirit-semantic-action-parameters/3067881#3067881(91)
iID(158)
iID);(162)
iID;(165)
ids"(20)
if(93)
implicitly(228)
in(108)
inherited(80)
input(250)
inside(201)
instead(259)
intended(121)
into(181)
is(1)
it(150)
it's(100)
it,(97)
it->second;(156)
iterator(188)
just(16)
know(125)
know.(262)
least(240)
left(145)
left.emplace(160)
left.end())(154)
left.find(atom);(151)
left.size();(159)
let's(274)
lexeme(229)
like(113)
like:(133)
little(2)
local,(79)
looked(95)
m_ref.by_id;(148)
m_ref.by_word;(146)
made(170)
magically(120)
make(216)
map(6)
mapping(54)
mappings.(36)
maps.(203)
match(212)
mean(234)
meanings,(122)
more(271)
needs(33)
non-space,(238)
not(101)
now,(273)
of(18)
on(185)
only(213)
operator()(Range(138)
or(51)
parameter.(85)
parse(235)
parser(14)
probably(268)
produce(15)
range,(189)
raw(186)
read(70)
reading(243)
redundancy(197)
removes(195)
return(155)
right(147)
right.emplace(iID,(163)
rule(227)
same(199)
semantic(31)
sequence(17)
sidestep(39)
silently)(224)
since(178)
size_t;(47)
skip(222)
skipper(264)
so(239)
some(270)
source(187)
stay(107)
std::map<Atom,(56)
std::map<AtomId,(59)
std::string_view;(49)
steps,(272)
storing(198)
stretches(237)
struct(53)
suggest(106)
sure(217)
synthesized(77)
text)(140)
text.size()};(143)
that(27)
the(5)
them(23)
things(112)
thinking(176)
this(89)
to(7)
treatment(88)
two(202)
type.(10)
unless(233)
use(3)
using(44)
vector,(182)
very(102)
want(13)
was(175)
when(242)
whitespace,(248)
whitespaces(223)
will(28)
without(123)
word(253)
wrap(218)
you(12)
you'll(98)
you've(94)
your(30)
{(55)
}(166)
};(62)
Oh I forgot to actually store the Atoms
:
Live On Coliru
Atoms idlist;
if (qi::parse(input.begin(), input.end(), sParser, idlist)) {
std::cout << "Parsed " << sMapping.by_id.size() << " unique atoms\n";
for (AtomId id : idlist) {
std::cout << "'" << sMapping.by_id.at(id) << "' ";
}
std::cout << "\n";
} else {
// ...
Prints something starting like:
Parsed 282 unique atoms
'There' 'is' 'little' 'use' 'binding' 'the' 'map' 'to' 'the' 'exposed' 'attribute' 'type.' 'What' 'you' 'want' 'the' ...
BONUS
- Using Boost Bimap instead of handrolling the two maps. This keeps things always in sync and is about 15 lines of code shorter:
Live On Coliru
using mapping = boost::bimap<Atom, AtomId>;
// ...
AtomId convert_f::operator()(Range const& text) const {
Atom atom{&*text.begin(), text.size()};
return m_ref.left.insert({atom, m_ref.size()}).first->second;
}
And then in the usage:
std::cout << "Parsed " << sMapping.size() << " unique atoms\n";
for (AtomId id : idlist) {
std::cout << "'" << sMapping.right.at(id) << "' ";
}