I am trying to learn boost and some template programming in C++ but I am really having such an hard time to implement a simple class for iterating over Gzip files using mapped_file_source
. I essentially have an edge list in TSV format such that each line in the gzip file is of the format: <src:int><tab><dst:int>
. What I want is to implement a gz_file
class that exposes a begin and end iterator over which I can get an edge (std::pair<int,int>
) each time I query the iterator.
The problem is the copy constructor which is broken since I cannot known where I am positioned in the gzip file.
Here is the code I have so far:
class gz_graph {
public:
gz_graph(const char * filename)
{
m_file.open(filename);
if (!m_file.is_open()) {
throw std::runtime_error("Error opening file");
}
m_data = m_file.data();
m_data_size = m_file.size() / sizeof(m_data[0]);
auto ret = posix_madvise((void*)m_data, m_data_size, POSIX_MADV_SEQUENTIAL);
}
class iterator;
iterator begin() const
{
return iterator(this, false);
}
iterator end() const
{
return iterator(this, true);
}
class iterator : public std::iterator<std::forward_iterator_tag, Edge> {
public:
iterator(gz_graph const * ref, bool consumed)
: m_ref(ref),
m_cur_edge(-1, -1),
m_consumed(consumed)
{
if (!consumed) {
initialize();
advance();
}
}
iterator(const iterator& x)
: m_ref(x.m_ref),
m_cur_edge(x.m_cur_edge)
{
if (!x.m_consumed) {
initialize();
advance();
}
std::cout << "Copy constructor" << std::endl;
}
value_type const& operator*() const
{
return m_cur_edge;
}
value_type const* operator->() const
{
return &m_cur_edge;
}
iterator& operator++()
{
advance();
return *this;
}
bool operator==(iterator const& other) const
{
assert(m_ref == other.m_ref);
return m_cur_edge == other.m_cur_edge;
}
bool operator!=(iterator const& other) const
{
return !(*this == other);
}
private:
void initialize()
{
boost::iostreams::array_source source(m_ref->m_data, m_ref->m_data_size);
m_in.push(boost::iostreams::gzip_decompressor());
m_in.push(source);
}
void advance()
{
std::string line_str;
if (!getline(m_in, line_str)) {
m_consumed = true;
m_cur_edge = Edge(-1, -1);
return;
}
std::vector<std::string> strs;
boost::split(strs, line_str, boost::is_any_of("\t"));
if (strs.size() != 2)
throw std::runtime_error("Required 2 fields per line");
int src = boost::lexical_cast<int>(strs.at(0));
int dst = boost::lexical_cast<int>(strs.at(1));
m_cur_edge = Edge(src, dst);
// std::cout << "Read line " << line_str << std::endl;
}
gz_graph const * m_ref;
Edge m_cur_edge;
boost::iostreams::filtering_istream m_in;
bool m_consumed;
};
private:
boost::iostreams::mapped_file_source m_file;
char const* m_data;
size_t m_data_size;
};