The question is: What is fast?
In the below demo, I create I file with 591.000 lines. Size is 74MB.
Then I set a bigger input buffer for the std::ifstream
, read all lines, parse them, and copy the first 3 entries into the resulting vector. The rest I do ignore.
To avoid that the result is optimized away, I show 50 lines of output.
VS2019, C++17, Release Mode, all optimizations on.
Result: ~2.7s for reading and parsing all lines on my machine. (I must admit that I have 4 SSDs in RAID 0 via PCIe)
#include <iostream>
#include <fstream>
#include <vector>
#include <string>
#include <regex>
#include <array>
#include <chrono>
#include <iterator>
int main() {
// Put whatever filename you want
static const std::string fileName{ "r:\\big.txt" };
// Start Time measurement
auto start = std::chrono::system_clock::now();
#if 0
// Write file with 591000 lines
if (std::ofstream ofs(fileName); ofs) {
for (size_t i = 0U; i < 591000U; ++i) {
ofs << "invoiceNo_" << i << ";"
<< "stockCode_" << i << ";"
<< "description_" << i << ";"
<< "Field_4_" << i << ";"
<< "Field_5_" << i << ";"
<< "Field_6_" << i << ";"
<< "Field_7_" << i << ";"
<< "Field_8_" << i << "\n";
}
}
#endif
auto end = std::chrono::system_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
// How long did it take?
std::cout << "Time for writing the file: " << elapsed.count() << " ms\n";
// We are just interested in 3 fields
constexpr size_t NumberOfNeededFields = 3U;
// We expect 591000 lines, give a little bit more
constexpr size_t NumberOfExpectedFilesInFile = 600000U;
// We will create a bigger input buffer for our stream
constexpr size_t ifStreamBufferSize = 100000U;
static char buffer[ifStreamBufferSize];
// The delimtzer for our csv
static const std::regex delimiter{ ";" };
// Main working variables
using Fields3 = std::array<std::string, NumberOfNeededFields>;
static Fields3 fields3;
static std::vector<Fields3> fields{};
// Reserve space to avoid reallocation
fields.reserve(NumberOfExpectedFilesInFile);
// Start timer
start = std::chrono::system_clock::now();
// Open file and check, if it is open
if (std::ifstream ifs(fileName); ifs) {
// Set bigger file buffer
ifs.rdbuf()->pubsetbuf(buffer, ifStreamBufferSize);
// Read all lines
for (std::string line{}; std::getline(ifs, line); ) {
// Parse string
std::copy_n(std::sregex_token_iterator(line.begin(), line.end(), delimiter, -1), NumberOfNeededFields, fields3.begin());
// Store resulting 3 fields
fields.push_back(std::move(fields3));
}
}
end = std::chrono::system_clock::now();
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Time for parsing the file: " << elapsed.count() << " ms\n";
// Show some result
for (size_t i = 0; i < fields.size(); i += (fields.size()/50)) {
std::copy_n(fields[i].begin(), NumberOfNeededFields, std::ostream_iterator<std::string>(std::cout, " "));
std::cout << "\n";
}
return 0;
}