Hm, basically 50MB is considered "small" nowadays. With taht small data, you can read the whole file into one std::string
and then do a linear search.
So, the algorithm is:
- Open files and check, if they could be opened
- Read complete file into a
std::string
- Do a linear search for the string "data-permalink=""
- Remember the start position of the permalink
- Search for the closing "
- Use the
std::string
s substr
function to create the output permalink string
- Write this to a file
- Goto 1.
I created a 70MB random test file with random data.
The whole procedure takes less than 1s. Even with slow linear search.
But caveat. You want to parse a HTML file. This will most probably not work, because of potential nested structures. For this you should use existing HTML parsers.
Anyway. Here is one of many possible solutions.
#include <iostream>
#include <fstream>
#include <string>
#include <random>
#include <iterator>
#include <algorithm>
std::string randomSourceCharacters{ " abcdefghijklmnopqrstuvwxyz" };
const std::string sourceFileName{ "r:\\test.txt" };
const std::string linkFileName{ "r:\\links.txt" };
void createRandomData() {
std::random_device randomDevice;
std::mt19937 randomGgenerator(randomDevice());
std::uniform_int_distribution<> randomCharacterDistribution(0, randomSourceCharacters.size() - 1);
std::uniform_int_distribution<> randomLength(10, 30);
if (std::ofstream ofs{ sourceFileName }; ofs) {
for (size_t i{}; i < 1000000; ++i) {
const int prefixLength{ randomLength(randomGgenerator) };
const int linkLength{ randomLength(randomGgenerator) };
const int suffixLength{ randomLength(randomGgenerator) };
for (int k{}; k < prefixLength; ++k)
ofs << randomSourceCharacters[randomCharacterDistribution(randomGgenerator)];
ofs << "data-permalink=\"";
for (int k{}; k < linkLength; ++k)
ofs << randomSourceCharacters[randomCharacterDistribution(randomGgenerator)];
ofs << "\"";
for (int k{}; k < suffixLength; ++k)
ofs << randomSourceCharacters[randomCharacterDistribution(randomGgenerator)];
}
}
else std::cerr << "\nError: Could not open source file '" << sourceFileName << "' for writing\n";
}
int main() {
// Please uncomment if you want to create a file with test data
// createRandomData();
// Open source file for reading and check, if file could be opened
if (std::ifstream ifs{ sourceFileName }; ifs) {
// Open link file for writing and check, if file could be opened
if (std::ofstream ofs{ linkFileName }; ofs) {
// Read the complete 50MB file into a string
std::string data(std::istreambuf_iterator<char>(ifs), {});
const std::string searchString{ "data-permalink=\"" };
const std::string permalinkEndString{ "\"" };
// Do a linear search
for (size_t posBegin{}; posBegin < data.length(); ) {
// Search for the begin of the permalink
if (posBegin = data.find(searchString, posBegin); posBegin != std::string::npos) {
const size_t posStartForEndSearch = posBegin + searchString.length() ;
// Search fo the end of the perma link
if (size_t posEnd = data.find(permalinkEndString, posStartForEndSearch); posEnd != std::string::npos) {
// Output result
const size_t lengthPermalink{ posEnd - posStartForEndSearch };
const std::string output{ data.substr(posStartForEndSearch, lengthPermalink) };
ofs << output << '\n';
posBegin = posEnd + 1;
}
else break;
}
else break;
}
}
else std::cerr << "\nError: Could not open source file '" << sourceFileName << "' for reading\n";
}
else std::cerr << "\nError: Could not open source file '" << sourceFileName << "' for reading\n";
}
Edit
If you need unique links you may store the result in an std::unordered_set
and then output later.
#include <iostream>
#include <fstream>
#include <string>
#include <iterator>
#include <algorithm>
#include <unordered_set>
const std::string sourceFileName{ "r:\\test.txt" };
const std::string linkFileName{ "r:\\links.txt" };
int main() {
// Open source file for reading and check, if file could be opened
if (std::ifstream ifs{ sourceFileName }; ifs) {
// Open link file for writing and check, if file could be opened
if (std::ofstream ofs{ linkFileName }; ofs) {
// Read the complete 50MB file into a string
std::string data(std::istreambuf_iterator<char>(ifs), {});
const std::string searchString{ "data-permalink=\"" };
const std::string permalinkEndString{ "\"" };
// Here we will store unique results
std::unordered_set<std::string> result{};
// Do a linear search
for (size_t posBegin{}; posBegin < data.length(); ) {
// Search for the begin of the permalink
if (posBegin = data.find(searchString, posBegin); posBegin != std::string::npos) {
const size_t posStartForEndSearch = posBegin + searchString.length();
// Search fo the end of the perma link
if (size_t posEnd = data.find(permalinkEndString, posStartForEndSearch); posEnd != std::string::npos) {
// Output result
const size_t lengthPermalink{ posEnd - posStartForEndSearch };
const std::string output{ data.substr(posStartForEndSearch, lengthPermalink) };
result.insert(output);
posBegin = posEnd + 1;
}
else break;
}
else break;
}
for (const std::string& link : result)
ofs << link << '\n';
}
else std::cerr << "\nError: Could not open source file '" << sourceFileName << "' for reading\n";
}
else std::cerr << "\nError: Could not open source file '" << sourceFileName << "' for reading\n";
}