I would like to read the extracted version of this Debian package file (http://ftp.debian.org/debian/dists/stable/main/binary-amd64/Packages.xz) with ~10⁶ lines to find out what version a certain package has. I am trying to do that with Node.js but it takes ~2,5s on my machine to find a package with this algorithm:
import { createReadStream } from 'fs';
import readline from 'readline';
async function probeExtractedPackage(
extractedFile,
packageName
) {
const requiredPackageKeys = ['Package', 'Version', 'Homepage'];
const CHUNK_SIZE = 10000000; // 10MB
const rl = readline.createInterface({
input: createReadStream(extractedFile, { highWaterMark: CHUNK_SIZE }),
terminal:false
});
let pd = {};
for await (const line of rl) {
if (line === '') {
// now we should have all information available
if (pd.Package === packageName) {
return { releases: [{ version: pd.Version }], homepage: pd.Homepage };
}
pd = {};
continue;
}
for (let i = 0; i < requiredPackageKeys.length; i++) {
if (line.startsWith(requiredPackageKeys[i])) {
pd[requiredPackageKeys[i]] = line
.substring(requiredPackageKeys[i].length + 1)
.trim();
break;
}
}
}
return null;
}
Basically, you can see that I'm reading the file line by line using Node.js' readline.createInterface
.
In C++, however, the same task takes on the same machine ~300ms with this algorithm:
#include <stdio.h>
#include <chrono>
#include <csignal>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>
std::string getValue(const std::string &line, const std::string &key)
{
return line.substr(key.size());
}
int main(int argc, char *argv[])
{
auto start = std::chrono::steady_clock::now();
std::ifstream infile("/workspace/thefile.txt");
std::string line;
std::string packageName;
std::string packageVersion;
std::size_t count = 0;
auto container = std::unordered_map<std::string, std::string>();
std::vector<std::string> interestedIn = {"Package:", "Homepage:", "Version:"};
while(std::getline(infile, line)) {
count++;
if(line == "") {
container = std::unordered_map<std::string, std::string>();
continue;
}
for(const auto &ii : interestedIn) {
if(line.starts_with(ii)) {
container[ii] = getValue(line, ii);
break;
}
}
}
auto end = std::chrono::steady_clock::now();
std::cout << "Took: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << std::endl;
std::cout << "Inspected " << count << std::endl;
return 0;
}
I understand that C++ is more efficient in many aspects, but does this translate to an improvement of a factor 10? I'm more guessing that I'm doing something wrong in Node.js (I just started with it). I'm using Node.js 16.