I am doing a modified version of collecting word co-occurrences, so I wrote my own javascript, and I am tracking the occurrences in three objects. However, once the objects get large (~8 million, 3 million, and 172000) a function that took 5 seconds per 100000 sentences now takes minutes to do one sentence with 30 words (30 tokens). I am nowhere near my RAM cap (I have 12 more GBs of RAM it could be using, and the program is only using 2.2GB). Using Node.js v17.3.1.
Why does my function take so long when the objects get bigger (even though the sentences remain the same length)? Should I be using a different object besides Javascript's default object, or is there a way improve the speed of access and setting these objects when they are so big?
Code:
let posCounts = {};
let negCounts = {};
// the number of times each word occurs
let wordCounts = {};
let tokens = // some function that gets tokens;
for (let k = 0; k < tokens.length; k++) {
// count word occurences
if (tokens[k] in wordCounts) {
wordCounts[tokens[k]] += 1;
} else {
wordCounts[tokens[k]] = 1;
}
for(let tok = k + 1; tok < tokens.length; tok++) {
if (tok == k) {
// avoid word to self cooccurrence
// should no longer be possible
continue;
} else {
// check which form of the cooccurence exists already in either count
actual_tok = (tokens[k] + "-" + tokens[tok]);
if(actual_tok in posCounts || actual_tok in negCounts) {
// no-op
} else {
actual_tok = (tokens[tok] + "-" + tokens[k]);
}
// condition set before this block of code
if(condition) {
if (actual_tok in posCounts) {
posCounts[actual_tok] += 1;
} else {
posCounts[actual_tok] = 1;
}
} else {
if (actual_tok in negCounts) {
negCounts[actual_tok] += 1;
} else {
negCounts[actual_tok] = 1;
}
}
}
}
}
Update: I've tried increasing the heap size via node train_matrices.js --max-old-space-size=12288
and node train_matrices.js --max_old_space_size=12288
(underline instead of dash), and that didn't work either.