I'm trying to write a parser for STEP-files in javascript, that will be used primarily in the browser, but also in Node, and for now I use Node to debug.
It's going quite well and it's parsing along for a while. But when I get to really large files with millions of lines (around 200Mb and more) it chokes and eventually crashes and complains about JavaScript heap out of memory!
The files look something like this:
...
#10=ORGANIZATION('O0001','LKSoft','company');
#11=PRODUCT_DEFINITION_CONTEXT('part definition',#12,'manufacturing');
#12=APPLICATION_CONTEXT('mechanical design');
#13=APPLICATION_PROTOCOL_DEFINITION('','automotive_design',2003,#12);
#14=PRODUCT_DEFINITION('0',$,#15,#11);
#15=PRODUCT_DEFINITION_FORMATION('1',$,#16);
#16=PRODUCT('A0001','Test Part 1','',(#18));
#17=PRODUCT_RELATED_PRODUCT_CATEGORY('part',$,(#16));
#18=PRODUCT_CONTEXT('',#12,'');
...
#3197182=APPLIED_ORGANIZATION_ASSIGNMENT(#10,#20,(#16));
#3197183=ORGANIZATION_ROLE('id owner');
The files are a bit irregular so I'm writing a quite blunt parser, parsing letter for letter:
const fs = require('fs');
class bigObject {
constructor(data) {
this.parse(data);
}
propertyLexer(row) {
let refNrRE = /[-0-9]/;
let floatNumberRE = /[.\-0-9E]/;
let charsRE = /[_a-zA-Z.]/;
let stringRE = /'((?:''|[^'])*)'/;
let lexedRow = [];
let current = 0;
let rowLen = row.length;
while (current < rowLen) {
let char = row[current];
// I.E. #32123
if (char === '#') {
let property = '';
while (refNrRE.test(row[current + 1]) && current < rowLen) {
current++;
property += row[current];
}
lexedRow.push(parseInt(property));
current++;
}
// Empty property
else if (char === '$') {
lexedRow.push('');
current++;
}
// Skip to next property
else if (char === ',') {
current++;
}
// I.E. 'Comments, blabla (more comments)'
else if (char === "'") {
let property = stringRE.exec(row.substr(current));
lexedRow.push(property[1]);
current += property[1].length + 2;
}
// I.E. .AREAUNIT.
else if (charsRE.test(char)) {
let property = '';
while (charsRE.test(row[current]) && current < rowLen) {
property += row[current];
current++;
}
lexedRow.push(property);
}
// I.E. -1000.00
else if (floatNumberRE.test(char)) {
let property = '';
while (floatNumberRE.test(row[current]) && current < rowLen) {
property += row[current];
current++;
}
lexedRow.push(property);
}
// Skip rest for now
else {
current++;
}
}
return lexedRow;
}
parse(data) {
if (typeof data !== "string") {
try {
data = data.toString();
}
catch (e) {
throw `Indata not string or not able to convert to string: ${e}`;
}
}
let stepRowRE = /#\d+\s*=\s*[a-zA-Z0-9]+\s*\([^)]*(?:\)(?!;)[^)]*)*\);/g;
// Split single row into three capture groups
let singleRowWithGroupingRE = /^#(\d+)\s*=\s*([a-zA-Z0-9]+)\s*\(([^)]*(?:\)(?!;)[^)]*)*)\);/;
let stepRows = data.match(stepRowRE);
let rowIndex = stepRows.length - 1;
let rowsFromFile = {};
let count = 0;
for (let i = 0; i <= rowIndex; i++) {
let matching = singleRowWithGroupingRE.exec(stepRows[i]);
rowsFromFile[matching[1]] = {c: matching[2], p: this.propertyLexer(matching[3].replace(/(\r\n|\n|\r)/gm, ''))};
if (i % 200000 === 0) {
console.log(i + '::' + JSON.stringify(rowsFromFile[matching[1]]));
}
count++;
}
}
}
//// Start here ////
fs.readFile('./ifc-files/A-40-V-00252.ifc', (err, data) => {
let newObject = new bigObject(data);
});
I get this error:
<--- Last few GCs --->
[11348:000002D4A6E72260] 81407 ms: Mark-sweep 1403.2 (1458.8) ->
1403.2 (1458.8) MB, 2428.1 / 0.0 ms allocation failure GC in old space requested [11348:000002D4A6E72260] 83836 ms: Mark-sweep
1403.2 (1458.8) -> 1403.2 (1428.8) MB, 2429.0 / 0.0 ms last resort gc [11348:000002D4A6E72260] 86282 ms: Mark-sweep 1403.2 (1428.8) ->
1403.1 (1428.8) MB, 2446.3 / 0.0 ms last resort gc
<--- JS stacktrace --->
==== JS stack trace =========================================
Security context: 00000384656C0D51 <JS Object>
1: parse [C:\Users\user\Projects\parser\index.js:~95] [pc=000000525FB71B18](this=000001EE5F96DE19 <a bigObject with map 0000036221B1B7A9>,data=0000034357F04201 <Very long string[190322237]>)
2: new bigObject [C:\Users\user\Projects\parser\index.js:8] [pc=000000525FB48737](this=000001EE5F96DE19 <a bigObject with map 0000036221B1B7...
FATAL ERROR: CALL_AND_RETRY_LAST Allocation failed - JavaScript heap out of memory
I've been trying to find the reason for this for days now but I can't see anything that looks like a memory leak or infinite loop.
My machine has 16Gb memory and should easily be able to handle a 200mb file, many times over!
Are there anybody who can help me with my problem? Thanks!
EDIT: Everything is working just fine if I use Firefox or even Edge(!), and also when I use --max_old_space_size=4096
flag to increase available memory for Chrome/Node (V8). But it's not likely that regular users will do this... So I still need to make it more memory efficient. But I have no clue how.
EDIT2: It's not the JSON.stringify or the fact that I read the whole file that causes the problem. This will be a problem if I try to read an even larger file than I currently do. But for now it's more because I'm storing too much in memory or something.