I am trying to parse a pdf with more than 300 page. I am using pdf-parse npm package. The pdf has 300 pages. But my application crashes to while parsing the pdf. My question is that is there way by which i can parse one page at a time? Below is the code I have tried.
function render_page(pageData) {
//check documents https://mozilla.github.io/pdf.js/
let render_options = {
//replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`.
normalizeWhitespace: false,
//do not attempt to combine same line TextItem's. The default value is `false`.
disableCombineTextItems: false
}
return pageData.getTextContent(render_options)
.then(function (textContent) {
return textContent.items.map(function (s) {
return s.str
}).join(''); // value page text
})
}
//textContent.items.map
//.map(function (s) { return s.str; }).join('{newline}'); // value page text
let dataBuffer = fs.readFileSync('male.pdf');
const options = {
// internal page parser callback
// you can set this option, if you need another format except raw text
pagerender: render_page,
// max page number to parse
max: 4,
//check https://mozilla.github.io/pdf.js/getting_started/
version: 'v1.10.100'
}
pdf(dataBuffer, options).then(function (data) {
res.send(data)
})