5

I am trying to parse a pdf with more than 300 page. I am using pdf-parse npm package. The pdf has 300 pages. But my application crashes to while parsing the pdf. My question is that is there way by which i can parse one page at a time? Below is the code I have tried.

function render_page(pageData) {
    //check documents https://mozilla.github.io/pdf.js/
    let render_options = {
      //replaces all occurrences of whitespace with standard spaces (0x20). The default value is `false`.
      normalizeWhitespace: false,
      //do not attempt to combine same line TextItem's. The default value is `false`.
      disableCombineTextItems: false
    }

    return pageData.getTextContent(render_options)
      .then(function (textContent) {
        return textContent.items.map(function (s) {
         return s.str
        }).join(''); // value page text 
      })
  }
  //textContent.items.map
  //.map(function (s) { return s.str; }).join('{newline}'); // value page text 
  let dataBuffer = fs.readFileSync('male.pdf');
  const options = {
    // internal page parser callback
    // you can set this option, if you need another format except raw text
    pagerender: render_page,
    // max page number to parse
    max: 4,
    //check https://mozilla.github.io/pdf.js/getting_started/
    version: 'v1.10.100'
  }
  pdf(dataBuffer, options).then(function (data) {
    res.send(data)
  })
  • Does it throw any error ? I can see the max option in the documentation too – Manos Kounelakis Jan 18 '20 at 10:41
  • @ManosKounelakis fortunately it doen't throw any error. the fact is my application does crash when i extract text from 200+ pages. –  Jan 18 '20 at 10:44
  • @GeorgeAlvis Can you change max in options more than 300? I think a problem at there – Chuong Tran Jan 18 '20 at 11:17
  • @ChuongTran i have already done that. But my application crashes. So my question is that how do I parse the pdf one page at a time. –  Jan 18 '20 at 11:20

0 Answers0