0

I'm faced with the problem of converting a PDF to a Google Doc using ocr, but my PDF is too long, so I'm looking to manually set the page range that is being converted.

Here is my code:

function extractPdfData(){
  const ss = SpreadsheetApp.getActiveSpreadsheet()
  //Get all PDF files:
  const folder = DriveApp.getFolderById(FOLDER_ID);
  //const files = folder.getFiles();
  const files = folder.getFilesByType("application/pdf");
  
  //Iterate through each folder
  while(files.hasNext()){
    let file = files.next();
    let fileID = file.getId();
    
    const doc = getTextFromPDF(fileID);
    const data= extractSpecifics(doc.text);
    }
}

function getTextFromPDF(fileID) {
  var blob = DriveApp.getFileById(fileID).getBlob()
  var resource = {
    title: blob.getName(),
    mimeType: blob.getContentType()
  };
  var options = {
    ocr: true,
    ocrLanguage: "en"
  };
  // Convert the pdf to a Google Doc with ocr.
  var file = Drive.Files.insert(resource, blob, options);
 
  // Get the texts from the newly created text.
  var doc = DocumentApp.openById(file.id);
  var text = doc.getBody().getText();
  var title = doc.getName();
  
  // Delete the document once the text has been stored.
  Drive.Files.remove(doc.getId());
  
  return {
    name:title,
    text:text
  };
}

Any assistance is greatly appreciated. (Please excuse my use of var instead of let).

  • I have to apologize for my poor English skill. Unfortunately, I cannot understand your expected situation from `I'm looking to manually set the page range`. Can I ask you about the detail of it? First, I would like to try to correctly understand your question. I apologize for this. – Tanaike Feb 24 '23 at 23:49
  • @K J Thank you for your support. From your comment, I would like to recommend posting it as an answer by including more information. I think that it will be useful for the owner of this question. – Tanaike Feb 25 '23 at 00:12
  • @Tanaike -I want to split my PDF before running OCR because right now it is too long. I want to OCR the first 50 pages, then the next 50 pages to circumvent my my problem right now where only the first 80 pages of my PDF get converted. If you have a solution it would be greatly appreciated. – Francesco Russo Feb 27 '23 at 19:43

1 Answers1

0

Workaround Suggestion

As per the discussion in the comments of your post, it is not possible to only extract specific pages in your PDF file just by solely using the Google Drive API.

I have found this related post where you could possibly workaround this matter using a specific method mentioned on the post. I have tweaked your script & did a quick test run:

Tweaked Script

var apiSecret = 'API_SECRET'

function extractPdfData() {
  //Get all PDF files:
  const folder = DriveApp.getFolderById('DRIVE_FOLDER_ID');
  //const files = folder.getFiles();
  const files = folder.getFilesByType("application/pdf");
  //define pages here that you only want to extract from a multi page PDf file
  var pages = [1, 3].join('%2C');

  //Iterate through each folder
  while (files.hasNext()) {
    let file = files.next();

    var url = `https://v2.convertapi.com/convert/pdf/to/split?Secret=${apiSecret}&ExtractPages=${pages}`; // Please set your secret key.
    var options = {
      method: "post",
      payload: { File: DriveApp.getFileById(file.getId()).getBlob() },
    }
    var res = UrlFetchApp.fetch(url, options);
    var rawData = JSON.parse(res.getContentText());

    rawData.Files.forEach(curPDF => {
      var pdfName = curPDF.FileName;
      var pdfData = curPDF.FileData;
      var mime = MimeType.PDF;

      //Set up temporary pdf files for each pages
      var data = Utilities.base64Decode(pdfData, Utilities.Charset.UTF_8);
      var blob = Utilities.newBlob(data, mime, pdfName);
      var currentPDF = folder.createFile(blob);

      const doc = getTextFromPDF(currentPDF.getId());
      //const data= extractSpecifics(doc.text);

      //Testing
      console.log(doc);

      //remove the current pdf file created
      Drive.Files.remove(currentPDF.getId());
    })
  }
}

function getTextFromPDF(fileID) {
  var blob = DriveApp.getFileById(fileID).getBlob()

  var resource = {
    title: blob.getName(),
    mimeType: blob.getContentType()
  };
  var options = {
    ocr: true,
    ocrLanguage: "en"
  };
  // Convert the pdf to a Google Doc with ocr.
  var file = Drive.Files.insert(resource, blob, options);
 
  // Get the texts from the newly created text.
  var doc = DocumentApp.openById(file.id);
  var text = doc.getBody().getText();
  var title = doc.getName();
  
  // Delete the document once the text has been stored.
  Drive.Files.remove(doc.getId());
  
  return {
    name:title,
    text:text
  };
}

Demo

  • Say you want to extract the page 1 & page 3 in a 4-page PDF file in your Drive folder:

enter image description here enter image description here

  • After running the script, here's a test console log result of your getTextFromPDF() function:

enter image description here

SputnikDrunk2
  • 3,398
  • 1
  • 5
  • 17
  • Thank you for your solution, however Split PDF API is too expensive for my application. I was really hoping to solve this problem for free. – Francesco Russo Feb 27 '23 at 19:44