I ended up using the Apps Script library ImgApp.
First, download the pdf:
var blob = UrlFetchApp.fetch(url).getBlob()
Then I used a function taken from this answer to convert that pdf blob into an array of png blobs (one for each page):
async function convertPDFToPNG(blob) {
const cdnjs = "https://cdn.jsdelivr.net/npm/pdf-lib/dist/pdf-lib.min.js"
eval(UrlFetchApp.fetch(cdnjs).getContentText())
const setTimeout = function (f, t) { // Overwrite setTimeout with Google Apps Script.
Utilities.sleep(t)
return f()
}
const data = new Uint8Array(blob.getBytes())
const pdfData = await PDFLib.PDFDocument.load(data)
const pageLength = pdfData.getPageCount()
console.log(`Total pages: ${pageLength}`)
const obj = { imageBlobs: [], fileIds: [] }
for (let i = 0; i < pageLength; i++) {
console.log(`Processing page: ${i + 1}`)
const pdfDoc = await PDFLib.PDFDocument.create()
const [page] = await pdfDoc.copyPages(pdfData, [i])
pdfDoc.addPage(page)
const bytes = await pdfDoc.save()
const blob = Utilities.newBlob([...new Int8Array(bytes)], MimeType.PDF, `sample${i + 1}.pdf`)
const id = DriveApp.createFile(blob).getId()
Utilities.sleep(3000)
const link = Drive.Files.get(id, { fields: "thumbnailLink" }).thumbnailLink
if (!link) {
throw new Error("In this case, please increase the value of 3000 in Utilities.sleep(3000), and test it again.")
}
var splitLink = link.split("=")[0] + "=s5000"
const imageBlob = UrlFetchApp.fetch(splitLink).getBlob().setName(`page${i + 1}.png`)
imageBlob.setContentType(MimeType.PNG)
obj.imageBlobs.push(imageBlob)
obj.fileIds.push(id)
}
obj.fileIds.forEach(id => DriveApp.getFileById(id).setTrashed(true))
return obj.imageBlobs
}
After that, add a delay of 1 second, then create an object used to crop a page in ImgApp (you can read how the parameters work in the github readme):
var pngs = await convertPDFToPNG(pdfBlob)
Utilities.sleep(1000)
const object = {
blob: pngs[0],
unit: "pixel",
crop: { t: 100, b: 0, l: 0, r: 200 },
outputWidth: 1600,
}
Note: outputWidth limit is currently 1600 because it's the largest thumbnail provided by the Slide API. Trying to increase it may make the quality worse, by experience.
The goal is to crop the image so that only the column you want is left. Experiment with different values in crop: { t: 0, b: 0, l: 0, r: 0 }
until you get what you want.
Then, run the ImgApp.editImage() function and run OCR on the cropped image:
const croppedBlob = ImgApp.editImage(object)
var resource = {
title: croppedBlob.getName(),
mimeType: croppedBlob.getContentType()
}
var docFile = Drive.Files.insert(resource, croppedBlob, options)
var doc = DocumentApp.openById(docFile.id)
var text = doc.getBody().getText()