0

This code downloads metadata from a repository, writes that data to file, downloads a pdf, turns that pdf to text, then deletes the original pdf:

for record in records:
            record_data = []  # data is stored in record_data
            for name, metadata in record.metadata.items():
                for i, value in enumerate(metadata):
                    if value:
                        record_data.append(value)
            fulltext = ''
            file_path = ''
            file_path_metadata = ''
            unique_id = str(uuid.uuid4())
            for data in record_data:
                if 'Fulltext' in data:
                    # the link to the pdf
                    fulltext = data.replace('Fulltext ', '')
                    # path where the txt file will be stored
                    file_path = '/' + os.path.basename(data).replace('.pdf', '') + unique_id + '.pdf'
                    # path where the metadata will be stored
                    file_path_metadata = '/' + os.path.basename(data).replace('.pdf', '') + unique_id + '_metadata.txt'
                    print fulltext, file_path

            # Write metadata to file
            if fulltext:
                try:
                    write_metadata = open(path_to_institute + file_path_metadata, 'w')
                    for i, data in enumerate(record_data):
                        write_metadata.write('MD_' + str(i) + ': ' + data.encode('utf8') + '\n')
                    write_metadata.close()
                except Exception as e:
                    # Exceptions due to missing path to file
                    print 'Exception when writing metadata: {}'.format(e)
                    print fulltext, path_to_institute, file_path_metadata

                # Download pdf
                download_pdf(fulltext, path_to_institute + file_path)

                # Create text file and delete pdf
                pdf2text(path_to_institute + file_path)

Doing some measurements, the download_pdf method and pdf2text method takes quite a long time.

Here are those methods:

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
import os


def remove_file(path):
    try:
            os.remove(path)
    except OSError, e:
            print ("Error: %s - %s." % (e.filename,e.strerror))


def pdf2text(path):
    string_handling = StringIO()
    parser = PDFParser(open(path, 'r'))
    save_file = open(path.replace('.pdf', '.txt'), 'w')

    try:
        document = PDFDocument(parser)
    except Exception as e:
        print '{} is not a readable document. Exception {}'.format(path, e)
        return

    if document.is_extractable:
        recourse_manager = PDFResourceManager()
        device = TextConverter(recourse_manager,
                               string_handling,
                               codec='ascii',
                               laparams=LAParams())
        interpreter = PDFPageInterpreter(recourse_manager, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

        # write to file
        save_file.write(string_handling.getvalue())
        save_file.close()

        # deletes pdf
        remove_file(path)

    else:
        print(path, "Warning: could not extract text from pdf file.")
        return

def download_pdf(url, path):
        try:
            f = urllib2.urlopen(url)
        except Exception as e:
            print e
            f = None

        if f:
            data = f.read()
            with open(path, "wb") as code:
                code.write(data)
                code.close()

So I'm thinking I should run those in parallel. I tried this, but it did not word:

    pool = mp.Pool(processes=len(process_data))
    for i in process_data:
        print i
        pool.apply(download_pdf, args=(i[0], i[1]))

    pool = mp.Pool(processes=len(process_data))
    for i in process_data:
        print i[1]
        pool.apply(pdf2text, args=(i[1],))

It takes just as long time? The printing happens as if the processes are run one at a time...

Arash Saidi
  • 2,228
  • 20
  • 36

2 Answers2

2

I finally found out a way to run the code in parallel. Unbelievable how much faster it got.

    import multiprocessing as mp

    jobs = []
    for i in process_data:
        p = mp.Process(target=download_pdf, args=(i[0], i[1]))
        jobs.append(p)
        p.start()

    for i, data in enumerate(process_data):
        print data
        p = mp.Process(target=pdf2text, args=(data[1],))
        jobs[i].join()
        p.start()
Arash Saidi
  • 2,228
  • 20
  • 36
  • 1
    There are probably more performance gains from using Threads as opposed to Process for file downloads. Threads are much cheaper and quick to make, and are usually the right choice for IO-bound tasks. – Giannis May 16 '19 at 15:33
1

here is a great article on how to build stuff in parallel,

it uses multiprocessing.dummy to run things in different threads

here is a little example:

from urllib2 import urlopen
from multiprocessing.dummy import Pool

urls = [url_a,
        url_b,
        url_c
       ]

pool = Pool()
res = pool.map(urlopen, urls)

pool.close()
pool.join()

for python >= 3.3 I suggest concurrent.futures

example:

import functools
import urllib.request
import futures

URLS = ['http://www.foxnews.com/',
    'http://www.cnn.com/',
    'http://europe.wsj.com/',
    'http://www.bbc.co.uk/',
    'http://some-made-up-domain.com/']

def load_url(url, timeout):
    return urllib.request.urlopen(url, timeout=timeout).read()

with futures.ThreadPoolExecutor(50) as executor:
    future_list = executor.run_to_futures(
       [functools.partial(load_url, url, 30) for url in URLS])

example taken from: here

Urban48
  • 1,398
  • 1
  • 13
  • 26