4
import requests
import time
from lxml import html

def parse_site():
    return str(memoryview(''.join([f'---! {link.text_content()} !---\n{parse_fandom(link.xpath(".//a/@href")[0])}\n' for link in
        html.fromstring(requests.get('https://archiveofourown.org/media').content).xpath('//*[@class="actions"]')]).encode('utf-8'))[:-1], 'utf-8')

def parse_fandom(url):
    return ''.join([' '.join(f'{item.text_content()} |—| {item.xpath(".//a/@href")[0]}'.split()) + '\n' for item in
        html.fromstring(requests.get(f'https://archiveofourown.org{url}').content).xpath('//*[contains(@class, "tags")]//li')])

if __name__ == '__main__':
    start_time = time.time()
    with open('test.txt', 'w+', encoding='utf-8') as f:
        f.write(parse_site())
    print("--- %s seconds ---" % (time.time() - start_time))

I'm working on web scraping this site to collect fandom stats, but connecting to the site with requests.get() can take 1-3 seconds, bringing the whole program to a slow 18-22 seconds. Somehow, I want to make these requests on parallel threads, but modules like grequests need an allocated pool to do so, and I haven't figured out a way to create such a pool within list comprehension.

Order of the list doesn't matter to me, as long as there is a hierarchy between each category (parsed in parse_site()) and its child links (parse_fandom(url)). What I want to do is something like:

[parallel_parse_fandom(url), parallel_parse_fandom(url2), parallel_parse_fandom(url3)]
↓
[<All links within this fandom>, parallel_parse_fandom(url2), <All links within this fandom>]
↓
return [<All links within this fandom>, <All links within this fandom>, <All links within this fandom>]

Solution based on @Aditya's

import requests
import time
from lxml import html
from concurrent.futures import ThreadPoolExecutor, as_completed

def parse_site():
    with ThreadPoolExecutor(max_workers=12) as executor:
        results = []
        for result in as_completed([executor.submit(parse_fandom, url) for url in [[link.text_content(), link.xpath(".//a/@href")[0]] for link in 
        html.fromstring(requests.get('https://archiveofourown.org/media').content).xpath('//*[@class="actions"]')]]):
            results.append(result)
    return str(memoryview(''.join(item.result() for item in results).encode('utf-8'))[:-1], 'utf-8')

def parse_fandom(data):
    return f'---! {data[0]} !---\n' + ''.join([' '.join(f'{item.text_content()} |—| {item.xpath(".//a/@href")[0]}'.split()) + '\n' for item in
        html.fromstring(requests.get(f'https://archiveofourown.org{data[1]}').content).xpath('//*[contains(@class, "tags")]//li')])

if __name__ == '__main__':
    with open('test.txt', 'w', encoding='utf-8') as f:
        f.write(parse_site())
MasonMac
  • 137
  • 1
  • 8

1 Answers1

1

You can try the below, It will easily allow you to make a lot of requests in parallel provided the server can handle it as well;

# it's just a wrapper around concurrent.futures ThreadPoolExecutor with a nice tqdm progress bar!
from tqdm.contrib.concurrent import thread_map

def chunk_list(lst, size):
    """
    From SO only; 
    Yield successive n-sized chunks from list.
    """
    for i in range(0, len(lst), size):
        yield lst[i:i + size]

for idx, my_chunk in enumerate(chunk_list(huge_list, size=2**12)):
    for response in thread_map(<which_func_to_call>, my_chunk, max_workers=your_cpu_cores+6)):
        # which_func_to_call -> wrap the returned response json obj in this, etc
        # do something with the response now..
        # make sure to cache the chunk results as well
Aditya
  • 2,380
  • 2
  • 14
  • 39