I currently have this class for making requests to an API and caching the JSON response:
import os
import pathlib
import json
import hashlib
import time
import requests
class NoJSONResponseError(Exception):
pass
class JSONRequestCacher(object):
"""Manage a JSON object through the cache.
Download the associated resource from the provided URL
when need be and retrieve the JSON from a cached file
if possible.
"""
def __init__(self, duration=86400, cachedir=None):
self.duration = duration
self.cachedir = self._get_cachedir(cachedir)
self._validate_cache()
def _get_cachedir(self, cachedir):
if cachedir is None:
cachedir = os.environ.get(
'CUSTOM_CACHEDIR',
pathlib.Path(pathlib.Path.home(), '.custom_cache/')
)
return cachedir
def _validate_cache(self):
"""Create the cache directory if it doesn't exist"""
self.cachedir.mkdir(parents=True, exist_ok=True)
def _request(self, url):
"""Perform the retrieval of the requested JSON data"""
return requests.get(url)
def save(self, raw, cachefile):
"""Save the provided raw JSON data into the cached file"""
with open(cachefile, 'w') as out:
json.dump(raw, out)
def load(self, cachefile):
"""Retrieve the saved JSON data from the cached file"""
with open(cachefile) as cached:
return json.load(cached)
def cache_is_valid(self, cachefile):
"""Check if cache exists and is more recent than the cutoff"""
if cachefile.is_file():
cache_age = time.time() - cachefile.stat().st_mtime
return cache_age < self.duration
return False
def request(self, url, refresh=False):
"""The JSON data associated to the given URL.
Either read from the cache or fetch from the web.
"""
urlhash = hashlib.md5(url.encode()).hexdigest()
cachefile = self.cachedir.joinpath(urlhash)
start = time.time()
if not refresh and self.cache_is_valid(cachefile):
return self.load(cachefile), True, time.time() - start
resp = self._request(url)
resp.raise_for_status()
try:
raw = resp.json()
except ValueError:
raise NoJSONResponseError()
self.save(raw, cachefile)
return raw, False, resp.elapsed.total_seconds()
I then have other classes and code which call the request
method of this code like so:
class APIHelper():
def __init__(self):
self.cache = JSONRequestCacher()
def fetch(val):
url = 'my/url/{}'.format(val)
return self.cache.request(url)
def fetchall(vals):
repsonses = []
for val in vals:
responses.append(self.fetch(val))
return responses
For a small number of vals
this is fine and it's really no big deal to wait 10 mins. However I am now looking at making 30,000+ hits to this endpoint. In the past I have used threadpools (multiprocessing.dummy.Pool
) to achieve some parallelism, however from my reading it seems like async/await
and aiothttp
is a better way to go. Unfortunately try as I might I cannot wrap my head around how to translate that to this code. I am using Python 3.8.
EDIT I tried making this change:
class JSONRequestCacher():
def __init__():
self.http = aiohttp.ClientSession()
async def _request(self, url):
async with self.http.get(url) as response:
return await response.read()
Got the error: AttributeError: 'coroutine' object has no attribute 'json'
from my raw = resp.json()
line
Tried then adding resp = await self._request(url)
but that is SyntaxError: 'await' outside async function
. Then if I make request
an async function then calling it just seems to return me a coroutine object that doesn't give me the expected response.
And this is just trying to make the _request
call async. I can't even start to understand how I am meant to make multiple calls to it via another class (APIHelper
).