I have a huge list of urls that I need to send request and retrieve a json data.But the problem is Since the list with the urls is too big to load it at once, I would like to read the urls one by one, and each time the url is loaded, it should start a request. My code work for small list(~20k) with no problem but I got stuck with a huge list.
It would be great if you could tell me how to change my code, to get it to send asynchronous requests for each url of the urls list. Thank you in advance.
Here is my code:
import json
import urllib
from urllib.parse import quote
import time
import asyncio
import aiohttp
import json
from json.decoder import JSONDecodeError
urls = ["url_1", "url_2". "url_3"........"url_3,000,000"]
START = time.monotonic()
class RateLimiter:
RATE = 20
MAX_TOKENS = 10
def __init__(self, client):
self.client = client
self.tokens = self.MAX_TOKENS
self.updated_at = time.monotonic()
async def get(self, *args, **kwargs):
await self.wait_for_token()
now = time.monotonic() - START
print(f'{now:.0f}s: ask {args[0]}')
return self.client.get(*args, **kwargs)
async def wait_for_token(self):
while self.tokens < 1:
self.add_new_tokens()
await asyncio.sleep(0.1)
self.tokens -= 1
def add_new_tokens(self):
now = time.monotonic()
time_since_update = now - self.updated_at
new_tokens = time_since_update * self.RATE
if self.tokens + new_tokens >= 1:
self.tokens = min(self.tokens + new_tokens, self.MAX_TOKENS)
self.updated_at = now
async def fetch_one(client, url):
# Watch out for the extra 'await' here!
async with await client.get(url) as resp:
for response in resp:
try:
results = await response.json()
try:
answer = results['results'][0]['locations']
output = {
"Provided location" : results['results'][0]['providedLocation'].get('location'),
"City": answer[0].get('adminArea5'),
"State" : answer[0].get('adminArea3'),
"Country": answer[0].get('adminArea1')
}
json_results.append(output)
except (IndexError,JSONDecodeError):
output = {
"Provided location": 'null',
"City": 'null',
"State" : 'null',
"Country":'null'
}
json_results.append(output)
except:
output = {
"Provided location": None,
"City": 'null',
"State" : 'null',
"Country":'null'
}
json_results.append(output)
now = time.monotonic() - START
async def main():
async with aiohttp.ClientSession() as client:
client = RateLimiter(client)
tasks = [asyncio.ensure_future(fetch_one(client, url)) for url in urls]
await asyncio.gather(*tasks)
if __name__ == '__main__':
asyncio.run(main())