I am bulding a python script that uses asyncio and 3 queues. I am processing data from different sources in 4 steps and the idea is to use the queues to save the results from one step to be used in the next step as soons as it possible. The script is doing what it should but for some reason I am not figuring out when all the data has been processed the script don't finish. To try to understant the problem I build a simplified version of the script where I do simple math operations.
First I populate the first queue with 50 randon number between 0 and 10. Next I get the numbers stored in queue1, square it and put the result on queue2. Next I get the squared number stored in queue2, double it and store the result in queue3. Finally I get the final result stored in queue3 and append it to a dataframe and save the result to a file.
As I said. The procedure described above works but when I finish processing all the elemens queue3 I was expecting that the procedure would finish.
This my first version of the toy code I build to demonstrate my problem
import os
import warnings
import asyncio
import random
import pandas as pd
from datetime import datetime
os.environ['PYTHONASYNCIODEBUG'] = '1'
warnings.resetwarnings()
class asyncio_toy():
def __init__(self):
self.df = pd.DataFrame(columns=['id','final_value'])
async def generate_random_number(self,i:int,queue):
for k in range(50):
r=random.randint(0,10)
#await asyncio.sleep(r)
await queue.put((k,r))
async def square_it(self,n,queue1,queue2):
while True:
print(f'{datetime.now()} - START SQUARE IT task {n} q1: {str(queue1.qsize()).zfill(2)} - q2:{str(queue2.qsize()).zfill(2)}')
r=await queue1.get()
await asyncio.sleep(5)
await queue2.put((r[0],r[1]*r[1]))
queue1.task_done()
print(f'{datetime.now()} - END SQUARE IT task {n} q1: {str(queue1.qsize()).zfill(2)} - q2:{str(queue2.qsize()).zfill(2)}')
async def double_it(self,n,queue2,queue3):
while True:
print(f'{datetime.now()} - START DOUBLE IT task {n} q2: {str(queue2.qsize()).zfill(2)} - q3:{str(queue3.qsize()).zfill(2)}')
r=await queue2.get()
await asyncio.sleep(10)
await queue3.put((r[0],2*r[1]))
queue2.task_done()
print(f'{datetime.now()} - END DOUBLE IT task {n} q2: {str(queue2.qsize()).zfill(2)} - q3:{str(queue3.qsize()).zfill(2)}')
async def save_it(self,n,queue3):
while True:
print(f'{datetime.now()} - START SAVE IT task {n} q3: {str(queue3.qsize()).zfill(2)}')
r=await queue3.get()
await asyncio.sleep(1)
self.df.loc[len(self.df)]=[r[0],r[1]]
self.df.to_csv('final_result.csv')
queue3.task_done()
print(f'{datetime.now()} - END SAVE IT task {n} q3: {str(queue3.qsize()).zfill(2)}')
async def main(self):
queue1 = asyncio.Queue() # store the randon number
queue2 = asyncio.Queue() # stores the squared number
queue3 = asyncio.Queue() # store the final result
rand_gen = [asyncio.create_task(self.generate_random_number(n,queue1)) for n in range(1)]
square_scan = [asyncio.create_task(self.square_it(k,queue1,queue2)) for k in range(5)]
double_scan = [asyncio.create_task(self.double_it(k,queue2,queue3)) for k in range(5)]
save_scan = [asyncio.create_task(self.save_it(k,queue3)) for k in range(5)]
await asyncio.gather(*rand_gen)
await asyncio.gather(*square_scan)
await asyncio.gather(*double_scan)
await asyncio.gather(*save_scan)
await queue1.join()
await queue2.join()
await queue3.join()
for a in square_scan:
a.cancel()
for b in square_scan:
b.cancel()
for c in save_scan:
c.cancel()
### testing
if __name__ == '__main__':
toy=asyncio_toy()
asyncio.run(toy.main())
Doing some research about this problem I found this another question
[1]: Using Multiple Asyncio Queues Effectively which suggest not use of queue.join and use of sentinel shutdonw.
import os
import warnings
import asyncio
import random
import pandas as pd
from datetime import datetime
os.environ['PYTHONASYNCIODEBUG'] = '1'
warnings.resetwarnings()
class asyncio_toy():
def __init__(self):
self.df = pd.DataFrame(columns=['id','final_value'])
async def generate_random_number(self,i:int,queue1):
for k in range(50):
r=random.randint(0,10)
queue1.put_nowait((k,r))
queue1.put_nowait(None)
async def square_it(self,n,queue1,queue2):
while True:
print(f'{datetime.now()} - START SQUARE IT task {n} q1: {str(queue1.qsize()).zfill(2)} - q2:{str(queue2.qsize()).zfill(2)}')
r=await queue1.get()
if r is None:
await queue2.put(None)
break
await asyncio.sleep(5)
await queue2.put((r[0],r[1]*r[1]))
queue1.task_done()
print(f'{datetime.now()} - END SQUARE IT task {n} q1: {str(queue1.qsize()).zfill(2)} - q2:{str(queue2.qsize()).zfill(2)}')
async def double_it(self,n,queue2,queue3):
while True:
print(f'{datetime.now()} - START DOUBLE IT task {n} q2: {str(queue2.qsize()).zfill(2)} - q3:{str(queue3.qsize()).zfill(2)}')
r=await queue2.get()
if r is None:
await queue3.put(None)
break
await asyncio.sleep(10)
await queue3.put((r[0],2*r[1]))
queue2.task_done()
print(f'{datetime.now()} - END DOUBLE IT task {n} q2: {str(queue2.qsize()).zfill(2)} - q3:{str(queue3.qsize()).zfill(2)}')
async def save_it(self,n,queue3):
while True:
print(f'{datetime.now()} - START SAVE IT task {n} q3: {str(queue3.qsize()).zfill(2)}')
r=await queue3.get()
if r is None:
break
await asyncio.sleep(1)
self.df.loc[len(self.df)]=[r[0],r[1]]
self.df.to_csv('final_result.csv')
queue3.task_done()
print(f'{datetime.now()} - END SAVE IT task {n} q3: {str(queue3.qsize()).zfill(2)}')
async def main(self):
queue1 = asyncio.Queue() # store the randon number
queue2 = asyncio.Queue() # stores the squared number
queue3 = asyncio.Queue() # store the final result
rand_gen = [asyncio.create_task(self.generate_random_number(n,queue1)) for n in range(1)]
square_scan = [asyncio.create_task(self.square_it(k,queue1,queue2)) for k in range(5)]
double_scan = [asyncio.create_task(self.double_it(k,queue2,queue3)) for k in range(5)]
save_scan = [asyncio.create_task(self.save_it(k,queue3)) for k in range(5)]
await asyncio.gather(*rand_gen)
await asyncio.gather(*square_scan)
await asyncio.gather(*double_scan)
await asyncio.gather(*save_scan)
for a in square_scan:
a.cancel()
for b in square_scan:
b.cancel()
for c in save_scan:
c.cancel()
### testing
if __name__ == '__main__':
toy=asyncio_toy()
asyncio.run(toy.main())
But it didn't solve the problem. I have also try to remove the functions from the class definition but it didn't work as well.
I am starting working with asyncio module and I think I am doing some basic mistake that I am not able to see. Any tips will be welcome.
UPDATE
I have simplified the problem even further and got some interesting effect that can lead to the answer. I created another toy code that use just one queue where I store the initial randon number. The code get the number from this queue squared it and print in termial. This peace of code finish. So I think that maybe the problem is related, in some way, to the fact that I am using more than one queue.
import asyncio
import random
class asyncio_toy():
def __init__(self):
...
async def generate_random_number(self,i:int,queue):
for _ in range(i):
r=random.randint(0,5)
await asyncio.sleep(r)
await queue.put((i,r))
async def square_scan(self,k,queue):
while True:
(i,r) = await queue.get()
print(f'prod {i} - cons {k} - {r} - {r*r}')
queue.task_done()
async def main(self):
queue = asyncio.Queue()
prod = [asyncio.create_task(self.generate_random_number(n,queue)) for n in range(5)]
cons = [asyncio.create_task(self.square_scan(k,queue)) for k in range(4)]
await asyncio.gather(*prod)
await queue.join()
for c in cons:
c.cancel()
### testing
if __name__ == '__main__':
toy=asyncio_toy()
asyncio.run(toy.main())