I have a simple concurrency script that uses Oracle for the main database. I'm listening into SQL with python-oracledb
and have used individual threads to open up a database connection. However, I have recognised that this is very slow by comparison for larger tables. For example, for table with a table size of 1.2 million rows, the programme hangs and my IDE crashes. Whereas, with row sizes of 80,000, it completes in a few seconds. What is causing this overflow and how to correct this?
SQL = 'SELECT /*+ ENABLE_PARALLEL_DML PARALLEL(AUTO) */ * FROM USER_TABLE offset :rowoffset rows fetch next :maxrows rows only'
MAX_ROWS = 1200000
NUM_THREADS = 12
def start_workload(fn):
def wrapped(self, threads, *args, **kwargs):
assert isinstance(threads, int)
assert threads > 0
ts = []
for i in range(threads):
new_args = (self, i, *args)
t = threading.Thread(target=fn, args=new_args, kwargs=kwargs)
t.start()
ts.append(t)
for t in ts:
t.join()
return wrapped
import pandas as pd
class TEST:
def __init__(self, batchsize, maxrows, *args):
self._pool = oracledb.create_pool(user = args[0], password = args[1], port=1521,host="localhost", service_name="service_name", min=NUM_THREADS, max=NUM_THREADS)
self._batchsize = batchsize
self._maxrows = maxrows
@start_workload
def do_query(self, tn):
with self._pool.acquire() as connection:
with connection.cursor() as cursor:
max_rows = self._maxrows
row_iter = int(max_rows/self._batchsize)
cursor.arraysize = 10000
cursor.prefetchrows = 1000000
cursor.execute(SQL, dict(rowoffset=(tn*row_iter), maxrows=row_iter))
columns = [col[0] for col in cursor.description]
cursor.rowfactory = lambda *args: dict(zip(columns, args))
pd.DataFrame(cursor.fetchall()).to_csv(f'TH_{tn}_customer.csv')
if __name__ == '__main__':
result = TEST(NUM_THREADS,MAX_ROWS,username, password)
import time
start=time.time()
Make = result.do_query(NUM_THREADS)
end=time.time()
print('Total Time: %s' % (end-start))
print(Make)