I've a data frame for which I want to calculate the distance of each row to every other row. I need it to be very fast of course so I added some parallelism. But I see that it runs faster with a single thread for some reason.
dist = {}
dist_lock = threading.Lock()
def calculate_dist_threaded(data_frame, num_threads):
// calculate set of indexes for each thread, store in indexes array.
tuple_data_frame = list(data_frame.itertuples())
r = [None]
for attr in list(data_frame):
if data_frame[attr].dtype in ['int32', 'float64']:
dft = data_frame[attr][data_frame[attr] != sys.maxint].dropna()
r.append(dft.max() - dft.min())
else:
r.append(None)
for i in range(data_frame.shape[0]):
dist[i] = {}
// run each calculate_dist for each thread.
def calculate_dist(tuples, indexes, r):
for i in indexes:
logger.debug("working on index={0}".format(i))
for other in tuples:
if other[0] in dist[i]:
continue
if i == other[0]:
d = 0.0
else:
d = dist_tuples(tuples[i], other, r)
with dist_lock:
dist[i][other[0]] = d
dist[other[0]][i] = d
def dist_tuples(x, y, r):
d = 0.0
for i in range(1, len(x)):
if x[i] == y[i]:
d += 0.0
elif isinstance(x[i], numbers.Number):
d += abs(x[i] - y[i]) / r[i]
else:
d += 1.0
return d
if __name__ == "__main__":
calculate_dist_threaded(data, multiprocessing.cpu_count())
When I run it with 4 threads as it is the number of cpus on my laptop I see that it takes 15 seconds to calculate for 4 separate indexes at the same time. But if I run the same code with just one thread I see that it takes just 2 seconds to calculate for a single index so it's almost 2 seconds faster than with 4 threads. Am I missing something and have a blocking piece of code here or is just my lame laptop?