I'm trying to Tim sort an hdf5 file. I converted the data in the hdf5 file into a 2D numpy array, and I'm trying to sort it through a stable algorithm. The insertion step goes well.
However, when I checked by debugging, in the merge step, when the left or right array (arrays holding the pre-sorted values) elements are assigned to the original array one by one, the left and right arrays are also changed, which messes up the rest of the sorting as only one element is repeated throughout that entire chunk.
This is what I'm working with:
from collections import deque
import h5py
import sys
filename = sys.argv[1]
with h5py.File(filename, "r") as f:
a_group_key = list(f.keys())[0]
ds_arr = f[a_group_key][()]
MINIMUM= 32
def find_minrun(n):
r = 0
while n >= MINIMUM:
r |= n & 1
n >>= 1
return n + r
def insertion_sort(array, left, right):
for i in range(left+1,right+1):
#element = array[i]
j = i
while array[j-1][11]>array[j][11] and j>left :
array[[j, j-1]] = array[[j-1, j]]
j -= 1
#array[j+1] = element
return array
def merge(array, l, m, r):
array_length1= m - l + 1
array_length2 = r - m
left = []
right = []
for i in range(0, array_length1):
left.append(array[l + i])
for i in range(0, array_length2):
right.append(array[m + 1 + i])
i=0
j=0
k=l
while j < array_length2 and i < array_length1:
if left[i][11] <= right[j][11]:
array[k] = left[i]
i += 1
else:
array[k] = right[j]
j += 1
k += 1
while i < array_length1:
array[k] = left[i]
k += 1
i += 1
while j < array_length2:
array[k] = right[j]
k += 1
j += 1
def tim_sort(array):
n = len(array)
minrun = find_minrun(n)
for start in range(0, n, minrun):
end = min(start + minrun - 1, n - 1)
insertion_sort(array, start, end)
size = minrun
while size < n:
for left in range(0, n, 2 * size):
mid = min(n - 1, left + size - 1)
right = min((left + 2 * size - 1), (n - 1))
merge(array, left, mid, right)
size = 2 * size
tim_sort(ds_arr)
Here is what's happening:
E.g., for chunk size = 38 (calculated during run)
ds_arr:
[(0, 127.062386, 233.88705, 2043.5269, 0.9048116, 0.91757613, 40.91537, 0.03138579, 0.03187117, 0.01391112, 7452.246, 16),
(0, 185.04272, 327.6841, 673.2232, 0.94436234, 0.8363992, 34.514076, 0.0662952, 0.05692514, 0.11432385, 2303.1208, 24),
(0, 215.6241, 213.80653, 1756.7979, 0.89432126, 0.9836176, 36.561146, 0.03355443, 0.03731155, 0.0907836, 6127.373, 123),
... (+16 elements),
(0, 244.40192, 115.47513, 1171.4187, 0.9789816, 0.9536665, 26.107262, 0.04582449, 0.04447512, 0.02585861, 3681.2678, 6),
(0, 252.34537, 376.03607, 1285.4608, 0.93303615, 0.9969181, 34.071854, 0.0423173, 0.04572778, 0.06407942, 4192.847, 18),
(0, 343.2354, 207.10603, 859.55945, 0.93482053, 0.9706468, 35.455666, 0.05554156, 0.05821768, 0.03690968, 2773.9412, 34),
... (+16 elements),
other chunks]
left:
[(0, 127.062386, 233.88705, 2043.5269, 0.9048116, 0.91757613, 40.91537, 0.03138579, 0.03187117, 0.01391112, 7452.246, 16),
(0, 185.04272, 327.6841, 673.2232, 0.94436234, 0.8363992, 34.514076, 0.0662952, 0.05692514, 0.11432385, 2303.1208, 24),
(0, 215.6241, 213.80653, 1756.7979, 0.89432126, 0.9836176, 36.561146, 0.03355443, 0.03731155, 0.0907836, 6127.373, 123),
... (+16 elements)]
right:
[(0, 244.40192, 115.47513, 1171.4187, 0.9789816, 0.9536665, 26.107262, 0.04582449, 0.04447512, 0.02585861, 3681.2678, 6),
(0, 252.34537, 376.03607, 1285.4608, 0.93303615, 0.9969181, 34.071854, 0.0423173, 0.04572778, 0.06407942, 4192.847, 18),
(0, 343.2354, 207.10603, 859.55945, 0.93482053, 0.9706468, 35.455666, 0.05554156, 0.05821768, 0.03690968, 2773.9412, 34),
... (+16 elements)]
sorted ds_arr:
[(0, 244.40192, 115.47513, 1171.4187, 0.9789816, 0.9536665, 26.107262, 0.04582449, 0.04447512, 0.02585861, 3681.2678, 6),
(0, 244.40192, 115.47513, 1171.4187, 0.9789816, 0.9536665, 26.107262, 0.04582449, 0.04447512, 0.02585861, 3681.2678, 6),
(0, 244.40192, 115.47513, 1171.4187, 0.9789816, 0.9536665, 26.107262, 0.04582449, 0.04447512, 0.02585861, 3681.2678, 6),
... (+35 times),
similarly other chunks]
instead of getting a sorted list.