Here is a partial solution;
reorder
is functional, but as I am not very familiar with pandas I have just used Python's built-in data structures.
Edit: I've replaced partition_by_sum
with a greedy implementation; it tries to find equal sums, but pays no attention to the number of items per bin. Suggestions for a better algorithm?
This should give you a pretty good head-start.
from collections import defaultdict
import csv
VALUE_COL = 1
NUM_BINS = 3
inp = [
["foo", 569],
["bar", 9842],
["asdasd", 98],
["poiqweu", 7840],
["oiasd", 4],
["poeri", 145],
["sacodiw", 55],
["aosdwr", 855],
["9523", 60],
["a52sd", 5500],
["sdcw", 415],
["0932", 317]
]
def load_csv(fname, **kwargs):
with open(fname, "rb") as inf:
for row in csv.reader(inf, **kwargs):
yield row
def save_csv(fname, rows, **kwargs):
with open(fname, "wb") as outf:
csv.writer(outf, **kwargs).writerows(rows)
def make_index(lst, col):
"""
Index a table by column;
return list of column-values and dict of lists of rows having that value
"""
values, index = [], defaultdict(list)
for row in lst:
val = row[col]
values.append(val)
index[val].append(row)
return values, index
def min_index(lst):
"""
Return index of min item in lst
"""
return lst.index(min(lst))
def partition_by_sum(values, num_bins, key=None):
"""
Try to partition values into lists having equal sum
Greedy algorithm, per http://en.wikipedia.org/wiki/Partition_problem#Approximation_algorithm_approaches
"""
values.sort(key=key, reverse=True) # sort descending
bins = [[] for i in xrange(num_bins)]
sums = [0] * num_bins
for value in values:
index = min_index(sums)
bins[index].append(value)
sums[index] += value
return bins
def reorder(lst, key=None):
"""
Return [highest, lowest, second-highest, second-lowest, ...]
"""
lst.sort(key=key, reverse=True) # sort in descending order
halflen = (len(lst) + 1) // 2 # find midpoint
highs, lows = lst[:halflen], lst[halflen:][::-1] # grab [high half descending], [low half ascending]
lst[0::2], lst[1::2] = highs, lows # reassemble
return lst
def main():
# load data
data = inp # load_csv("input_file.csv")
# solve partitioning
values, index = make_index(data, VALUE_COL)
bins = partition_by_sum(values, NUM_BINS)
# rearrange for output
bins = [[index[val].pop() for val in reorder(bin)] for bin in bins]
# write output
for i,bin in enumerate(bins, 1):
save_csv("output_file_{}.csv".format(i), bin)
if __name__=="__main__":
main()