I need an efficient Python algorithm to get to the following outcome(s):
example 1:
l = [[1, 2, 3, 4], [2, 1, 4], [3, 1], [4, 1, 2]]
r = [[1, 2, 4], [3]]
example 2:
l = [[1, 2, 3, 4], [2, 1], [3, 1], [4, 1]]
r = [[1, 2], [3], [4]]
example 3:
l = [[1], [2, 3, 4], [3, 2, 5, 6], [4, 2] , [5, 3], [6, 3]]
r = [[2, 3], [1], [4], [5], [6]]
In these examples l is a list of lists, where:
- the first element of each list is always the index of the element
- if element 1 contains the value 3, then in its turn the 3rd list will always contain the 1 itself.
What I need is to find an efficient way to group matching subsets using 3 criteria: length of subset, frequency of sub-elements and the fact that a sub-element can only be part of 1 resulting subset.
In the 1st example the sub-element [1]
would be the most frequent subset, but the fact that the longer subset [1, 2, 4]
was found in 3 of the 4 elements is more important: length trumps frequency.
In the 2nd example a grouping such as [1, 3]
has the same length and frequency as [1, 2]
, but the the 1 is "taken" by the firstly found subset.
Later edit:
What I did so far is:
- I turned my list of lists into a dictionary
- then I built a function which repeatedly builds square matrixes of matched and not matched values, based on all possible permutations the unique keys of my dictionary
- then in the square matrixes I search for the largest squares along the main diagonal (based on a code provided here: Python find the largest square in the matrix dynamic programming)
- then I eliminate the largest squares which overlap and start allover again
My code is totally inefficient because the number of permutations grows exponentially with the size of my initial dictionary, therefore I am looking for a new idea, a new approach.
Here is what I have done so far:
from itertools import chain, permutations
def group_matches(my_dict, matched=None):
def update_my_dict(my_dict, matched):
ret_val = {}
for k, v in my_dict.items():
if k not in matched:
for unique_ind in matched:
if unique_ind in v:
v.remove(unique_ind)
ret_val[k] = v
return ret_val
def get_matches(unique_ind_permutation, my_dict):
def create_matrix(unique_ind_permutation, my_dict):
matrix = []
for k in unique_ind_permutation:
r = [True if f in my_dict[k] else False
for f in unique_ind_permutation]
matrix += [r]
return matrix
matrix = create_matrix(unique_ind_permutation, my_dict)
dp = [[0] * len(matrix) for _ in range(len(matrix))]
max_squares = [(0, None, None)]
for ri, r in enumerate(matrix):
for ci, c in enumerate(r):
dp[ri][ci] = 0 if not c \
else (1 if ri == 0 or ci == 0
else min(dp[ri - 1][ci], dp[ri][ci - 1], dp[ri - 1][ci - 1]) + 1)
max_squares = [(dp[ri][ci], ri, ci)] if dp[ri][ci] > max_squares[0][0] \
else (max_squares + [(dp[ri][ci], ri, ci)] if dp[ri][ci] == max_squares[0][0]
else max_squares)
matches = []
if max_squares[0][0] != 0:
for max_square in max_squares:
rows = [r for r in range(max_square[1]+1-max_square[0],max_square[1]+1)]
columns = [c for c in range(max_square[2]+1-max_square[0],max_square[2]+1)]
if rows == columns:
matches += [tuple(rows)]
matches = eliminate_common_matches(matches)
matches_to_unique_ind = []
l = 0
if len(matches) > 0:
l = len(matches[0])
for m in matches:
m_unique_ind = sorted([unique_ind_permutation[x] for x in m])
matches_to_unique_ind += [m_unique_ind]
return matches_to_unique_ind, l
def eliminate_common_matches(matches):
for m in matches:
aux = matches.copy()
aux.remove(m)
for a in aux:
common = (set(m) & set(a))
if len(common) > 0:
min_m = min(m)
min_a = min(a)
if min_m <= min_a:
matches.remove(a)
else:
matches.remove(m)
return matches
def find_matched(unique_indexes, matches):
matched = []
unmatched = []
for unique_ind in unique_indexes:
for m in matches:
if unique_ind in m:
matched += [unique_ind]
else:
unmatched += [unique_ind]
return matched, unmatched
if matched is not None:
my_dict = update_my_dict(my_dict, matched)
unique_indexes = list(my_dict.keys())
p_unique_indexes = list(permutations(unique_indexes))
matches = []
last_l = 0
for p in p_unique_indexes:
m, l = get_matches(p, my_dict)
if last_l < l:
matches.clear()
last_l = l
if last_l == l and l > 0:
matches += m
matches = set(tuple(l) for l in matches)
matches_order = []
for m in matches:
mx = sorted([unique_indexes.index(unique_ind_x) for unique_ind_x in m])
matches_order += [mx]
matches_order = eliminate_common_matches(matches_order)
matches = []
for mo in matches_order:
mx = [unique_indexes[x] for x in mo]
matches += [mx]
matched, unmatched = find_matched(unique_indexes, matches)
return matches, matched, unmatched
my_dict = {1:[1, 2, 3, 4],
2:[2, 1, 4],
3:[3, 1],
4:[4, 1, 2]}
unique_indexes = list(my_dict.keys())
matches = []
matched = None
while True:
instance_matches, matched, unmatched = group_matches(my_dict, matched)
if len(instance_matches) > 0:
matches += instance_matches
if len(unmatched) == 0 or len(instance_matches) == 0:
break
unmatched = list(set(unique_indexes) - set(list(chain(*matches))))
matches_unique = []
for i, x in enumerate(matches):
if x not in matches[:i]:
matches_unique += [x]
matches = matches_unique + unmatched
print(matches)
Another more complicated example:
my_dict = {
'a':['a', 'b', 'c', 'h'],
'b':['b', 'a', 'c', 'i'],
'c':['c', 'a', 'b', 'd', 'e'],
'd':['d', 'c', 'e', 'f'],
'e':['e', 'c', 'd', 'f'],
'f':['f', 'd', 'e', 'g', 'h'],
'g':['g', 'f', 'h'],
'h':['h', 'a', 'f', 'g'],
'i':['i', 'b']
}
# expected outcome:
res = [['c', 'd', 'e'], ['f', 'g', 'h'], ['a', 'b'], ['i']]
The subset ['d', 'e', 'f'] is not part of the expected outcome, because the 'd' and the 'e' are already taken by the first subset.