There must be a better way to solve this, but here is a naive attempt at resolving your question(s). This solution presents results you may not realize.
We build a dictionary of tuples with some relationship with 1) the key or 2) the key and each other in a set of tuples.
Given
import copy
import itertools as it
import collections as ct
import nose.tools as nt
a = ["01", "02", "03"]
b = ["02", "03", "04"]
c = ["04", "05", "03"]
d = ["05", "07", "06"]
e = ["08", "06", "07"]
# Helpers
def _is_quasi_unique(by=3, item=None, seen=None):
"""Return True if `item` differs with all `seen` items `by` some number of elements."""
if seen is None:
seen = set()
return all((len(set(s) - set(item)) >= by) for s in seen)
def _has_repeats(iterable):
"""Return True if repeated elements are in `iterable`."""
return not (len(iterable) == len(set(iterable)))
def _has_permutes(x, iterable):
"""Return True if `x` equates to any permutated item in `iterable`."""
for item in iterable:
if set(x) == set(item):
return True
return False
def _clean_subsets(subsets):
"""Yield quasi-unique, non-repeated subsets."""
seen = set()
for subset in subsets:
if _has_permutes(subset, seen) or _has_repeats(subset):
continue
seen.add(subset)
yield subset
Code
# Secondary Functions
def get_cluster(subsets, seed, n=2, exclude=True):
"""Return a set of quasi-unique tuples; permit `n` similar elements.
`seed` is a set, a starting pool of related items.
"""
if exclude:
set_ = seed
else:
set_ = copy.copy(seed)
for subset in subsets:
if _is_quasi_unique(n, subset, seed):
set_.add(subset)
return set_
def get_clean_subsets(*iterables):
"""Yield clean subsets."""
iterables = it.product(*iterables)
yield from _clean_subsets(iterables)
# Primary Functions
def get_clusters(subsets, n=2, exclude=True):
"""Return a dict of clusters of unique items."""
clusters = {}
everseen = set()
for subset in subsets:
if subset not in everseen:
clusters[subset] = get_cluster(subsets, {subset}, n=n, exclude=exclude)
everseen |= clusters[subset]
return clusters
def find_related_items(*iterables, n=2, exclude=True):
"""Return a dict of key-cluster pairs.
Parameters
----------
iterables : tuple
Input lists
n : int
Number of allowed similar elements between subsets.
exclude: bool
Make clusters with subsets unique within each cluster.
"""
subsets = list(get_clean_subsets(*iterables))
return get_clusters(subsets, n=n, exclude=exclude)
Demo
Find sets of tuples that are unique to all other tuples (subsets)
>>> find_related_items(a, b, c, d, e, n=2, exclude=False)
{('01', '02', '04', '05', '06'):
{('01', '02', '03', '05', '08'),
('01', '02', '03', '06', '08'),
('01', '02', '03', '07', '06'),
('01', '02', '03', '07', '08'),
('01', '02', '04', '05', '06'),
('01', '02', '04', '07', '08'),
('01', '02', '05', '07', '08'),
('01', '03', '04', '05', '07'),
('01', '03', '04', '05', '08'),
('01', '03', '04', '06', '08'),
('01', '03', '04', '07', '06'),
('01', '03', '04', '07', '08'),
('01', '03', '05', '06', '08'),
('01', '03', '05', '07', '06'),
...},
...}
Find sets of tuples that are unique to other tuples in the each set (cluster)
>>> find_related_items(a, b, c, d, e, n=2)
{('01', '02', '03', '05', '07'):
{('01', '02', '03', '05', '07'),
('01', '02', '03', '06', '08'),
('01', '02', '04', '05', '08'),
('01', '02', '04', '07', '06'),
('01', '03', '04', '05', '06'),
('01', '03', '04', '07', '08')},
('01', '02', '04', '05', '06'):
{('01', '02', '03', '05', '08'),
('01', '02', '03', '07', '06'),
('01', '02', '04', '05', '06'),
('01', '02', '04', '07', '08'),
('01', '03', '04', '05', '07'),
('01', '03', '04', '06', '08')},
...}
Details
Terms
- subsets: tuples generated from the Cartesian product of input items.
- clusters: groups of quasi-unique tuples.
- quasi-unique: two tuples are quasi-unique if all elements are disjoint except for
n
similar elements shared between them.
Primary Functions
find_related_items()
: this function returns clean, key-cluster pairs. Subsets are cleaned by discaring tuples with permutated or repeated elements (via get_clean_subsets()
). Clusters are built by calling get_clusters()
.
get_clusters()
: this function produces a dictionary of key-cluster pairs by iterating the subsets. All observed subsets are recorded internally in the everseen
set. Only unique observations are added as keys to the dictionary.
Two Results
These tuples in each cluster are "unique" with respect to either:
- the key (
A
), e.g. A - B
, A - C
, (exclude=False
)
- the key and all other members in the cluster, e.g.
A - B
, B - C
, C - A
, ...
, (exclude=True
)
Notes
- Clusters are built from
_get_cluster()
, which relies on an initial starting tuple. This starting tuple is needed to compared against all other subsets in order to return a cluster of quasi-unique tuples. Starting tuples are recorded as keys of the dictionary only to indicate what other tuples in a cluster were compared against. Thus, keys are arbitrary and less critical than the clusters.
- While the general patterns should be consistent, the final results may be contingent on the starting tuples.