Assuming that you want this to work for an arbitrary number of sequences, a direct (but likely not the most efficient -- probably the others
object can be constructed from the last iteration) way to solve this would be:
def deep_unique_set(*seqs):
for i, seq in enumerate(seqs):
others = set(x for seq_ in (seqs[:i] + seqs[i + 1:]) for x in seq_)
yield [x for x in seq if x not in others]
or the slightly faster but less memory efficient and otherwise identical:
def deep_unique_preset(*seqs):
pile = list(x for seq in seqs for x in seq)
k = 0
for seq in seqs:
num = len(seq)
others = set(pile[:k] + pile[k + num:])
yield [x for x in seq if x not in others]
k += num
Testing it with the provided input:
s1 = ['a', 'b', 'c']
s2 = ['a', 'potato', 'd']
s3 = ['a', 'b', 'h']
print(list(deep_unique_set(s1, s2, s3)))
# [['c'], ['potato', 'd'], ['h']]
print(list(deep_unique_preset(s1, s2, s3)))
# [['c'], ['potato', 'd'], ['h']]
Note that if the input contain duplicates within one of the lists, they are not removed, i.e.:
s1 = ['a', 'b', 'c', 'c']
s2 = ['a', 'potato', 'd']
s3 = ['a', 'b', 'h']
print(list(deep_unique_set(s1, s2, s3)))
# [['c', 'c'], ['potato', 'd'], ['h']]
print(list(deep_unique_preset(s1, s2, s3)))
# [['c', 'c'], ['potato', 'd'], ['h']]
If all duplicates should be removed, a better approach is to count the values. The method of choice for this is by using collections.Counter
, as proposed in @Kasramvd answer:
def deep_unique_counter(*seqs):
counts = collections.Counter(itertools.chain.from_iterable(seqs))
for seq in seqs:
yield [x for x in seq if counts[x] == 1]
s1 = ['a', 'b', 'c', 'c']
s2 = ['a', 'potato', 'd']
s3 = ['a', 'b', 'h']
print(list(deep_unique_counter(s1, s2, s3)))
# [[], ['potato', 'd'], ['h']]
Alternatively, one could keep track of repeats, e.g.:
def deep_unique_repeat(*seqs):
seen = set()
repeated = set(x for seq in seqs for x in seq if x in seen or seen.add(x))
for seq in seqs:
yield [x for x in seq if x not in repeated]
which will have the same behavior as the collections.Counter
-based approach:
s1 = ['a', 'b', 'c', 'c']
s2 = ['a', 'potato', 'd']
s3 = ['a', 'b', 'h']
print(list(deep_unique_repeat(s1, s2, s3)))
# [[], ['potato', 'd'], ['h']]
but is slightly faster, since it does not need to keep track of unused counts.
Another, highly inefficient, make use of list.count()
for counting instead of a global counter:
def deep_unique_count(*seqs):
pile = list(x for seq in seqs for x in seq)
for seq in seqs:
yield [x for x in seq if pile.count(x) == 1]
These last two approaches are also proposed in @AlainT. answer.
Some timings for these are provided below:
n = 100
m = 100
s = tuple([random.randint(0, 10 * n * m) for _ in range(n)] for _ in range(m))
for func in funcs:
print(func.__name__)
%timeit list(func(*s))
print()
# deep_unique_set
# 10 loops, best of 3: 86.2 ms per loop
# deep_unique_preset
# 10 loops, best of 3: 57.3 ms per loop
# deep_unique_count
# 1 loop, best of 3: 1.76 s per loop
# deep_unique_repeat
# 1000 loops, best of 3: 1.87 ms per loop
# deep_unique_counter
# 100 loops, best of 3: 2.32 ms per loop