We can implement a class that keeps track of the top-k values, as I don't believe the standard library has this built-in. This would be kept up to date in parallel with the main dictionary object (probably a Counter
). You could also use this as an attribute of a subclass of the main dictionary object.
Implementation
class MostCommon(object):
"""Keep track the top-k key-value pairs.
Attributes:
top: Integer representing the top-k items to keep track of.
store: Dictionary of the top-k items.
min: The current minimum of any top-k item.
min_set: Set where keys are counts, and values are the set of
keys with that count.
"""
def __init__(self, top):
"""Create a new MostCommon object to track key-value paris.
Args:
top: Integer representing the top-k values to keep track of.
"""
self.top = top
self.store = dict()
self.min = None
self.min_set = defaultdict(set)
def _update_existing(self, key, value):
"""Update an item that is already one of the top-k values."""
# Currently handle values that are non-decreasing.
assert value > self.store[key]
self.min_set[self.store[key]].remove(key)
if self.store[key] == self.min: # Previously was the minimum.
if not self.min_set[self.store[key]]: # No more minimums.
del self.min_set[self.store[key]]
self.min_set[value].add(key)
self.min = min(self.min_set.keys())
self.min_set[value].add(key)
self.store[key] = value
def __contains__(self, key):
"""Boolean if the key is one of the top-k items."""
return key in self.store
def __setitem__(self, key, value):
"""Assign a value to a key.
The item won't be stored if it is less than the minimum (and
the store is already full). If the item is already in the store,
the value will be updated along with the `min` if necessary.
"""
# Store it if we aren't full yet.
if len(self.store) < self.top:
if key in self.store: # We already have this item.
self._update_existing(key, value)
else: # Brand new item.
self.store[key] = value
self.min_set[value].add(key)
if value < self.min or self.min is None:
self.min = value
else: # We're full. The value must be greater minimum to be added.
if value > self.min: # New item must be larger than current min.
if key in self.store: # We already have this item.
self._update_existing(key, value)
else: # Brand new item.
# Make room by removing one of the current minimums.
old = self.min_set[self.min].pop()
del self.store[old]
# Delete the set if there are no old minimums left.
if not self.min_set[self.min]:
del self.min_set[self.min]
# Add the new item.
self.min_set[value].add(key)
self.store[key] = value
self.min = min(self.min_set.keys())
def __repr__(self):
if len(self.store) < 10:
store = repr(self.store)
else:
length = len(self.store)
largest = max(self.store.itervalues())
store = '<len={length}, max={largest}>'.format(length=length,
largest=largest)
return ('{self.__class__.__name__}(top={self.top}, min={self.min}, '
'store={store})'.format(self=self, store=store))
Example usage
>>> common = MostCommon(2)
>>> common
MostCommon(top=2, min=None, store={})
>>> common['a'] = 1
>>> common
MostCommon(top=2, min=1, store={'a': 1})
>>> 'a' in common
True
>>> common['b'] = 2
>>> common
MostCommon(top=2, min=1, store={'a': 1, 'b': 2})
>>> common['c'] = 3
>>> common
MostCommon(top=2, min=2, store={'c': 3, 'b': 2})
>>> 'a' in common
False
>>> common['b'] = 4
>>> common
MostCommon(top=2, min=3, store={'c': 3, 'b': 4})
Access after updating values is indeed O(1)
>>> counter = Counter()
>>> for x in permutations(xrange(10), 10):
counter[x] += 1
>>> common = MostCommon(1)
>>> for key, value in counter.iteritems():
common[key] = value
>>> common
MostCommon(top=1, min=1, store={(9, 7, 8, 0, 2, 6, 5, 4, 3, 1): 1})
>>> timeit('repr(common)', 'from __main__ import common', number=1)
1.3251570635475218e-05
Access is O(1), but when the minimum changes during a set-item call that is a O(n) operation, where n
is the number of top values. This is still better than Counter
, which is O(n) during every access, where n
is the size of the entire vocabulary!