Function calls and searches of global namespace are more expensive.
Your stupid
function makes 2 function calls per element in the word list. The second in your max
call is entirely avoidable, iterating the keys of a dict and then for each key looking up the value using dict.get
is a flagrant inefficiency when you can iterate over key-value pairs.
def stupid(words):
freqs = {}
for w in words:
freqs[w] = freqs.get(w, 0) + 1
return max(freqs, key=freqs.get)
def most_frequent(words):
## Build the frequency dict
freqs = {}
for w in words:
if w in freqs:
freqs[w] += 1
else:
freqs[w] = 1
## Search the frequency dict
m_k = None
m_v = 0
for k, v in freqs.iteritems():
if v > m_v:
m_k, m_v = k, v
return m_k, m_v
Using user1952500's single pass suggestion, how does this fare on your large sample sets?
def faster(words):
freq = {}
m_k = None
m_v = 0
for w in words:
if w in freq:
v = freq[w] + 1
else:
v = 1
freq[w] = v
if v > m_v:
m_k = w
m_v = v
return m_k, m_v
This has the slight advantage of being stable for the multiple most-frequent values.
Comparison of all suggestions using nltk.books
to produce a sample:
def word_frequency_version1(words):
"""Petar's initial"""
freqs = {}
for w in words:
freqs[w] = freqs.get(w, 0) + 1
return max(freqs, key=freqs.get)
def word_frequency_version2(words):
"""Matt's initial"""
## Build the frequency dict
freqs = {}
for w in words:
if w in freqs:
freqs[w] += 1
else:
freqs[w] = 1
## Search the frequency dict
m_k = None
m_v = 0
for k, v in freqs.iteritems():
if v > m_v:
m_k, m_v = k, v
return m_k, m_v
def word_frequency_version3(words):
"""Noting max as we go"""
freq = {}
m_k = None
m_v = 0
for w in words:
if w in freq:
v = freq[w] + 1
else:
v = 1
freq[w] = v
if v > m_v:
m_k = w
m_v = v
return m_k, m_v
from collections import Counter
def word_frequency_version4(words):
"""Built-in Counter"""
c = Counter(words)
return c.most_common()[0]
from multiprocessing import Pool
def chunked(seq,count):
v = len(seq) / count
for i in range(count):
yield seq[i*v:v+i*v]
def frequency_map(words):
freq = {}
for w in words:
if w in freq:
freq[w] += 1
else:
freq[w] = 1
return freq
def frequency_reduce(results):
freq = {}
for result in results:
for k, v in result.iteritems():
if k in freq:
freq[k] += v
else:
freq[k] = v
m_k = None
m_v = None
for k, v in freq.iteritems():
if v > m_v:
m_k = k
m_v = v
return m_k, m_v
# def word_frequency_version5(words,chunks=5,pool_size=5):
# pool = Pool(processes=pool_size)
# result = frequency_reduce(pool.map(frequency_map,chunked(words,chunks)))
# pool.close()
# return result
def word_frequency_version5(words,chunks=5,pool=Pool(processes=5)):
"""multiprocessing Matt's initial suggestion"""
return frequency_reduce(pool.map(frequency_map,chunked(words,chunks)))
def word_frequency_version6(words):
"""Petar's one-liner"""
return max(set(words),key=words.count)
import timeit
freq1 = timeit.Timer('func(words)','from __main__ import words, word_frequency_version1 as func; print func.__doc__')
freq2 = timeit.Timer('func(words)','from __main__ import words, word_frequency_version2 as func; print func.__doc__')
freq3 = timeit.Timer('func(words)','from __main__ import words, word_frequency_version3 as func; print func.__doc__')
freq4 = timeit.Timer('func(words)','from __main__ import words, word_frequency_version4 as func; print func.__doc__')
freq5 = timeit.Timer('func(words,chunks=chunks)','from __main__ import words, word_frequency_version5 as func; print func.__doc__; chunks=10')
freq6 = timeit.Timer('func(words)','from __main__ import words, word_frequency_version6 as func; print func.__doc__')
Results:
>>> print "n={n}, m={m}".format(n=len(words),m=len(set(words)))
n=692766, m=34464
>>> freq1.timeit(10)
"Petar's initial"
3.914874792098999
>>> freq2.timeit(10)
"Matt's initial"
3.8329160213470459
>>> freq3.timeit(10)
"Noting max as we go"
4.1247420310974121
>>> freq4.timeit(10)
"Built-in Counter"
6.1084718704223633
>>> freq5.timeit(10)
"multiprocessing Matt's initial suggestion"
9.7867341041564941
Notes:
- I'm cheating with
multiprocessing.Pool
instance as a kwarg for timing purposes as I wanted to avoid the pool start-up cost and timeit
doesn't let you specify clean-up code. This was run on a "quad" cpu VM, I'm sure for some values of input data and cpu counts that multiprocessing will be faster.
- For the most part return the highest frequency word, which may be random if there's a tie for first place.
- Approximations of highest frequency may be faster (using sampling), but will be approximate.
- Version 6 (one-liner) should be ignored for large values of
n*m
.