Assuming you'd like to count the files per site I slightly modified your code to reset the file_list
to an empty list on each iteration:
import os
import random
import shutil
import tempfile
from pathlib import Path
from performance_measurement import run_performance_comparison
def count_doc_files(paths):
doc_type = "*.doc*"
doc_list = [".doc", ".docx"]
"""Count doc and docx files for Live sites."""
file_dict = {}
for dir in paths:
file_list = []
dir_parts = dir.parts
site = dir_parts[-1]
for file in dir.rglob(doc_type):
if file.suffix in doc_list:
file_list.append(file.name)
file_dict.update({site: len(file_list)})
return file_dict
def direct_glob(paths):
"""Replace the for loops and list construction with a direct dictionary comprehension"""
doc_list = [".doc", ".docx"]
doc_type = "*.doc*"
file_dict = {
dir.parts[-1]: sum(1 for file in dir.rglob(doc_type) if file.suffix in doc_list)
for dir in paths
}
return file_dict
We shave of a little bit by direct dictionary construction with a comprehension, which avoids the overhead of constructing the list 1-by-1. However this doesn't make much of a difference.
from collections import deque
from itertools import count
# Avoid constructing a deque each time, reduces fixed overhead enough
# that this beats the sum solution for all but length 0-1 inputs
consumeall = deque(maxlen=0).extend
def ilen(it):
# Taken from ShadowRanger's answer at https://stackoverflow.com/a/34404546/4850343
# Make a stateful counting iterator
cnt = count()
# zip it with the input iterator, then drain until input exhausted at C level
consumeall(zip(it, cnt)) # cnt must be second zip arg to avoid advancing too far
# Since count 0 based, the next value is the count
return next(cnt)
def fast_counting_dictionary_comprehension(paths):
"""Count doc and docx files for Live sites."""
doc_list = [".doc", ".docx"]
doc_type = "*.doc*"
file_dict = {
dir.parts[-1]: ilen(
file for file in dir.rglob(doc_type) if file.suffix in doc_list
)
for dir in paths
}
return file_dict
Here we increase the speed of counting the length of the iterator, but that also doesn't make a difference.
def multi_glob(paths):
"""glob twice, skip the if statement"""
doc_list = [".doc", ".docx"]
file_dict = {
dir.parts[-1]: sum(1 for extension in doc_list for file in dir.rglob(f'*{extension}') )
for dir in paths
}
return file_dict
The glob
bing itself seems to be the performance hog, because replacing the filter in the list comprehension with explicit repeated glob calls significantly slows things down.
Profiling shows not much improvement is possible. I also tried globbing with glob.iglob
but that was a lot worse even.
def setup(subdirs=3, num_dirs_with_files=10):
"""Generate a directory tree structure with doc and docx files."""
doc_list = ["doc", "docx"]
temp_dir = tempfile.mkdtemp()
for i in range(subdirs):
level_dir = os.path.join(temp_dir, f"level_{i}")
os.mkdir(level_dir)
if i < num_dirs_with_files:
for doc_type in doc_list:
for i in range(random.randint(1, 5)):
doc_file = os.path.join(level_dir, f"file_{i}.{doc_type}")
open(doc_file, "a").close()
return [list(Path(temp_dir).glob("*"))]
def teardown(paths):
for path in paths:
shutil.rmtree(path)
approaches = [count_doc_files, direct_glob,fast_counting_dictionary_comprehension]
run_performance_comparison(
approaches,
[100, 200, 500, 1_000, 3_000, 10_000, 20_000],#50_0000,100_000,300_000,500_000],
title="Performance Comparison",
setup=setup,
teardown=teardown,
number_of_repetitions=1,
)

Profiling code:
import timeit
from functools import partial
import matplotlib.pyplot as plt
from typing import List, Dict, Callable
from contextlib import contextmanager
@contextmanager
def data_provider(data_size, setup=lambda N: N, teardown=lambda: None):
data = setup(data_size)
yield data
teardown(*data)
def run_performance_comparison(approaches: List[Callable],
data_size: List[int],
setup=lambda N: [N],
teardown=lambda: None,
number_of_repetitions=5, title='Performance Comparison', data_name='N'):
# First we show that all approaches return the same result
with data_provider(100, setup, teardown) as data:
for approach in approaches[1:]:
result, expected = approach(*data), approaches[0](*data)
assert result == expected, f'{approach.__name__} returned {result} instead of {expected}'
approach_times: Dict[Callable, List[float]] = {approach: [] for approach in approaches}
for N in data_size:
with data_provider(N, setup, teardown) as data:
print(f'Running performance comparison for {data_name}={N}')
for approach in approaches:
function = partial(approach, *data)
approach_time = min(timeit.Timer(function).repeat(repeat=number_of_repetitions, number=1))
approach_times[approach].append(approach_time)
for approach in approaches:
plt.plot(data_size, approach_times[approach], label=approach.__name__)
plt.yscale('log')
plt.xscale('log')
plt.xlabel(data_name)
plt.ylabel('Execution Time (seconds)')
plt.title(title)
plt.legend()
plt.show()