Is there a way to make this document search function faster?

Question

Is threading going to be my best bet?

I have the following and it works, but there are several folders it has to comb through and the grand total of files is just North of 500,000. Is there any way to speed this up? Multi processing?

from pathlib import Path


def build_paths():
    """Build list of all site Uploaded Files dirs"""
    return list(Path(r"K:\BigDog").glob(r'*\Data\Files'))


def count_doc_files(doc_type, doc_list):
    """Count doc and docx files for Live sites."""
    file_dict = {}
    file_list = []
    for dir in build_paths():
        dir_parts =  dir.parts
        site = dir_parts[3]
        for file in dir.rglob(doc_type):
            if file.suffix in doc_list:
                file_list.append(file.name)
        file_dict.update({site: len(file_list)})
    return file_dict


doc_list = ['.doc', '.docx']
print(count_doc_files('*.doc*', doc_list))

this seems more suitable for https://codereview.stackexchange.com/ .. that said, you never open the files, so is it sufficient to just glob and see how many there are? — ti7, Jul 13 '23 at 15:53
@ti7 Thanks. Can you elaborate on your final question? Sorry I posted in wrong spot. I'll move but want to know what you mean by final question. — FancyPantsFlower, Jul 13 '23 at 15:58
@FancyPantsFlower You really should post an answer, not edit a question and add your answer to the original OP by editing the code with no explanation or notice to others reading. If you want to point out flaws, and add improvements to the code (other than simple formatting) you should post an answer. Thanks. — FancyPantsFlower, Jul 13 '23 at 16:01
@SebastianWozny I don't know what you mean. It loops through a list of folders and counts how many files in each. `{'SiteZulu': 377}` — FancyPantsFlower, Jul 13 '23 at 17:04
@SebastianWozny I'd need an example to understand what you're saying. Count appears fine. — uncrayon, Jul 13 '23 at 17:23
@SebastianWozny I think you're trying to tell me to `clear()` the list. That or add it inside the loop so it resets. — uncrayon, Jul 13 '23 at 17:28
No `file_list=[]` isn't the solution on its own. I feel like you're intentionally being unclear. Say what you mean and mean what you say. it really helps cut down the chatter. What you "mean" is add `file_list=[]` inside the loop. By simply saying `file_list=[]` you're intentionally being misleading because you know that is already in the code. So someone would read this, look at the code, say - Hey that is already there - I'm good. — uncrayon, Jul 13 '23 at 17:29

Sebastian Wozny · Answer 1 · 2023-07-13T17:58:58.257

Assuming you'd like to count the files per site I slightly modified your code to reset the file_list to an empty list on each iteration:

import os
import random
import shutil
import tempfile
from pathlib import Path

from performance_measurement import run_performance_comparison



def count_doc_files(paths):
    doc_type = "*.doc*"
    doc_list = [".doc", ".docx"]
    """Count doc and docx files for Live sites."""
    file_dict = {}
    for dir in paths:
        file_list = []

        dir_parts = dir.parts
        site = dir_parts[-1]
        for file in dir.rglob(doc_type):
            if file.suffix in doc_list:
                file_list.append(file.name)
        file_dict.update({site: len(file_list)})
    return file_dict




def direct_glob(paths):
    """Replace the for loops and list construction with a direct dictionary comprehension"""
    doc_list = [".doc", ".docx"]
    doc_type = "*.doc*"

    file_dict = {
        dir.parts[-1]: sum(1 for file in dir.rglob(doc_type) if file.suffix in doc_list)
        for dir in paths
    }
    return file_dict

We shave of a little bit by direct dictionary construction with a comprehension, which avoids the overhead of constructing the list 1-by-1. However this doesn't make much of a difference.

from collections import deque
from itertools import count

# Avoid constructing a deque each time, reduces fixed overhead enough
# that this beats the sum solution for all but length 0-1 inputs
consumeall = deque(maxlen=0).extend


def ilen(it):
    # Taken from ShadowRanger's answer at https://stackoverflow.com/a/34404546/4850343
    # Make a stateful counting iterator
    cnt = count()
    # zip it with the input iterator, then drain until input exhausted at C level
    consumeall(zip(it, cnt))  # cnt must be second zip arg to avoid advancing too far
    # Since count 0 based, the next value is the count
    return next(cnt)


def fast_counting_dictionary_comprehension(paths):
    """Count doc and docx files for Live sites."""
    doc_list = [".doc", ".docx"]
    doc_type = "*.doc*"

    file_dict = {
        dir.parts[-1]: ilen(
            file for file in dir.rglob(doc_type) if file.suffix in doc_list
        )
        for dir in paths
    }
    return file_dict

Here we increase the speed of counting the length of the iterator, but that also doesn't make a difference.

def multi_glob(paths):
    """glob twice, skip the if statement"""
    doc_list = [".doc", ".docx"]

    file_dict = {
        dir.parts[-1]: sum(1 for extension in doc_list for file in dir.rglob(f'*{extension}') )
        for dir in paths
    }
    return file_dict

The globbing itself seems to be the performance hog, because replacing the filter in the list comprehension with explicit repeated glob calls significantly slows things down.

Profiling shows not much improvement is possible. I also tried globbing with glob.iglob but that was a lot worse even.

def setup(subdirs=3, num_dirs_with_files=10):
    """Generate a directory tree structure with doc and docx files."""
    doc_list = ["doc", "docx"]
    temp_dir = tempfile.mkdtemp()

    for i in range(subdirs):
        level_dir = os.path.join(temp_dir, f"level_{i}")
        os.mkdir(level_dir)

        if i < num_dirs_with_files:
            for doc_type in doc_list:
                for i in range(random.randint(1, 5)):
                    doc_file = os.path.join(level_dir, f"file_{i}.{doc_type}")
                    open(doc_file, "a").close()

    return [list(Path(temp_dir).glob("*"))]


def teardown(paths):
    for path in paths:
        shutil.rmtree(path)

approaches = [count_doc_files, direct_glob,fast_counting_dictionary_comprehension]

run_performance_comparison(
    approaches,
    [100, 200, 500, 1_000, 3_000, 10_000, 20_000],#50_0000,100_000,300_000,500_000],
    title="Performance Comparison",
    setup=setup,
    teardown=teardown,
    number_of_repetitions=1,
)

Profiling code:

import timeit
from functools import partial

import matplotlib.pyplot as plt
from typing import List, Dict, Callable

from contextlib import contextmanager


@contextmanager
def data_provider(data_size, setup=lambda N: N, teardown=lambda: None):
    data = setup(data_size)
    yield data
    teardown(*data)


def run_performance_comparison(approaches: List[Callable],
                               data_size: List[int],
                               setup=lambda N: [N],
                               teardown=lambda: None,
                               number_of_repetitions=5, title='Performance Comparison', data_name='N'):
    # First we show that all approaches return the same result
    with data_provider(100, setup, teardown) as data:
        for approach in approaches[1:]:
            result, expected = approach(*data), approaches[0](*data)
            assert result == expected, f'{approach.__name__} returned {result} instead of {expected}'
    approach_times: Dict[Callable, List[float]] = {approach: [] for approach in approaches}
    for N in data_size:
        with data_provider(N, setup, teardown) as data:
            print(f'Running performance comparison for {data_name}={N}')
            for approach in approaches:
                function = partial(approach, *data)
                approach_time = min(timeit.Timer(function).repeat(repeat=number_of_repetitions, number=1))
                approach_times[approach].append(approach_time)

    for approach in approaches:
        plt.plot(data_size, approach_times[approach], label=approach.__name__)
    plt.yscale('log')
    plt.xscale('log')

    plt.xlabel(data_name)
    plt.ylabel('Execution Time (seconds)')
    plt.title(title)
    plt.legend()
    plt.show()

Is there a way to make this document search function faster?

1 Answers1