I am currently successfully using a python 2.7 script, which recursively cycles over a huge directory/file path, collects the paths of all files, gets the mtime of such files and the mtime of respective files having the same path and name, but being pdf files for comparison. I use scandir.walk() in the python 2.7 script and os.walk() in python 3.7, which has been recently updated to also use the scandir algorithm (no additional stat() calls).
However, still the python 3 version of the script is significantly slower! This is not due to the scandir/walk part of the algorithm, but apparently either due to the getmtime algorithm (which, however, is the same call in python2 and 3) or by the processing of the huge list (we are talking about ~500.000 entries in this list).
Any idea what might cause this and how to solve this issue?
#!/usr/bin/env python3
#
# Imports
#
import sys
import time
from datetime import datetime
import os
import re
#
# MAIN THREAD
#
if __name__ == '__main__':
source_dir = '/path_to_data/'
# Get file list
files_list = []
for root, directories, filenames in os.walk(source_dir):
# Filter for extension
for filename in filenames:
if (filename.lower().endswith(('.msg', '.doc', '.docx', '.xls', '.xlsx'))) and (not filename.lower().startswith('~')):
files_list.append(os.path.join(root, filename))
# Sort list
files_list.sort(reverse=True)
# For each file, the printing routine is performed (including necessity check)
all_documents_counter = len(files_list)
for docfile_abs in files_list:
print('\n' + docfile_abs)
# Define files
filepathname_abs, file_extension = os.path.splitext(docfile_abs)
filepath_abs, filename = os.path.split(filepathname_abs)
# If the filename does not have the format # # # # # # # *.xxx (e.g. seven numbers), then it is checked whether it is referenced in the databse. If not, it is moved to a certain directory
if (re.match(r'[0-9][0-9][0-9][0-9][0-9][0-9][0-9](([Aa][0-9][0-9]?)?|(_[0-9][0-9]?)?|([Aa][0-9][0-9]?_[0-9][0-9]?)?)\...?.?', filename + file_extension) is None):
if any(expression in docfile_abs for expression in ignore_subdirs):
pass
else:
print('Not in database')
# DOC
docfile_rel = docfile_abs.replace(source_dir, '')
# Check pdf
try:
pdf_file_abs = filepathname_abs + '.pdf'
pdf_file_timestamp = os.path.getmtime(pdf_file_abs)
check_pdf = True
except(FileNotFoundError):
check_pdf = False
# Check PDF
try:
PDF_file_abs = filepathname_abs + '.PDF'
PDF_file_timestamp = os.path.getmtime(PDF_file_abs)
check_PDF = True
except(FileNotFoundError):
check_PDF = False
# Check whether ther are lowercase or uppercase extension and decide what to do if there are none, just one or both present
if (check_pdf is True) and (check_PDF is False):
# Lower case case
pdf_extension = '.pdf'
pdffile_timestamp = pdf_file_timestamp
elif (check_pdf is False) and (check_PDF is True):
# Upper case case
pdf_extension = '.PDF'
pdffile_timestamp = PDF_file_timestamp
elif (check_pdf is False) and (check_PDF is False):
# None -> set timestampt to zero
pdf_extension = '.pdf'
pdffile_timestamp = 0
elif (check_pdf is True) and (check_PDF is True):
# Both are present, decide for the newest and move the other to a directory
if (pdf_file_timestamp < PDF_file_timestamp):
pdf_extension = '.PDF'
pdf_file_rel = pdf_file_abs.replace(source_dir, '')
pdffile_timestamp = PDF_file_timestamp
elif (PDF_file_timestamp < pdf_file_timestamp):
pdf_extension = '.pdf'
PDF_file_rel = PDF_file_abs.replace(source_dir, '')
pdffile_timestamp = pdf_file_timestamp
# Get timestamps of doc and pdf files
try:
docfile_timestamp = os.path.getmtime(docfile_abs)
except OSError:
docfile_timestamp = 0
# Enable this to force a certain period to be printed
DateBegin = time.mktime(time.strptime('01/02/2017', "%d/%m/%Y"))
DateEnd = time.mktime(time.strptime('01/03/2017', "%d/%m/%Y"))
# Compare stimestamps and print or not
if (pdffile_timestamp < docfile_timestamp) or (pdffile_timestamp == 0):
# Inform that there should be printed
print('\tPDF should be printe.')
else:
# Inform that there was no need to print
print('\tPDF is up to date.')
# Exit
sys.exit(0)