I am trying to process a 10,000+ line spreadsheet, and even when I narrow it to ~3,000 lines, Python is running slowly (about 1 line per second). I'm comparing each entry in the third column; if I find a match, I check the value in the fifth column. If both match, I have an interesting record.
I know this is probably something simple, or that there's an easier, more pythonic way, but am suffering from coder's block.
I'm using python 3.8 and openpyxl 3.0.0 under Windows 10. Here's the code:
#! python3
''' Copy specific rows from a spreadsheet and write them to a
temporary spreadsheet.
Most of this was copied from
https://stackoverflow.com/questions/44593705/
how-to-copy-over-an-excel-sheet-to-another-workbook-in-python'''
import logging
import os
import tkinter as tk
from tkinter import filedialog
import openpyxl
from openpyxl.utils import column_index_from_string, get_column_letter
logging.basicConfig(level=logging.DEBUG, \
format=' %(asctime)s - %(levelname)s- %(message)s')
logging.disable(logging.DEBUG)
logging.info('=== STARTING ===')
root = tk.Tk()
root.withdraw() # Hide the Tk window.
# Get the source spreadsheet.
src_file = filedialog.askopenfilename()
logging.debug(src_file)
src_file_path = os.path.split(src_file)
logging.debug('Path = ' + src_file_path[0] + '; File = ' + src_file_path[1])
# Open the source spreadsheet.
src_wb = openpyxl.load_workbook(src_file, read_only=True)
src_ws = src_wb.worksheets[0]
# CONSTANTS
KEYDATE_TARGET = "PM's ESTIM."
MANAGER_TARGET = '055'
SRC_CONTRACT_COL = 'D'
SRC_PROJ_COL = 'F'
SRC_KEYDATE_COL = 'E'
SRC_MAX_ROWS = src_ws.max_row
SRC_MGR_COL = 'C'
SRC_SUBJ_COL = 'J'
# Initializations
src_row_num = 7274
temp_row_num = 0
src_col = 3
for src_row in src_ws.iter_rows(min_row=src_row_num, max_row = 9001):
# Build cell coordinates
src_row_num += 1
src_cell_coord = SRC_MGR_COL + str(src_row_num)
logging.debug(src_cell_coord)
src_cell = src_ws[src_cell_coord]
if src_cell.value == MANAGER_TARGET or src_cell.value == '053':
logging.info('Found manager ' + str(src_cell.value))
# Save row number
mgr_row = src_cell.row
keydate_coord = SRC_KEYDATE_COL + str(mgr_row)
keydate_cell = src_ws[keydate_coord]
if KEYDATE_TARGET in keydate_cell.value:
logging.info('keydate_coord = ' + str(keydate_coord))
logging.info('=== DONE ===')