I tried to follow what was suggested here. But it seems that only a few columns get effected when I tried either of the following functions:
def get_col_widths(dataframe):
# First we find the maximum length of the index column
idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))])
# Then, we concatenate this to the max of the lengths of column name and its values for each column, left to right
return [idx_max] + [max([len(str(s)) for s in dataframe[col].values] + [len(col)]) for col in dataframe.columns]
for i, width in enumerate(get_col_widths(df)):
worksheet.set_column(i, i, width)
def set_column_width(df):
length_list = [len(x) for x in df.columns]
for i, width in enumerate(length_list):
worksheet.set_column(i, i, width)
I may not be using the functions above correctly. Thus here is my entire code:
import openpyxl
import sql_utils as sql
from datetime import date
from datetime import timedelta
from dateutil.relativedelta import relativedelta
from pandas.tseries.holiday import USFederalHolidayCalendar
import pyodbc
import pandas as pd
import datetime
import numpy as np
import xlsxwriter
from openpyxl import load_workbook
import sys
import win32com.client as win32
from win32com.client import Dispatch
cal = USFederalHolidayCalendar()
# pylint: disable=no-member
cnxn = pyodbc.connect(sql.connection_string)
# Set dates for the queries
cutoff_date = datetime.date(2019, 6, 30)
output_file = ".\\pa-jpm-reporting\\output\\jpm_morgans_report.xlsx"
if len(sys.argv) > 1:
cutoff_date = datetime.datetime.strptime(sys.argv[1], '%Y-%m-%d').date()
output_file = sys.argv[2]
writer = pd.ExcelWriter(output_file)
def get_first_business_day_of_month(start_date, end_date):
return [
get_business_day(d).date()
for d in pd.date_range(start_date, end_date, freq="BMS")
]
def get_business_day(date):
while date.isoweekday() > 5 or date in cal.holidays():
date += timedelta(days=1)
return date
# Get as_of_date and archive_date
archive_date = get_first_business_day_of_month(cutoff_date, cutoff_date + relativedelta(days=+10))[0]
as_of_date = datetime.date(archive_date.year, 1, 1)
# Pull date
def get_sql(cutoff_date, archive_date, as_of_date):
return sql.jpm_rate_query.format(cutoff_date, archive_date, as_of_date)
def get_sql2(utoff_date, archive_date, as_of_date):
return sql.jpm_query.format(cutoff_date, archive_date, as_of_date)
# Get data into a pandas dataframe
def get_dataframe(cutoff_date, archive_date, as_of_date):
cnxn.execute(get_sql(cutoff_date, archive_date, as_of_date))
data = pd.read_sql(get_sql2(cutoff_date, archive_date, as_of_date), cnxn)
return data
df = get_dataframe(cutoff_date, archive_date, as_of_date)
df['ModEffectiveDate'] = pd.to_datetime(df['ModEffectiveDate'])
df['StepDate1'] = pd.to_datetime(df['StepDate1'])
# Fix step date
def date_check(date1, date2):
if date1 > date2:
return 'X'
else:
return ' '
df['Step date prior to mod'] = df.apply(lambda x: date_check(x['ModEffectiveDate'], x['StepDate1']), axis=1)
# Fix step check
cols = df.filter(regex='StepRate').columns
df['Later Step Rate not higher than previous'] = ' '
for i, col in enumerate(cols):
if i <= len(cols) - 2:
df['Later Step Rate not higher than previous'] = np.where(df[col] > df[cols[i+1]],'X',df['Later Step Rate not higher than previous'])
else:
break
# Format the Dates
df['CutoffDate'].astype('datetime64[ns]')
df['CutoffDate'] = pd.to_datetime(df['CutoffDate']).dt.strftime("%m/%d/%Y")
df['ModEffectiveDate'].astype('datetime64[ns]')
df['ModEffectiveDate'] = pd.to_datetime(df['ModEffectiveDate']).dt.strftime("%m/%d/%Y")
df['StepDate1'].astype('datetime64[ns]')
df['StepDate1'] = pd.to_datetime(df['StepDate1']).dt.strftime("%m/%d/%Y")
df = df.replace('NaT', '', regex=True)
# clean up (remove) zero step rates that follow the last true step
cols = df.filter(regex='StepRate').columns
for i, col in enumerate(cols):
df[col] = df[col].replace(0, '', regex=True)
# remove repeated loan number shown immediately before the step dates
df = df.drop(['loanid'], axis=1)
# Add color to column
def highlight(s):
same = s == df['CutoffDate']
return ['background-color: lightblue' if x else '' for x in same]
df.style.highlight_null(null_color='green')
df.to_excel(writer, index=False)
workbook = writer.book
worksheet = writer.sheets['Sheet1']
# Freeze 1st row
worksheet.freeze_panes(1,0)
# Place dollar amounts where relevant
num_format = workbook.add_format({'num_format': '#,##0.00'})
# Find the columns (numbers) with float64 type and set the number format
nametypemap = df.dtypes.apply(lambda x: x.name).to_dict()
for i, (k, v) in enumerate(nametypemap.items()):
# print("index: {}, key: {}, value: {}".format(i, k, v))
if v == 'float64':
worksheet.set_column(i, i, 12, num_format)
def get_col_widths(dataframe):
# First we find the maximum length of the index column
idx_max = max([len(str(s)) for s in dataframe.index.values] + [len(str(dataframe.index.name))])
# Then, we concatenate this to the max of the lengths of column name and its values for each column, left to right
return [idx_max] + [max([len(str(s)) for s in dataframe[col].values] + [len(col)]) for col in dataframe.columns]
for i, width in enumerate(get_col_widths(df)):
worksheet.set_column(i, i, width)
def set_column_width(df):
length_list = [len(x) for x in df.columns]
for i, width in enumerate(length_list):
worksheet.set_column(i, i, width)
set_column_width(df)
writer.save()
Please let me know if there is a way of doing this. I am also having a hard time trying to highlight a column in the dataframe. But I will make another separate post for that.