I tried many functions and tried to apply existing solutions to get the output I want, yet I seems not to be able to get an excel output at the end that keeps the formatting I try to apply.
It seems that all the function existing in pandas uses only identically labelled indexes, or files of the same shape, in my situation the shape of the two files are (757,26) for let's say file1 and (688,39) for file 2, the first 26 columns are labelled the same way for file1 and file2.
is there a way to merge these two files, highlight the differences as indicated in the title, and create an excel output with the formatting still present?
Here is what I tried:
import pandas as pd
import numpy as np
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl import Workbook
import pandas.io.formats.style as style
dfcurr=pd.read_excel(r'IPH4201P2 - DRATracker_Current.xlsx')
dfprev=pd.read_excel(r'IPH4201P2 - DRATracker_Previous.xlsx')
dfprev=dfprev.loc[(dfprev['Subject'].str.contains('M'))|(dfprev['Subject'].str.contains('S'))]
dfprev=dfprev.reset_index()
dfprev=dfprev.drop(columns='index')
df_diff=pd.merge(dfcurr,dfprev,how='left',indicator=True)
common_columns = df_diff.columns.intersection(dfprev.columns)
compare_df = df_diff[common_columns].eq(dfprev[common_columns])
compare_df.to_excel('comp.xlsx')
# Convert dataframe to string
df_diff = df_diff.astype(str)
def highlight_diff(data, compare):
if type(data) != pd.DataFrame:
data = pd.DataFrame(data)
if type(compare) != pd.DataFrame:
compare = pd.DataFrame(compare)
result = []
for col in data.columns:
if col in compare.columns and (data[col] != compare[col]).any():
result.append('background-color: #DAEEF3')
elif col not in compare.columns:
result.append('background-color: #E4DFEC')
else:
result.append('background-color: white')
return result
# Create a new workbook and add a new worksheet
wb = Workbook()
ws = wb.active
# Write the dataframe to the worksheet
for r in dataframe_to_rows(df_diff.style.apply(highlight_diff, compare=compare_df).data, index=False, header=True):
ws.append(r)
# Save the workbook
wb.save('Merged_style.xlsx')
However, I do not get an output with the style applied; no cells are highlighted in the color I want them to be highlighted in.
Edit: I tried a different approach to highlight the cells in the excel, the function used for this approach comes from here:
import pandas as pd
import numpy as np
import openpyxl
import pandas.io.formats.style as style
dfcurr=pd.read_excel(r'IPH4201P2 - DRATracker_Current.xlsx')
dfprev=pd.read_excel(r'IPH4201P2 - DRATracker_Previous.xlsx')
dfprev=dfprev.loc[(dfprev['Subject'].str.contains('M'))|(dfprev['Subject'].str.contains('S'))]
dfprev=dfprev.reset_index()
dfprev=dfprev.drop(columns='index')
new='background-color: #DAEEF3'
change='background-color: #E4DFEC'
df_diff=pd.merge(dfcurr,dfprev,on=['Subject','Visit','Visit Date','Site\nID','Cohort','Pathology','Clinical\nStage At\nScreening','TNMBA at\nScreening'],how='left',indicator=True)
for col in df_diff.columns:
if '_y' in col:
del df_diff[col]
elif 'Unnamed: 1' in col:
del df_diff[col]
elif '_x' in col:
df_diff.columns=df_diff.columns.str.rstrip('_x')
def highlight_diff(data, other, color='#DAEEF3'):
# Define html attribute
attr = 'background-color: {}'.format(color)
# Where data != other set attribute
return pd.DataFrame(np.where(data.ne(other), attr, ''),
index=data.index, columns=data.columns)
# Set axis=None so it passes the entire frame
df_diff=df_diff.style.apply(highlight_diff, axis=None, other=dfprev)
print(type(df_diff))
df_diff.to_excel('Diff.xlsx',engine='openpyxl',index=0)
This new method provides me with an excel file where the style is applied, how can I update it to apply the color #DAEEF3 to rows in df_diff where if the Subject, Visit and Visit Date are not present in the dataframe dfprev, and apply the color #E4DFEC to cells that differs between the two files for matching Subject, Visit and Visit Date?