I am trying to create an excel sheet with large data and save it to the disk. I use pyexcelerate module as it is faster than others. I also do some styling. So, converting to csv is not an option for me. As the excel sheet is large, it is giving this ZIP64 file limit error internally from pyexcelerate. What I learned is that, the size limit is 2GB. However, What I predict is that the file size shouldn't come more than 500MB in my case. So, wondering why I am getting this error.
Is there any work around to overcome this limitation ?
A sample code is here..
import pandas as pd
import pyexcelerate
dfM=pd.read_csv('largefile.csv')
df_format=pd.read_csv('format.csv')
# Remove dup columns from dataframe if any.
dfM=dfM.loc[:,~dfM.columns.duplicated()]
totRows=len(dfM.index)
totCols=len(dfM.columns)
# Split sheets if more than 100K
rows_per_sheet=100000
number_of_sheets=math.ceil(totRows/rows_per_sheet)
start_index=0
end_index=rows_per_sheet
workbook = pyexcelerate.Workbook() # Creating new workbook
for sheet_num in range(number_of_sheets):
df=dfM.iloc[start_index:end_index] # Splitting based on index
sheet_nme=sheet + '_' + str(sheet_num+1) # Adding the sheet number to the sheet name
numRows=len(df.index)
# Preparing excel data
excel = [df.columns] + list(df.values)
worksheet=workbook.new_sheet(sheet_nme,data=excel) # Creating new sheet
# df_format --> ['C_Header',P_Color','P_Color_Val','P_Format']
df_format=df_format[df_format.C_Header.isin(df.columns.to_list())]
# loop1
# Start iterating through the columns to apply the format
for i in range(totCols):
format_obj=pyexcelerate.Format(df_format.iloc[i,3])
if df_format.iloc[i,1] == 'E' and df_format.iloc[i,2] > ' ':
r,g,b=hex_to_rgb(df_format.iloc[i,2])
fill_obj=pyexcelerate.Fill(background=pyexcelerate.Color(r,g,b))
worksheet.set_col_style(i+1,pyexcelerate.Style(fill=fill_obj,format=format_obj))
else :
worksheet.set_col_style(i+1,pyexcelerate.Style(format=format_obj))
# loop2
# Start iterating through the columns and the rows to apply the format for header alone
for i in range(totCols):
font_obj=pyexcelerate.Font(bold=True)
if df_format.iloc[i,2] > ' ': #Excel cell starts from (1,1) and dataframe from (0,0)
r,g,b=hex_to_rgb(df_format.iloc[i,2])
fill_obj=pyexcelerate.Fill(background=pyexcelerate.Color(r,g,b))
worksheet.set_cell_style(1,i+1,pyexcelerate.Style(fill=fill_obj,font=font_obj))
else :
worksheet.set_cell_style(1,i+1,pyexcelerate.Style(font=font_obj))
#loop3
# This is for higlighting the summary row if any
if total_row : # if there is a summary row
for i in range(totCols):
format_obj=pyexcelerate.Format(df_format.iloc[i,3])
font_obj=pyexcelerate.Font(bold=True,color=pyexcelerate.Color(255,0,0))
fill_obj=pyexcelerate.Fill(background=pyexcelerate.Color(255,255,0))
worksheet.set_cell_style(numRows+1,i+1,pyexcelerate.Style(fill=fill_obj,format=format_obj,font=font_obj))
start_index=end_index
end_index=end_index + rows_per_sheet
df='' # clearing memory
workbook.save(excel_file) # Saving the file
At the time of this save instruction , it gives the error.
Traceback (most recent call last):
File "/usr/local/lib64/python3.6/site-packages/pyexcelerate/Writer.py", line 82, in save
f.write(s.encode("utf-8"))
File "/usr/lib64/python3.6/zipfile.py", line 1043, in close
raise RuntimeError('File size unexpectedly exceeded ZIP64 '
RuntimeError: File size unexpectedly exceeded ZIP64 limit"
I tried using a byteIO as well. But no luck.
from io import BytesIO
buff = BytesIO() # BytesIO is used to avoid zip archive writing error for large files
wb.save(buff) # Saving the file to bytesio memory
with open(excel_file,"wb") as f:
f.write(xlsx_file.getvalue())
It doens't matter whether saving to a disk or memory, it gives the error. Below is the full error message. (FYI-I use redis rq for task serialization)
Traceback (most recent call last):
File "/usr/local/lib64/python3.6/site-packages/pyexcelerate/Writer.py", line 82, in save
f.write(s.encode("utf-8"))
File "/usr/lib64/python3.6/zipfile.py", line 1043, in close
raise RuntimeError('File size unexpectedly exceeded ZIP64 '
RuntimeError: File size unexpectedly exceeded ZIP64 limit
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/rq/worker.py", line 1061, in perform_job
rv = job.perform()
File "/usr/local/lib/python3.6/site-packages/rq/job.py", line 821, in perform
self._result = self._execute()
File "/usr/local/lib/python3.6/site-packages/rq/job.py", line 844, in _execute
result = self.func(*self.args, **self.kwargs)
File "./app.py", line 763, in fulfillRequest
wb.save(xlsx_file) # Saving the file to bytesio memory
File "/usr/local/lib64/python3.6/site-packages/pyexcelerate/Workbook.py", line 105, in save
self._save(filename_or_filehandle)
File "/usr/local/lib64/python3.6/site-packages/pyexcelerate/Workbook.py", line 97, in _save
self._writer.save(file_handle)
File "/usr/local/lib64/python3.6/site-packages/pyexcelerate/Writer.py", line 89, in save
zf.write(tfn, "xl/worksheets/sheet%s.xml" % (index))
File "/usr/lib64/python3.6/zipfile.py", line 1614, in write
"Can't write to ZIP archive while an open writing handle exists"
ValueError: Can't write to ZIP archive while an open writing handle exists
As per Jan' suggestion I tried listing the length of len(s.encode("utf-8")) And it looks to have more than 3 GB. Is there a way I can reduce this size ? This file is failing with the file limit issue. Or any way out ?
Dataframe shape: (163165, 305)
Dataframe size : 3.4339 GB
sheet=1 : Number of sheetStream=26117755 : Total length=2687100303
sheet=2 : Number of sheetStream=844 : Total length=43625
sheet=3 : Number of sheetStream=328447 : Total length=31528899
sheet=4 : Number of sheetStream=12172832 : Total length=997473023
Number of sheet = 4
Total Number of sheetStream = 38619878
Grand Total length = 3716145850 (3.4609 GB)
One more thing to mention is that even though the size in the memory is so huge, when it writes as an xlsx into disk, the size is significantly less. for example, the below case successfully runs and created a 125 MB xlsx file.
Dataframe shape: (116123, 305)
Dataframe size : 2.4296 GB
sheet=1 : Number of sheetStream=17904832 : Total length=1840807057
sheet=2 : Number of sheetStream=844 : Total length=43625
sheet=3 : Number of sheetStream=314923 : Total length=30162834
sheet=4 : Number of sheetStream=9091957 : Total length=744388260
Number of sheet = 4
Total Number of sheetStream = 27312556
Grand Total length = 2615401776 (2.4357 GB)
So, point is, even though it is considering it as a huge file, as xlsx on the disk, its a small file.
As per charlie's suggestion, I tried again with openpyxl, but its awfully slow. And failed with serialization error(lxml.etree._raiseSerialisationError\nlxml.etree.SerialisationError: IO_WRITE) after 43% of processing. (Means before even saving the workbook)
Here is the code I tried...
# Remove dup columns from dataframe if any.
dfM=dfM.loc[:,~dfM.columns.duplicated()]
totRows=len(dfM.index)
totCols=len(dfM.columns)
# Split sheets if more than 100K
rows_per_sheet=100000
number_of_sheets=math.ceil(totRows/rows_per_sheet)
if number_of_sheets == 0 : number_of_sheets = 1 # Atleast one sheet should be written even if empty.
start_index=0
end_index=rows_per_sheet
assert LXML is True
wb = Workbook(write_only=True) #Creating openpyxl workbook
for sheet_num in range(number_of_sheets):
df=dfM.iloc[start_index:end_index] # Splitting based on index
sheet_nme=sheet + '_' + str(sheet_num+1) # Adding the sheet number to the sheet name
num_rows=len(df.index)
worksheet = wb.create_sheet(sheet_nme)
# df_format --> ['C_Header',P_Color','P_Color_Val','P_Format']
df_format=df_format[df_format.C_Header.isin(df.columns.to_list())]
# Writing dataframe to excel row by row
row_num=-1 # To start indexing from zero
for row in dataframe_to_rows(df,index=False,header=True):
row_num+=1
col_num=-1 # To start indexing from zero
styled_row=[]
# Styling cell by cell
for cell in row :
col_num+=1
cell = WriteOnlyCell(worksheet,value=cell)
if row_num == 0: # Logic only for head row
cell.font=Font(bold=True)
if df_format.iloc[col_num,2] > ' ': #coloring for header
colorFill=PatternFill(start_color=df_format.iloc[col_num,2],end_color=df_format.iloc[col_num,2],fill_type='solid')
cell.fill=colorFill
if row_num > 0 : # Logic for other rows
if df_format.iloc[col_num,1] == 'E' :
cell.number_format=df_format.iloc[col_num,3] # Formatting for the cells
if df_format.iloc[col_num,2] > ' ': #coloring for other cells
colorFill=PatternFill(start_color=df_format.iloc[col_num,2],end_color=df_format.iloc[col_num,2],fill_type='solid')
cell.fill=colorFill
if total_row and row_num == num_rows : # Logic for summary row
cell.number_format=df_format.iloc[col_num,3]
cell.font=Font(bold=True,color='FF0000')
colorFill=PatternFill(start_color='FFFF00',end_color='FFFF00',fill_type='solid')
cell.fill=colorFill
styled_row.append(cell)
worksheet.append(styled_row)
start_index=end_index
end_index=end_index + rows_per_sheet
df='' # clearing memory
# Cleaning df those are no loner needed
dfM='' # clearing memory
df_format='' # clearing memory
#Finally writing the workbook
buff = BytesIO() # BytesIO is used to avoid zip archive writing error for large files
wb.save(buff) # Saving the file to bytesio memory
wb.close()
with open(excel_file,"wb") as f:
f.write(buff.getvalue())
Any other suggestions ?