Problem
My problem is that I want to extract tables from several PDFs. I can get the data out. Only the writing into a csv file does not work.
I get this out: as it should not be
How I want it to look How it should look
i am Importing pdfminer, os and pandas
My Code
path='My_Path'
df_results = pd.DataFrame()
for file_name in os.listdir(path): #Loop on Files
print(file_name)
fp = open(path + file_name, 'rb')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.get_pages(fp)
for page in pages:
print('Processing next page...')
interpreter.process_page(page)
layout = device.get_result()
for lobj in layout:
if isinstance(lobj, LTTextBox):
x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
#print('At %r is text: %s' % ((x, y), text))
#data= pd.Series(text)
if x==50.520000749999994 and y==200.30424779999996: #x and y from console from print from line 39
collected_data = [text]
data_list = collected_data
#data = pd.Series(data_list)
print(data_list)
data= pd.DataFrame([data_list], columns=list('c'), )
df_results = df_results.append(data,ignore_index=False)
if x==405.599991 and y==187.82423730000002: #x and y from console from print from line 39
collected_data = [text]
data_list = collected_data
#data = pd.Series(data_list)
print(data_list)
data= pd.DataFrame([data_list], columns=list('d'), )
df_results = df_results.append(data,ignore_index=False)
if x==562.4399872500001 and y==187.82423730000002: #x and y from console from print from line 39
collected_data = [text]
data_list = collected_data
#data = pd.Series(data_list)
print(data_list)
data= pd.DataFrame([data_list], columns=list('f'), )
df_results = df_results.append(data,ignore_index=False)
#print(collected_data)
print(df_results)
df_results.to_csv('coordinates_data.csv', index = False, sep=';', )