Updated answer:
After some discussion in the comments below, my original answer isn't going to cut it.
The structure of the file Test.csv
is not something that DictReader
from the CSV module can parse. This is easily solved by creating a simple file parser.
The part below the 2 methods has not changed much. Instead of parsing the results of DictReader
from the CSV module, we parse the results from the function readcsv
updated code:
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
h.bypass_tables = False
def cleanline(instring: str) -> list:
"""
removes the offending crap and returns a list of strings
"""
return instring.replace('|', '').replace('file:///', '').strip().split(',')
def readcsv(filename: str) -> list:
"""
read the CSV file and create a list of dict items based on it.
the result will be similar to what DictReader from the CSV module did,
but tailored to the specific file formatting that you are processing.
"""
result = []
with open(filename) as csv_infile:
# get headers & clean the line
header_list = cleanline(csv_infile.readline())
# skip the line "| -------- |" by just reading the line and not processing it
# note that this is not actually needed as the logic below
# only handles lines that contain a comma character
csv_infile.readline()
# process the rest of the lines
for line in csv_infile:
# the check below is to check if it's an empty line or not
# (by looking for the comma separator)
if ',' in line:
# basically I use the header_list to turn the current line
# into a dict and add it to the result list
# set/reset values
line_list = cleanline(line)
line_dict = {}
# use the index to get the header from the headerline
for index, item in enumerate(line_list):
line_dict[header_list[index]] = item
result.append(line_dict)
return result
for row in readcsv('Test.csv'):
print(row)
infilename = row['FilePath']
# create a filename based on the File column
outfilename = f"{row['File']}.txt"
with open(infilename) as html_infile:
text = h.handle(html_infile.readlines())
with open(outfilename, 'w') as html_outfile:
html_outfile.write(text)
Original answer:
You are missing the final part, taken from the docs.
Note the change in the assignment to the content variable from content = r.content
to content = r.text
.
I also added a print statement so you can see the difference between content
and text
.
from csv import DictReader
import requests
import html2text
with open('Test.csv', 'r') as read_obj:
csv_dict_reader = DictReader(read_obj)
for row in csv_dict_reader:
r = requests.get(row['FilePath'])
content = r.text
print(content)
h = html2text.HTML2Text()
h.ignore_links = True
h.bypass_tables = False
text = h.handle(content)
print(text)
# edit to save the converted text to a file
# for the filename I'm using the url, with some stripping
# you need to test this code though as I wrote it on mobile
with open(row['FilePath'].replace('/', '_').replace(':', ''), 'w') as outfile:
outfile.write(text)
The above answer was on the false assumption that HTTP requests were being preformed. Below is the adjusted answer after comment interaction.
from csv import DictReader
import html2text
h = html2text.HTML2Text()
h.ignore_links = True
h.bypass_tables = False
with open('Test.csv', 'r') as read_obj:
csv_dict_reader = DictReader(read_obj)
for row in csv_dict_reader:
infilename = row['FilePath']
outfilename = row['File'] # I'm using this as it seems this column is what this is meant for
with open(infilename) as html_infile:
text = h.handle(html_infile.readlines())
with open(outfilename, 'w') as html_outfile:
html_outfile.write(text)