I am trying to convert html to text. Upon inspection, there are "amp;" and other non text items being converted. I used replace () to try to manually remove these non text items, but it does not seems to work.
Here are my codes:
import csv
from datetime import datetime, timedelta
import glob
import pandas as pd
from bs4 import BeautifulSoup
import lxml.html
import lxml.html.clean
filenames = glob.glob('C:\daily\d??????e.htm')
print(" ")
print(" Combine Daily...")
print(" ")
with open('combine.txt', 'w', newline='', encoding='utf-8-sig') as g:
writer = csv.writer(g)
for i in filenames:
mylines = [] # Declare an empty list named mylines.
print(i)
with open(i,'rt') as myfile: # Open txt for reading text data.
myfile = myfile.string.replace("amp;", "")
myfile = myfile.string.replace("</font></pre><pre><font size='1'>", "")
with open('combine.txt', 'a', newline='', encoding='utf-8-sig') as g:
writer = csv.writer(g)
writer.writerow([myfile])