My Python process that processes multiple pages on one website crashes on the line:
soup = BeautifulSoup(cleaned_html, "lxml")
Moreover, every time it is a different page.
I use Python 2.7, bs4 0.0.1, and lxml 3.6.0.
Could you please help me? Thanks in advance!
My code:
def clean_html(self, html, document_format):
""" This function cleans and rearranges HTML and retearnes beautiful soup """
cleaned_html = html
# Remove all unimportant tags, except for the ones used by Abbyy
cleaned_html = self.remove_unimportant_tags_except_for_p_b_font_a(cleaned_html)
# Replace "nbsp;" with " "
cleaned_html = self.replace_html_symbols(cleaned_html)
# Remove extra spaces
cleaned_html = self.remove_extra_space(cleaned_html)
# Adjust html for the files from Abbyy
if document_format == 'abbyy':
logger.info("Record is made by Abbyy")
cleaned_html = self.adjust_abbyy_tags(cleaned_html)
elif document_format == 'sec':
logger.info("Record is a SEC document")
cleaned_html = self.adjust_sec_tags(cleaned_html)
# Remove the unimportant tags used by Abbyy
cleaned_html = self.remove_p_b_font_a(cleaned_html)
# Remove extra spaces
cleaned_html = self.remove_extra_space(cleaned_html)
logger.info("HTML is cleaned before making soup")
# Make soup
try:
if document_format in ("abbyy", "sec"): soup = BeautifulSoup(cleaned_html, "html5lib")
else: soup = BeautifulSoup(cleaned_html, "lxml")
except Exception as e:
logger.warning("Beautiful soup cannot be made out of this page: {}".format(str(e)))
return None
logger.info("Soup is made")
# Remove script and style tag containers with their content
[s.extract() for s in soup('script')]
[s.extract() for s in soup('style')]
[s.extract() for s in soup('del')]
[s.extract() for s in soup('s')]
[s.extract() for s in soup('strike')]
[s.extract() for s in soup('base')]
[s.extract() for s in soup('basefont')]
[s.extract() for s in soup('noscript')]
[s.extract() for s in soup('applet')]
[s.extract() for s in soup('embed')]
[s.extract() for s in soup('object')]
logger.info("Soup is cleaned")
return soup
If I do not specify "lxml", I get the following notification:
C:\Users\EERMIL~1\AppData\Local\Temp\2\_MEI38~1\bs4\__init__.py:166: UserWarning
: No parser was explicitly specified, so I'm using the best available HTML parse
r for this system ("lxml"). This usually isn't a problem, but if you run this co
de on another system, or in a different virtual environment, it may use a differ
ent parser and behave differently.
To get rid of this warning, change this:
BeautifulSoup([your markup])
to this:
BeautifulSoup([your markup], "lxml")
If I use "html5lib" instead of "lxml", the Python process does not crash, but I cannot get all text out of the HTML page. Namely I get the following error (which I catch as you see below)
'NoneType' object has no attribute 'next_element'
when I execute the following code:
for child in soup.children:
# If it is an irregular tag, skip it
if str(type(child)) == "<class 'bs4.element.Tag'>":
# If name has strange symbols, skip it
if re.search('[^a-z0-9]', child.name):
continue
# If there is no text inside, skip it
try:
if not re.search('(\w|\d)', child.get_text()):
continue
except Exception as e:
logger.warning("Unexpected exception in getting text from tag {}: {}".format(str(child), str(e)))
continue