I'm got this function and it takes in the html code of wikipedia pages. It takes forever to run, any ideas how to make it more efficient?
def wikipedia_distribution(file, clean=True):
with bz2.open(file, encoding="utf8", mode='rt') as source_file:
data = source_file.read()
soup = BeautifulSoup(data, 'html')
if clean == True:
pages_text = [[item for item in page.get_text().split('\n') if item != ''] for page in pages]
else:
pages_text = [str(page).split('\n') for page in pages]
print(2)
# variable to save the results
char_list = [len("".join(page)) for page in pages_text]
print(3)
lines_list = [len(page) for page in pages_text_clean]
print(4)
lines_lens = [len(line) for page in pages_text for line in page]
print(5)
return char_list, lines_list, lines_lens