I'm using BeautifulSoup to parse names from multiple HTML files, some files contain celebrities' names in other languages like Spanish and occasionally have accents.
I've tried using 2 functions to strip the accents that seem to be working properly ('Jesús' -> 'Jesus') but when I start calling the function with data gathered with beautifulsoup I don't get the same result ('Jesús' -> 'JesAos')
my code:
def strip_accents_1(text):
text = unicodedata.normalize('NFD', text)\
.encode('ascii', 'ignore')\
.decode("utf-8")
return str(text)
def strip_accents_2(text):
return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
# This works correctly
print(strip_accents_1('Jesús'))
def html_imp(h):
soup = BeautifulSoup(open(h), features = "lxml")
tdTags = []
values =[]
sort = []
for i in range(4,40):
for i in soup.find_all('td'):
tdTags.append(i.text)
for i in [7,25,9,15,5,11]:
values.append(tdTags[i])
# Original name with accent
sort.append(values[3])
# Strip accents
sort.append(strip_accents_1(values[3]))
sort.append(strip_accents_2(values[3]))
print(sort)
return sort
Output:
Jesus
['Jesús', 'JesAs', 'JesAos']
HTML fragment :
<TD WIDTH="80" ALIGN="center">Jesús</TD>
What's keeping the strip_accents functions from working while handling the HTML fragment?