I would like to translate the text of a column of my dataframe, the goal is to harmonize the data. I have text in Chinese, English, French, German, Spanish etc... I want to have all the text in English.
I have tried several things: with the googletrans API
1)naively try to do it
from googletrans import Translator
translator = Translator()
df["translated"] = df.apply(lambda row :translator.translate(row['name']).text,axis = 1)
Out:JSONDecodeError: ('Expecting value: line 1 column 1 (char 0)', 'occurred at index 1816997')
2) by resetting the API each time GoogleTrans API Error - Expecting value: line 1 column 1 (char 0) Using this link I ran this code: and I still have an error..
import copy
from googletrans import Translator
translatedList = []
for index, row in df.iterrows():
# REINITIALIZE THE API
translator = Translator()
newrow = copy.deepcopy(row)
try:
# translate the 'text' column
translated = translator.translate(row['name'], dest='en')
newrow['translated'] = translated.text
except Exception as e:
print(str(e))
continue
translatedList.append(newrow)
Out: Expecting value: line 1 column 1 (char 0)
3) I also tried to bypass the limit of the google API by changing IPs.
test with vpn: does not work
import random
listofservers = ["South Africa", "Egypt" , "Australia", "New Zealand", "South Korea", "Singapore", "Taiwan", "Vietnam", "Hong Kong", "Indonesia", "Thailand", "Japan", "Malaysia", "United Kingdom", "Netherlands", "Germany", "France", "Belgium", "Switzerland", "Sweden","Spain","Denmark", "Italy", "Norway", "Austria", "Romania", "Czech Republic", "Luxembourg", "Poland", "Finland", "Hungary", "Latvia", "Russia", "Iceland", "Bulgaria", "Croatia", "Moldova", "Portugal", "Albania", "Ireland", "Slovakia","Ukraine", "Cyprus", "Estonia", "Georgia", "Greece", "Serbia", "Slovenia", "Azerbaijan", "Bosnia and Herzegovina", "Macedonia","India", 'Turkey', 'Israel', 'United Arab Emirates', 'United States', 'Canada','Mexico'
,"Brazil", "Costa Rica", "Argentina", "Chile"]
def SelectServer(l):
return random.choice(l)
def translate_text(text, dest_language="en"):
# Used to translate using the googletrans library
translator = googletrans.Translator()
try:
translation = translator.translate(text=text, dest=dest_language)
except json.decoder.JSONDecodeError:
# api call restriction
print("exception !! déconection du VPN ")
process = subprocess.Popen(["nordvpn", "-d"], shell = True ,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
process.wait()
time.sleep(5)
srv = SelectServer(listofservers)
print("sélection du serveur : "+ srv + " et connexion")
process = subprocess.Popen(["nordvpn", "-c", "-g", srv ], shell = True ,stdout=subprocess.PIPE, stderr=subprocess.PIPE)
process.wait()
time.sleep(60)
return translate_text(text=text, dest_language=dest_language)
return translation.text
Out : ConnectionError: HTTPSConnectionPool(host='translate.google.com', port=443): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x0000027016006488>: Failed to establish a new connection: [WinError 10060]
I greatly appreciate your help,
Chris.