I'm a beginner in python so I have this program where it classifies tweets into different categories (sport,sante, culture...) using keywords and I would like to copy-paste every line of the JSON file that belongs to a certain category into a file named text1 and I did the following : but I guess I did it the wrong way since I keep receiving the same error please any suggestion on how to solve this problem!
import json
import mysql.connector
'''
python -m pip install unicode
'''
c = 0
n = 0
sportcount=0
religcount=0
santecount=0
educcount=0
cultcount=0
socicount=0
policount=0
covid = ['كوفيد','MEDECIN','كورونا','CORONA', 'COVID','VACCIN', 'PANDEMIE', 'CONFINEMENT', 'PANDEMIE', 'CHU', 'GEL','ﻣﺎﺳﻚ' ,'CHINA','ANTIVIRALES','LAVAGE DE MAINS','VIRUS','اﻟﺤﺮاﻙ','اﻟﺤﺠﺮ','CHLOROQUINE','FATIGUE','كماما','STAYSAFE','EPIDEMIE','STAYHOME','منظمة الصحة',' pas de prière']
sport=['مولودية','WORKOUT','بايرن','ارسنال','ليفربول','منتخب','تتويج','ميسي','PSG','FIFA', 'FOOT','FEKIR', 'BOUGER', 'DANSER', 'STADE','بونجاح','JOUEUR', 'COMPETION', 'SPORT', 'SALLE', 'SPONSOR', 'PISCINE', 'PUMA', 'GYM', 'TEN', 'MATCH', 'CHAMPION', 'BASKET', 'NOVAK', 'DJOKOVIC', 'MESSI', 'OLYMPIQUE']
religion=['ALLAH','المساجد','مصل','HAMDULILAH','المسلم','فتوى','لله','EID','الله','MOSQ','دين']
sante=['controle','إصاب','OXYGENE','بوناطيرو','حالات','مؤكد','IMMUNIT','CAS','صح','DOCTEUR','مخبر','حصيلة','صحة','أطباء','تسجل','FATIG','مستشف','HOPITAUX','سعال','لقاح','SOUCHES','MALADE','حصيلة','FUMEURS', 'DIABETE', 'EPIDEMIE', 'DEPISTAGE', 'SOIGNANT', 'INJECTION','GEL','SANTE', 'FIEVRE', 'KAWASAKI', 'RESPIRATOIRE', 'PATIENT', 'TEST', 'TRAITEMENT','فحص','كماما', 'CHU','منظمة الصحة', 'MEDECINE', 'POSITIF', 'PHARMACE', 'INFECTES', 'IMMUN', 'VACCIN', 'PFIZER', 'PCR', 'PANDÉM', 'PANADEMI', 'ÉPIDÉMI', 'EPIDEMIC', 'MASQUE', 'BAVETTE', 'MASK', 'MÉDICAL', 'MEDICAL', 'HÔPITAL', 'HOSPITAL', 'INFECT', 'TRANSMISSION', 'SURVIVANT', 'SURVIVORS', 'DIAGNOSTIC', 'DIAGNOSIS', 'SANTÉ', 'HEALTH', 'MÉDECIN', 'DOCTOR', 'MÉDICAMENT', 'MEDICIN', 'AMBULANCE', 'DÉPISTAGE', 'DEPISTAGE', 'STATISTI', 'MALAD', 'SICK', 'CONFIN', 'PROPAGATION', 'PRÉVENTION', 'PREVENTION', 'CONTAGION', 'SYMPT', 'MESUR', 'MEASUR', 'MICROB', 'WASH', 'ISOL']
education=['سنة','collaboration','bac','EDUC','ÉCOLE','PROF','بتدائي','تعليم','أساتذة','دراس','طلبة']
culture=['LIVRE', 'BOOK', 'SHOP', 'FILM', 'MOVIE', 'MUSIC', 'TV', 'VOYAGE', 'CINEMA', 'ART', 'BLOG', 'SONG']
social=['الشباب','TWITTER','تغريد','متابعة','SOCI','NETFLIX','YOUTUBE','JOURNAL','solde','liquidationy']
politique=['manifestation','AFFAIRE', 'PUBLIQUE', 'AMBASSADE', 'CIRCONSPECTION', 'CIVI', 'COMBINAISON', 'DÉMAGOGIE', 'DÉMOCRATIE', 'DIPLOMATE', 'ÉCONOMIE', 'ÉTAT', 'FÉDÉRALISME', 'GOUVERNEMENT', 'LEGATION', 'MACHIAVÉLIQUE', 'MACHIAVÉLISME', 'MANŒUVRIER', 'NÉGOCI', 'POLICE', 'POUVOIR', 'PRUDE', 'PUBLIC', 'STRATÉGIE', 'TRACTATION', 'RÉPUBLIQUE', 'SONDAGE', 'OPINION', 'PARLEMENT', 'CITO', 'DÉPUTÉ', 'DIRIGEANT', 'MAIRE', 'MINIST', 'SECRÉTAIRE', 'SÉNATEUR', 'CONSEILLER', 'MAIRE', 'COMMUNES', 'MEMBRE', 'DU', 'PARLEMENT', 'CONGRÈS', 'SÉNAT', 'PROTESTATION', 'PROCURATION', 'POUVOIR', 'FRAUDE','النفط','وزير','HOLLANDE','خدعة','تبون','الغلق','BORIS JOHNSON','وزار','رئيس','DROITS','مؤامرة','والي','TRUMP','GOUVERNEMEN', 'POLITI', 'OUYAHIA', 'ERAK', 'IRAK', 'REINE', 'MACRON', 'MINIST', 'AMBASSAD', 'MANIFEST', 'PRESIDENT', 'SELLAL', 'NATIONAL', 'مظاهر','MILITAIRE', 'DICTATEUR','اﻟﺤﺮاﻙ']
myJsonFile = open('tweet.json', encoding="utf-16")
resultat = open('texte.txt', 'w')
for line in myJsonFile:
data = json.loads(line)
c = c+1
text = data['raw_text'].upper()
tweet = any(ele in text for ele in covid )
if tweet == True:
n=n+1
#print(str(n) + " the " +str(c)+" tweet---------------------------------------------------------------------------------------")
in_sport = any(ele in text for ele in sport )
if in_sport:
data["cat"]='sport'
resultat.write(line)
sportcount=sportcount+1
print(data)
else :
in_sante = any(ele in text for ele in sante )
if in_sante :
data["cat"]='sante'
resultat.write(line)
santecount=santecount+1
print(data)
else :
in_politique = any(ele in text for ele in politique)
if in_politique:
data["cat"]='politique'
policount=policount+1
resultat.write(line)
print(data)
else:
in_culture = any(ele in text for ele in culture )
if in_culture:
data["cat"]='culture'
cultcount=cultcount+1
resultat.write(line)
print(data)
else:
in_religion = any(ele in text for ele in religion)
if in_religion:
data["cat"]='religion'
religcount=religcount+1
resultat.write(line)
print(data)
else:
in_education = any(ele in text for ele in education )
if in_education:
data["cat"]='education'
educcount=educcount+1
resultat.write(line)
print(data)
else:
in_social = any(ele in text for ele in social)
if in_social:
data["cat"]='social'
socicount=socicount+1
resultat.write(line)
print(data)
else:
print(" the tweet---------------------------------------------------------------------------------------")
print("a partir de",c," le nombre de tweets concernant le covid sont ",n)
print("sport ",sportcount," sante ",santecount," politique ",policount," culture ",cultcount," religion ",religcount," education ",educcount,"social",socicount)
print("les tweets non classifies",n-(educcount+religcount+cultcount+policount+santecount+sportcount+socicount))
error:
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/NIHAD/PycharmProjects/pythonProject3/classification.py", line 51, in <module>
resultat.write(line)
File "C:\Users\NIHAD\AppData\Local\Programs\Python\Python39\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 451-454: character maps to <undefined>