0

Home-made algorithm (not sure how good that is) : Please don't be too harsh, it's barely my third real algorithm in python I am totally aware that some people might find it quite long for what it really is... first I did it step by step (beginner mistake I believe, should have thought of the whole process before starting), second : I explain at the end.

import os
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemmy = WordNetLemmatizer()
import string
import re
from nltk.corpus import stopwords
import math
import operator

# Lecture de l'index

index = open("C:\\Users\\Ayman\\projet python\\imdb\\title_index", "r").read()

# Création d'un dictionnaire de type dico[numéro de film]=[liste des
# numéros de critiques associées]
#lIMITATION AVEC K = LE NOMBRE DE CRITIQUES QU'ON PREND EN COMPTE
#DU COUP SAIT QUE LA TAILLE DE DICO_INDEX EST LE NOMBRE DE FILMS QUE L'ON PREND EN COMPTE

dico_index = {}
k=0
for i in index.splitlines():
    if k<2000 :
        k=k+1
        info = [j for j in i.split(":")]
        if info[1] in dico_index.keys():
            dico_index[info[1]].append(info[0])
        else:
            dico_index[info[1]] = [info[0]]

# Exploration du dossier des critiques et référencement des notes dans une
# dictionnaire

critiques = os.listdir(ch_crit)
notes_critiques = {}
chemin_critiques ={}
for i in critiques:
    info = [j for j in i.split("_")]
    notes_critiques[info[0]] = info[1][:2 if info[1][1:2] != "." else 1]
    chemin_critiques[info[0]] = os.path.join(ch_crit, i)

# Création d'un fichier pour chaque film et insertion des critiques associées

for i in dico_index.keys():
    fichier = open(
        "C:\\Users\\Ayman\\projet python\\imdb\\movies corpus\\{0}.txt".format(i),
        "x", encoding="utf-8")
    for j in dico_index[i]:
        crit = open(
            "C:\\Users\\Ayman\\projet python\\imdb\\comments\\{0}_{1}.txt".format(
                j, notes_critiques[j]), "r", encoding="utf-8")
        fichier.writelines(crit)
    fichier.close()

# Création d'un dictionnaire qui contient le texte tokenizé pour chaque film

token_films={}
token_clean = {}
for i in dico_index.keys():
    print("Ouverture du NOUVEAU FILM :::::",i, "                ", len(token_films))
    fichier = open(
        "C:\\Users\\Ayman\\projet python\\imdb\\movies corpus\\{0}.txt".format(i),
        "r", encoding="utf-8")
    token_films[i]= word_tokenize(fichier.read())
    fichier.close()
    token_clean[i]=[]
    for j in token_films[i] :
        if (j.isalpha()==True and not j in stopwords.words('english') and not re.fullmatch('[' + string.punctuation + ']+', j) and not j in ['i', 'I', 'br']) :
            print("ajout de :",j.lower())
            token_clean[i].append(lemmy.lemmatize(j.lower(), pos='v'))



# calculer la fréquence d'apparition d'un mot dans un texte : 

def compte_mot(texte, mot):
    compte=0
    for i in texte :
        if i==mot:
            compte = compte + 1
    return(compte/len(texte))

# Calculer et stocker pour chaque texte les mots et leur fréquence.

def compte_texte(texte):
    compte_dico = {}
    for i in texte:
        if i not in compte_dico.keys() :
            compte_dico[i]= compte_mot (texte, i)
    return(compte_dico)

# Stocker les informations pour un ensemble de textes dans un dictionnaire

def compte_corpus(corpus):
    compte_corpus = {}
    for i in corpus.keys():
        compte_corpus[i]=compte_texte(corpus[i])
    return(compte_corpus)

base_compte = compte_corpus(token_clean)

# Calculer la fréquence d'apparition d'un mot dans un ensemble de texte

def compte_freq(base, mot):
    freq = 0
    for i in base.keys() :
        if mot in base[i].keys():
            freq = freq + 1
    return(math.log10(len(base)/(1+freq)))

#Calcul du TF-IDF pour un mot par rapport à un film

def compte_tf_idf(mot, corpus, base, film):
    a = compte_freq(base, mot)
    b2 = compte_mot(corpus[film], mot)
    return(a*b2)

# Partie 3

def n_pluspertinents(base, n):
    liste = {}
    for i in base.keys() :
        print("NOUVEAU film", i)
        for j in base[i].keys():
            print("TEST MOT", j)
            if j not in liste.keys():
                print("ADD")
                liste[j]=compte_freq(base, j)
    return(liste)

n_plus = n_pluspertinents(base_compte, 10)
sorted_n_plus = sorted(n_plus.items(), key=operator.itemgetter(1))
n_plus_ready = [i for i in sorted_n_plus if i[1] not in [sorted_n_plus[31694][1]] ]
n_best = n_plus_ready[len(n_plus_ready) - 2000: len(n_plus_ready)]


def vecteur(film, corpus, base, n_plus):
    vect = []
    for i in n_plus :
        vect.append(compte_tf_idf(i, corpus, base, film))
    return(vect)

def distance_cosinus(film1, film2, corpus, base, n_plus):
    a = vecteur(film1, corpus, base, n_plus)
    b = vecteur(film2, corpus, base, n_plus)
    c =[0 for i in range(len(n_plus))]
    print("calcul distance")
    if (a != c and b != c) :
        return sum([i*j for i,j in zip(a, b)])/(math.sqrt(sum([i*i for i in a]))* math.sqrt(sum([i*i for i in b])))
    else :
        return(0)

vec = vecteur("0030007", token_clean, base_compte, n_best)
distance_cosinus("0028944", "0030007", token_clean, base_compte, n_best)

distances = {}
for i in dico_index.keys():
    distances[i]=[]
    for j in dico_index.keys():
        if i != j :
            distances[i].append(distance_cosinus(i,j, token_clean, base_compte, n_best))

I am open to ALL solutions which does not COMPLETELY change the structure of the algorithm. I mean, I want this to be made by me, and improved by whoever has an idea. I don't want to have an all-made solution, or maybe just as an example.

  • I want to be sure to understand your advices, not just pasting it in my code.

Thank you very much

  • Which part is the worst you ask? Have you heard of profiling? – bejado Apr 02 '17 at 09:21
  • not at all, should I have a look ? ;) – Ayman Makki Apr 02 '17 at 09:30
  • Yeah, I'd say that's pretty much what you're after – bejado Apr 02 '17 at 09:32
  • 1
    As for your code there are plenty of improvements for you to learn about, but you should ask on codereview.stackexchange.com. Reviewing working code is off-topic for this site. – alexis Apr 02 '17 at 15:56
  • http://stackoverflow.com/questions/582336/how-can-you-profile-a-script I like the one with pycallgraph a lot, easy to see which functions you are calling perhaps more times than you should, and which take the longest. – Igor Apr 03 '17 at 14:20

0 Answers0