My algorithm for tokenization/lemmatization and cosine similarity is way too slow.. Which part is the worst?

Question

Home-made algorithm (not sure how good that is) : Please don't be too harsh, it's barely my third real algorithm in python I am totally aware that some people might find it quite long for what it really is... first I did it step by step (beginner mistake I believe, should have thought of the whole process before starting), second : I explain at the end.

import os
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
lemmy = WordNetLemmatizer()
import string
import re
from nltk.corpus import stopwords
import math
import operator

# Lecture de l'index

index = open("C:\\Users\\Ayman\\projet python\\imdb\\title_index", "r").read()

# Création d'un dictionnaire de type dico[numéro de film]=[liste des
# numéros de critiques associées]
#lIMITATION AVEC K = LE NOMBRE DE CRITIQUES QU'ON PREND EN COMPTE
#DU COUP SAIT QUE LA TAILLE DE DICO_INDEX EST LE NOMBRE DE FILMS QUE L'ON PREND EN COMPTE

dico_index = {}
k=0
for i in index.splitlines():
    if k<2000 :
        k=k+1
        info = [j for j in i.split(":")]
        if info[1] in dico_index.keys():
            dico_index[info[1]].append(info[0])
        else:
            dico_index[info[1]] = [info[0]]

# Exploration du dossier des critiques et référencement des notes dans une
# dictionnaire

critiques = os.listdir(ch_crit)
notes_critiques = {}
chemin_critiques ={}
for i in critiques:
    info = [j for j in i.split("_")]
    notes_critiques[info[0]] = info[1][:2 if info[1][1:2] != "." else 1]
    chemin_critiques[info[0]] = os.path.join(ch_crit, i)

# Création d'un fichier pour chaque film et insertion des critiques associées

for i in dico_index.keys():
    fichier = open(
        "C:\\Users\\Ayman\\projet python\\imdb\\movies corpus\\{0}.txt".format(i),
        "x", encoding="utf-8")
    for j in dico_index[i]:
        crit = open(
            "C:\\Users\\Ayman\\projet python\\imdb\\comments\\{0}_{1}.txt".format(
                j, notes_critiques[j]), "r", encoding="utf-8")
        fichier.writelines(crit)
    fichier.close()

# Création d'un dictionnaire qui contient le texte tokenizé pour chaque film

token_films={}
token_clean = {}
for i in dico_index.keys():
    print("Ouverture du NOUVEAU FILM :::::",i, "                ", len(token_films))
    fichier = open(
        "C:\\Users\\Ayman\\projet python\\imdb\\movies corpus\\{0}.txt".format(i),
        "r", encoding="utf-8")
    token_films[i]= word_tokenize(fichier.read())
    fichier.close()
    token_clean[i]=[]
    for j in token_films[i] :
        if (j.isalpha()==True and not j in stopwords.words('english') and not re.fullmatch('[' + string.punctuation + ']+', j) and not j in ['i', 'I', 'br']) :
            print("ajout de :",j.lower())
            token_clean[i].append(lemmy.lemmatize(j.lower(), pos='v'))



# calculer la fréquence d'apparition d'un mot dans un texte : 

def compte_mot(texte, mot):
    compte=0
    for i in texte :
        if i==mot:
            compte = compte + 1
    return(compte/len(texte))

# Calculer et stocker pour chaque texte les mots et leur fréquence.

def compte_texte(texte):
    compte_dico = {}
    for i in texte:
        if i not in compte_dico.keys() :
            compte_dico[i]= compte_mot (texte, i)
    return(compte_dico)

# Stocker les informations pour un ensemble de textes dans un dictionnaire

def compte_corpus(corpus):
    compte_corpus = {}
    for i in corpus.keys():
        compte_corpus[i]=compte_texte(corpus[i])
    return(compte_corpus)

base_compte = compte_corpus(token_clean)

# Calculer la fréquence d'apparition d'un mot dans un ensemble de texte

def compte_freq(base, mot):
    freq = 0
    for i in base.keys() :
        if mot in base[i].keys():
            freq = freq + 1
    return(math.log10(len(base)/(1+freq)))

#Calcul du TF-IDF pour un mot par rapport à un film

def compte_tf_idf(mot, corpus, base, film):
    a = compte_freq(base, mot)
    b2 = compte_mot(corpus[film], mot)
    return(a*b2)

# Partie 3

def n_pluspertinents(base, n):
    liste = {}
    for i in base.keys() :
        print("NOUVEAU film", i)
        for j in base[i].keys():
            print("TEST MOT", j)
            if j not in liste.keys():
                print("ADD")
                liste[j]=compte_freq(base, j)
    return(liste)

n_plus = n_pluspertinents(base_compte, 10)
sorted_n_plus = sorted(n_plus.items(), key=operator.itemgetter(1))
n_plus_ready = [i for i in sorted_n_plus if i[1] not in [sorted_n_plus[31694][1]] ]
n_best = n_plus_ready[len(n_plus_ready) - 2000: len(n_plus_ready)]


def vecteur(film, corpus, base, n_plus):
    vect = []
    for i in n_plus :
        vect.append(compte_tf_idf(i, corpus, base, film))
    return(vect)

def distance_cosinus(film1, film2, corpus, base, n_plus):
    a = vecteur(film1, corpus, base, n_plus)
    b = vecteur(film2, corpus, base, n_plus)
    c =[0 for i in range(len(n_plus))]
    print("calcul distance")
    if (a != c and b != c) :
        return sum([i*j for i,j in zip(a, b)])/(math.sqrt(sum([i*i for i in a]))* math.sqrt(sum([i*i for i in b])))
    else :
        return(0)

vec = vecteur("0030007", token_clean, base_compte, n_best)
distance_cosinus("0028944", "0030007", token_clean, base_compte, n_best)

distances = {}
for i in dico_index.keys():
    distances[i]=[]
    for j in dico_index.keys():
        if i != j :
            distances[i].append(distance_cosinus(i,j, token_clean, base_compte, n_best))

I am open to ALL solutions which does not COMPLETELY change the structure of the algorithm. I mean, I want this to be made by me, and improved by whoever has an idea. I don't want to have an all-made solution, or maybe just as an example.

I want to be sure to understand your advices, not just pasting it in my code.

Thank you very much

Which part is the worst you ask? Have you heard of profiling? — bejado, Apr 02 '17 at 09:21
As for your code there are plenty of improvements for you to learn about, but you should ask on codereview.stackexchange.com. Reviewing working code is off-topic for this site. — alexis, Apr 02 '17 at 15:56
http://stackoverflow.com/questions/582336/how-can-you-profile-a-script I like the one with pycallgraph a lot, easy to see which functions you are calling perhaps more times than you should, and which take the longest. — Igor, Apr 03 '17 at 14:20

My algorithm for tokenization/lemmatization and cosine similarity is way too slow.. Which part is the worst?

0 Answers0