I've implemented an algorithm to to return a list with similar records from bibliographic data.
def Find_top(rank,structure, cp):
global contadorGrupos
global contadorValor
datos=[]
for clave in rank:
aux=[]
vectorName=[]
if clave["Analizado"]==0:
for x in structure:
if clave["Name"]!=x["Name"]:
if x["Analizado"]==0:
jac=jaccard_similarity(clave["Name"],x["Name"])
if jac > 0.4:
jar=jellyfish.jaro_winkler(unicode(clave["Name"], 'utf-8'),unicode(x["Name"], 'utf-8'))
valor=(jac+jar)/2
if valor > 0.5:
if cp=="Authors":
if valor> 0.8 and Comparador(clave["Name"], x["Name"])==1:
if Verificar_Key(clave["Afiliation"], x["Afiliation"])>0.7:
aux.append(x)
vectorName.append(x["Name"])
x["Analizado"]=1
else:
if cp == "Afiliation":
if valor >= 0.983 :
aux.append(x)
vectorName.append(x["Name"])
x["Analizado"]=1
else:
if valor >= 0.93:
aux.append(x)
vectorName.append(x["Name"])
x["Analizado"]=1
clave["Analizado"]=1
aux = aux+BuscarParecidos(vectorName,structure, cp)
BuscarT(vectorName, rank)
if len(aux)!=0:
contadorGrupos = contadorGrupos + 1
aux.append(clave)
datos.append(aux)
contadorValor = contadorValor + 1
return datos
Where BuscarParecidos is:
def BuscarParecidos(vector, lista , cp):
vectorAuxiliar=[]
nombres=[]
for ax in vector:
for i in lista:
if i["Analizado"]==0:
if not i["Name"] in vector:
vx=jaccard_similarity(ax,i["Name"])
vy=jellyfish.jaro_winkler(unicode(ax, 'utf-8'),unicode(i["Name"], 'utf-8'))
vt=(vx+vy)/2
if cp=="Authors":
if vt > 0.8 and Comparador(ax,i["Name"])==1:
i["Analizado"]=1
vectorAuxiliar.append(i)
nombres.append(i["Name"])
else:
if cp == "Afiliation":
if vt >= 0.983:
i["Analizado"]=1
vectorAuxiliar.append(i)
nombres.append(i["Name"])
else:
if vt >= 0.93:
i["Analizado"]=1
vectorAuxiliar.append(i)
nombres.append(i["Name"])
vector = vector + nombres
return vectorAuxiliar
jaccard_similarity
is my own algorithm and jaro_winkler
is implemented by the jellyfish library.
The problem is that when I run the algorithm with Python 3.4 runs perfectly in 40 seconds with 3310 records approximately. But when I run the algorithm with Python 2.7 takes 4 minutes and few seconds to finish. I don't understand why it happens.