0

I am calculating edit distance between two dataframe. Both the dataframe consists of ~30L of rows, as the dataframe size is large it is taking lot of time. Is there any way to improve the performance?

for i in range(0,len(targets1)):
    if i % 100 == 0:
        pct = (i/len(targets1)) * 100
        print("(" + str(dt.datetime.now()) + ") completed: " + str(round(pct, 2)) + "%")
    sr1_new=sr1[(sr2==targets2[i]) & (len_sr2>=(len_targets2[i]-10)) & (len_sr2 <=(len_targets2[i]+10))]
    if len(sr1_new) > 0:
        ee=sr1_new.str.edit_distance(targets1[i])
        ee=ee.sort_values()
        output_final = output_final.append({'Name': targets[i],'Matched_Name': sr[ee.index[0]],'score':ee[ee.index[0]],'score_final':(len(sr[ee.index[0]])+len(targets[i])-ee[ee.index[0]])/(len(sr[ee.index[0]])+len(targets[i]))*100}, ignore_index=True)
    else:
        output_final = output_final.append({'Name': targets[i],'Matched_Name': '','Matched_REF': "0",'score':0,'score_final':0}, ignore_index=True)
targets1 = pd.Series(['ABBSHHCH','ABBSAJSJAHDKAJKJ', 'BASJBASJASH', 'KJSAKASJAS', 'KJSAIUBDAKS',
                'KAJSNDSAX', 'JASANXAJSKJ', 'NASNXHY', 'AIUSSHXBAHSJASHJ'])

targets2 = pd.Series(['AB','AB', 'BA', 'KJ', 'KJ','KA', 'JA', 'NA', 'AI'])

sr1 = pd.Series(['ABBSHHSJAKX','ABBMNASASJKKLASAHDKAJKJ', 'BASSAMSAJASH', 'KJSMSANMAASJAS', 'KJSSMNASBDAKS',
                'KASKJADSAX', 'JASAKJKJSKJ', 'NASAKXHY', 'AIUSSANMASSJASHJ','NSAASJNCXA','ABBSASMNKAJKJ', 'ASNASNXJASH', 
                'KJSKJSAKSJAS', 'KJASKJSDAKS', 'KAJSAKJSAX', 'JAKJASXAJSKJ', 'NADADHY', 'AIUSNASSASJASHJ'])

sr2 = pd.Series(['AB','AB','BA','KJ','KJ','KA','JA','NA','AI','NS','AB','AS','KJ','KJ','KA','JA','NA','AI'])

len_sr2 = pd.Series([11,23,12,14,13,10,11,8,16,10,13,11,12,11,10,12,7,15])

len_targets2 = pd.Series([8,16,11,10,11,9,11,7,16])
A14
  • 111
  • 11

0 Answers0