I have vendor list that i need to match with my internal client names and extract some other internal metrics.
Below is the sample vendor data hat has around 400 client names
Peter Silver & Partners
Patric+Kennedy
WBTA/Worldwide
Equivalence
Mart Gang
Cornard Scofi + Treno
my internal data has around 70k client names. I need to match the vendor list to internal large data.
I have tried below and I get error as ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all(). Also I wanted to output 2 columns from vendor file and 3 columns from internal table together for the matched results, can u advise how i output those??
import os
os.chdir("C:\\instantclient_19_10")
import cx_Oracle
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
con = cx_Oracle.connect("xxxnm/pass123@PRD:1522/prd.bnk.com")
dft = pd.read_sql("select c1, c2, c3 from Table1",con)
#print(dft)
dfs = pd.read_excel(r"C:\\Users\\nm\\vendor_List.xlsx", sheet_name=None)
dfv = pd.concat([sheet.assign(src_sheet=sheet_name) for sheet_name,sheet in dfs.items()])
matched_vendors = []
for row in dfv.index:
vendor_name = dfv.at[row,"Company Name"]
for columns in dft.index:
internal_vendor_name=dft.at[columns,"Internal_NM"]
matched_token=fuzz.partial_ratio(vendor_name,internal_vendor_name)
if matched_token>80:
matched_vendors.append([vendor_name,internal_vendor_name,matched_token])
df = DataFrame (matched_vendors).transpose()
df.columns = ['vendor_name','internal_vendor_name','matched_token']
#print(matched_vendors)
matched_vendors.to_csv(r"C:\\Users\\nm\\vendor_matched.csv", index=False)
Ok Shifted to https://github.com/maxbachmann/rapidfuzz based on the Vectorizing or Speeding up Fuzzywuzzy String Matching on PANDAS Column and kind of no performance gain.. I guess I'm doing something wrong
import os
os.chdir("C:\\instantclient_19_10")
import cx_Oracle
import pandas as pd, numpy as np
from rapidfuzz import process, utils
con = cx_Oracle.connect("xxxnm/pass123@PRD:1522/prd.com")
dft = pd.read_sql("select c1, c2, c3 from Table1",con)
dft.apply(lambda x: x.astype(str).str.upper())
#print(dft)
dfs = pd.read_excel(r"C:\\Users\\nm\\vendor_List.xlsx", sheet_name=None)
dfv = pd.concat([sheet.assign(src_sheet=sheet_name) for sheet_name,sheet in dfs.items()])
dfv.apply(lambda x: x.astype(str).str.upper())
vendor = list(dfv['Company Name'])
entity_name = list(dft.INTERNAL_NM)
for i in range(0,len(vendor)):
vendor1 = vendor[i]
for j in range(0,len(INTERNAL_NM)):
entity1 = INTERNAL_NM[j]
match = process.extractOne(vendor1, entity1, processor=None, score_cutoff=93)
# Vendor_list.append(vendor1)
# entity_vendor_list.append(entity1)
# match_score.append(match)
print(match)
df = DataFrame (match).transpose()
#df.columns = ['vendor_name','internal_vendor_name','matched_token']
#print(matched_vendors)
# matched_vendors.to_csv(r"C:\\Users\\nm\\vendor_matched.csv", index=False)