so I am looking to modify this code to reduce runtime of fuzzywuzzy library. At present, it's taking about an hour for a dataset with 800 rows, and when I used this on a dataset with 4.5K rows, it kept running for almost 6 hours, still no result. I had to stop the kernel.
I need to use this code on a data of 20K atleast. Can anyone please suggest any edits to this code to get the results faster? This is the code -
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz,process
df = pd.read_csv(r'path')
df.head()
data = df['Body']
print(data)
clean = []
threshold = 80
for row in data:
# score each sentence against each other
# [('string', score),..]
scores = process.extract(row, data, scorer=fuzz.token_set_ratio)
# basic idea is if there is a close second match we want to evaluate
# and keep the longer of the two
if scores[1][1] > threshold:
clean.append(max([x[0] for x in scores[:2]],key=len))
else:
clean.append(scores[0][0])
# remove dupes
clean = set(clean)
#converting 'clean' list to dataframe and giving the column name for the cleaned column
clean_data = pd.DataFrame(clean, columns=['Body'])
clean_data.to_csv(r'path')
This is how my data looks like -
https://docs.google.com/spreadsheets/d/1p9RC9HznhdJFH4kFYdE_TgnHdoRf8P6gTEAkB3lQWEE/edit?usp=sharing
So if you notice rows 14&15, and rows 19&20 are partial duplicates, I want the code to identify such sentences, and drop the shorter ones.
Update -
I made a minor change to the rapidfuzz solution given by @Darryl G, and now the code looks like this -
`import pandas as pd
import numpy as np
import openpyxl
from rapidfuzz.fuzz import token_set_ratio as rapid_token_set_ratio
from rapidfuzz import process as process_rapid
from rapidfuzz import utils as rapid_utils
import time
df = pd.read_excel(r'path')
data = df['Body']
print(data)
def excel_sheet_to_dataframe(path):
'''
Loads sheet from Excel workbook using openpyxl
'''
wb = openpyxl.load_workbook(path)
ws = wb.active
data = ws.values
# Get the first line in file as a header line
columns = next(data)[0:]
return pd.DataFrame(data, columns=columns)
clean_rapid = []
threshold = 80
def process_rapid_fuzz(data):
'''
Process using rapid fuzz rather than fuzz_wuzzy
'''
series = (rapid_utils.default_process(d) for d in data) # Pre-process to make lower-case and remove non-alphanumeric
# characters (generator)
processed_data = pd.Series(series)
for query in processed_data:
scores = process_rapid.extract(query, processed_data, scorer=rapid_token_set_ratio, score_cutoff=threshold)
if len(scores) > 1 and scores[1][1] > threshold:
m = max(scores[:2], key = lambda k:len(k[0])) # Of up to two matches above threshold, takes longest
clean_rapid.append(m[0]) # Saving the match index
else:
clean_rapid.append(query)
################ Testing
t0 = time.time()
df = excel_sheet_to_dataframe(r'path') # Using Excel file in working folder
# Desired data in body column
data = df['Body'].dropna() # Dropping None rows (few None rows at end after Excel import)
result_fuzzy_rapid = process_rapid_fuzz(data)
print(f'Elapsed time {time.time() - t0}')
# remove dupes
clean_rapid = set(clean_rapid)
#converting 'clean' list to dataframe and giving the column name for the cleaned column
clean_data = pd.DataFrame(clean_rapid, columns=['Body'])
#exporting the cleaned data
clean_data.to_excel(r'path')`
Now the issue is, in the output file, all the full stops, etc are getting dropped. How can I retain them?