Right now, I have a dataset of around 70,000 tweets and I'm trying to remove tweets that are overly similar. I have decided to use a Levenshtein ratio of greater than 0.9 as a threshold for tweets that are too similar. However, my current code is essentially comparing every tweet to every other tweet, giving me O(N^2), which is painfully slow. It takes over 12 hours to run, which is just too much. I tried parallelizing it, which is what I'm running right now, but I don't know if that will speed it up to an acceptable degree. Is there some way I can accomplish this in O(N)?
import json
import multiprocessing as mp
from pathlib import Path
import Levenshtein
def check_ratio(similar_removed, try_tweet):
for tweet in similar_removed:
if Levenshtein.ratio(try_tweet, tweet[1]) > 0.9:
return 1
return 0
def main():
path = Path('C:/Data/Python/JobLoss')
orig_data = []
with open(path / 'Processed.json') as f:
data = json.load(f)
for tweet in data:
orig_data.append(tweet[2])
pool = mp.Pool(mp.cpu_count())
similar_removed = []
similar_removed.append((0, orig_data[0]))
for i in range(1, len(orig_data)):
print('%s / %s' % (i, len(orig_data)))
too_similar = 0
try_tweet = orig_data[i]
too_similar = pool.apply(check_ratio, args=(similar_removed, try_tweet))
if not too_similar:
similar_removed.append((i, try_tweet))
pool.close()
final = [data[ind[0]] for ind in similar_removed]
with open(path / 'ProcessedSimilarRemoved.json', 'w') as f:
json.dump(final, f)
if __name__ == '__main__':
main()