I have a list of suppliers and I want to check them to see if there are any possible duplicates.
Let's take some example supplier names:
1. The Supplier GmbH
2. Trading Company LLC & Co. KG
3. DHL Express
4. DHL-Express Inc.
5. Supplier GmbH
6. Trading S.a.r.l.
I created two regex functions:
StripNonAlpha
that removes all non alpha characters and two letter words and replaces "-" with a space and WordMatch
that takes two arguments and returns True
if this specific word exists in the company name (I want to check for whole words, not partial, this is why I'm not using InStr
).
Taking the vendor names from above, I want to have for example supplier 1 and 5, 3 and 4 marked as possible duplicates but not 2 and 6.
I have around 100K suppliers to check, but the code is running very slow. Any clues how to optimize it?
Function StripNonAlpha(TextToReplace As String) As String
Dim ObjRegex As Object
Set ObjRegex = CreateObject("vbscript.regexp")
With ObjRegex
.Global = True
.Pattern = "[^a-zA-Z\s]+"
StripNonAlpha = .Replace(Replace(TextToReplace, "-", Chr(32)), vbNullString)
.Pattern = "\b.{2}\b"
StripNonAlpha = .Replace(StripNonAlpha, vbNullString)
End With
End Function
Function WordMatch(Source As String, MatchExprValue As String) As Boolean
Dim RE As Object
Set RE = CreateObject("vbscript.regexp")
RE.IgnoreCase = True
RE.Pattern = "\b" & MatchExprValue & "\b"
WordMatch = RE.test(Source)
End Function
Sub possible_duplicatev2()
Dim arr1() As String
Dim exclude(1 To 6) As String
Dim arr2() As String
Dim companyname As Range
Dim i As Long
Dim j As Long
Dim k As Long
Dim l As Long
Dim FoundCount As Long
Dim DuplicateCount As Long
Dim rc As Long
Dim scompanyname As String
Dim x As Long
Dim y As Long
exclude(1) = "sarl"
exclude(2) = "gmbh"
exclude(3) = "llc"
exclude(4) = "inc"
exclude(5) = "the"
exclude(6) = "sas"
rc = Range("A" & Rows.Count).End(xlUp).Row
For x = rc To 2 Step -1
If LCase(Range("B" & x).Text) Like "*zzz*" Or LCase(Range("B" & x).Text) Like "*xxx*" Or LCase(Range("B" & x).Text) Like "*yyy*" Then
Range("B" & x).EntireRow.Delete
End If
Next x
rc = Range("A" & Rows.Count).End(xlUp).Row - 1
ReDim arr1(1 To rc, 1 To 2)
For Each companyname In Range("B2", Range("B1").End(xlDown))
scompanyname = StripNonAlpha(LCase(companyname))
arr1(companyname.Row - 1, 1) = scompanyname
Next companyname
For i = 1 To UBound(arr1, 1)
For j = 1 To UBound(exclude)
If WordMatch(arr1(i, 1), exclude(j)) = True Then
Replace arr1(i, 1), exclude(j), ""
End If
Next j
arr2 = Split(arr1(i, 1), " ")
For k = 1 To UBound(arr1, 1)
For l = 0 To UBound(arr2)
If k = i Then
GoTo nextk
ElseIf WordMatch(arr1(k, 1), arr2(l)) = True Then
FoundCount = FoundCount + 1
End If
Next l
If UBound(arr2) = 1 And FoundCount = 1 Then
arr1(k, 2) = "Yes"
DuplicateCount = DuplicateCount + 1
ElseIf UBound(arr2) > 0 And FoundCount > 1 Then
arr1(k, 2) = "Yes"
DuplicateCount = DuplicateCount + 1
Else
arr1(k, 2) = "No"
End If
FoundCount = 0
nextk:
Next k
If DuplicateCount > 0 Then
arr1(i, 2) = "Yes"
Else
arr1(i, 2) = "No"
End If
DuplicateCount = 0
Next i
For y = 1 To UBound(arr1, 1)
Range("D" & y + 1) = arr1(y, 2)
Next y
End Sub