24

How would the Jaro–Winkler distance string comparison algorithm be implemented in C#?

leebickmtu
  • 1,565
  • 2
  • 12
  • 20

4 Answers4

49
public static class JaroWinklerDistance
{
    /* The Winkler modification will not be applied unless the 
     * percent match was at or above the mWeightThreshold percent 
     * without the modification. 
     * Winkler's paper used a default value of 0.7
     */
    private static readonly double mWeightThreshold = 0.7;

    /* Size of the prefix to be concidered by the Winkler modification. 
     * Winkler's paper used a default value of 4
     */
    private static readonly int mNumChars = 4;


    /// <summary>
    /// Returns the Jaro-Winkler distance between the specified  
    /// strings. The distance is symmetric and will fall in the 
    /// range 0 (perfect match) to 1 (no match). 
    /// </summary>
    /// <param name="aString1">First String</param>
    /// <param name="aString2">Second String</param>
    /// <returns></returns>
    public static double distance(string aString1, string aString2) {
        return 1.0 - proximity(aString1,aString2);
    }


    /// <summary>
    /// Returns the Jaro-Winkler distance between the specified  
    /// strings. The distance is symmetric and will fall in the 
    /// range 0 (no match) to 1 (perfect match). 
    /// </summary>
    /// <param name="aString1">First String</param>
    /// <param name="aString2">Second String</param>
    /// <returns></returns>
    public static double proximity(string aString1, string aString2)
    {
        int lLen1 = aString1.Length;
        int lLen2 = aString2.Length;
        if (lLen1 == 0)
            return lLen2 == 0 ? 1.0 : 0.0;

        int  lSearchRange = Math.Max(0,Math.Max(lLen1,lLen2)/2 - 1);

        // default initialized to false
        bool[] lMatched1 = new bool[lLen1];
        bool[] lMatched2 = new bool[lLen2];

        int lNumCommon = 0;
        for (int i = 0; i < lLen1; ++i) {
            int lStart = Math.Max(0,i-lSearchRange);
            int lEnd = Math.Min(i+lSearchRange+1,lLen2);
            for (int j = lStart; j < lEnd; ++j) {
                if (lMatched2[j]) continue;
                if (aString1[i] != aString2[j])
                    continue;
                lMatched1[i] = true;
                lMatched2[j] = true;
                ++lNumCommon;
                break;
            }
        }
        if (lNumCommon == 0) return 0.0;

        int lNumHalfTransposed = 0;
        int k = 0;
        for (int i = 0; i < lLen1; ++i) {
            if (!lMatched1[i]) continue;
            while (!lMatched2[k]) ++k;
            if (aString1[i] != aString2[k])
                ++lNumHalfTransposed;
            ++k;
        }
        // System.Diagnostics.Debug.WriteLine("numHalfTransposed=" + numHalfTransposed);
        int lNumTransposed = lNumHalfTransposed/2;

        // System.Diagnostics.Debug.WriteLine("numCommon=" + numCommon + " numTransposed=" + numTransposed);
        double lNumCommonD = lNumCommon;
        double lWeight = (lNumCommonD/lLen1
                         + lNumCommonD/lLen2
                         + (lNumCommon - lNumTransposed)/lNumCommonD)/3.0;

        if (lWeight <= mWeightThreshold) return lWeight;
        int lMax = Math.Min(mNumChars,Math.Min(aString1.Length,aString2.Length));
        int lPos = 0;
        while (lPos < lMax && aString1[lPos] == aString2[lPos])
            ++lPos;
        if (lPos == 0) return lWeight;
        return lWeight + 0.1 * lPos * (1.0 - lWeight);

    }


}
leebickmtu
  • 1,565
  • 2
  • 12
  • 20
  • My understanding that it was Jaro-Winkler should return 0-1, where 1 is a perfect match, but this implementation returns zero when identical, and 1 if no match. – Dave Dec 31 '13 at 18:37
  • 5
    Dave, This code supports both actually. Calling proximity(str1, str2) returns 0-1, where 1 is perfect match. Calling distance(str1, str2) returns 0-1, where 0 is perfect match. – leebickmtu Jan 06 '14 at 01:18
  • 4
    I made some (useful) modifications to your implementation: https://gist.github.com/ronnieoverby/2aa19724199df4ec8af6 The most useful is the ability to specify character comparer. – Ronnie Overby May 06 '15 at 16:07
  • 2
    This matches mdq Microsoft.MasterDataServices.DataQuality assembly Jaro-Winkler score better than any of the Nuget packages available that also do Jaro-Winkler. – joezen777 Jul 15 '15 at 20:25
  • @MonsterMMORPG: I converted this from Java code I found somewhere to C#. Don't recall where the source was. – leebickmtu Sep 10 '15 at 20:08
  • 1
    I took the liberty of porting this to TSQL aswell, you can find the code at http://stackoverflow.com/a/34879177/1856391 in the event that anyone is interested :) – Maritim Jan 19 '16 at 14:25
  • 1
    Yeah. Some Nuget packages have bugs that give the wrong Jaro Winkler "similarity" and have infinite loop problems. https://fuzzystring.codeplex.com/workitem/list/basic – Jess Apr 13 '16 at 13:23
  • 2
    For more information about the failings of certain Jaro Winkler NuGet packages, see here: http://jessemon.blogspot.com/2016/05/comparison-and-review-of-net-fuzzy.html – Jess Sep 12 '17 at 15:18
2

You can take a look on Lucene.Net ,

it implement Jaro–Winkler distance algorithm ,

and its score is different from which leebickmtu post ,

you can take it as reference

the url is below :

http://lucenenet.apache.org/docs/3.0.3/db/d12/_jaro_winkler_distance_8cs_source.html

holmes2136
  • 477
  • 5
  • 14
  • 2
    I tried the code on Lucene.Net and it gives the same results as the solution I posted. It matches with the proximity() call of my code. The distance() call just reverses the interval. – leebickmtu Jan 16 '14 at 23:30
  • How did you implement this code, any example will be very very helpful for me. – Sayed Uz Zaman Oct 14 '21 at 07:18
0

Use below class to use jaro winkler. i have customized both algorithm jaro and jaro-winkler.

Visit on Github for DLL.

using System;
using System.Linq;

namespace Search
{
    public static class EditDistance
    {
        private struct JaroMetrics
        {
            public int Matches;

            public int Transpositions;
        }

        private static EditDistance.JaroMetrics Matches(string s1, string s2)
        {
            string text;
            string text2;
            if (s1.Length > s2.Length)
            {
                text = s1;
                text2 = s2;
            }
            else
            {
                text = s2;
                text2 = s1;
            }
            int num = Math.Max(text.Length / 2 - 1, 0);
            int[] array = new int[text2.Length];
            int i;
            for (i = 0; i < array.Length; i++)
            {
                array[i] = -1;
            }
            bool[] array2 = new bool[text.Length];
            int num2 = 0;
            for (int j = 0; j < text2.Length; j++)
            {
                char c = text2[j];
                int k = Math.Max(j - num, 0);
                int num3 = Math.Min(j + num + 1, text.Length);
                while (k < num3)
                {
                    if (!array2[k] && c == text[k])
                    {
                        array[j] = k;
                        array2[k] = true;
                        num2++;
                        break;
                    }
                    k++;
                }
            }
            char[] array3 = new char[num2];
            char[] ms2 = new char[num2];
            i = 0;
            int num4 = 0;
            while (i < text2.Length)
            {
                if (array[i] != -1)
                {
                    array3[num4] = text2[i];
                    num4++;
                }
                i++;
            }
            i = 0;
            num4 = 0;
            while (i < text.Length)
            {
                if (array2[i])
                {
                    ms2[num4] = text[i];
                    num4++;
                }
                i++;
            }
            int num5 = array3.Where((char t, int mi) => t != ms2[mi]).Count<char>();
            EditDistance.JaroMetrics result;
            result.Matches = num2;
            result.Transpositions = num5 / 2;
            return result;
        }

        

        public static float JaroWinkler(this string s1, string s2, float prefixScale, float boostThreshold)
        {
            prefixScale = ((prefixScale > 0.25f) ? 0.25f : prefixScale);
            prefixScale = ((prefixScale < 0f) ? 0f : prefixScale);
            float num = s1.Jaro(s2);
            int num2 = 0;
            for (int i = 0; i < Math.Min(s1.Length, s2.Length); i++)
            {
                if (s1[i] != s2[i])
                {
                    break;
                }
                num2++;
            }
            return (num < boostThreshold) ? num : (num + prefixScale * (float)num2 * (1f - num));
        }

        public static float JaroWinkler(this string s1, string s2, float prefixScale)
        {
            return s1.JaroWinkler(s2, prefixScale, 0.7f);
        }

        public static float JaroWinkler(this string s1, string s2)
        {
            return s1.JaroWinkler(s2, 0.1f, 0.7f);
        }

        public static float Jaro(this string s1, string s2)
        {
            EditDistance.JaroMetrics jaroMetrics = EditDistance.Matches(s1, s2);
            float num = (float)jaroMetrics.Matches;
            int transpositions = jaroMetrics.Transpositions;
            float result;
            if (num == 0f)
            {
                result = 0f;
            }
            else
            {
                float num2 = (num / (float)s1.Length + num / (float)s2.Length + (num - (float)transpositions) / num) / 3f;
                result = num2;
            }
            return result;
        }

        public static int LevenshteinDistance(this string source, string target)
        {
            int result;
            if (string.IsNullOrEmpty(source))
            {
                if (string.IsNullOrEmpty(target))
                {
                    result = 0;
                }
                else
                {
                    result = target.Length;
                }
            }
            else if (string.IsNullOrEmpty(target))
            {
                result = source.Length;
            }
            else
            {
                if (source.Length > target.Length)
                {
                    string text = target;
                    target = source;
                    source = text;
                }
                int length = target.Length;
                int length2 = source.Length;
                int[,] array = new int[2, length + 1];
                for (int i = 1; i <= length; i++)
                {
                    array[0, i] = i;
                }
                int num = 0;
                for (int j = 1; j <= length2; j++)
                {
                    num = (j & 1);
                    array[num, 0] = j;
                    int num2 = num ^ 1;
                    for (int i = 1; i <= length; i++)
                    {
                        int num3 = (target[i - 1] == source[j - 1]) ? 0 : 1;
                        array[num, i] = Math.Min(Math.Min(array[num2, i] + 1, array[num, i - 1] + 1), array[num2, i - 1] + num3);
                    }
                }
                result = array[num, length];
            }
            return result;
        }
    }
}
Peter
  • 37,042
  • 39
  • 142
  • 198
KARAN
  • 1,023
  • 1
  • 12
  • 24
0

You can use the below code which works very well for all the kind of strings.After getting the result you need to multiply with 100 to get the percentage of similarity. I hope it will solves your problem.

    public class JaroWinkler
{
    private const double defaultMismatchScore = 0.0;
    private const double defaultMatchScore = 1.0;

    /// <summary>
    /// Gets the similarity between two strings by using the Jaro-Winkler algorithm.
    /// A value of 1 means perfect match. A value of zero represents an absolute no match
    /// </summary>
    /// <param name="_firstWord"></param>
    /// <param name="_secondWord"></param>
    /// <returns>a value between 0-1 of the similarity</returns>
    /// 
    public static double RateSimilarity(string _firstWord, string _secondWord)
    {
        // Converting to lower case is not part of the original Jaro-Winkler implementation
        // But we don't really care about case sensitivity in DIAMOND and wouldn't decrease security names similarity rate just because
        // of Case sensitivity
        _firstWord = _firstWord.ToLower();
        _secondWord = _secondWord.ToLower();

        if ((_firstWord != null) && (_secondWord != null))
        {
            if (_firstWord == _secondWord)
                //return (SqlDouble)defaultMatchScore;
                return defaultMatchScore;
            else
            {
                // Get half the length of the string rounded up - (this is the distance used for acceptable transpositions)
                int halfLength = Math.Min(_firstWord.Length, _secondWord.Length) / 2 + 1;

                // Get common characters
                StringBuilder common1 = GetCommonCharacters(_firstWord, _secondWord, halfLength);
                int commonMatches = common1.Length;

                // Check for zero in common
                if (commonMatches == 0)
                    //return (SqlDouble)defaultMismatchScore;
                    return defaultMismatchScore;

                StringBuilder common2 = GetCommonCharacters(_secondWord, _firstWord, halfLength);

                // Check for same length common strings returning 0 if is not the same
                if (commonMatches != common2.Length)
                    //return (SqlDouble)defaultMismatchScore;
                    return defaultMismatchScore;

                // Get the number of transpositions
                int transpositions = 0;
                for (int i = 0; i < commonMatches; i++)
                {
                    if (common1[i] != common2[i])
                        transpositions++;
                }

                int j = 0;
                j += 1;

                // Calculate Jaro metric
                transpositions /= 2;
                double jaroMetric = commonMatches / (3.0 * _firstWord.Length) + commonMatches / (3.0 * _secondWord.Length) + (commonMatches - transpositions) / (3.0 * commonMatches);
                //return (SqlDouble)jaroMetric;
                return jaroMetric;
            }
        }

        //return (SqlDouble)defaultMismatchScore;
        return defaultMismatchScore;
    }

    /// <summary>
    /// Returns a string buffer of characters from string1 within string2 if they are of a given
    /// distance seperation from the position in string1.
    /// </summary>
    /// <param name="firstWord">string one</param>
    /// <param name="secondWord">string two</param>
    /// <param name="separationDistance">separation distance</param>
    /// <returns>A string buffer of characters from string1 within string2 if they are of a given
    /// distance seperation from the position in string1</returns>
    private static StringBuilder GetCommonCharacters(string firstWord, string secondWord, int separationDistance)
    {
        if ((firstWord != null) && (secondWord != null))
        {
            StringBuilder returnCommons = new StringBuilder(20);
            StringBuilder copy = new StringBuilder(secondWord);
            int firstWordLength = firstWord.Length;
            int secondWordLength = secondWord.Length;

            for (int i = 0; i < firstWordLength; i++)
            {
                char character = firstWord[i];
                bool found = false;

                for (int j = Math.Max(0, i - separationDistance); !found && j < Math.Min(i + separationDistance, secondWordLength); j++)
                {
                    if (copy[j] == character)
                    {
                        found = true;
                        returnCommons.Append(character);
                        copy[j] = '#';
                    }
                }
            }
            return returnCommons;
        }
        return null;
    }
}