137

I am looking for a JavaScript function that can compare two strings and return the likeliness that they are alike. I have looked at soundex but that's not really great for multi-word strings or non-names. I am looking for a function like:

    function compare(strA,strB){
    
    }
    
    compare("Apples","apple") = Some X Percentage.

The function would work with all types of strings, including numbers, multi-word values, and names. Perhaps there's a simple algorithm I could use?

Ultimately none of these served my purpose so I used this:

     function compare(c, u) {
            var incept = false;
            var ca = c.split(",");
            u = clean(u);
            //ca = correct answer array (Collection of all correct answer)
            //caa = a single correct answer word array (collection of words of a single correct answer)
            //u = array of user answer words cleaned using custom clean function
            for (var z = 0; z < ca.length; z++) {
                caa = $.trim(ca[z]).split(" ");
                var pc = 0;
                for (var x = 0; x < caa.length; x++) {
                    for (var y = 0; y < u.length; y++) {
                        if (soundex(u[y]) != null && soundex(caa[x]) != null) {
                            if (soundex(u[y]) == soundex(caa[x])) {
                                pc = pc + 1;
                            }
                        }
                        else {
                            if (u[y].indexOf(caa[x]) > -1) {
                                pc = pc + 1;
                            }
                        }
                    }
                }
                if ((pc / caa.length) > 0.5) {
                    return true;
                }
            }
            return false;
        }
        
        // create object listing the SOUNDEX values for each letter
        // -1 indicates that the letter is not coded, but is used for coding
        //  0 indicates that the letter is omitted for modern census archives
        //                              but acts like -1 for older census archives
        //  1 is for BFPV
        //  2 is for CGJKQSXZ
        //  3 is for DT
        //  4 is for L
        //  5 is for MN my home state
        //  6 is for R
        function makesoundex() {
            this.a = -1
            this.b = 1
            this.c = 2
            this.d = 3
            this.e = -1
            this.f = 1
            this.g = 2
            this.h = 0
            this.i = -1
            this.j = 2
            this.k = 2
            this.l = 4
            this.m = 5
            this.n = 5
            this.o = -1
            this.p = 1
            this.q = 2
            this.r = 6
            this.s = 2
            this.t = 3
            this.u = -1
            this.v = 1
            this.w = 0
            this.x = 2
            this.y = -1
            this.z = 2
        }
        
        var sndx = new makesoundex()
        
        // check to see that the input is valid
        function isSurname(name) {
            if (name == "" || name == null) {
                return false
            } else {
                for (var i = 0; i < name.length; i++) {
                    var letter = name.charAt(i)
                    if (!(letter >= 'a' && letter <= 'z' || letter >= 'A' && letter <= 'Z')) {
                        return false
                    }
                }
            }
            return true
        }
        
        // Collapse out directly adjacent sounds
        // 1. Assume that surname.length>=1
        // 2. Assume that surname contains only lowercase letters
        function collapse(surname) {
            if (surname.length == 1) {
                return surname
            }
            var right = collapse(surname.substring(1, surname.length))
            if (sndx[surname.charAt(0)] == sndx[right.charAt(0)]) {
                return surname.charAt(0) + right.substring(1, right.length)
            }
            return surname.charAt(0) + right
        }
        
        // Collapse out directly adjacent sounds using the new National Archives method
        // 1. Assume that surname.length>=1
        // 2. Assume that surname contains only lowercase letters
        // 3. H and W are completely ignored
        function omit(surname) {
            if (surname.length == 1) {
                return surname
            }
            var right = omit(surname.substring(1, surname.length))
            if (!sndx[right.charAt(0)]) {
                return surname.charAt(0) + right.substring(1, right.length)
            }
            return surname.charAt(0) + right
        }
        
        // Output the coded sequence
        function output_sequence(seq) {
            var output = seq.charAt(0).toUpperCase() // Retain first letter
            output += "-" // Separate letter with a dash
            var stage2 = seq.substring(1, seq.length)
            var count = 0
            for (var i = 0; i < stage2.length && count < 3; i++) {
                if (sndx[stage2.charAt(i)] > 0) {
                    output += sndx[stage2.charAt(i)]
                    count++
                }
            }
            for (; count < 3; count++) {
                output += "0"
            }
            return output
        }
        
        // Compute the SOUNDEX code for the surname
        function soundex(value) {
            if (!isSurname(value)) {
                return null
            }
            var stage1 = collapse(value.toLowerCase())
            //form.result.value=output_sequence(stage1);
        
            var stage1 = omit(value.toLowerCase())
            var stage2 = collapse(stage1)
            return output_sequence(stage2);
        
        }
        
        function clean(u) {
            var u = u.replace(/\,/g, "");
            u = u.toLowerCase().split(" ");
            var cw = ["ARRAY OF WORDS TO BE EXCLUDED FROM COMPARISON"];
            var n = [];
            for (var y = 0; y < u.length; y++) {
                var test = false;
                for (var z = 0; z < cw.length; z++) {
                    if (u[y] != "" && u[y] != cw[z]) {
                        test = true;
                        break;
                    }
                }
                if (test) {
        //Don't use & or $ in comparison
                    var val = u[y].replace("$", "").replace("&", "");
                    n.push(val);
                }
            }
            return n;
        }
Compositr
  • 717
  • 6
  • 18
Brad Ruderman
  • 2,053
  • 2
  • 15
  • 22
  • [Efficient way of calculating likeness scores of strings when sample size is large?](http://stackoverflow.com/questions/1609742/efficient-way-of-calculating-likeness-scores-of-strings-when-sample-size-is-larg) – Andreas May 06 '12 at 20:05
  • I am testing this out, still having trouble finding the perfect one. The classic example that breaks these. Say the question is "What are the first two presidents?" and someone answers "adams and washington". A string comparison to "george washington john adams" should be roughly 50%. – Brad Ruderman May 06 '12 at 20:52
  • 1
    oof, depends on jQuery? – Shawn Whinnery Jun 15 '18 at 18:10

11 Answers11

225

Here's an answer based on Levenshtein distance https://en.wikipedia.org/wiki/Levenshtein_distance

function similarity(s1, s2) {
  var longer = s1;
  var shorter = s2;
  if (s1.length < s2.length) {
    longer = s2;
    shorter = s1;
  }
  var longerLength = longer.length;
  if (longerLength == 0) {
    return 1.0;
  }
  return (longerLength - editDistance(longer, shorter)) / parseFloat(longerLength);
}

For calculating edit distance

function editDistance(s1, s2) {
  s1 = s1.toLowerCase();
  s2 = s2.toLowerCase();

  var costs = new Array();
  for (var i = 0; i <= s1.length; i++) {
    var lastValue = i;
    for (var j = 0; j <= s2.length; j++) {
      if (i == 0)
        costs[j] = j;
      else {
        if (j > 0) {
          var newValue = costs[j - 1];
          if (s1.charAt(i - 1) != s2.charAt(j - 1))
            newValue = Math.min(Math.min(newValue, lastValue),
              costs[j]) + 1;
          costs[j - 1] = lastValue;
          lastValue = newValue;
        }
      }
    }
    if (i > 0)
      costs[s2.length] = lastValue;
  }
  return costs[s2.length];
}

Usage

similarity('Stack Overflow','Stack Ovrflw')

returns 0.8571428571428571


You can play with it below:

function checkSimilarity(){
  var str1 = document.getElementById("lhsInput").value;
  var str2 = document.getElementById("rhsInput").value;
  document.getElementById("output").innerHTML = similarity(str1, str2);
}

function similarity(s1, s2) {
      var longer = s1;
      var shorter = s2;
      if (s1.length < s2.length) {
        longer = s2;
        shorter = s1;
      }
      var longerLength = longer.length;
      if (longerLength == 0) {
        return 1.0;
      }
      return (longerLength - editDistance(longer, shorter)) / parseFloat(longerLength);
    }

    function editDistance(s1, s2) {
      s1 = s1.toLowerCase();
      s2 = s2.toLowerCase();

      var costs = new Array();
      for (var i = 0; i <= s1.length; i++) {
        var lastValue = i;
        for (var j = 0; j <= s2.length; j++) {
          if (i == 0)
            costs[j] = j;
          else {
            if (j > 0) {
              var newValue = costs[j - 1];
              if (s1.charAt(i - 1) != s2.charAt(j - 1))
                newValue = Math.min(Math.min(newValue, lastValue),
                  costs[j]) + 1;
              costs[j - 1] = lastValue;
              lastValue = newValue;
            }
          }
        }
        if (i > 0)
          costs[s2.length] = lastValue;
      }
      return costs[s2.length];
    }
<div><label for="lhsInput">String 1:</label> <input type="text" id="lhsInput" oninput="checkSimilarity()" /></div>
<div><label for="rhsInput">String 2:</label> <input type="text" id="rhsInput" oninput="checkSimilarity()" /></div>
<div>Match: <span id="output">No Input</span></div>
David
  • 4,744
  • 5
  • 33
  • 64
overlord1234
  • 2,366
  • 1
  • 10
  • 8
  • 1
    An improvement for several words: var similarity2 = function(s1, s2){ var split1 = s1.split(' '); var split2 = s2.split(' '); var sum = 0; var max = 0; var temp = 0; for(var i=0; i – infinito84 May 09 '17 at 19:50
  • @overlord1234 Does the method above works for string like this: 9e27dbb9ff6eea70821c02b4457cbc6b7eb8e12a64f46c192c3a05f1bc1519acd101193dac157c6233d9d773f9b364ca210d6287f9efa00bfc656746782905be ? – QWERTY Oct 05 '17 at 13:53
  • It does work with strings without a semantic attached to it. Please try and run the in-line code snippet (thanks to David). I get a similarity of 0.17857142857142858 when I input the aforementioned strings. – overlord1234 Oct 12 '17 at 19:44
  • @hyperfkcb he's implementing the edit distance algorithm, which counts how many characters are in the wrong position (more or less), so for calculating the percentage he is taking the longer possible edit distance value (longerLength) and doing (longerLength - editDistance ) / longerLength. – infinito84 Feb 05 '18 at 04:29
  • 1
    However, it is too slow for long strings. – upupming Jan 17 '19 at 02:14
  • @overlord1234 If I would like to exclude chars like `!,;.` how could I do that? For instance matching `Help me!` should give 100% match of `Help me` – Peter Warbo Apr 30 '19 at 07:55
  • @PeterWarbo Remove any special characters from the base strings s1 and s2 before doing the comparison. Whether you want to remove the special characters completely or introduce a blank space, that depends on your use case – overlord1234 May 13 '19 at 22:14
  • Why does it think "aabb" and "bbaa" are 0% of a match? – KTibow May 30 '20 at 01:51
  • 2
    But it doesn't work for caps. It's not case sensitive! –  Aug 27 '21 at 13:36
  • Not so good from performance side of view, check my post in this thread. Better to use string-similarity library, it twice faster – Nigrimmist Feb 15 '22 at 20:41
26

Using this library for string similarity worked like a charm for me!

Here's the Example -

var similarity = stringSimilarity.compareTwoStrings("Apples","apple");    // => 0.88
Tushar Walzade
  • 3,737
  • 4
  • 33
  • 56
16

Here is a very simple function that does a comparison and returns a percentage based on equivalency. While it has not been tested for all possible scenarios, it may help you get started.

function similar(a,b) {
    var equivalency = 0;
    var minLength = (a.length > b.length) ? b.length : a.length;    
    var maxLength = (a.length < b.length) ? b.length : a.length;    
    for(var i = 0; i < minLength; i++) {
        if(a[i] == b[i]) {
            equivalency++;
        }
    }
    

    var weight = equivalency / maxLength;
    return (weight * 100) + "%";
}
alert(similar("test","tes"));   // 75%
alert(similar("test","test"));  // 100%
alert(similar("test","testt")); // 80%
alert(similar("test","tess"));  // 75%
shubhu
  • 123
  • 5
jamesmortensen
  • 33,636
  • 11
  • 99
  • 120
14

To Find degree of similarity between two strings; we can use more than one or two methods but I am mostly inclined towards the usage of 'Dice's Coefficient' . which is better! well in my knowledge than using 'Levenshtein distance'

Using this 'string-similarity' package from npm you will be able to work on what I said above.

some easy usage examples are

var stringSimilarity = require('string-similarity');

var similarity = stringSimilarity.compareTwoStrings('healed', 'sealed'); 

var matches = stringSimilarity.findBestMatch('healed', ['edward', 'sealed', 'theatre']);

for more please visit the link given above. Thankyou.

Rizwan Atta
  • 3,222
  • 2
  • 19
  • 31
Leon Grin
  • 889
  • 8
  • 10
  • 1
    A link to a solution is welcome, but please ensure your answer is useful without it: [add context around the link](https://meta.stackexchange.com/a/8259) so your fellow users will have some idea what it is and why it’s there, then quote the most relevant part of the page you're linking to in case the target page is unavailable. [Answers that are little more than a link may be deleted](https://stackoverflow.com/help/deleted-answers). – David Buck Dec 05 '19 at 19:19
8

Just one I quickly wrote that might be good enough for your purposes:

function Compare(strA,strB){
    for(var result = 0, i = strA.length; i--;){
        if(typeof strB[i] == 'undefined' || strA[i] == strB[i]);
        else if(strA[i].toLowerCase() == strB[i].toLowerCase())
            result++;
        else
            result += 4;
    }
    return 1 - (result + 4*Math.abs(strA.length - strB.length))/(2*(strA.length+strB.length));
}

This weighs characters that are the same but different case 1 quarter as heavily as characters that are completely different or missing. It returns a number between 0 and 1, 1 meaning the strings are identical. 0 meaning they have no similarities. Examples:

Compare("Apple", "Apple")    // 1
Compare("Apples", "Apple")   // 0.8181818181818181
Compare("Apples", "apple")   // 0.7727272727272727
Compare("a", "A")            // 0.75
Compare("Apples", "appppp")  // 0.45833333333333337
Compare("a", "b")            // 0
Paul
  • 139,544
  • 27
  • 275
  • 264
  • 11
    Not so accurate: Compare("Apple", "zApple") = 0.07 , while Compare("Apple", "Applez") = 0.84 –  Mar 05 '16 at 00:04
  • 3
    @Kousha , it's positional. "Apple" and "zApple" only have one letter in common (the second `p`). – Paul Mar 05 '16 at 00:13
  • 2
    @Paulpro Apple, and zApple have five letters in common logically. It is your implementation fault. Apple, zApple, Applez are similar. –  Mar 05 '16 at 17:52
  • 2
    @Kousha, zApple isn't similar according to this algorithm, since it's positional. That doesn't make the algorithm incorrect. – Paul Mar 05 '16 at 19:35
  • 11
    @Paulpro: That doesn't make your algorithm incorrect, but makes it a poor answer for this question... – MarcoS Jun 03 '17 at 11:55
6

How about function similar_text from PHP.js library?

It is based on a PHP function with the same name.

function similar_text (first, second) {
    // Calculates the similarity between two strings  
    // discuss at: http://phpjs.org/functions/similar_text

    if (first === null || second === null || typeof first === 'undefined' || typeof second === 'undefined') {
        return 0;
    }

    first += '';
    second += '';

    var pos1 = 0,
        pos2 = 0,
        max = 0,
        firstLength = first.length,
        secondLength = second.length,
        p, q, l, sum;

    max = 0;

    for (p = 0; p < firstLength; p++) {
        for (q = 0; q < secondLength; q++) {
            for (l = 0;
            (p + l < firstLength) && (q + l < secondLength) && (first.charAt(p + l) === second.charAt(q + l)); l++);
            if (l > max) {
                max = l;
                pos1 = p;
                pos2 = q;
            }
        }
    }

    sum = max;

    if (sum) {
        if (pos1 && pos2) {
            sum += this.similar_text(first.substr(0, pos2), second.substr(0, pos2));
        }

        if ((pos1 + max < firstLength) && (pos2 + max < secondLength)) {
            sum += this.similar_text(first.substr(pos1 + max, firstLength - pos1 - max), second.substr(pos2 + max, secondLength - pos2 - max));
        }
    }

    return sum;
}
VisioN
  • 143,310
  • 32
  • 282
  • 281
  • 1
    Is the similarity returned based on matching character? How does it evaluate the similarity – QWERTY Oct 05 '17 at 13:48
2

fuzzyset - A fuzzy string set for javascript. fuzzyset is a data structure that performs something akin to fulltext search against data to determine likely mispellings and approximate string matching. Note that this is a javascript port of a python library.

2

To some extent, I like the ideas of Dice's coefficient embedded in the string-similarity module. But I feel that considering the bigrams only and not taking into account their multiplicities is missing some important data. Below is a version that also handles multiplicities, and I think is a simpler implementation overall. I don't try to use their API, offering only a function which compares two strings after some manipulation (removing non-alphanumeric characters, lower-casing everything, and compressing but not removing whitespace), built atop one which compares them without that manipulation. It would be easy enough to wrap this back in their API, but I see little need.

const stringSimilarity = (a, b) =>
  _stringSimilarity (prep (a), prep (b))

const _stringSimilarity = (a, b) => {
  const bg1 = bigrams (a)
  const bg2 = bigrams (b)
  const c1 = count (bg1)
  const c2 = count (bg2)
  const combined = uniq ([... bg1, ... bg2]) 
    .reduce ((t, k) => t + (Math .min (c1 [k] || 0, c2 [k] || 0)), 0)
  return 2 * combined / (Math .max (bg1 .length + bg2 .length, 1))
}

const prep = (str) => // TODO: unicode support?
  str .toLowerCase () .replace (/[^\w\s]/g, ' ') .replace (/\s+/g, ' ')

const bigrams = (str) => 
  [...str] .slice (0, -1) .map ((c, i) => c + str [i + 1])

const count = (xs) => 
  xs .reduce ((a, x) => ((a [x] = (a [x] || 0) + 1), a), {})

const uniq = (xs) => 
  [... new Set (xs)]

console .log (stringSimilarity (
  'foobar', 
  'Foobar'
)) //=> 1

console .log (stringSimilarity (
  "healed", 
  "sealed"
))//=> 0.8

console .log (stringSimilarity (
  "Olive-green table for sale, in extremely good condition.",
  "For sale: table in very good  condition, olive green in colour."
)) //=> 0.7787610619469026

console .log (stringSimilarity (
  "Olive-green table for sale, in extremely good condition.",
  "For sale: green Subaru Impreza, 210,000 miles"
)) //=> 0.38636363636363635

console .log (stringSimilarity (
  "Olive-green table for sale, in extremely good condition.",
  "Wanted: mountain bike with at least 21 gears."
)) //=> 0.1702127659574468

console .log (stringSimilarity (
  "The rain in Spain falls mainly on the plain.",
  "The run in Spun falls munly on the plun.",
)) //=> 0.7560975609756098

console .log (stringSimilarity (
  "Fa la la la la, la la la la",
  "Fa la la la la, la la",
)) //=> 0.8636363636363636

console .log (stringSimilarity (
  "car crash",
  "carcrash",
)) //=> 0.8

console .log (stringSimilarity (
  "Now is the time for all good men to come to the aid of their party.",
  "Huh?",
)) //=> 0
.as-console-wrapper {max-height: 100% !important; top: 0}

Some of the test cases are from string-similarity, others are my own. They show some significant differences from that package, but nothing untoward. The only one I would call out is the difference between "car crash" and "carcrash", which string-similarity sees as identical and I report with a similarity of 0.8. My version finds more similarity in all the olive-green test-cases than does string-similarity, but as these are in any case fairly arbitrary numbers, I'm not sure how much difference it makes; they certainly position them in the same relative order.

Scott Sauyet
  • 49,207
  • 4
  • 49
  • 103
2

string-similarity lib vs Top answer (by @overloard1234) performance comparation you can find below

Based on @Tushar Walzade's advice to use string-similarity library, you can find, that for example

stringSimilatityLib.findBestMatch('KIA','Kia').bestMatch.rating 

will return 0.0

So, looks like better to compare it in lowerCase.

Better base usage (for arrays) :

findBestMatch(str, strArr) {
   const lowerCaseArr = strArr.map(element => element.toLowerCase());//creating lower case array
   const match = stringSimilatityLib.findBestMatch(str.toLowerCase(), lowerCaseArr).bestMatch; //trying to find bestMatch
   if (match.rating > 0) {
      const foundIndex = lowerCaseArr.findIndex(x => x === match.target); //finding the index of found best case
      return strArr[foundIndex]; //returning initial value from array
   }
    return null;
},

Performance

Also, i compared top answer here (made by @overloard1234) and string-similarity lib (v4.0.4).

The results you can find here : https://jsbench.me/szkzojoskq/1

Perf tests

Result : string-similarity is ~ twice faster

Just for fun : v2.0 of string-similarity library slower, than latest 4.0.4 about 2.2 times. So update it, if you are still using < 3.0 :)

Nigrimmist
  • 10,289
  • 4
  • 52
  • 53
0
  const str1 = " pARTH PARmar r  ";
  const str2 = "  parmar r par     ";

  function calculateSimilarity(str1 = "", str2 = "") {
    let longer = str1.trim();
    let shorter = str2.trim();

    let a1 = longer.toLowerCase().split(" ");
    let b1 = shorter.toLowerCase().split(" ");
    let result = a1.every((aa, i) => aa[0] === b1[i][0]);

    if (longer.length < shorter.length)  [longer,shorter] = [shorter,longer];

    var arr = [];
    let count = 0;
    for(var i = 0;i<longer.length;i++){
      if(shorter && shorter.includes(longer[i])) {
        shorter = shorter.replace(longer[i],"")
        count++
      };
    }

    return {
      score : (count*100)/longer.length,
      result
    }
  }

  console.log(calculateSimilarity(str1, str2));
  • 1
    Your answer could be improved with additional supporting information. Please [edit] to add further details, such as citations or documentation, so that others can confirm that your answer is correct. You can find more information on how to write good answers [in the help center](/help/how-to-answer). – Community Sep 09 '22 at 07:31
0

I used @overlord1234 function, but corrected ь: '', cuz English words don't have this letter, and next need return a[char] ?? char instead of return a[char] || char