I was looking to test the performance of MDLD for some in-browser string comparisions to be integrated into a web-app. The use-case involves comparing strings like, "300mm, Packed Wall" and "Packed Wall - 300mm", so I was looking for fuzzy string matching, that has some tolerance for punctuation and typos, as well as allowing block character transpositions.
I wasn't able to find an implementation online for Javascript. I found a version written for PL/SQL available at CSIRO's Taxamatch Wiki.
This was my attempt at converting the code in to JS; the results for the basic function seem fairly accurate, however, the block transposition calculation doesn't give the expected results. E.g. "Hi There" vs "There Hi" returns "6", regardless of what the block limit is set to.
If anyone knows of a working implementation, could you point me to it? Alternatively, what's the problem with my adaptation, or the source code itself? The only major change I made was to use "Math.ceil()" in two instances where the source appeared to use integer division, which would always take the floor-- It was causing odd issues for inputs that would result in 1 character strings-- but didn't seem to affect the behaviour of other cases I'd tested.
function mdld(str1, str2, block_lim)
{
mycol = [];
cost = 0;
len1 = str1.length;
len2 = str2.length;
if( str1 === str2 )
return 0;
else if ( len1 === 0 || len2 === 0 )
return Math.max(len1, len2);
else if ( len1 === 1 && len2 === 1 && str1 !== str2 )
return 1;
else
{
// Temporary strings which will be pre-processed
// Speeds up calcs & retains correct measurement
temp1 = str1;
temp2 = str2;
// Trim any common initial characters
while ( temp1.substr(0,1) === temp2.substr(0,1) )
{
temp1 = temp1.substr(1, temp1.length);
temp2 = temp2.substr(1, temp2.length);
}
// Trim any trailing characters
while ( temp1.substr(-1,1) === temp2.substr(-1,1) )
{
temp1 = temp1.substr(0,temp1.length-1);
temp2 = temp2.substr(0,temp2.length-1);
}
len1 = temp1.length;
len2 = temp2.length;
// Calc Levenshtein Distance
if (len1 === 0 || len2 === 0)
return Math.max(len1, len2);
else if (len1 === 1 && len2 === 1 && str1 !== str2)
return 1;
else
{
// Create columns
var s, t;
for(s = 0; s <= len1; s++)
mycol[s] = [];
// Enter values into leftmost column
for(t = 0; t <= len2; t++)
mycol[0][t] = t;
// Populate remaining columns
for(s = 1; s <= len1; s++)
{
mycol[s][0] = s;
// Populate first row (each cell of one column)
for(t = 1; t <= len2; t++)
{
//Calculate cost
if (temp1.substr(s-1,1) === temp2.substr(t-1,1))
cost = 0;
else
cost = 1;
// extension to cover multiple character transpositions
// that includes calculation of original Levenshtein distance when no transposition
tempBlockLen = Math.min( Math.ceil(len1/2), Math.ceil(len2/2), !block_lim ? 1 : block_lim );
while (tempBlockLen >= 1)
{
if ((s >= tempBlockLen * 2) &&
(t >= tempBlockLen * 2) &&
(temp1.substr(s-tempBlockLen*2, tempBlockLen) === temp2.substr(t-tempBlockLen, tempBlockLen)) &&
(temp1.substr(s-tempBlockLen, tempBlockLen) === temp2.substr(t-tempBlockLen*2, tempBlockLen)))
{
// Transposition found
mycol[s][t] = Math.min( mycol[s][t-1] + 1,
mycol[s-1][t] + 1,
mycol[s-tempBlockLen*2][t-tempBlockLen*2] + cost + tempBlockLen - 1 );
tempBlockLen = 0;
}
else if (tempBlockLen === 1)
{
// No Transposition
mycol[s][t] = Math.min( mycol[s][t-1] + 1,
mycol[s-1][t] + 1,
mycol[s-1][t-1] + cost );
}
tempBlockLen = tempBlockLen - 1;
}
}
}
}
return mycol[len1][len2];
}
}