How to optimize this edit distance code i.e. finding the number of bits changed between 2 values! e.g. word1 = '010000001000011111101000001001000110001' word2 = '010000001000011111101000001011111111111'
When i tried to run on Hadoop it takes ages to complete?
How to reduce the for loop and comparsions ?
#!/usr/bin/python
import os, re, string, sys
from numpy import zeros
def calculateDistance(word1, word2):
x = zeros( (len(word1)+1, len(word2)+1) )
for i in range(0,len(word1)+1):
x[i,0] = i
for i in range(0,len(word2)+1):
x[0,i] = i
for j in range(1,len(word2)+1):
for i in range(1,len(word1)+1):
if word1[i-1] == word2[j-1]:
x[i,j] = x[i-1,j-1]
else:
minimum = x[i-1, j] + 1
if minimum > x[i, j-1] + 1:
minimum = x[i, j-1] + 1
if minimum > x[i-1, j-1] + 1:
minimum = x[i-1, j-1] + 1
x[i,j] = minimum
return x[len(word1), len(word2)]