I have a C# function that compares two strings oldString
and newString
line-by-line. Each comparison takes case into account only within quotes. For example, these two lines are equal:
If Foo Then: Debug.Print "Hello World!"
If foo Then: Debug.print "Hello World!"
And these two are unequal:
If Foo Then: Debug.Print "Hello World!"
If Foo Then: Debug.Print "hello world!"
The function returns a new file constructed from newString
, but wherever there are unchanged lines, the lines from oldString
are used instead. So if this is oldString
:
If Foo Then
Debug.Print "Hello World!"
End If
Debug.Print "This line will be deleted soon."
And this is newString
:
If foo Then
Debug.print "Hello World!"
Debug.print "This is a new line."
End If
Then the function returns this:
If Foo Then
Debug.Print "Hello World!"
Debug.print "This is a new line."
End If
Currently I'm using this code based on the dynamic programming approach to the longest common substring problem, with some simple optimizations:
public static string FixCase(string oldString, string newString) {
if (string.Equals(oldString, newString, StringComparison.InvariantCulture)) {
return newString;
}
var oldLines = oldString.Split(new[] { "\r\n" }, StringSplitOptions.None).ToList();
var newLines = newString.Split(new[] { "\r\n" }, StringSplitOptions.None).ToList();
var headEqualLines = new List<string>();
var minLineCount = oldLines.Count < newLines.Count ? oldLines.Count : newLines.Count;
for (var i = 0; i < minLineCount; i++) {
if (VbaLineEqual(oldLines[i], newLines[i])) {
headEqualLines.Add(oldLines[i]);
} else {
break;
}
}
oldLines.RemoveRange(0, headEqualLines.Count);
newLines.RemoveRange(0, headEqualLines.Count);
minLineCount -= headEqualLines.Count;
var tailEqualLines = new List<string>();
if (oldLines.Count > 0 && newLines.Count > 0) {
for (var i = 1; i <= minLineCount; i++) {
if (VbaLineEqual(oldLines[oldLines.Count - i], newLines[newLines.Count - i])) {
tailEqualLines.Add(oldLines[oldLines.Count - i]);
} else {
break;
}
}
tailEqualLines.Reverse();
oldLines.RemoveRange(oldLines.Count - tailEqualLines.Count - 1, tailEqualLines.Count);
newLines.RemoveRange(newLines.Count - tailEqualLines.Count - 1, tailEqualLines.Count);
}
var lengths = new int[oldLines.Count + 1, newLines.Count + 1];
for (var i = 0; i < oldLines.Count; i++) {
for (var j = 0; j < newLines.Count; j++) {
if (VbaLineEqual(oldLines[i], newLines[j]))
lengths[i + 1, j + 1] = lengths[i, j] + 1;
else
lengths[i + 1, j + 1] = lengths[i + 1, j] > lengths[i, j + 1] ? lengths[i + 1, j] : lengths[i, j + 1];
}
}
var result = new List<string>();
var x = oldLines.Count;
var y = newLines.Count;
while (x != 0 && y != 0) {
if (lengths[x, y] == lengths[x - 1, y]) {
x--;
} else if (lengths[x, y] == lengths[x, y - 1]) {
result.Add(newLines[y - 1]);
y--;
} else {
result.Add(oldLines[x - 1]);
x--;
y--;
}
}
while (y != 0) {
result.Add(newLines[y - 1]);
y--;
}
result.Reverse();
return string.Join("\r\n", headEqualLines.Concat(result).Concat(tailEqualLines));
}
static bool VbaLineEqual(string oldLine, string newLine) {
if (string.Equals(oldLine, newLine, StringComparison.InvariantCulture))
return true;
if (!string.Equals(oldLine, newLine, StringComparison.InvariantCultureIgnoreCase))
return false;
var quoting = false;
for (var i = 0; i < oldLine.Length; i++) {
if (quoting && oldLine[i] != newLine[i])
return false;
if (oldLine[i] == '"')
quoting = !quoting;
}
return true;
}
But when I run this against a pair of strings with 5,000 or so lines and changes near the beginning and end, performance and memory usage are both pretty dreadful, probably due to allocating an array of 5,000 × 5,000 int
.
Is there any pre-built library that can handle this comparison, or a more efficient algorithm I can use? Note that I'd like to license my code under the zlib license, so GNU diffutils is no good.