Update I modified the sample data to include different types of tests, and modified the RemoveCommonPrefixAndSuffix
to handle these new tests.
I found that just comparing the first two strings for a common prefix and suffix can be a mistake if the first two books (or whatever the topic is) begin and/or end with the same words.
For example:
new List<string>()
{
"The book named Lord of the Rings 2 is a classic.",
"The book named Lord of the Flies 2 is a classic.",
"The book named This is pretty is a classic.",
"The book named War and Peace is a classic.",
"The book named The Three Musketeers is a classic.",
},
Here, if we only compare the first two sentences, we determine that the common prefix is "The book named Lord of the"
, which is incorrect. We also determine that the common suffix is "2 is a classic."
, which is also incorrect.
Here's a solution that addresses this by ensuring that all the sentences have the same prefix and suffix:
public static List<string> RemoveCommonPrefixAndSuffix(List<string> sentences,
int minSeqenceLength = 2)
{
if (sentences == null) return null;
if (sentences.Count < 2 ||
sentences.Any(s => s.Count(c => c == ' ') < minSeqenceLength - 1))
{
return sentences.ToList();
}
if (sentences.All(s => s == sentences[0]))
{
return sentences.Select(s => string.Empty).ToList();
}
var sentenceWords = sentences.Select(s => s.Split()).ToList();
var firstSentence = sentenceWords[0];
var length = sentenceWords.Min(s => s.Length);
var commonPrefix = new StringBuilder();
var commonSuffix = new StringBuilder();
var prefixDone = false;
var suffixDone = false;
for (var i = 0; i < length && !(prefixDone && suffixDone); i++)
{
if (!prefixDone && sentenceWords.All(s => s[i] == firstSentence[i]))
{
commonPrefix.Append(firstSentence[i] + " ");
}
else
{
prefixDone = true;
}
if (!suffixDone && sentenceWords.All(s =>
s[s.Length - i - 1] == firstSentence[firstSentence.Length - i - 1]))
{
commonSuffix.Insert(0, firstSentence[firstSentence.Length - i - 1] + " ");
}
else
{
suffixDone = true;
}
}
var prefix = commonPrefix.ToString().Count(c => c == ' ') >= minSeqenceLength - 1
? commonPrefix.ToString()
: string.Empty;
var suffix = commonSuffix.ToString().Count(c => c == ' ') >= minSeqenceLength - 1
? commonSuffix.ToString()
: string.Empty;
var commonLength = prefix.Length + suffix.Length;
return sentences
.Select(s => s.Length > commonLength
? s.Substring(prefix.Length, s.Length - prefix.Length - suffix.Length)
: string.Empty)
.ToList();
}
Here's the method to get the test data:
private static List<List<string>> GetTestSentences()
{
return new List<List<string>>
{
// Prefix-only test
new List<string>
{
"I went to The Home Depot",
"I went to Walgreens",
"I went to Best Buy",
},
// Suffix-only test
new List<string>
{
"Game of Thrones is a good TV series",
"Breaking Bad is a good TV series",
"The Office is a good TV series",
},
// Prefix / Suffix test
new List<string>
{
"The basketball team Los Angeles Lakers are my favorite",
"The basketball team New York Knicks are my favorite",
"The basketball team Chicago Bulls are my favorite",
},
// No prefix or suffix - all sentences are different
new List<string>
{
"I went to The Home Depot",
"Game of Thrones is a good TV series",
"The basketball team Los Angeles Lakers are my favorite",
},
// All sentences are the same - no "topic" between prefix and suffix
new List<string>()
{
"These sentences are all the same",
"These sentences are all the same",
"These sentences are all the same",
},
// Some sentences have no content between prefix and suffix
new List<string>()
{
"This sentence has no topic",
"This sentence [topic here] has no topic",
"This sentence has no topic",
"This sentence [another one] has no topic",
},
// First two topics have common beginnings
new List<string>()
{
"The book named Lord of the Rings is a classic",
"The book named Lord of the Flies is a classic",
"The book named This is pretty is a classic",
"The book named War and Peace is a classic",
"The book named The Three Musketeers is a classic",
},
// The first two topics have a common ending
new List<string>
{
"The movie named Matrix 2 is very good",
"The movie named Avatar 2 is very good",
"The movie named The Sound of Music is very good",
"The movie named Terminator 2 is very good",
}
};
}
Below is the example usage and output. I also included the results from the selected answer, along with some perf benchmarks for speed comparison:
private static void Main()
{
var sentenceLists = GetTestSentences();
var padLength = sentenceLists.Max(t => t.Max(s => s.Length)) + 2;
Console.WriteLine("\nComparison Results\n------------------\n");
// Rufus' solution
var sw = Stopwatch.StartNew();
foreach (var sentenceList in sentenceLists)
{
var trimmedSentences = RemoveCommonPrefixAndSuffix(sentenceList);
for (var j = 0; j < trimmedSentences.Count; j++)
{
Console.WriteLine("{0} {1}", sentenceList[j].PadRight(padLength, '.'),
trimmedSentences[j]);
}
Console.WriteLine();
}
sw.Stop();
Console.WriteLine($"Rufus' solution took {sw.ElapsedMilliseconds} ms\n");
Console.WriteLine(new string('-', Console.WindowWidth));
// Prateek's solution
sw.Restart();
foreach (var sentenceList in sentenceLists)
{
var prefix = FindMatchingPattern(sentenceList[0], sentenceList[1], true);
var suffix = FindMatchingPattern(sentenceList[0], sentenceList[1], false);
if (prefix.Length > 0) prefix = Regex.Escape(prefix);
if (suffix.Length > 0) suffix = Regex.Escape(suffix);
foreach (var item in sentenceList)
{
var result = Regex.Replace(item, prefix, string.Empty);
result = Regex.Replace(result, suffix, string.Empty);
Console.WriteLine($"{item.PadRight(padLength, '.')} {result}");
}
Console.WriteLine();
}
sw.Stop();
Console.WriteLine($"Prateek's solution took {sw.ElapsedMilliseconds} ms\n");
Console.WriteLine(new string('-', Console.WindowWidth));
GetKeyFromUser("\nDone!! Press any key to exit...");
}
Output
