Instead of using Regex
to parse the Xml Document, you can use the tools in the System.Xml.Linq
namespace to handle the parsing for you, which is inherently much faster and easier to use.
Here's an example program that takes a structure with 35,000 nodes in. I've kept your regex string to check for the bad characters, but I've specified it as a Compiled
regex string, which should yield better performance, although admittedly, not a huge increase when I compared the two. More info.
This example uses Descendants
, which gets references to all of the element you specify in the parameter within the element specified (in this case, we've started from the root element). Those results are filtered by the ContainsBadCharacters
method.
For the sake of simplicity I haven't made the foreach
loops DRY, but it's probably worth doing so.
On my machine, this runs in less than a second, but timings will vary based on machine performance and occurrences of bad characters.
using System;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Text.RegularExpressions;
using System.Xml.Linq;
namespace ConsoleApplication2
{
class Program
{
static Regex r = new Regex(@"(&#[xX]?[A-Fa-f\d]+;)|[^\w\s\/\;\&\.@-]", RegexOptions.Compiled);
static void Main(string[] args)
{
System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
var xmls = new StringBuilder("<Nodes>");
for(int i = 0;i<35000;i++)
{
xmls.Append(@"<Node>
<Title>Lorem~~~~</Title>
<Country>Ipsum!</Country>
</Node>");
}
xmls.Append("</Nodes>");
var doc = XDocument.Parse(xmls.ToString());
sw.Start();
foreach(var element in doc.Descendants("Title").Where(ContainsBadCharacters))
{
element.Value = r.Replace(element.Value, "$1");
}
foreach (var element in doc.Descendants("Country").Where(ContainsBadCharacters))
{
element.Value = r.Replace(element.Value, "$1");
}
sw.Stop();
var saveFile = new FileInfo(Path.Combine(Assembly.GetExecutingAssembly().Location.Substring(0,
Assembly.GetExecutingAssembly().Location.LastIndexOf(@"\")), "test.txt"));
if (!saveFile.Exists) saveFile.Create();
doc.Save(saveFile.FullName);
Console.WriteLine(sw.Elapsed);
Console.Read();
}
static bool ContainsBadCharacters(XElement item)
{
return r.IsMatch(item.Value);
}
}
}