Prompted by a comment from Konrad Rudolph on a related question, I wrote the following program to benchmark regular expression performance in F#:
open System.Text.RegularExpressions
let str = System.IO.File.ReadAllText "C:\\Users\\Jon\\Documents\\pg10.txt"
let re = System.IO.File.ReadAllText "C:\\Users\\Jon\\Documents\\re.txt"
for _ in 1..3 do
let timer = System.Diagnostics.Stopwatch.StartNew()
let re = Regex(re, RegexOptions.Compiled)
let res = Array.Parallel.init 4 (fun _ -> re.Split str |> Seq.sumBy (fun m -> m.Length))
printfn "%A %fs" res timer.Elapsed.TotalSeconds
and the equivalent in C++:
#include "stdafx.h"
#include <windows.h>
#include <regex>
#include <vector>
#include <string>
#include <fstream>
#include <cstdio>
#include <codecvt>
using namespace std;
wstring load(wstring filename) {
const locale empty_locale = locale::empty();
typedef codecvt_utf8<wchar_t> converter_type;
const converter_type* converter = new converter_type;
const locale utf8_locale = locale(empty_locale, converter);
wifstream in(filename);
wstring contents;
if (in)
{
in.seekg(0, ios::end);
contents.resize(in.tellg());
in.seekg(0, ios::beg);
in.read(&contents[0], contents.size());
in.close();
}
return(contents);
}
int count(const wstring &re, const wstring &s){
static const wregex rsplit(re);
auto rit = wsregex_token_iterator(s.begin(), s.end(), rsplit, -1);
auto rend = wsregex_token_iterator();
int count=0;
for (auto it=rit; it!=rend; ++it)
count += it->length();
return count;
}
int _tmain(int argc, _TCHAR* argv[])
{
wstring str = load(L"pg10.txt");
wstring re = load(L"re.txt");
__int64 freq, tStart, tStop;
unsigned long TimeDiff;
QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
QueryPerformanceCounter((LARGE_INTEGER *)&tStart);
vector<int> res(4);
#pragma omp parallel num_threads(4)
for(auto i=0; i<res.size(); ++i)
res[i] = count(re, str);
QueryPerformanceCounter((LARGE_INTEGER *)&tStop);
TimeDiff = (unsigned long)(((tStop - tStart) * 1000000) / freq);
printf("(%d, %d, %d, %d) %fs\n", res[0], res[1], res[2], res[3], TimeDiff/1e6);
return 0;
}
Both programs load two file as unicode strings (I'm using a copy of the Bible), construct a non-trivial unicode regex \w?\w?\w?\w?\w?\w
and split the string four times in parallel using the regex returning the sum of the lengths of the split strings (in order to avoid allocation).
Running both in Visual Studio (with MP and OpenMP enabled for the C++) in release build targeting 64-bit, the C++ takes 43.5s and the F# takes 3.28s (over 13x faster). This does not surprise me because I believe .NET JIT compiles the regex to native code whereas the C++ stdlib interprets it but I'd like some peer review.
Is there a perf bug in my C++ code or is this a consequence of compiled vs interpreted regular expressions?
EDIT: Billy ONeal has pointed out that .NET can have a different interpretation of \w
so I have made it explicit in a new regex:
[0-9A-Za-z_]?[0-9A-Za-z_]?[0-9A-Za-z_]?[0-9A-Za-z_]?[0-9A-Za-z_]?[0-9A-Za-z_]
This actually makes the .NET code substantially faster (C++ is the same), reducing the time from 3.28s to 2.38s for F# (over 17x faster).