Starting from the cppreference examples
#include <iostream>
#include <regex>
void show_matches(const std::string& in, const std::string& re)
{
std::smatch m;
std::regex_search(in, m, std::regex(re));
if(m.empty()) {
std::cout << "input=[" << in << "], regex=[" << re << "]: NO MATCH\n";
} else {
std::cout << "input=[" << in << "], regex=[" << re << "]: ";
std::cout << "prefix=[" << m.prefix() << "] ";
for(std::size_t n = 0; n < m.size(); ++n)
std::cout << " m[" << n << "]=[" << m[n] << "] ";
std::cout << "suffix=[" << m.suffix() << "]\n";
}
}
int main()
{
// greedy match, repeats [a-z] 4 times
show_matches("abcdefghi", "(?:(?![aeiou])[a-z]){2,4}");
}
You can test and check the the details of the regular expression here.
The choice of using a non capturing group (?: ...)
is to prevent it from changing your groups in case you will use it in a bigger regular expression.
(?![aeiou])
will match without consuming the input if finds a character not matching [aeiou]
, the [a-z]
will match letters. Combining these two condition is equivalent to your character class subtraction.
The {2,4}
is a quantifier that says from 2 to 4, could also be +
for one or more, *
for zero or more.
Edit
Reading the comments in the other answer I understand that you want to support XMLSchema
.
The next program shows how to use ECMA regular expression to translate the "character class differences" to a ECMA compatible format.
#include <iostream>
#include <regex>
#include <string>
#include <vector>
std::string translated_regex(const std::string &pattern){
// pattern to identify character class subtraction
std::regex class_subtraction_re(
"\\[((?:\\\\[\\[\\]]|[^[\\]])*)-\\[((?:\\\\[\\[\\]]|[^[\\]])*)\\]\\]"
);
// translate the regular expression to ECMA compatible
std::string translated = std::regex_replace(pattern,
class_subtraction_re, "(?:(?![$2])[$1])");
return translated;
}
void show_matches(const std::string& in, const std::string& re)
{
std::smatch m;
std::regex_search(in, m, std::regex(re));
if(m.empty()) {
std::cout << "input=[" << in << "], regex=[" << re << "]: NO MATCH\n";
} else {
std::cout << "input=[" << in << "], regex=[" << re << "]: ";
std::cout << "prefix=[" << m.prefix() << "] ";
for(std::size_t n = 0; n < m.size(); ++n)
std::cout << " m[" << n << "]=[" << m[n] << "] ";
std::cout << "suffix=[" << m.suffix() << "]\n";
}
}
int main()
{
std::vector<std::string> tests = {
"Some text [0-9-[4]] suffix",
"([abcde-[ae]])",
"[a-z-[aei]]|[A-Z-[OU]] "
};
std::string re = translated_regex("[a-z-[aeiou]]{2,4}");
show_matches("abcdefghi", re);
for(std::string test : tests){
std::cout << " " << test << '\n'
<< " -- " << translated_regex(test) << '\n';
}
return 0;
}
Edit: Recursive and Named character classes
The above approach does not work with recursive character class negation. And there is no way to deal with recursive substitutions using only regular expressions. This rendered the solution far less straight forward.
The solution has the following levels
- one function scans the regular expression for a
[
- when a
[
is found there is a function to handle the character classes recursively when '-[` is found.
- The pattern
\p{xxxxx}
is handled separately to identify named character patterns. The named classes are defined in the specialCharClass
map, I fill two examples.
#include <iostream>
#include <regex>
#include <string>
#include <vector>
#include <map>
std::map<std::string, std::string> specialCharClass = {
{"IsDigit", "0-9"},
{"IsBasicLatin", "a-zA-Z"}
// Feel free to add the character classes you want
};
const std::string getCharClassByName(const std::string &pattern, size_t &pos){
std::string key;
while(++pos < pattern.size() && pattern[pos] != '}'){
key += pattern[pos];
}
++pos;
return specialCharClass[key];
}
std::string translate_char_class(const std::string &pattern, size_t &pos){
std::string positive;
std::string negative;
if(pattern[pos] != '['){
return "";
}
++pos;
while(pos < pattern.size()){
if(pattern[pos] == ']'){
++pos;
if(negative.size() != 0){
return "(?:(?!" + negative + ")[" + positive + "])";
}else{
return "[" + positive + "]";
}
}else if(pattern[pos] == '\\'){
if(pos + 3 < pattern.size() && pattern[pos+1] == 'p'){
positive += getCharClassByName(pattern, pos += 2);
}else{
positive += pattern[pos++];
positive += pattern[pos++];
}
}else if(pattern[pos] == '-' && pos + 1 < pattern.size() && pattern[pos+1] == '['){
if(negative.size() == 0){
negative = translate_char_class(pattern, ++pos);
}else{
negative += '|';
negative = translate_char_class(pattern, ++pos);
}
}else{
positive += pattern[pos++];
}
}
return '[' + positive; // there is an error pass, forward it
}
std::string translate_regex(const std::string &pattern, size_t pos = 0){
std::string r;
while(pos < pattern.size()){
if(pattern[pos] == '\\'){
r += pattern[pos++];
r += pattern[pos++];
}else if(pattern[pos] == '['){
r += translate_char_class(pattern, pos);
}else{
r += pattern[pos++];
}
}
return r;
}
void show_matches(const std::string& in, const std::string& re)
{
std::smatch m;
std::regex_search(in, m, std::regex(re));
if(m.empty()) {
std::cout << "input=[" << in << "], regex=[" << re << "]: NO MATCH\n";
} else {
std::cout << "input=[" << in << "], regex=[" << re << "]: ";
std::cout << "prefix=[" << m.prefix() << "] ";
for(std::size_t n = 0; n < m.size(); ++n)
std::cout << " m[" << n << "]=[" << m[n] << "] ";
std::cout << "suffix=[" << m.suffix() << "]\n";
}
}
int main()
{
std::vector<std::string> tests = {
"[a]",
"[a-z]d",
"[\\p{IsBasicLatin}-[\\p{IsDigit}-[89]]]",
"[a-z-[aeiou]]{2,4}",
"[a-z-[aeiou-[e]]]",
"Some text [0-9-[4]] suffix",
"([abcde-[ae]])",
"[a-z-[aei]]|[A-Z-[OU]] "
};
for(std::string test : tests){
std::cout << " " << test << '\n'
<< " -- " << translate_regex(test) << '\n';
// Construct a reegx (validate syntax)
std::regex(translate_regex(test));
}
std::string re = translate_regex("[a-z-[aeiou-[e]]]{2,10}");
show_matches("abcdefghi", re);
return 0;
}