As suggested by dyoo,
string(?:(?!endString).)*?endString
can be polyfilled as (where char n
means the n
th char of string
):
string
(?: # Match 0+ groups, each either
# doesn't start with the first char of string,
[^<char 1>]
|
# share the first, but not the second,
<char 1>[^<char 2>]
|
# share the first two, but not the third,
<char 1><char 2>[^<char 3>]
|
# ...
|
# share the first n - 1, but not the nth,
<char 1>...<char n-1>[^<char n>]
)*?
endString
Structure in non-ASCII art (where X
denotes a difference):
┌──┬──┬──┬─────────────┬────┬──┐
│C1│C2│C3│ ... │Cn-1│Cn│
└──┴──┴──┴─────────────┴────┴──┘
├──┼──┼──┼─────────────┼────┼X C1C2C3...Cn-1[^Cn]
├──┼──┼──┼─────────────┼X │ C1C2C3...[^Cn-1]
│ │ │ │ ... │ │ ...
├──┼──┼X │ │ │ C1C2[^C3]
├──┼X │ │ │ │ C1[^C2]
├X │ │ │ │ │ [^C1]
Another representation using nested non-capturing groups:
(?: # Match 0+ groups, each consists of
# the first char, followed by either
<char 1>
(?:
# the second char, followed by either
<char 2>
(?:
# the third char, followed by either
<char 3>
# ...
(?:
# the (n-1)th char, followed by
<char n-1>
(?:
# something that is not the nth char
[^<char n>]
)
|
# something that is not the (n-1)th char
[^<char n-1>]
)
# ...
|
# or something that is not the third char
[^<char 3>]
)
|
# or something that is not the second char
[^<char 2>]
)
|
# or something that is not the first char
[^<char1>]
)*?
Structure in non-ASCII art (where X
denotes a difference and ┬
a branch):
┌──┬──┬──┬─────────────┬────┬──┐
│C1│C2│C3│ ... │Cn-1│Cn│
└──┴──┴──┴─────────────┴────┴──┘
├┬─┼┬─┼┬─┼─────────────┼┬───┼─X C1C2C3...Cn-1[^Cn]
││ ││ ││ │ │└X C1C2C3...[^Cn-1]
... ...
││ ││ │└X│ C1C2[^C3]
││ │└X│ │ C1[^C2]
│└X│ │ │ [^C1]
These two do the same thing. For long strings, the second solution will be substantially shorter. For example, let <div>
and </div>
be the start and end mark, correspondingly. For example:
<div <span>Foo</span> div>
will be separated as (where _
denotes a space; with colors):
┌───────┬───┬────┬───┬───┬───┬───┬───┬───┬───┬────┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐
│ <div_ │ _ │ <s │ p │ a │ n │ > │ F │ o │ o │ </ │ s │ p │ a │ n │ > │ _ │ _ │ _ │ d │ i │ v │ > |
└───────┴───┴────┴───┴───┴───┴───┴───┴───┴───┴────┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┴───┘
As one can observe from the two structures above, there is a repeating pattern, which means the regex can be generated programmatically. Here's such a program, written in pseudocode:
function escape(string)
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&')
end
function escapeBracket(char)
return char == ']' ? '\\]' : char
end
function polyfill(string)
regex = []
for index from 0 to string.length do
# string[0] included, string[index] excluded
branch = escape(string.slice(0, index)) + '[^{escapeBracket(string[index])}]'
regex.append(branch)
end
return '(?:{regex.join('|')})*?'
end
polyfill('BBB') # '(?:[^B]|B[^B]|BB[^B])*?'
polyfill('<div>') # '(?:[^<]|<[^d]|<d[^i]|<di[^v]|<div[^>])*?'
...or:
function polyfill2(string)
reversedString = string.reverse()
regex = ''
for index from 0 to string.length do
char = reversedString[index]
firstBranch = regex ? escape(char) + regex + '|' : ''
secondBranch = '[^{escapeBracket(char)}]'
regex = '(?:{firstBranch}{secondBranch})'
end
return regex + '*?'
end
polyfill2('BBB') # (?:B(?:B(?:[^B])|[^B])|[^B])*?
polyfill2('<div>') # (?:<(?:d(?:i(?:v(?:[^>])|[^v])|[^i])|[^d])|[^<])*?
Try them on regex101:
polyfill
:
BBB
(PCRE2: 711 steps)
<div>
(PCRE2: 4021 steps)
polyfill2
:
BBB
(PCRE2: 814 steps)
<div>
(PCRE2: 4734 steps)
- Negative lookaheads:
BBB
(PCRE2: 991 steps)
<div>
(PCRE2: 5887 steps)
Playground:
const [p1, p2, p3] = document.querySelectorAll('div');
const [start, end] = document.querySelectorAll('input');
const data = document.querySelector('#data');
start.addEventListener('input', inputHandler);
end.addEventListener('input', inputHandler);
start.dispatchEvent(new Event('input'));
function inputHandler() {
const escaped = [escape(start.value), escape(end.value)];
[
[p1, polyfill],
[p2, polyfill2],
[p3, lookahead]
].forEach(([element, fn]) => {
element.textContent = escaped.join(fn(start.value));
});
data.textContent = generateData(start.value, end.value);
}
function generateData(start, end) {
const data = [];
const middleLength = 10 + Math.random() * 30 | 0;
const middle = [...Array(middleLength)].map(() => {
const original = (Math.random() > 0.5 ? end : start);
return original[Math.random() * original.length | 0];
}).join('');
for (let i = 0; i < start.length; i++) {
for (let j = 0; j < end.length; j++) {
data.push(
start + start.slice(0, i + 1) +
middle +
end.slice(-j - 1) + end
);
}
}
return data.join('\n');
}
function escape(string) {
return string.replace(/[-/\\^$*+?.()|[\]{}]/g, '\\$&');
}
function escapeBracket(char) {
return char === ']' ? '\\]' : char;
}
function polyfill(string) {
const regex = [];
for (let i = 0; i < string.length; i++) {
const branch = escape(string.slice(0, i)) + `[^${escapeBracket(string[i])}]`;
regex.push(branch);
}
return `(?:${regex.join('|')})*?`;
}
function polyfill2(string) {
const reversedString = [...string].reverse();
let regex = '';
for (let i = 0; i < string.length; i++) {
const char = reversedString[i];
const firstBranch = regex ? escape(char) + regex + '|' : '';
const secondBranch = `[^${escapeBracket(char)}]`;
regex = `(?:${firstBranch}${secondBranch})`;
}
return regex + '*?';
}
function lookahead(string) {
return `(?:(?!${escape(string)}).)*?`;
}
label {
display: flex;
align-items: start;
flex-direction: column;
gap: 1em;
margin-top: 2em;
}
label > :nth-child(n + 2) {
align-self: stretch;
font-family: monospace;
}
input {
padding: 1.5em 1em;
height: 0;
}
div {
word-break: break-all;
}
#data {
height: 5em;
white-space: pre;
}
<label>
<span>Start/end:</span>
<input type="text" value="[foo-bar(baz)qux]">
<input type="text" value="[foo-bar(baz)qux]">
</label>
<label>
<span>Polyfill:</span>
<div id="p1"></div>
</label>
<label>
<span>Polyfill2:</span>
<div id="p2"></div>
</label>
<label>
<span>Negative lookahead:</span>
<div id="p3"></div>
</label>
<label>
<span>Random text:</span>
<textarea id="data"></textarea>
</label>
(Disclaimer: I don't know Go. This answer was written as suggested by @markalex at q/76035959)