An approach with str.translate
, without the use of regex or re
module:
from string import ascii_letters
delete_dict = {sp_character: '' for sp_character in ascii_letters}
table = str.maketrans(delete_dict)
text = 'I 77! need 1:5 this number inside my wor5d, but also this word3 and this 4word, but not this 1 and not this 555.'
print([res for s in text.rstrip('.').split()
if not (s2 := s.rstrip(',')).isnumeric() and (res := s2.translate(table)) and res.isnumeric()])
Out:
['5', '3', '4']
Performance
I was curious so I did some benchmark tests to compare performance against other approaches. Looks like str.translate
is faster even than the regex implementation.
Here is my benchmark code with timeit
:
import re
from string import ascii_letters
from timeit import timeit
_NUM_RE = re.compile(r'(?<=[a-zA-Z])\d+|\d+(?=[a-zA-Z])')
delete_dict = {sp_character: '' for sp_character in ascii_letters}
_TABLE = str.maketrans(delete_dict)
text = 'I need this number inside my wor5d, but also this word3 and this 4word, but not this 1 and not this 555.'
def main():
n = 100_000
print('regex: ', timeit("re.findall(r'(?<=[a-zA-Z])\d+|\d+(?=[a-zA-Z])', text)",
globals=globals(), number=n))
print('regex (opt): ', (timeit("_NUM_RE.findall(text)",
globals=globals(), number=n)))
print('iter_char: ', timeit("""
k=set()
for x in range(1,len(text)-1):
if text[x-1].isdigit() and text[x].isalpha():
k.add(text[x-1])
if text[x].isdigit() and text[x+1].isalpha():
k.add(text[x])
if text[x-1].isalpha() and text[x].isdigit() and text[x+1].isalpha():
k.add(text[x])
if text[x-1].isalpha() and text[x].isdigit():
k.add(text[x])
""", globals=globals(), number=n))
print('str.translate: ', timeit("""
[
res for s in text.rstrip('.').split()
if not (s2 := s.rstrip(',')).isnumeric() and (res := s2.translate(_TABLE)) and res.isnumeric()
]
""", globals=globals(), number=n))
if __name__ == '__main__':
main()
Results (Mac OS X - M1):
regex: 0.5315765410050517
regex (opt): 0.5069837079936406
iter_char: 2.5037198749923846
str.translate: 0.37348733299586456