There are lots of option to extract numbers from a list of strings.
A general list of strings is assumed as follows:
input_list = ['abc.123def45, ghi67 890 12, jk345', '123, 456 78, 90', 'abc def, ghi'] * 10000
If the conversion into an integer is not considered,
def test_as_str(input_list):
output_list = []
for string in input_list:
output_list += re.findall(r'\d+', string)
return output_list
%timeit -n 10 -r 7 test_as_str(input_list)
> 37.6 ms ± 168 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
def test_as_str(input_list):
output_list = []
[output_list.extend(re.findall(r'\d+', string)) for string in input_list]
return output_list
%timeit -n 10 -r 7 test_as_str(input_list)
> 39.5 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
def test_as_str(input_list):
return list(itertools.chain(*[re.findall(r'\d+', string) for string in input_list]))
%timeit -n 10 -r 7 test_as_str(input_list)
> 40.4 ms ± 202 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
def test_as_str(input_list):
return list(filter(None, [item for string in input_list for item in re.split('[^\d]+' , string)]))
%timeit -n 10 -r 7 test_as_str(input_list)
> 42.8 ms ± 372 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The conversion into an integer can be also considered.
def test_as_int(input_list):
output_list = []
for string in input_list:
output_list += re.findall(r'\d+', string)
return list(map(int, output_list))
%timeit -n 10 -r 7 test_as_int(input_list)
> 44.7 ms ± 232 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
def test_as_int(input_list):
output_list = []
for string in input_list:
output_list += re.findall(r'\d+', string)
return [int(item) for item in output_list]
%timeit -n 10 -r 7 test_as_int(input_list)
> 47.8 ms ± 198 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
def test_as_int(input_list):
return [int(item) for string in input_list for item in re.findall(r'\d+', string)]
%timeit -n 10 -r 7 test_as_int(input_list)
> 48.3 ms ± 101 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
def test_as_int(input_list):
return [int(item) for string in input_list for item in re.split('[^\d]+' , string) if item]
%timeit -n 10 -r 7 test_as_int(input_list)
> 51.4 ms ± 150 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
def test_as_int(input_list):
return [int(item) for string in input_list for item in re.split('[^\d]+' , string) if item.isdigit()]
%timeit -n 10 -r 7 test_as_int(input_list)
> 54.9 ms ± 210 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
def test_as_int(input_list):
return [int(item) for string in input_list for item in re.split('[^\d]+' , string) if len(item)]
%timeit -n 10 -r 7 test_as_int(input_list)
> 55.5 ms ± 175 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The performance test, which does not show much difference, is done on Windows OS, Python 3.8.8 virtual environment.