Please refer to the below runnable demo code. It is trying to sr.str.extract()
the "a"
column into multiple columns, and insert those columns into the origianl df immediately after the "a"
column.
Please combine steps [1]
and [2]
in the below code in a better way.
import re
import pandas as pd
df = pd.DataFrame({
"a" : {1: 'a', 2: 'aa', 3: 'aaa'} ,
"b" : {1: 'b', 2: 'bb', 3: 'bbb'} ,
"c" : {1: 'b', 2: 'bb', 3: 'bbb'} ,
})
"""
df==
+----+-----+-----+-----+
| | a | b | c |
|----+-----+-----+-----|
| 1 | a | b | b |
| 2 | aa | bb | bb |
| 3 | aaa | bbb | bbb |
+----+-----+-----+-----+
"""
# step [1] sr.str.extract
rex = re.compile(r'(?P<firstletter>\w) (?P<secondletter>\w+)', re.X)
cols = df['a'].str.extract(rex)
# step [2] insert extracted columns back into the original df
df['firstletter'], df['secondletter'] = 0, 0
df['firstletter'] = cols['firstletter']
df['secondletter'] = cols['secondletter']
df = df['a firstletter secondletter b c'.split()]
"""
# Or, a more concise step [2], but too hard to glance thru and remember, also prone to mistake:
for col in cols.columns[::-1]:
df.insert(df.columns.get_loc('a')+1, col, cols[col])
"""
# result:
"""
df==
+----+-----+---------------+----------------+-----+-----+
| | a | firstletter | secondletter | b | c |
|----+-----+---------------+----------------+-----+-----|
| 1 | a | nan | nan | b | b |
| 2 | aa | a | a | bb | bb |
| 3 | aaa | a | aa | bbb | bbb |
+----+-----+---------------+----------------+-----+-----+
"""