One way to do this is to:
- Split the string into words which match the keywords (with or without the
XS
suffix), or other non-matching parts using re.findall
- Sort the words which match according to their index in the keywords list
- Rebuild the words list using the sorted keywords
- Join the string back together
You can do that with this function:
def sizesorter(s, keywords):
words = re.findall(r'((?:\b(?:' + '|'.join(keywords) + r')\b)(?:\sXS)?|(?:[^\s]*(?:\s|$)))', s, re.I)
sizes = iter(sorted([w for w in words if w.split(' ')[0] in keywords], key=lambda w:keywords.index(w.split(' ')[0])))
words = [w if w.split(' ')[0] not in keywords else next(sizes) for w in words]
return ''.join(words)
You can then apply that function to the column. For example:
import pandas as pd
import re
df = pd.DataFrame({ 'column' : ['The Small, Large, Medium',
'The fast Medium, Small XS',
'He was a Medium, Large or Small',
'small, Large a metre'
] })
def sizesorter(s, keywords):
words = re.findall(r'((?:\b(?:' + '|'.join(keywords) + r')\b)(?:\sXS)?|(?:[^\s]*(?:\s|$)))', s, re.I)
sizes = iter(sorted([w for w in words if w.split(' ')[0] in keywords], key=lambda w:keywords.index(w.split(' ')[0])))
words = [w if w.split(' ')[0] not in keywords else next(sizes) for w in words]
return ''.join(words)
df.column = df.column.apply(sizesorter, args=(['Small', 'Medium', 'Large'], ))
print(df)
Output:
column
0 The Small, Medium, Large
1 The fast Small XS, Medium
2 He was a Small, Medium or Large
Partial sorting of the list of words adapted from this answer.