Matching different groups and knitting may be faster than regex replace. Would have to test
import re
#=== DESIRED ===================================================================
# aarhus(iof>city>thing,equ>arhus);aarhus;CAT(CATN),N(NP) ;
# abadan(iof>city>thing);abadan;CAT(CATN),N(NP) ;
# abandon(icl>leave>do,agt>person,obj>person);abandon;CAT(CATV),AUX(AVOIR),VAL1(GN) ;```
#===============================================================================
data = ["abadan(iof>city>thing);CAT(CATN),N(NP) ;",
"abandon(icl>leave>do,agt>person,obj>person);CAT(CATV),AUX(AVOIR),VAL1(GN) ;"]
# Matching different groups, and then stiching together may be faster tna a regex replace.
# Basedon https://stackoverflow.com/questions/3850074/regex-until-but-not-including
# (?:(?!CAT).)* - match anything until the start of the word CAT.
# I.e.
# (?: # Match the following but do not capture it:
# (?!CAT) # (first assert that it's not possible to match "CAT" here
# . # then match any character
# )* # end of group, zero or more repetitions.
p = ''.join(["^", # Match start of string
"(.*?(?:(?!\().)*)", # Match group one, anything up to first open paren, which will be the first word (I.e. abadan or abandon
"(.*?(?:(?!CAT).)*)", # Group 2, match everything after group one, up to "CAT" but not including CAT
"(.*$)" # Match the rest
])
for line in data:
m = re.match(p, line)
newline = m.group(1) # First word
newline += m.group(2) # Group two
newline += m.group(1) + ";" # First word again with semi-colon
newline += m.group(3) # Group three
print(newline)
OUTPUT:
abadan(iof>city>thing);abadan;CAT(CATN),N(NP) ;
abandon(icl>leave>do,agt>person,obj>person);abandon;CAT(CATV),AUX(AVOIR),VAL1(GN) ;