I don't know much about regular expressions. I need to extract a single word enclosed by left and right parentheses in a string.
For instance, in string
(ROOT (S (NP (NNP Washington) (NNP (CNN)) (NNP Donald) (NNP Trump)) (VP (VBD was) (VP (VBN asked) (PP ( by) (NP (NP (DT a) (NN member)) (PP (IN of) (NP (DT a) (NNP Fox) (NNP News) (NN town) (NN hall) (NN audience))))) (NP-TMP (DT this) (NN week)) (SBAR (WHNP (WP what)) (S (NP (PRP he)) (VP (MD would) (VP (VB do) (S (VP (TO to) (VP (VB reduce) (NP (JJ violent) (NN crime)) (PP (IN in) (NP (DT the) (NNS country's) (JJ inner) (NN cities.)))))))))))))) (NN debate,)) (ADJP (JJ due) (S (VP (TO to) (VP (VB be) (VP (VBN broadcast) (S (NP (JJ live)) (ADJP (RBS most) (RB everywhere.))))))))))))))))
I need to get (CNN)
and ( by)
substrings.
EDIT
def fixeup_tree_string(T):
change_index=[]
Match = [(m.start(0), m.end(0),'Y') for m in re.finditer(r"\(\s*[\w|#|~|!|@|#|$|%|^|&|*|<|>|.|,|;|:|`|'''|_|-|+|/]+\s*\)", T)]
if len(Match)==0:
return T
if Match[0][0]!=0:
change_index.append((0,Match[0][0],'N'))
for i in range(len(Match)-1):
change_index.append(Match[i])
change_index.append((Match[i][1],Match[i+1][0],'N'))
change_index.append(Match[-1])
if Match[len(Match)-1][1]< len(T):
change_index.append((Match[len(Match)-1][1],len(T),'N'))
new_T = []
for r in change_index:
if r[2]=='N':
for i in range(r[0],r[1]):
new_T.append(T[i])
else:
str = T[r[0]:r[1]].replace(' ','')
str = str.split(')')[0].split('(')[-1]
str = '(NN '+str +')'
for x in str:
new_T.append(x)
new_T = (''.join(new_T))
return new_T