You can do rule based matching with Spacy to take information provided by surrounding words into account.
I wrote some demo code below which you can extend to cover more cases:
import spacy
from spacy.pipeline import EntityRuler
from spacy import displacy
from spacy.matcher import Matcher
sentences = ["now she's a software engineer" , "she's got a cat", "he's a tennis player", "He thinks that she's 30 years old"]
nlp = spacy.load('en_core_web_sm')
def normalize(sentence):
ans = []
doc = nlp(sentence)
#print([(t.text, t.pos_ , t.dep_) for t in doc])
matcher = Matcher(nlp.vocab)
pattern = [{"POS": "PRON"}, {"LOWER": "'s"}, {"LOWER": "got"}]
matcher.add("case_has", None, pattern)
pattern = [{"POS": "PRON"}, {"LOWER": "'s"}, {"LOWER": "been"}]
matcher.add("case_has", None, pattern)
pattern = [{"POS": "PRON"}, {"LOWER": "'s"}, {"POS": "DET"}]
matcher.add("case_is", None, pattern)
pattern = [{"POS": "PRON"}, {"LOWER": "'s"}, {"IS_DIGIT": True}]
matcher.add("case_is", None, pattern)
# .. add more cases
matches = matcher(doc)
for match_id, start, end in matches:
string_id = nlp.vocab.strings[match_id]
for idx, t in enumerate(doc):
if string_id == 'case_has' and t.text == "'s" and idx >= start and idx < end:
ans.append("has")
continue
if string_id == 'case_is' and t.text == "'s" and idx >= start and idx < end:
ans.append("is")
continue
else:
ans.append(t.text)
return(' '.join(ans))
for s in sentences:
print(s)
print(normalize(s))
print()
output:
now she's a software engineer
now she is a software engineer
she's got a cat
she has got a cat
he's a tennis player
he is a tennis player
He thinks that she's 30 years old
He thinks that she is 30 years is old