The answer is to make a Rules file with Additional TokensRegexNER Rules.
I used a regex to group out the labeled names. From this I built a rules tempfile which I passed to the corenlp jar with -ner.additional.regexner.mapping mytemprulesfile
.
Alexander III of Macedon PERSON PERSON,LOCATION,ORGANIZATION,MISC
Aristotle PERSON PERSON,LOCATION,ORGANIZATION,MISC
Anatolia LOCATION PERSON,LOCATION,ORGANIZATION,MISC
Alexander PERSON PERSON,LOCATION,ORGANIZATION,MISC
Persia LOCATION PERSON,LOCATION,ORGANIZATION,MISC
Issus LOCATION PERSON,LOCATION,ORGANIZATION,MISC
Gaugamela LOCATION PERSON,LOCATION,ORGANIZATION,MISC
Persian King Darius III PERSON PERSON,LOCATION,ORGANIZATION,MISC
Achaemenid Empire ORGANIZATION PERSON,LOCATION,ORGANIZATION,MISC
I have aligned this list for readability, but these are tab-separated values.
An interesting finding is that some multi-word pre-labeled entities stay multi-word as originally labeled, whereas running corenlp without the rules files will sometimes split these tokens into separate entities.
I had wanted to specifically identify the named-entity tokens, figuring it would make coreferences easier, but I guess this will do for now. How often are entity names identical but unrelated within one document, anyway?
Example (execution takes ~70secs)
import os, re, tempfile, json, nltk, pprint
from subprocess import PIPE
from nltk.internals import (
find_jar_iter,
config_java,
java,
_java_options,
find_jars_within_path,
)
def ExtractLabeledEntitiesByRegex( text, regex ):
rgx = re.compile(regex)
nelist = []
for mobj in rgx.finditer( text ):
ne = mobj.group('ner')
try:
tag = mobj.group('tag')
except IndexError:
tag = 'PERSON'
mstr = text[mobj.start():mobj.end()]
nelist.append( (ne,tag,mstr) )
cleantext = rgx.sub("\g<ner>", text)
return (nelist, cleantext)
def GenerateTokensNERRules( nelist ):
rules = ""
for ne in nelist:
rules += ne[0]+'\t'+ne[1]+'\tPERSON,LOCATION,ORGANIZATION,MISC\n'
return rules
def GetEntities( origtext ):
nelist, cleantext = ExtractLabeledEntitiesByRegex( origtext, '(\[(?P<tag>[a-zA-Z]+)\:\s*)(?P<ner>(\s*\w)+)(\s*\])' )
origfile = tempfile.NamedTemporaryFile(mode='r+b', delete=False)
origfile.write( cleantext.encode('utf-8') )
origfile.flush()
origfile.seek(0)
nerrulefile = tempfile.NamedTemporaryFile(mode='r+b', delete=False)
nerrulefile.write( GenerateTokensNERRules(nelist).encode('utf-8') )
nerrulefile.flush()
nerrulefile.seek(0)
java_options='-mx4g'
config_java(options=java_options, verbose=True)
stanford_jar = '../stanford-corenlp-full-2018-10-05/stanford-corenlp-3.9.2.jar'
stanford_dir = os.path.split(stanford_jar)[0]
_classpath = tuple(find_jars_within_path(stanford_dir))
cmd = ['edu.stanford.nlp.pipeline.StanfordCoreNLP',
'-annotators','tokenize,ssplit,pos,lemma,ner,parse,coref,coref.mention,depparse,natlog,openie,relation',
'-ner.combinationMode','HIGH_RECALL',
'-ner.additional.regexner.mapping',nerrulefile.name,
'-coref.algorithm','neural',
'-outputFormat','json',
'-file',origfile.name
]
# java( cmd, classpath=_classpath, stdout=PIPE, stderr=PIPE )
stdout, stderr = java( cmd, classpath=_classpath, stdout=PIPE, stderr=PIPE ) # Couldn't get working- stdin=textfile
PrintJavaOutput( stdout, stderr )
origfilenametuple = os.path.split(origfile.name)
jsonfilename = origfilenametuple[len(origfilenametuple)-1] + '.json'
os.unlink( origfile.name )
os.unlink( nerrulefile.name )
origfile.close()
nerrulefile.close()
with open( jsonfilename ) as jsonfile:
jsondata = json.load(jsonfile)
currentid = 0
entities = []
for sent in jsondata['sentences']:
for thisentity in sent['entitymentions']:
tag = thisentity['ner']
if tag == 'PERSON' or tag == 'LOCATION' or tag == 'ORGANIZATION':
entity = {
'id':currentid,
'label':thisentity['text'],
'tag':tag
}
entities.append( entity )
currentid += 1
return entities
#### RUN ####
corpustext = "During his youth, [PERSON:Alexander III of Macedon] was tutored by [PERSON: Aristotle] until age 16. Following the conquest of [LOCATION: Anatolia], [PERSON: Alexander] broke the power of [LOCATION: Persia] in a series of decisive battles, most notably the battles of [LOCATION: Issus] and [LOCATION: Gaugamela]. He subsequently overthrew [PERSON: Persian King Darius III] and conquered the [ORGANIZATION: Achaemenid Empire] in its entirety."
entities = GetEntities( corpustext )
for thisent in entities:
pprint.pprint( thisent )
Output
{'id': 0, 'label': 'Alexander III of Macedon', 'tag': 'PERSON'}
{'id': 1, 'label': 'Aristotle', 'tag': 'PERSON'}
{'id': 2, 'label': 'his', 'tag': 'PERSON'}
{'id': 3, 'label': 'Anatolia', 'tag': 'LOCATION'}
{'id': 4, 'label': 'Alexander', 'tag': 'PERSON'}
{'id': 5, 'label': 'Persia', 'tag': 'LOCATION'}
{'id': 6, 'label': 'Issus', 'tag': 'LOCATION'}
{'id': 7, 'label': 'Gaugamela', 'tag': 'LOCATION'}
{'id': 8, 'label': 'Persian King Darius III', 'tag': 'PERSON'}
{'id': 9, 'label': 'Achaemenid Empire', 'tag': 'ORGANIZATION'}
{'id': 10, 'label': 'He', 'tag': 'PERSON'}