How can I fix this n-gram extractor in Python?

Question

I've made an n-gram extractor that pulls organization's names from texts. However, the program only pulls the first letter of the first word and the last word. For example, if the phrase "Sprint International Corporation" appears in the text, the program will return "s corporation" as the n-gram. Do you know what I'm doing wrong? I've posted the code and output below. Thanks.

This is the code for the n-gram extractor.

def org_ngram(classified_text):
    orgs = [c for c in classified_text if (c[1]=="ORGANIZATION")]
    #print(orgs)

    combined_orgs = []
    prev_org = False
    new_org = ("", "ORGANIZATION")
    for i in range(len(classified_text)):
        if classified_text[i][1] != "ORGANIZATION":
            prev_org = False
        else:
            if prev_org:
                new_org = new_org[0] + " " + classified_text[i][0].lower()
            else:
                combined_orgs.append(new_org)
                new_org = classified_text[i][0].lower()
            prev_org = True

    combined_orgs.append(new_org)
    combined_orgs = combined_orgs[1:]
    return combined_orgs

Here is the text that I analyze and the program I use to analyze it.

from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize

st = StanfordNERTagger('C:\\path\\english.all.3class.distsim.crf.ser.gz',
                       'C:\\Users\\path\\stanford-ner.jar',
                       encoding='utf-8')

text = "Trump met with representatives from Sprint International Corporation, Nike Inc, and Wal-Mart Company regarding the trade war."

tokenized_text = word_tokenize(text)
classified_text = st.tag(tokenized_text)
orgs = org_ngram(classified_text)

print(orgs)

Here is the current output.

['s corporation', 'n inc', 'w company']

This is what I want to output to look like.

['sprint international corporation', 'nike inc', 'wal-mart company']

score 0 · Answer 1 · answered Feb 01 '20 at 00:06

Firstly, avoid the StanfordNERTagger, it would have been deprecated soon. Use this instead Stanford Parser and NLTK

>>> from nltk.parse import CoreNLPParser

# Lexical Parser
>>> parser = CoreNLPParser(url='http://localhost:9000')

>>> ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
>>> list(ner_tagger.tag(('Rami Eid is studying at Stony Brook University in NY'.split())))
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'STATE_OR_PROVINCE')]

Once you have the list of tuples from with the tokens and NER tag, the task you want to achieve is to get continuous token-tag items in the list of tuples given a specific tag type, you can try the solution from https://stackoverflow.com/a/30666949/610569

from nltk import pos_tag
from nltk.chunk import conlltags2tree
from nltk.tree import Tree

def stanfordNE2BIO(tagged_sent):
    bio_tagged_sent = []
    prev_tag = "O"
    for token, tag in tagged_sent:
        if tag == "O": #O
            bio_tagged_sent.append((token, tag))
            prev_tag = tag
            continue
        if tag != "O" and prev_tag == "O": # Begin NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag == tag: # Inside NE
            bio_tagged_sent.append((token, "I-"+tag))
            prev_tag = tag
        elif prev_tag != "O" and prev_tag != tag: # Adjacent NE
            bio_tagged_sent.append((token, "B-"+tag))
            prev_tag = tag

    return bio_tagged_sent


def stanfordNE2tree(ne_tagged_sent):
    bio_tagged_sent = stanfordNE2BIO(ne_tagged_sent)
    sent_tokens, sent_ne_tags = zip(*bio_tagged_sent)
    sent_pos_tags = [pos for token, pos in pos_tag(sent_tokens)]

    sent_conlltags = [(token, pos, ne) for token, pos, ne in zip(sent_tokens, sent_pos_tags, sent_ne_tags)]
    ne_tree = conlltags2tree(sent_conlltags)
    return ne_tree

def extract_ner(ne_tagged_sent):
    ne_tree = stanfordNE2tree(ne_tagged_sent)

    ne_in_sent = []
    for subtree in ne_tree:
        if type(subtree) == Tree: # If subtree is a noun chunk, i.e. NE != "O"
            ne_label = subtree.label()
            ne_string = " ".join([token for token, pos in subtree.leaves()])
            ne_in_sent.append((ne_string, ne_label))
    return ne_in_sent

Then:

ne_tagged_sent = [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), 
('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), 
('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), 
('in', 'O'), ('NY', 'LOCATION')]

print(extract_ner(ne_tagged_sent))

[out]:

[('Rami Eid', 'PERSON'), ('Stony Brook University', 'ORGANIZATION'), ('NY', 'LOCATION')]

How can I fix this n-gram extractor in Python?

1 Answers1