I want to check how to do unsupervised learning for HMM with the nltk HMM-Trainer. As far as i know, a HMM with supervised and unsupervised training should perform better than only supervised trained. I got following Code
#!/bin/python3
import nltk
from nltk.corpus import brown
from nltk.util import unique_list
def hmmTrainer(sents):
tag_set = unique_list(tag for sent in sents for (word,tag) in sent)
symbols = unique_list(word for sent in sents for (word,tag) in sent)
return nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
if __name__ == '__main__':
corpus=brown
print(len(corpus.sents()))
> 57340
train= corpus.tagged_sents()[:10000]
devel=corpus.tagged_sents()[10000:11000]
test=corpus.tagged_sents()[11000:13000]
trainer=tag.hmmTrainer(train+devel)
hmm=trainer.train_supervised(train)
print(hmm.evaluate(test))
> 0.3531
tmp=[nltk.untag(sent) for sent in devel]
hmm=trainer.train(labeled_sequences=train,
unlabeled_sequences=[tmp])
print(hmm.evaluate(test))
> 0.0866
# this needs ~40h for training
train= corpus.tagged_sents()[:30000]
devel=corpus.tagged_sents()[30000:33000]
test=corpus.tagged_sents()[33000:36000]
trainer=tag.hmmTrainer(train+devel)
hmm=trainer.train_supervised(train)
print(hmm.evaluate(test))
> 0.59785
tmp=[nltk.untag(sent) for sent in devel]
hmm=trainer.train(labeled_sequences=train,
unlabeled_sequences=[tmp])
print(hmm.evaluate(test))
> 0.1057
Whats my mistake?