I am trying to create a dictionary to use for NLP, where the final output should be something like [{text: "blah blah blah"}, "positive"]
But when I try to create the dictionary of "text: blah blah blah"
, even though I'm dealing with a list, the output I get is only one entry long.
Here is the setup code.
training_text = []
training_tag = []
with open("training.csv", encoding="ISO-8859-1") as csvfile:
list_reader = csv.reader(csvfile)
for row in list_reader:
text=row[0]
tag=row[1]
training_text.append(text)
training_tag.append(tag)
training_text_stem = []
for doc in training_text[1:]: #skip first row, which is header
#tokenize text
tok = nltk.word_tokenize(doc)
text = nltk.Text(tok)
#normalize words
words = [w.lower() for w in text if w.isalpha()]
#build vocabulary
vocab = sorted(set(words))
#remove stopwords
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
vocab_redux = [w for w in vocab if w not in stopwords]
#stemming to reduce topically similar words to their root
from nltk.stem.porter import PorterStemmer
p_stemmer = PorterStemmer()
vocab_stem = [p_stemmer.stem(i) for i in vocab_redux]
training_text_stem.append(vocab_stem)
This is where it's falling apart. I've tried it 2 ways, as a Dict-Zip comprehension, and as a for loop. In both cases, the output is just one entry, as opposed to the whole list.
key = ['text']*len(training_text_stem)
training_dictionary = dict(zip(training_text_stem, key))
def makeadictionary(document):
dictionarylist = []
for doc in document:
dictionarylist.append({'text': doc})
return(dictionarylist)
makeadictionary(training_text_stem)