I am implementing an lstm model where I have already trained my model with a dataset. When I am using my new dataset to predict the output, I am having errors because some words that are in my new dataset are not present in trained model. Is there any method to perform so that is the word is not found, it does not consider it?
Actually the words from the train model are saved in a dictionary as shown in my codes below:
df = pd.read_csv('C:/Users/User/Desktop/Coding/lstm emotion recognition/emotion.data/emotion.data')
#Preparing data for model traininng
#Tokenization-Since the data is already tokenized and lowecased, we just need to split the words
input_sentences = [text.split(" ") for text in df["text"].values.tolist()]
labels = df["emotions"].values.tolist()
#creating vocabulary(word index)
#Initialize word2id and label2id dictionaries that will be used to encode words and labels
word2id = dict() #creating the dictionary named word2id
label2id = dict() #creating a dictionary named label2id
max_words = 0 #maximum number of words in a sentence
#construction of word2id
for sentence in input_sentences:
for word in sentence:
#Add words to word2id if not exist
if word not in word2id:
word2id[word] = len(word2id)
#If length of the sentence is greater than max_words, update max_words
if len(sentence) > max_words:
max_words = len(sentence)
#Construction of label2id and id2label dictionaries
label2id = {l: i for i, l in enumerate(set(labels))}
id2label = {v: k for k, v in label2id.items()}
from keras.models import load_model
model = load_model('modelsave2.py')
print(model)
import keras
model_with_attentions = keras.Model(inputs=model.input,
output=[model.output,
model.get_layer('attention_vec').output])
#File I/O Open function for read data from JSON File
with open('C:/Users/User/Desktop/Coding/parsehubjsonfileeg/all.json', encoding='utf8') as file_object:
# store file data in object
data = json.load(file_object)
# dictionary for element which you want to keep
new_data = {'selection1': []}
print(new_data)
# copy item from old data to new data if it has 'reviews'
for item in data['selection1']:
if 'reviews' in item:
new_data['selection1'].append(item)
print(item['reviews'])
print('--')
# save in file
with open('output.json', 'w') as f:
json.dump(new_data, f)
selection1 = data['selection1']
for item in selection1:
name = item['name']
print ('>>>>>>>>>>>>>>>>>> ', name)
CommentID = item['reviews']
for com in CommentID:
comment = com['review'].lower() # converting all to lowercase
result = re.sub(r'\d+', '', comment) # remove numbers
results = (result.translate(
str.maketrans('', '', string.punctuation))).strip() # remove punctuations and white spaces
comments = remove_stopwords(results)
print('>>>>>>',comments)
encoded_samples = [[word2id[word] for word in comments]]
# Padding
encoded_samples = keras.preprocessing.sequence.pad_sequences(encoded_samples, maxlen=max_words)
# Make predictions
label_probs, attentions = model_with_attentions.predict(encoded_samples)
label_probs = {id2label[_id]: prob for (label, _id), prob in zip(label2id.items(), label_probs[0])}
# Get word attentions using attenion vector
print(label_probs)
print(max(label_probs))
my output is:
>>>>>> ['amazing', 'stay', 'nights', 'cleanliness', 'room', 'faultless']
{'fear': 0.26750156, 'love': 0.0044763167, 'joy': 0.06064613, 'surprise': 0.32365623, 'sadness': 0.03203068, 'anger': 0.31168908}
surprise
>>>>>> ['good', 'time', 'food', 'good']
Traceback (most recent call last):
File "C:/Users/User/PycharmProjects/Dissertation/loadandresult.py", line 96, in <module>
encoded_samples = [[word2id[word] for word in comments]]
File "C:/Users/User/PycharmProjects/Dissertation/loadandresult.py", line 96, in <listcomp>
encoded_samples = [[word2id[word] for word in comments]]
KeyError: 'everydaythe'
the error is because the word 'everydaythe' is not found my my trained dataset,..What should i do to correct this? please help me guys. please