I'm a newbie in Keras and I'm trying to solve the task of sentence similairty using NN in Keras. I use word2vec as word embedding, and then a Siamese Network to prediction how similar two sentences are. The base network for the Siamese Network is a LSTM, and to merge the two base network I use a Lambda layer with cosine similairty metric. As dataset I'm using SICK dataset, that gives a score to each pair of sentences, from 1(different) to 5(very similar).
I created the network and it runs, but I have a lot of doubts :
first of all I'm not sure if the way I feed the LSTM with sentences is fine. I take word2vec embedding for each word and I create only one array per sentence, padding it with zeros to seq_len in order to obtain same lenght arrays. And then I reshape it in this way : data_A = embedding_A.reshape((len(embedding_A), seq_len, feature_dim))
Besides I'm not sure if my Siamese Network is correct, beacuse a lot of predictionion for different pairs are equal and the loss doesn't change much (from 0.3300 to 0.2105 in 10 epochs, and it doesn't change much more in 100 epochs).
Someone can help me find and understand my mistakes? Thanks so much (and sorry for my bad english)
Interested part in my code
def cosine_distance(vecs):
#I'm not sure about this function too
y_true, y_pred = vecs
y_true = K.l2_normalize(y_true, axis=-1)
y_pred = K.l2_normalize(y_pred, axis=-1)
return K.mean(1 - K.sum((y_true * y_pred), axis=-1))
def cosine_dist_output_shape(shapes):
shape1, shape2 = shapes
print((shape1[0], 1))
return (shape1[0], 1)
def contrastive_loss(y_true, y_pred):
margin = 1
return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
def create_base_network(feature_dim,seq_len):
model = Sequential()
model.add(LSTM(100, batch_input_shape=(1,seq_len,feature_dim),return_sequences=True))
model.add(Dense(50, activation='relu'))
model.add(Dense(10, activation='relu'))
return model
def siamese(feature_dim,seq_len, epochs, tr_dataA, tr_dataB, tr_y, te_dataA, te_dataB, te_y):
base_network = create_base_network(feature_dim,seq_len)
input_a = Input(shape=(seq_len,feature_dim,))
input_b = Input(shape=(seq_len,feature_dim))
processed_a = base_network(input_a)
processed_b = base_network(input_b)
distance = Lambda(cosine_distance, output_shape=cosine_dist_output_shape)([processed_a, processed_b])
model = Model([input_a, input_b], distance)
adam = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(optimizer=adam, loss=contrastive_loss)
model.fit([tr_dataA, tr_dataB], tr_y,
batch_size=128,
epochs=epochs,
validation_data=([te_dataA, te_dataB], te_y))
pred = model.predict([tr_dataA, tr_dataB])
tr_acc = compute_accuracy(pred, tr_y)
for i in range(len(pred)):
print (pred[i], tr_y[i])
return model
def padding(max_len, embedding):
for i in range(len(embedding)):
padding = np.zeros(max_len-embedding[i].shape[0])
embedding[i] = np.concatenate((embedding[i], padding))
embedding = np.array(embedding)
return embedding
def getAB(sentences_A,sentences_B, feature_dim, word2idx, idx2word, weights,max_len_def=0):
#from_sentence_to_array : function that transforms natural language sentences
#into vectors of real numbers. Each word is replaced with the corrisponding word2vec
#embedding, and words that aren't in the embedding are replaced with zeros vector.
embedding_A, max_len_A = from_sentence_to_array(sentences_A,word2idx, idx2word, weights)
embedding_B, max_len_B = from_sentence_to_array(sentences_B,word2idx, idx2word, weights)
max_len = max(max_len_A, max_len_B,max_len_def*feature_dim)
#padding to max_len
embedding_A = padding(max_len, embedding_A)
embedding_B = padding(max_len, embedding_B)
seq_len = int(max_len/feature_dim)
print(seq_len)
#rashape
data_A = embedding_A.reshape((len(embedding_A), seq_len, feature_dim))
data_B = embedding_B.reshape((len(embedding_B), seq_len, feature_dim))
print('A,B shape: ',data_A.shape, data_B.shape)
return data_A, data_B, seq_len
FEATURE_DIMENSION = 100
MIN_COUNT = 10
WINDOW = 5
if __name__ == '__main__':
data = pd.read_csv('data\\train.csv', sep='\t')
sentences_A = data['sentence_A']
sentences_B = data['sentence_B']
tr_y = 1- data['relatedness_score']/5
if not (os.path.exists(EMBEDDING_PATH) and os.path.exists(VOCAB_PATH)):
create_embeddings(embeddings_path=EMBEDDING_PATH, vocab_path=VOCAB_PATH, size=FEATURE_DIMENSION, min_count=MIN_COUNT, window=WINDOW, sg=1, iter=25)
word2idx, idx2word, weights = load_vocab_and_weights(VOCAB_PATH,EMBEDDING_PATH)
tr_dataA, tr_dataB, seq_len = getAB(sentences_A,sentences_B, FEATURE_DIMENSION,word2idx, idx2word, weights)
test = pd.read_csv('data\\test.csv', sep='\t')
test_sentences_A = test['sentence_A']
test_sentences_B = test['sentence_B']
te_y = 1- test['relatedness_score']/5
te_dataA, te_dataB, seq_len = getAB(test_sentences_A,test_sentences_B, FEATURE_DIMENSION,word2idx, idx2word, weights, seq_len)
model = siamese(FEATURE_DIMENSION, seq_len, 10, tr_dataA, tr_dataB, tr_y, te_dataA, te_dataB, te_y)
test_a = ['this is my dog']
test_b = ['this dog is mine']
a,b,seq_len = getAB(test_a,test_b, FEATURE_DIMENSION,word2idx, idx2word, weights, seq_len)
prediction = model.predict([a, b])
print(prediction)
Some of the results :
my prediction | true label
0.849908 0.8
0.849908 0.8
0.849908 0.74
0.849908 0.76
0.849908 0.66
0.849908 0.72
0.849908 0.64
0.849908 0.8
0.849908 0.78
0.849908 0.8
0.849908 0.8
0.849908 0.8
0.849908 0.8
0.849908 0.74
0.849908 0.8
0.849908 0.8
0.849908 0.8
0.849908 0.66
0.849908 0.8
0.849908 0.66
0.849908 0.56
0.849908 0.8
0.849908 0.8
0.849908 0.76
0.847546 0.78
0.847546 0.8
0.847546 0.74
0.847546 0.76
0.847546 0.72
0.847546 0.8
0.847546 0.78
0.847546 0.8
0.847546 0.72
0.847546 0.8
0.847546 0.8
0.847546 0.78
0.847546 0.8
0.847546 0.78
0.847546 0.78
0.847546 0.46
0.847546 0.72
0.847546 0.8
0.847546 0.76
0.847546 0.8
0.847546 0.8
0.847546 0.8
0.847546 0.8
0.847546 0.74
0.847546 0.8
0.847546 0.72
0.847546 0.68
0.847546 0.56
0.847546 0.8
0.847546 0.78
0.847546 0.78
0.847546 0.8
0.852975 0.64
0.852975 0.78
0.852975 0.8
0.852975 0.8
0.852975 0.44
0.852975 0.72
0.852975 0.8
0.852975 0.8
0.852975 0.76
0.852975 0.8
0.852975 0.8
0.852975 0.8
0.852975 0.78
0.852975 0.8
0.852975 0.8
0.852975 0.78
0.852975 0.8
0.852975 0.8
0.852975 0.76
0.852975 0.8