this is my code loaded the pre-trained weights and embedding matrices
from __future__ import print_function
import numpy as np
import pandas as pd
import csv, datetime, time, json
from zipfile import ZipFile
from os.path import expanduser, exists
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, TimeDistributed, Dense, Lambda,
concatenate, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.data_utils import get_file
from keras import backend as K
from sklearn.model_selection import train_test_split
**Initialize global variables**
KERAS_DATASETS_DIR = expanduser('~/.keras/datasets/')
QUESTION_PAIRS_FILE_URL =
'http://qim.ec.quoracdn.net/quora_duplicate_questions.tsv'
QUESTION_PAIRS_FILE = 'test.csv'
GLOVE_ZIP_FILE_URL = 'http://nlp.stanford.edu/data/glove.840B.300d.zip'
GLOVE_ZIP_FILE = 'glove.840B.300d.zip'
GLOVE_FILE = 'glove.840B.300d.txt'
Q1_TRAINING_DATA_FILE = 'q1_train.npy'
Q2_TRAINING_DATA_FILE = 'q2_train.npy'
LABEL_TRAINING_DATA_FILE = 'label_train.npy'
WORD_EMBEDDING_MATRIX_FILE = 'word_embedding_matrix.npy'
NB_WORDS_DATA_FILE = 'nb_words.json'
MAX_NB_WORDS = 200000
MAX_SEQUENCE_LENGTH = 25
EMBEDDING_DIM = 300
MODEL_WEIGHTS_FILE = 'question_pairs_weights.h5'
VALIDATION_SPLIT = 0.1
TEST_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 1
DROPOUT = 0.1
BATCH_SIZE = 32
OPTIMIZER = 'adam'
word_embedding_matrix = np.load(open(WORD_EMBEDDING_MATRIX_FILE, 'rb'))
with open(NB_WORDS_DATA_FILE, 'r') as f:
nb_words = json.load(f)['nb_words']
print("Processing", QUESTION_PAIRS_FILE)
question1 = []
question2 = []
with open(KERAS_DATASETS_DIR + QUESTION_PAIRS_FILE, encoding='utf-8') as
csvfile:
reader = csv.DictReader(csvfile, delimiter=',')
for row in reader:
question1.append(row['question1'])
question2.append(row['question2'])
print('Question pairs: %d' % len(question1))
T1=len(question1)
print(T1)
**Build tokenized word index**
questions = question1 + question2
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(questions)
question1_word_sequences = tokenizer.texts_to_sequences(question1)
question2_word_sequences = tokenizer.texts_to_sequences(question2)
word_index = tokenizer.word_index
print("Words in index: %d" % len(word_index))
Prepare word embedding matrix
Prepare training data tensors
q1_data = pad_sequences(question1_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
q2_data = pad_sequences(question2_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of question1 data tensor:', q1_data.shape)
print('Shape of question2 data tensor:', q2_data.shape)
**Define the model**
question1 = Input(shape=(MAX_SEQUENCE_LENGTH,))
question2 = Input(shape=(MAX_SEQUENCE_LENGTH,))
q1 = Embedding(nb_words + 1,
EMBEDDING_DIM,
weights=[word_embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)(question1)
q1 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q1)
q1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q1)
q2 = Embedding(nb_words + 1,
EMBEDDING_DIM,
weights=[word_embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)(question2)
q2 = TimeDistributed(Dense(EMBEDDING_DIM, activation='relu'))(q2)
q2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(EMBEDDING_DIM, ))(q2)
merged = concatenate([q1,q2])
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
merged = Dense(200, activation='relu')(merged)
merged = Dropout(DROPOUT)(merged)
merged = BatchNormalization()(merged)
is_duplicate = Dense(1, activation='sigmoid')(merged)
model = Model(inputs=[question1,question2], outputs=is_duplicate)
model.compile(loss='binary_crossentropy', optimizer=OPTIMIZER, metrics=
['accuracy'])
model.load_weights(MODEL_WEIGHTS_FILE)
temp= model.predict([q1_data, q2_data])
df = pd.DataFrame(temp)
df.to_csv("hero.csv",header=['is_duplicate'])
it giving me output like this
test id is_duplicate
0 0.585984
1 0.13437697
2 0.7458109
3 0.6282846
4 0.44692168
but i need boolean values
test id is_duplicate
0 1
1 0
2 1
3 1
dont tell me to round up the values
I want the neural network itself gives me the boolean values
is it possible to train the network like that
if yes, please suggest the things i can include in code
thanks in advance