import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
data = pd.read_csv('data/TrainingData_unsubscribe.csv')
data['labels'] = data['Category'].factorize()[0]
#vectorize the features
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', stop_words='english')
x_vectors = tfidf.fit_transform(data.msgContent)
#split the data
x = x_vectors
y = data.labels
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
x_train = x_train.toarray()
x_train.shape
x.shape
x_test = x_test.toarray()
preds = x_vectors.toarray()
#Random seed and callback
stop = tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=10)
#create the model
model = tf.keras.Sequential([
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(len(pd.unique(y)), activation='softmax')
])
#compile the model
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
optimizer = tf.keras.optimizers.Adam(),
metrics=['accuracy'])
#fit the model
model.fit(x_train,y_train, epochs=500, verbose=0, callbacks=[stop])
#elavuation
print('\nEvaluation: ')
model.evaluate(x_test,y_test)
predictions = model.predict(preds)
len(pd.unique(y))
data["Prediction"] = predictions.argmax(axis=1)
output = data.drop(["labels"], axis=1)
category_ids = data[["Category", "labels"]].drop_duplicates()
output['Prediction'] =
Originally converted string labels to numeric ones using factorize():
data['labels'] = data['Category'].factorize()[0]
Now I'm trying to convert the labels back to their initial string variables. I've created a DF with the mapped values
Category | labels |
---|---|
HardBounce | 0 |
SoftBounce | 1 |
etc...
is it possible to map a df column using another df as a reference for the map? I've been unable to find any docs that show how to do this.