I have obtained the following code along with a dataset from a Kaggle notebook: https://www.kaggle.com/code/danofer/predicting-protein-classification/notebook
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import sys
# Import Datasets
df_seq = pd.read_csv('pdb_data_seq.csv')
df_char = pd.read_csv('pdb_data_no_dups.csv')
print('Datasets have been loaded...')
# 2). ----- Filter and Process Dataset ------
# Filter for only proteins
protein_char = df_char[df_char.macromoleculeType == 'Protein']
protein_seq = df_seq[df_seq.macromoleculeType == 'Protein']
print(protein_char.head())
print(protein_seq.describe(include="all"))
print(protein_char.columns)
# Select some variables to join
protein_char = protein_char[['structureId','classification','residueCount', 'resolution',
'structureMolecularWeight','crystallizationTempK', 'densityMatthews', 'densityPercentSol', 'phValue']]
protein_seq = protein_seq[['structureId','sequence']]
print(protein_seq.head())
print(protein_char.head())
# Join two datasets on structureId
model_f = protein_char.set_index('structureId').join(protein_seq.set_index('structureId'))
print(model_f.head())
print('%d is the number of rows in the joined dataset' %model_f.shape[0])
# Check NA counts
print(model_f.isnull().sum())
# Drop rows with missing values
model_f = model_f.dropna()
print('%d is the number of proteins that have a classification and sequence' %model_f.shape[0])
# Look at classification type counts
counts = model_f.classification.value_counts()
print(counts)
#plot counts
plt.figure()
sns.distplot(counts[(counts > 1000)], hist = False, color = 'purple')
plt.title('Count Distribution for Family Types')
plt.ylabel('% of records')
plt.show()
# Get classification types where counts are over 1000
types = np.asarray(counts[(counts > 1000)].index)
print(len(types))
# Filter dataset's records for classification types > 1000
data = model_f[model_f.classification.isin(types)]
# leaving more rows results in duplciates / index related?
data = data.drop_duplicates(subset=["classification","sequence"])
print(types)
print('%d is the number of records in the final filtered dataset' %data.shape[0])
data = data.drop_duplicates(subset=["classification","sequence"])
print(data.shape)
## Could add n-grams
## https://stackoverflow.com/questions/18658106/quick-implementation-of-character-n-grams-using-python
# jump_size !=1 -> less overlap in n-grams.
def char_grams(text,n=3,jump_size=2):
return [text[i:i+n] for i in range(0,len(text)-n+1,jump_size)]
data.head(3).sequence.apply(char_grams)
data["3mers"] = data.sequence.apply(char_grams)
data.tail()
data.to_csv("protein_classification_46k_ngrams.csv.gz",compression="gzip")
# 3). ----- Train Test Split -----
# Split Data
X_train, X_test, y_train, y_test = train_test_split(data['sequence'], data['classification'], test_size = 0.2, random_state = 1)
# Create a Count Vectorizer to gather the unique elements in sequence
vect = CountVectorizer(analyzer = 'char_wb', ngram_range = (4,4))
# Fit and Transform CountVectorizer
vect.fit(X_train)
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)
#Print a few of the features
print(vect.get_feature_names_out()[-20:])
sys.exit()
# 4). ------ Machine Learning Models ------
# Make a prediction dictionary to store accuracys
prediction = dict()
# Naive Bayes Model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_df, y_train)
NB_pred = model.predict(X_test_df)
prediction["MultinomialNB"] = accuracy_score(NB_pred, y_test)
print(prediction['MultinomialNB'])
# Adaboost
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier()
model.fit(X_train_df,y_train)
ADA_pred = model.predict(X_test_df)
prediction["Adaboost"] = accuracy_score(ADA_pred , y_test)
print(prediction["Adaboost"])
# 5). ----- Plot Confusion Matrix for NB -----
# Plot confusion matrix
conf_mat = confusion_matrix(y_test, NB_pred, labels = types)
#Normalize confusion_matrix
conf_mat = conf_mat.astype('float')/ conf_mat.sum(axis=1)[:, np.newaxis]
# Plot Heat Map
fig , ax = plt.subplots()
fig.set_size_inches(13, 8)
sns.heatmap(conf_mat)
print(types[3])
#print(types[38])
#Print F1 score metrics
print(classification_report(y_test, NB_pred, target_names = types))
However, my dataset is different, and it comprises sequences in CSV format. There is only one common column between my dataset and the test/train datasets.
I am seeking guidance on how to utilize this code/model to predict the classification in the subsequent column of my sequences. Please provide assistance.