I have a Python code which classifies a piece of news as either fake or real. TfidfVectorizer is used to clean the data and Passive Aggressive Classifier is used to model the fake news detector. Could someone tell me what line of code I should used to display the 30 most common words used in both the fake news and real news? And how do I draw a bar plot to show the frequency of these words?
%matplotlib inline
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import itertools
import json
import csv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
df = pd.read_csv(r".\fake_news(1).csv", sep=',', header=0, engine='python', escapechar='\\')
#print(df)
#df.shape
df.head()
#df.head().to_dict()
headline1 = df.headline
headline1.head()
trainx, testx, trainy, testy = train_test_split(df['headline'], is_sarcastic_1, test_size = 0.2, random_state = 7)
tvector = TfidfVectorizer(strip_accents='ascii', stop_words='english', max_df=0.5)
ttrain = tvector.fit_transform(trainx)
ttest = tvector.transform(testx)
pac = PassiveAggressiveClassifier(max_iter=100)
pac.fit(ttrain, trainy)
y_pred = pac.predict(ttest)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100,2)}%')
corpus = ['dem rep. totally nails why congress is falling short on gender, racial equality',
'eat your veggies: 9 deliciously different recipes',
'inclement weather prevents liar from getting to work',
"mother comes pretty close to using word 'streaming' correctly"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())