I am following Dataflair for a fake news project and using Jupyter notebook. I am following along the code that is provided and have been able to fix some errors but I am having an issue with the tfidf_train, it gives me an error as well as np.nan is an invalid document, expected byte or unicode string.
import numpy as np
import pandas as pd
import itertools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer`
I imported the tools as above
Then i imported the database provided by Dataflair:
df=pd.read_csv('C:/Users/cmisl/OneDrive/Desktop/fake_news.csv')
df.shape
df.head()
the table does show NaN values. Is there a way to remove them?
Next I used:
x_train, x_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=7)
tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
count_vectorizer = CountVectorizer(stop_words='english')
tfidf_train= vectorize.fit_transform(x_train)
tfidf_test = vectorize.transform(x_test)
And these are the errors I encounter:
ValueError Traceback (most recent call last)
Cell In[28], line 3
1 tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
2 count_vectorizer = CountVectorizer(stop_words='english')
----> 3 tfidf_train= vectorize.fit_transform(x_train)
4 tfidf_test = vectorize.transform(x_test)
File ~\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:2121, in TfidfVectorizer.fit_transform(self, raw_documents, y)
2114 self._check_params()
2115 self._tfidf = TfidfTransformer(
2116 norm=self.norm,
2117 use_idf=self.use_idf,
2118 smooth_idf=self.smooth_idf,
2119 sublinear_tf=self.sublinear_tf,
2120 )
-> 2121 X = super().fit_transform(raw_documents)
2122 self._tfidf.fit(X)
2123 # X is already a transformed view of raw_documents so
2124 # we set copy to False
File ~\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:1377, in CountVectorizer.fit_transform(self, raw_documents, y)
1369 warnings.warn(
1370 "Upper case characters found in"
1371 " vocabulary while 'lowercase'"
1372 " is True. These entries will not"
1373 " be matched with any documents"
1374 )
1375 break
-> 1377 vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
1379 if self.binary:
1380 X.data.fill(1)
File ~\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:1264, in CountVectorizer._count_vocab(self, raw_documents, fixed_vocab)
1262 for doc in raw_documents:
1263 feature_counter = {}
-> 1264 for feature in analyze(doc):
1265 try:
1266 feature_idx = vocabulary[feature]
File ~\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:106, in _analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
84 """Chain together an optional series of text processing steps to go from
85 a single document to ngrams, with or without tokenizing or preprocessing.
86
(...)
102 A sequence of tokens, possibly with pairs, triples, etc.
103 """
105 if decoder is not None:
--> 106 doc = decoder(doc)
107 if analyzer is not None:
108 doc = analyzer(doc)
File ~\anaconda3\lib\site-packages\sklearn\feature_extraction\text.py:239, in _VectorizerMixin.decode(self, doc)
236 doc = doc.decode(self.encoding, self.decode_error)
238 if doc is np.nan:
--> 239 raise ValueError(
240 "np.nan is an invalid document, expected byte or unicode string."
241 )
243 return doc
ValueError: np.nan is an invalid document, expected byte or unicode string.
Not sure if I should fill in the Nan values? Thanks for your help!