I am practicing on using NLTK to remove certain features from raw tweets and subsequently hoping to remove tweets that are (to me) irelevant (e.g. empty tweet or single word tweets). However, it seems that some of the single word tweets are not removed. I am also facing an issue with not able to remove any stopword that are either at the beginning or end of sentence.
Any advice? At the moment, I hope to pass back a sentence as an output rather than a list of tokenized words.
Any other comment on improving the code (processing time, elegance) are welcome.
import string
import numpy as np
import nltk
from nltk.corpus import stopwords
cache_english_stopwords=stopwords.words('english')
cache_en_tweet_stopwords=stopwords.words('english_tweet')
# For clarity, df is a pandas dataframe with a column['text'] together with other headers.
def tweet_clean(df):
temp_df = df.copy()
# Remove hyperlinks
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('https?:\/\/.*\/\w*', '', regex=True)
# Remove hashtags
# temp_df.loc[:,"text"]=temp_df.loc[:,"text"].replace('#\w*', '', regex=True)
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('#', ' ', regex=True)
# Remove citations
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('\@\w*', '', regex=True)
# Remove tickers
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('\$\w*', '', regex=True)
# Remove punctuation
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('[' + string.punctuation + ']+', '', regex=True)
# Remove stopwords
for tweet in temp_df.loc[:,"text"]:
tweet_tokenized=nltk.word_tokenize(tweet)
for w in tweet_tokenized:
if (w.lower() in cache_english_stopwords) | (w.lower() in cache_en_tweet_stopwords):
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('[\W*\s?\n?]'+w+'[\W*\s?]', ' ', regex=True)
#print("w in stopword")
# Remove quotes
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('\&*[amp]*\;|gt+', '', regex=True)
# Remove RT
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('\s+rt\s+', '', regex=True)
# Remove linebreak, tab, return
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('[\n\t\r]+', ' ', regex=True)
# Remove via with blank
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('via+\s', '', regex=True)
# Remove multiple whitespace
temp_df.loc[:, "text"] = temp_df.loc[:, "text"].replace('\s+\s+', ' ', regex=True)
# Remove single word sentence
for tweet_sw in temp_df.loc[:, "text"]:
tweet_sw_tokenized = nltk.word_tokenize(tweet_sw)
if len(tweet_sw_tokenized) <= 1:
temp_df.loc["text"] = np.nan
# Remove empty rows
temp_df.loc[(temp_df["text"] == '') | (temp_df['text'] == ' ')] = np.nan
temp_df = temp_df.dropna()
return temp_df