from sklearn.datasets import fetch_20newsgroups
from collections import Counter
news = fetch_20newsgroups(subset='all')
def clean_word(word):
return re.sub(r'[^\w\s]','',word).lower()
def word_not_in_stopwords(word):
return word not in ENGLISH_STOP_WORDS and word and word.isalpha()
def find_top_words(news):
cnt = Counter()
for text in news:
tokens_in_text = text.split()
tokens_in_text = map(clean_word, tokens_in_text)
tokens_in_text = filter(word_not_in_stopwords, tokens_in_text)
cnt.update(tokens_in_text)
return cnt.most_common(10)
%time find_top_words(news)
I get an error here:
%time find_top_words(news)
^
SyntaxError: invalid syntax