Ok, my first ever attempt answering a stack overflow question...
Your question is a bit vague, so I'll try to answer it as best I understand it. It sounds like you're asking how to prepare text prior to building SVN models, specifically how to lemmatize text input, compute word frequencies, and also create n-grams from the given string.
import nltk
from collections import Counter
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
# lowercase, remove punctuation, and lemmatize string
def word_generator(str):
wnl = WordNetLemmatizer()
clean = nltk.word_tokenize(str)
words = [wnl.lemmatize(word.lower()) for word in clean if word.isalpha()]
for word in words:
yield word
# create list of freqs
def freq_count(str):
voc_freq = Counter()
for word in word_generator(str):
voc_freq[word] += 1
trimmed = sorted(voc_freq.items(), reverse=True, key=lambda x: x[1])
return trimmed
# create n-grams
def make_ngrams(str, n):
grams = ngrams([word for word in word_generator(str)], n)
return list(grams)
Example 4-gram output:
>>> my_str = 'This is this string, not A great Strings not the greatest string'
>>> print(freq_count(my_str))
[('string', 3), ('this', 2), ('not', 2), ('is', 1), ('a', 1), ('great', 1), ('the', 1), ('greatest', 1)]
>>> print(make_ngrams(my_str, 4))
[('this', 'is', 'this', 'string'), ('is', 'this', 'string', 'not'), ('this', 'string', 'not', 'a'), ('string', 'not', 'a', 'great'), ('not', 'a', 'great', 'string'), ('a', 'great', 'string', 'not'), ('great', 'string', 'not', 'the'), ('string', 'not', 'the', 'greatest'), ('not', 'the', 'greatest', 'string')]
Then you can do whatever you want with this, such as creating vectors.