I am running Cloudera Spark 1.5.0 with Python 2.6.6
I have defined 3 functions like this
def tf(tokens):
""" Compute Term/Token Frequency
Args:
tokens (list of str): input list of tokens from tokenize
Returns:
dictionary: a dictionary of tokens to its TF values
"""
tokenCounts = {}
for t in tokens:
tokenCounts[t] = tokenCounts.get(t, 0) + (1.0/len(tokens))
return tokenCounts
def idfs(corpus):
""" Compute IDF
Args:
corpus (RDD): input corpus
Returns:
RDD: a RDD of (token, IDF value)
"""
uniqueTokens = corpus.flatMap(lambda x: list(set(x[1])))
tokenCountPairTuple = uniqueTokens.map(lambda x: (x, 1))
tokenSumPairTuple = tokenCountPairTuple.reduceByKey(lambda a,b: a+b)
N = corpus.count()
return (tokenSumPairTuple.map(lambda x: (x[0], float(N)/float(x[1]))))
def tfidf(tokens, idfs):
""" Compute TF-IDF
Args:
tokens (list of str): input list of tokens from tokenize
idfs (dictionary): record to IDF value
Returns:
dictionary: a dictionary of records to TF-IDF values
"""
tfs = tf(tokens)
tfIdfDict = {k: v*idfs[k] for k, v in tfs.items()}
return tfIdfDict
From other function, I make the call like this
w1 = tfidf(tokenize(string1),idfsDictionary)
I am getting an error like this
tfIdfDict = {k: v*idfs[k] for k, v in tfs.items()}
^
SyntaxError: invalid syntax
what is wrong with my syntax here. Worked well in databricks environment