I'm trying to tune the parameters of an ALS but always choose the first parameter as best option
from pyspark.sql import SQLContext
from pyspark import SparkConf, SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from math import sqrt
from operator import add
conf = (SparkConf()
.setMaster("local[4]")
.setAppName("Myapp")
.set("spark.executor.memory", "2g"))
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
def computeRmse(data):
return (sqrt(data.map(lambda x: (x[2] - x[3]) ** 2).reduce(add) / float(data.count())))
dfRatings = sqlContext.createDataFrame([(0, 0, 4.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 1, 1.0), (2, 2, 5.0)],
["user", "item", "rating"])
lr1 = ALS()
grid1 = ParamGridBuilder().addGrid(lr1.regParam, [1.0,0.005,2.0]).build()
evaluator1 = RegressionEvaluator(predictionCol=lr1.getPredictionCol(),labelCol=lr1.getRatingCol(), metricName='rmse')
cv1 = CrossValidator(estimator=lr1, estimatorParamMaps=grid1, evaluator=evaluator1, numFolds=2)
cvModel1 = cv1.fit(dfRatings)
a=cvModel1.transform(dfRatings)
print ('rmse with cross validation: {}'.format(computeRmse(a)))
for reg_param in (1.0,0.005,2.0):
lr = ALS(regParam=reg_param)
model = lr.fit(dfRatings)
print ('reg_param: {}, rmse: {}'.format(reg_param,computeRmse(model.transform(dfRatings))))
Output:
rmse with cross validation: 1.1820489116858794
reg_param: 1.0, rmse: 1.1820489116858794
reg_param: 0.005, rmse: 0.001573816765686575
reg_param: 2.0, rmse: 2.1056964491942787
Any help?
Thanks in advance,