I am using CSV size 9gb 13rows and column 8 crore, i want to apply naivebayes formula multilogistic with reference of a class=5
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
def parseLine(line):
parts = line.split(',')
label = integer(parts[0])
features = Vectors.dense([integer(x) for x in parts[1].split(' ')])
return LabeledPoint(label, features)
training = sc.textFile('c:/ram/testpca2004.csv').map(parseLine)
test = sc.textFile('c:/ram/testtestpca2004.csv').map(parseLine)
# Train a naive Bayes model.
model = NaiveBayes.train(training, 5)
# Make prediction and test accuracy.
predictionAndLabel = test.map(lambda p: (model.predict(p.features), p.label))
accuracy = 1.0 * predictionAndLabel.filter(lambda (x, v): x == v).count()/ test.count()
# Save and load model
model.save(sc, "target/tmp/myNaiveBayesModel")
sameModel = NaiveBayesModel.load(sc, "target/tmp/myNaiveBayesModel")
error:No module named pyspark.mllib.classification No module pyspark.mllib.linalg, pyspark.mllib.regression