I am new using R and I don't quite understand what is happening here so would greatly appreciate any help to understand why my kmeans cluster has NA values on my test set.
This is process I am following:
#1 - Load data and libraries
library(caTools)
library(caret)
library(flexclust)
train = read.csv("test.csv", stringsAsFactors=FALSE)
#2 - Convert state vars to Factors
train$condition=as.factor(train$condition)
train$carrier=as.factor(train$carrier)
train$color=as.factor(train$color)
train$storage=as.factor(train$storage)
train$productline=as.factor(train$productline)
train$biddable=as.factor(train$biddable)
train$cellular=as.factor(train$cellular)
#3 - Split Train set to Train and Validation sets
set.seed(123)
split = sample.split(train$sold, SplitRatio = 0.7)
train = subset(train, split==TRUE)
test = subset(train, split==FALSE)
#4 - Remove redundant vars (UniqueID and description) to create clusters
trainLimit = train
testLimit = test
trainLimit$sold=NULL
testLimit$sold=NULL
#5 - Create clusters
#5.1 - convert values to numeric
trainLimit$biddable=as.numeric(trainLimit$biddable)
trainLimit$startprice=as.numeric(trainLimit$startprice)
trainLimit$condition=as.numeric(trainLimit$condition)
trainLimit$cellular=as.numeric(trainLimit$cellular)
trainLimit$carrier=as.numeric(trainLimit$carrier)
trainLimit$color=as.numeric(trainLimit$color)
trainLimit$storage=as.numeric(trainLimit$storage)
trainLimit$productline=as.numeric(trainLimit$productline)
testLimit$biddable=as.numeric(testLimit$biddable)
testLimit$startprice=as.numeric(testLimit$startprice)
testLimit$condition=as.numeric(testLimit$condition)
testLimit$cellular=as.numeric(testLimit$cellular)
testLimit$carrier=as.numeric(testLimit$carrier)
testLimit$color=as.numeric(testLimit$color)
testLimit$storage=as.numeric(testLimit$storage)
testLimit$productline=as.numeric(testLimit$productline)
#5.2 - Normalize variable values (to reduce the dominance from start price)
preprocess=preProcess(trainLimit)
trainNorm=predict(preprocess,trainLimit)
testNorm=predict(preprocess,testLimit)
#5.3 - Create clusters
k=6
set.seed(123)
km = kmeans(trainNorm,centers=k)
km.kcca = as.kcca(km, trainNorm)
clusterTrain = predict(km.kcca)
clusterValidation = predict(km.kcca, newdata=testNorm)
data for testing:
biddable startprice condition cellular carrier color storage productline sold
1 0.99 Used 1 Unknown White 64 iPad 1 1
0 99.99 Seller refurbished 0 None White 16 iPad 1 0
1 20 Used 1 AT&T Black 16 iPad 1 1
0 75 Used 0 None Black 16 iPad 1 1
0 95 Used 0 None Black 32 iPad 1 0
0 84.99 Seller refurbished 1 AT&T Unknown 64 iPad 1 1
0 90 Used 0 None Unknown 16 iPad 1 1
0 79 Used 0 None Black 32 iPad 1 0
0 115 Used 0 None Unknown 32 iPad 1 0
0 289.95 New 1 Unknown Black 64 iPad 1 0
0 69.99 Used 0 None Black 16 iPad 1 0
0 269.99 New 0 None Black 16 iPad 1 0
0 79.99 Seller refurbished 1 AT&T Black 16 iPad 1 1
1 75 Used 0 None Unknown 32 iPad 1 1
1 50 Used 0 None Black 16 iPad 1 1
0 169.95 New 0 None Black 16 iPad 1 0
1 62 Used 0 None Black 64 iPad 1 1
0 279.95 New 0 None Unknown 32 iPad 1 0
0 149.95 New other (see details) 1 Unknown Black 64 iPad 1 0
0 99 Seller refurbished 1 AT&T Black 64 iPad 1 0
1 100 Used 0 None Unknown 16 iPad 1 0
1 49.49 Used 0 None Unknown 32 iPad 1 1
1 25 Used 1 AT&T Unknown 16 iPad 1 1
1 40 Used 0 None Unknown 16 iPad 1 1
1 29.99 Used 1 AT&T Black 64 iPad 1 1
1 49.99 Used 1 AT&T Black 16 iPad 1 1
1 50 Used 0 None Black 16 iPad 1 1
1 80 Used 1 AT&T Black 64 iPad 1 1
Any idea why this is happenning?