I'm comparing the predictive power between two Support Vector Machine models in R. I have 6 response variables (categorical) and 24 predictor variables. In one of the models I'm using my data with unbalance between the response variables and in another model I'm using over-sample to equalize the number of observations. I'm doing this for SVM, Random Forest and Naive Bayes Classifier models, comparing over and under sample methods. They all gave different results, except for these two SVM models whose results are exactly the same. Does anyone have any idea what might be going on? This is my script. ConfusionMatrix for both models are exactly the same.
library('e1071')
#Imbalanced
svm1=svm(Comportamento~xg+yg+zg+mov.var+energy+entropy
+roll+inclination+SMA+SVM+pitch+year+month
+day+hour+minute+second+sensorid+air.temp+relat.u
+wind.sp+wind.dir+solar.rad+max.raj,
data=trainset_eli, method="C-classification", kernal="radial",
gamma=0.1, cost=10)
#predict
predsmvval <- predict(svm1, valiset_eli, type = "class")
confusionMatrix(predsmvval, valiset_eli$Comportamento)
#Oversample
svm2=svm(Comportamento~xg+yg+zg+mov.var+energy+entropy
+roll+inclination+SMA+SVM+pitch+year+month
+day+hour+minute+second+sensorid+air.temp+relat.u
+wind.sp+wind.dir+solar.rad+max.raj,
data=uptrain_eli, method="C-classification", kernal="radial",
gamma=0.1, cost=10)
#predict
predsmvval2 <- predict(svm2, valiset_eli, type = "class")
confusionMatrix(predsmvval2, valiset_eli$Comportamento)
Training datasets are different
> table(trainset_eli$Comportamento)
1 2 4 5 6 7
14155 6872 41733 1040 5003 1997
> table(uptrain_eli$Comportamento)
1 2 4 5 6 7
41733 41733 41733 41733 41733 41733
Reproducible example trainset_eli (first 10 lines)
structure(list(air.temp = c(18.42, 32.63, 34.54, 26.42, 32.63,
34.44, 18.42, 35.45, 20.58, 18.17), relat.u = c(70, 30.45, 22.19,
50.69, 30.83, 25.67, 70, 21.44, 63.5, 69.97), wind.sp = c(1.136,
2.809, 1.512, 3.326, 2.171, 2.04, 1.136, 1.52, 0.756, 0.696),
wind.dir = c(79.1, 341.6, 350.1, 56.22, 294.9, 16.57, 79.1,
274.4, 84.4, 82.5), solar.rad = c(39.62, 741, 433.9, 621.1,
274.6, 847, 39.62, 266.4, 169.4, 24.11), max.raj = c(1.647,
5.247, 2.847, 6.047, 4.447, 4.447, 1.647, 2.847, 1.247, 2.047
), sensorid = c(67L, 65L, 66L, 70L, 70L, 70L, 69L, 68L, 69L,
65L), Comportamento = structure(c(6L, 3L, 3L, 5L, 2L, 2L,
1L, 1L, 2L, 1L), .Label = c("1", "2", "4", "5", "6", "7"), class = "factor"),
xg = c(-0.875, -0.765625, 0.234375, 0.546875, -0.0625, 0.421875,
-0.625, 0.515625, -0.453125, -0.734375), yg = c(-0.171875,
0, -0.0625, 0.375, 0.15625, 0.53125, -0.671875, 0.0625, -0.0625,
0.078125), zg = c(-0.421875, -0.578125, -0.875, -0.3125,
-0.25, -0.6875, -0.796875, -0.765625, -0.828125, -0.640625
), SMA = c(1.46875, 1.34375, 1.171875, 1.234375, 0.46875,
1.640625, 2.09375, 1.34375, 1.34375, 1.453125), SVM = c(0.986480882354037,
0.959380089563047, 0.907999389110477, 0.733044006608744,
0.30136408628103, 0.965847466282849, 1.21533978016438, 0.925179458942966,
0.946054718951288, 0.977655638185041), mov.var = c(0.0625,
0.109375, 0.046875, 1.015625, 1, 0.078125, 0.625, 0.8125,
0.203125, 0.015625), energy = c(0.947010278701782, 0.847154855728149,
0.679739058017731, 0.288748800754547, 0.00824832916259766,
0.870230257511139, 2.18167901039124, 0.732662439346313, 0.80105996131897,
0.913573801517487), entropy = c(0.252618304422212, 0.121902803377891,
0.0354050216019417, 0.817915633557388, 0.0171719387098626,
0.109209155417093, 2.0138239315557, 0.0228099706040058, 0.121902803377891,
0.0869466929526053), pitch = c(62.4975813343597, 52.9434718105904,
-14.9586823290351, -48.247900416119, 11.9694631246073, -25.8994130495892,
30.9479702502551, -33.8709569083463, 28.6176815553425, 48.6908889802574
), roll = c(-157.833654177918, 180, -175.914383220025, 129.805571092265,
147.994616791916, 142.305759533311, -139.864514437761, 175.333141628561,
-175.683972480134, 173.047042531826), inclination = c(-64.6810700998259,
-52.9434718105904, -15.4942996397858, -64.7667344528855,
-33.9462950277539, -44.6176169165428, -49.028705466841, -34.1529208122834,
-28.91407571407, -49.0601386023418), year = c(2019, 2019,
2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019), month = c(10,
9, 10, 10, 9, 10, 10, 10, 10, 10), day = c(1L, 24L, 1L, 1L,
24L, 1L, 1L, 1L, 1L, 1L), dayofweek = c(3, 3, 3, 3, 3, 3,
3, 3, 3, 3), hour = c(6L, 14L, 14L, 9L, 16L, 13L, 6L, 16L,
7L, 6L), minute = c(43L, 10L, 48L, 21L, 38L, 35L, 43L, 48L,
20L, 36L), second = c(48, 45, 50, 41, 45, 16, 36, 13, 43,
57)), row.names = c(NA, -10L), .internal.selfref = <pointer: 0x562333bc6120>, class = c("data.table",
"data.frame"))
Reproducible example uptrain_eli (first 10 lines)
structure(list(air.temp = c(18.42, 35.45, 18.17, 32.03, 33.83,
26.04, 35.12, 18.78, 17.81, 34.71), relat.u = c(70, 21.44, 69.97,
31.8, 29.69, 45.58, 22.07, 69.41, 68.5, 24.7), wind.sp = c(1.136,
1.52, 0.696, 1.177, 3.014, 3.902, 2.604, 0.68, 2.312, 3.222),
wind.dir = c(79.1, 274.4, 82.5, 279.8, 353.5, 36.06, 9.76,
72.9, 76.5, 10.84), solar.rad = c(39.62, 266.4, 24.11, 129.8,
877, 775, 706, 64.33, 235.5, 810), max.raj = c(1.647, 2.847,
2.047, 2.047, 5.647, 6.847, 5.647, 1.647, 4.047, 5.247),
sensorid = c(69L, 68L, 65L, 65L, 71L, 68L, 70L, 69L, 63L,
71L), xg = c(-0.625, 0.515625, -0.734375, -0.390625, -0.359375,
-0.4375, 0.03125, -0.546875, -0.8125, -0.34375), yg = c(-0.671875,
0.0625, 0.078125, -0.984375, -0.5, -0.53125, 1.265625, -0.15625,
-0.578125, -0.484375), zg = c(-0.796875, -0.765625, -0.640625,
-0.5625, -0.859375, -0.421875, -0.1875, -0.8125, -0.28125,
-0.671875), SMA = c(2.09375, 1.34375, 1.453125, 1.9375, 1.71875,
1.390625, 1.484375, 1.515625, 1.671875, 1.5), SVM = c(1.21533978016438,
0.925179458942966, 0.977655638185041, 1.19916149089687, 1.05720186400233,
0.807224459568093, 1.27982008623283, 0.991787567034897, 1.03609185313128,
0.896771553267609), mov.var = c(0.625, 0.8125, 0.015625,
0.640625, 0.328125, 1.1875, 1.28125, 0.203125, 0.125, 0.65625
), energy = c(2.18167901039124, 0.732662439346313, 0.913573801517487,
2.06781029701233, 1.2491991519928, 0.424597322940826, 2.68284565210342,
0.967552721500397, 1.1523728966713, 0.64673638343811), entropy = c(2.0138239315557,
0.0228099706040058, 0.0869466929526053, 1.3701855309152,
0.710199759881741, 0.164232090178216, 3.56989847819576, 0.317194938809455,
0.602066250006157, 0.294663749085412), pitch = c(30.9479702502551,
-33.8709569083463, 48.6908889802574, 19.0109259177363, 19.8726306940985,
32.8185691761192, -1.39915844134687, 33.4633429104771, 51.6464605002851,
22.5394507013279), roll = c(-139.864514437761, 175.333141628561,
173.047042531826, -119.744881296942, -149.808377039042, -128.453709216706,
98.4269690214807, -169.114472945341, -115.942295489872, -144.211026540817
), inclination = c(-49.028705466841, -34.1529208122834, -49.0601386023418,
-62.0255506627307, -35.6220253049926, -58.4915602831318,
-81.5755617584986, -34.9924265073308, -74.2492667720546,
-41.4775465392269), year = c(2019, 2019, 2019, 2019, 2019,
2019, 2019, 2019, 2019, 2019), month = c(10, 10, 10, 9, 10,
9, 10, 10, 9, 10), day = c(1L, 1L, 1L, 24L, 1L, 24L, 1L,
1L, 24L, 1L), dayofweek = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3),
hour = c(6L, 16L, 6L, 17L, 13L, 10L, 14L, 6L, 7L, 13L), minute = c(43L,
48L, 36L, 23L, 9L, 32L, 30L, 55L, 45L, 53L), second = c(36,
13, 57, 19, 36, 44, 49, 58, 16, 7), Comportamento = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", "2",
"4", "5", "6", "7"), class = "factor")), row.names = c(NA,
10L), class = "data.frame")