0

I'm comparing the predictive power between two Support Vector Machine models in R. I have 6 response variables (categorical) and 24 predictor variables. In one of the models I'm using my data with unbalance between the response variables and in another model I'm using over-sample to equalize the number of observations. I'm doing this for SVM, Random Forest and Naive Bayes Classifier models, comparing over and under sample methods. They all gave different results, except for these two SVM models whose results are exactly the same. Does anyone have any idea what might be going on? This is my script. ConfusionMatrix for both models are exactly the same.

library('e1071')

#Imbalanced
svm1=svm(Comportamento~xg+yg+zg+mov.var+energy+entropy
      +roll+inclination+SMA+SVM+pitch+year+month
      +day+hour+minute+second+sensorid+air.temp+relat.u
      +wind.sp+wind.dir+solar.rad+max.raj,
      data=trainset_eli, method="C-classification", kernal="radial", 
      gamma=0.1, cost=10)

#predict
predsmvval <- predict(svm1, valiset_eli, type = "class")

confusionMatrix(predsmvval, valiset_eli$Comportamento)

#Oversample
svm2=svm(Comportamento~xg+yg+zg+mov.var+energy+entropy
         +roll+inclination+SMA+SVM+pitch+year+month
         +day+hour+minute+second+sensorid+air.temp+relat.u
         +wind.sp+wind.dir+solar.rad+max.raj,
         data=uptrain_eli, method="C-classification", kernal="radial", 
         gamma=0.1, cost=10)

#predict
predsmvval2 <- predict(svm2, valiset_eli, type = "class")

confusionMatrix(predsmvval2, valiset_eli$Comportamento)

Training datasets are different

> table(trainset_eli$Comportamento)

    1     2     4     5     6     7 
14155  6872 41733  1040  5003  1997

> table(uptrain_eli$Comportamento)

    1     2     4     5     6     7 
41733 41733 41733 41733 41733 41733 

Reproducible example trainset_eli (first 10 lines)

structure(list(air.temp = c(18.42, 32.63, 34.54, 26.42, 32.63, 
34.44, 18.42, 35.45, 20.58, 18.17), relat.u = c(70, 30.45, 22.19, 
50.69, 30.83, 25.67, 70, 21.44, 63.5, 69.97), wind.sp = c(1.136, 
2.809, 1.512, 3.326, 2.171, 2.04, 1.136, 1.52, 0.756, 0.696), 
    wind.dir = c(79.1, 341.6, 350.1, 56.22, 294.9, 16.57, 79.1, 
    274.4, 84.4, 82.5), solar.rad = c(39.62, 741, 433.9, 621.1, 
    274.6, 847, 39.62, 266.4, 169.4, 24.11), max.raj = c(1.647, 
    5.247, 2.847, 6.047, 4.447, 4.447, 1.647, 2.847, 1.247, 2.047
    ), sensorid = c(67L, 65L, 66L, 70L, 70L, 70L, 69L, 68L, 69L, 
    65L), Comportamento = structure(c(6L, 3L, 3L, 5L, 2L, 2L, 
    1L, 1L, 2L, 1L), .Label = c("1", "2", "4", "5", "6", "7"), class = "factor"), 
    xg = c(-0.875, -0.765625, 0.234375, 0.546875, -0.0625, 0.421875, 
    -0.625, 0.515625, -0.453125, -0.734375), yg = c(-0.171875, 
    0, -0.0625, 0.375, 0.15625, 0.53125, -0.671875, 0.0625, -0.0625, 
    0.078125), zg = c(-0.421875, -0.578125, -0.875, -0.3125, 
    -0.25, -0.6875, -0.796875, -0.765625, -0.828125, -0.640625
    ), SMA = c(1.46875, 1.34375, 1.171875, 1.234375, 0.46875, 
    1.640625, 2.09375, 1.34375, 1.34375, 1.453125), SVM = c(0.986480882354037, 
    0.959380089563047, 0.907999389110477, 0.733044006608744, 
    0.30136408628103, 0.965847466282849, 1.21533978016438, 0.925179458942966, 
    0.946054718951288, 0.977655638185041), mov.var = c(0.0625, 
    0.109375, 0.046875, 1.015625, 1, 0.078125, 0.625, 0.8125, 
    0.203125, 0.015625), energy = c(0.947010278701782, 0.847154855728149, 
    0.679739058017731, 0.288748800754547, 0.00824832916259766, 
    0.870230257511139, 2.18167901039124, 0.732662439346313, 0.80105996131897, 
    0.913573801517487), entropy = c(0.252618304422212, 0.121902803377891, 
    0.0354050216019417, 0.817915633557388, 0.0171719387098626, 
    0.109209155417093, 2.0138239315557, 0.0228099706040058, 0.121902803377891, 
    0.0869466929526053), pitch = c(62.4975813343597, 52.9434718105904, 
    -14.9586823290351, -48.247900416119, 11.9694631246073, -25.8994130495892, 
    30.9479702502551, -33.8709569083463, 28.6176815553425, 48.6908889802574
    ), roll = c(-157.833654177918, 180, -175.914383220025, 129.805571092265, 
    147.994616791916, 142.305759533311, -139.864514437761, 175.333141628561, 
    -175.683972480134, 173.047042531826), inclination = c(-64.6810700998259, 
    -52.9434718105904, -15.4942996397858, -64.7667344528855, 
    -33.9462950277539, -44.6176169165428, -49.028705466841, -34.1529208122834, 
    -28.91407571407, -49.0601386023418), year = c(2019, 2019, 
    2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019), month = c(10, 
    9, 10, 10, 9, 10, 10, 10, 10, 10), day = c(1L, 24L, 1L, 1L, 
    24L, 1L, 1L, 1L, 1L, 1L), dayofweek = c(3, 3, 3, 3, 3, 3, 
    3, 3, 3, 3), hour = c(6L, 14L, 14L, 9L, 16L, 13L, 6L, 16L, 
    7L, 6L), minute = c(43L, 10L, 48L, 21L, 38L, 35L, 43L, 48L, 
    20L, 36L), second = c(48, 45, 50, 41, 45, 16, 36, 13, 43, 
    57)), row.names = c(NA, -10L), .internal.selfref = <pointer: 0x562333bc6120>, class = c("data.table", 
"data.frame"))

Reproducible example uptrain_eli (first 10 lines)

structure(list(air.temp = c(18.42, 35.45, 18.17, 32.03, 33.83, 
26.04, 35.12, 18.78, 17.81, 34.71), relat.u = c(70, 21.44, 69.97, 
31.8, 29.69, 45.58, 22.07, 69.41, 68.5, 24.7), wind.sp = c(1.136, 
1.52, 0.696, 1.177, 3.014, 3.902, 2.604, 0.68, 2.312, 3.222), 
    wind.dir = c(79.1, 274.4, 82.5, 279.8, 353.5, 36.06, 9.76, 
    72.9, 76.5, 10.84), solar.rad = c(39.62, 266.4, 24.11, 129.8, 
    877, 775, 706, 64.33, 235.5, 810), max.raj = c(1.647, 2.847, 
    2.047, 2.047, 5.647, 6.847, 5.647, 1.647, 4.047, 5.247), 
    sensorid = c(69L, 68L, 65L, 65L, 71L, 68L, 70L, 69L, 63L, 
    71L), xg = c(-0.625, 0.515625, -0.734375, -0.390625, -0.359375, 
    -0.4375, 0.03125, -0.546875, -0.8125, -0.34375), yg = c(-0.671875, 
    0.0625, 0.078125, -0.984375, -0.5, -0.53125, 1.265625, -0.15625, 
    -0.578125, -0.484375), zg = c(-0.796875, -0.765625, -0.640625, 
    -0.5625, -0.859375, -0.421875, -0.1875, -0.8125, -0.28125, 
    -0.671875), SMA = c(2.09375, 1.34375, 1.453125, 1.9375, 1.71875, 
    1.390625, 1.484375, 1.515625, 1.671875, 1.5), SVM = c(1.21533978016438, 
    0.925179458942966, 0.977655638185041, 1.19916149089687, 1.05720186400233, 
    0.807224459568093, 1.27982008623283, 0.991787567034897, 1.03609185313128, 
    0.896771553267609), mov.var = c(0.625, 0.8125, 0.015625, 
    0.640625, 0.328125, 1.1875, 1.28125, 0.203125, 0.125, 0.65625
    ), energy = c(2.18167901039124, 0.732662439346313, 0.913573801517487, 
    2.06781029701233, 1.2491991519928, 0.424597322940826, 2.68284565210342, 
    0.967552721500397, 1.1523728966713, 0.64673638343811), entropy = c(2.0138239315557, 
    0.0228099706040058, 0.0869466929526053, 1.3701855309152, 
    0.710199759881741, 0.164232090178216, 3.56989847819576, 0.317194938809455, 
    0.602066250006157, 0.294663749085412), pitch = c(30.9479702502551, 
    -33.8709569083463, 48.6908889802574, 19.0109259177363, 19.8726306940985, 
    32.8185691761192, -1.39915844134687, 33.4633429104771, 51.6464605002851, 
    22.5394507013279), roll = c(-139.864514437761, 175.333141628561, 
    173.047042531826, -119.744881296942, -149.808377039042, -128.453709216706, 
    98.4269690214807, -169.114472945341, -115.942295489872, -144.211026540817
    ), inclination = c(-49.028705466841, -34.1529208122834, -49.0601386023418, 
    -62.0255506627307, -35.6220253049926, -58.4915602831318, 
    -81.5755617584986, -34.9924265073308, -74.2492667720546, 
    -41.4775465392269), year = c(2019, 2019, 2019, 2019, 2019, 
    2019, 2019, 2019, 2019, 2019), month = c(10, 10, 10, 9, 10, 
    9, 10, 10, 9, 10), day = c(1L, 1L, 1L, 24L, 1L, 24L, 1L, 
    1L, 24L, 1L), dayofweek = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3), 
    hour = c(6L, 16L, 6L, 17L, 13L, 10L, 14L, 6L, 7L, 13L), minute = c(43L, 
    48L, 36L, 23L, 9L, 32L, 30L, 55L, 45L, 53L), second = c(36, 
    13, 57, 19, 36, 44, 49, 58, 16, 7), Comportamento = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", "2", 
    "4", "5", "6", "7"), class = "factor")), row.names = c(NA, 
10L), class = "data.frame")
  • It's easier to help you if you include a simple [reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) with sample input that can be used to test and verify possible solutions. – MrFlick Jul 15 '21 at 19:45
  • Done, but I believe it won't help much as the amount of observations in the models makes a difference. – Rafael Nakamura Jul 15 '21 at 21:15
  • It's quite expected that they could give the same result, especially if your data is unbalanced. – anymous.asker Jul 16 '21 at 02:53
  • Really? Can you explain me better why? – Rafael Nakamura Jul 16 '21 at 13:39
  • Anyone can help? – Rafael Nakamura Aug 04 '21 at 18:12
  • As you say, the number of observations "makes a difference". Specifically the pre-test probability is set by the counts of items in each class. You have a 20:1 imbalance, so it would require a very high post-test score to classify a test case as being in the smallest class. – IRTFM Feb 10 '22 at 20:39

0 Answers0