I'm using C5.0 to make a decision tree, and it's using my class label in the tree. A snippet of my data is below.
trainX
V1 V2 V3 V4 V5 V6
1 39 State-gov 77516 Bachelors 13 Never-married
2 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
3 38 Private 215646 HS-grad 9 Divorced
4 53 Private 234721 11th 7 Married-civ-spouse
5 28 Private 338409 Bachelors 13 Married-civ-spouse
V7 V8 V9 V10 V11 V12 V13 V14
1 Adm-clerical Not-in-family White Male 2174 0 40 United-States
2 Exec-managerial Husband White Male 0 0 13 United-States
3 Handlers-cleaners Not-in-family White Male 0 0 40 United-States
4 Handlers-cleaners Husband Black Male 0 0 40 United-States
5 Prof-specialty Wife Black Female 0 0 40 Cuba
trainY
[1] <=50K <=50K <=50K <=50K <=50K
There are cases in my data of >50K as well, this snippet of 5 just did not contain any.
When I make my tree, this is the code I use
library(C50)
trainX = X[1:100,]
trainY = Y[1:100]
testX = X[101:150,]
testY = Y[101:150]
model = C5.0(trainX, trainY)
summary(model)
And the output I get is...
Decision tree:
<=50K (100/25)
Evaluation on training data (100 cases):
Decision Tree
----------------
Size Errors
1 25(25.0%) <<
(a) (b) <-classified as
---- ----
75 (a): class <=50K
25 (b): class >50K
What am I doing wrong that it's using the classification as part of the tree?
EDIT - DPUTS below of Head. Still gives me the same issue, where its making a Decision Tree using the split as <=50K or >50K, which is my "Y" output and thus shouldn't be part of the decision making process.
trainX
structure(list(V1 = c(39L, 50L, 38L, 53L, 28L, 37L), V2 = structure(c(8L,
7L, 5L, 5L, 5L, 5L), .Label = c(" ?", " Federal-gov", " Local-gov",
" Never-worked", " Private", " Self-emp-inc", " Self-emp-not-inc",
" State-gov", " Without-pay"), class = "factor"), V3 = c(77516L,
83311L, 215646L, 234721L, 338409L, 284582L), V4 = structure(c(10L,
10L, 12L, 2L, 10L, 13L), .Label = c(" 10th", " 11th", " 12th",
" 1st-4th", " 5th-6th", " 7th-8th", " 9th", " Assoc-acdm", " Assoc-voc",
" Bachelors", " Doctorate", " HS-grad", " Masters", " Preschool",
" Prof-school", " Some-college"), class = "factor"), V5 = c(13L,
13L, 9L, 7L, 13L, 14L), V6 = structure(c(5L, 3L, 1L, 3L, 3L,
3L), .Label = c(" Divorced", " Married-AF-spouse", " Married-civ-spouse",
" Married-spouse-absent", " Never-married", " Separated", " Widowed"
), class = "factor"), V7 = structure(c(2L, 5L, 7L, 7L, 11L, 5L
), .Label = c(" ?", " Adm-clerical", " Armed-Forces", " Craft-repair",
" Exec-managerial", " Farming-fishing", " Handlers-cleaners",
" Machine-op-inspct", " Other-service", " Priv-house-serv", " Prof-specialty",
" Protective-serv", " Sales", " Tech-support", " Transport-moving"
), class = "factor"), V8 = structure(c(2L, 1L, 2L, 1L, 6L, 6L
), .Label = c(" Husband", " Not-in-family", " Other-relative",
" Own-child", " Unmarried", " Wife"), class = "factor"), V9 = structure(c(5L,
5L, 5L, 3L, 3L, 5L), .Label = c(" Amer-Indian-Eskimo", " Asian-Pac-Islander",
" Black", " Other", " White"), class = "factor"), V10 = structure(c(2L,
2L, 2L, 2L, 1L, 1L), .Label = c(" Female", " Male"), class = "factor"),
V11 = c(2174L, 0L, 0L, 0L, 0L, 0L), V12 = c(0L, 0L, 0L, 0L,
0L, 0L), V13 = c(40L, 13L, 40L, 40L, 40L, 40L), V14 = structure(c(40L,
40L, 40L, 40L, 6L, 40L), .Label = c(" ?", " Cambodia", " Canada",
" China", " Columbia", " Cuba", " Dominican-Republic", " Ecuador",
" El-Salvador", " England", " France", " Germany", " Greece",
" Guatemala", " Haiti", " Holand-Netherlands", " Honduras",
" Hong", " Hungary", " India", " Iran", " Ireland", " Italy",
" Jamaica", " Japan", " Laos", " Mexico", " Nicaragua", " Outlying-US(Guam-USVI-etc)",
" Peru", " Philippines", " Poland", " Portugal", " Puerto-Rico",
" Scotland", " South", " Taiwan", " Thailand", " Trinadad&Tobago",
" United-States", " Vietnam", " Yugoslavia"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4", "V5", "V6", "V7", "V8", "V9", "V10", "V11",
"V12", "V13", "V14"), row.names = c(NA, 6L), class = "data.frame")
trainY
structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = c(" <=50K", " >50K"
), class = "factor")
After reading in trainX, trainY, the easiest way to reproduce this problem would be
library(C50)
test = C5.0(x=trainX, y=trainY)
My actual train Y :
structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 2L, 1L, 1L), .Label = c(" <=50K", " >50K"), class = "factor")
My actual trainX
structure(list(age = c(39L, 50L, 38L, 53L, 28L, 37L, 49L, 52L,
31L, 42L, 37L, 30L, 23L, 32L, 40L, 34L, 25L, 32L, 38L, 43L, 40L,
54L, 35L, 43L, 59L, 56L, 19L, 54L, 39L, 49L, 23L, 20L, 45L, 30L,
22L, 48L, 21L, 19L, 31L, 48L, 31L, 53L, 24L, 49L, 25L, 57L, 53L,
44L, 41L, 29L, 25L, 18L, 47L, 50L, 47L, 43L, 46L, 35L, 41L, 30L,
30L, 32L, 48L, 42L, 29L, 36L, 28L, 53L, 49L, 25L, 19L, 31L, 29L,
23L, 79L, 27L, 40L, 67L, 18L, 31L, 18L, 52L, 46L, 59L, 44L, 53L,
49L, 33L, 30L, 43L, 57L, 37L, 28L, 30L, 34L, 29L, 48L, 37L, 48L,
32L), workClass = structure(c(8L, 7L, 5L, 5L, 5L, 5L, 5L, 7L,
5L, 5L, 5L, 8L, 5L, 5L, 5L, 5L, 7L, 5L, 5L, 7L, 5L, 5L, 2L, 5L,
5L, 3L, 5L, 1L, 5L, 5L, 3L, 5L, 5L, 2L, 8L, 5L, 5L, 5L, 5L, 7L,
5L, 7L, 5L, 5L, 5L, 2L, 5L, 5L, 8L, 5L, 5L, 5L, 5L, 2L, 6L, 5L,
5L, 5L, 5L, 5L, 5L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 6L, 1L, 5L, 5L,
7L, 5L, 5L, 5L, 5L, 1L, 5L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 5L,
5L, 2L, 5L, 5L, 5L, 5L, 3L, 3L, 7L, 5L, 5L, 2L), .Label = c(" ?",
" Federal-gov", " Local-gov", " Never-worked", " Private", " Self-emp-inc",
" Self-emp-not-inc", " State-gov", " Without-pay"), class = "factor"),
fnlwgt = c(77516L, 83311L, 215646L, 234721L, 338409L, 284582L,
160187L, 209642L, 45781L, 159449L, 280464L, 141297L, 122272L,
205019L, 121772L, 245487L, 176756L, 186824L, 28887L, 292175L,
193524L, 302146L, 76845L, 117037L, 109015L, 216851L, 168294L,
180211L, 367260L, 193366L, 190709L, 266015L, 386940L, 59951L,
311512L, 242406L, 197200L, 544091L, 84154L, 265477L, 507875L,
88506L, 172987L, 94638L, 289980L, 337895L, 144361L, 128354L,
101603L, 271466L, 32275L, 226956L, 51835L, 251585L, 109832L,
237993L, 216666L, 56352L, 147372L, 188146L, 59496L, 293936L,
149640L, 116632L, 105598L, 155537L, 183175L, 169846L, 191681L,
200681L, 101509L, 309974L, 162298L, 211678L, 124744L, 213921L,
32214L, 212759L, 309634L, 125927L, 446839L, 276515L, 51618L,
159937L, 343591L, 346253L, 268234L, 202051L, 54334L, 410867L,
249977L, 286730L, 212563L, 117747L, 226296L, 115585L, 191277L,
202683L, 171095L, 249409L), education = structure(c(10L,
10L, 12L, 2L, 10L, 13L, 7L, 12L, 13L, 10L, 16L, 10L, 10L,
8L, 9L, 6L, 12L, 12L, 2L, 13L, 11L, 12L, 7L, 2L, 12L, 10L,
12L, 16L, 12L, 12L, 8L, 16L, 10L, 16L, 16L, 2L, 16L, 12L,
16L, 8L, 7L, 10L, 10L, 12L, 12L, 10L, 12L, 13L, 9L, 9L, 16L,
12L, 15L, 10L, 12L, 16L, 5L, 9L, 12L, 12L, 10L, 6L, 12L,
11L, 16L, 12L, 16L, 12L, 16L, 16L, 16L, 10L, 10L, 16L, 16L,
12L, 8L, 1L, 2L, 6L, 12L, 10L, 12L, 12L, 12L, 12L, 12L, 13L,
7L, 11L, 9L, 16L, 16L, 12L, 10L, 16L, 11L, 16L, 8L, 12L), .Label = c(" 10th",
" 11th", " 12th", " 1st-4th", " 5th-6th", " 7th-8th", " 9th",
" Assoc-acdm", " Assoc-voc", " Bachelors", " Doctorate",
" HS-grad", " Masters", " Preschool", " Prof-school", " Some-college"
), class = "factor"), educationNum = c(13L, 13L, 9L, 7L,
13L, 14L, 5L, 9L, 14L, 13L, 10L, 13L, 13L, 12L, 11L, 4L,
9L, 9L, 7L, 14L, 16L, 9L, 5L, 7L, 9L, 13L, 9L, 10L, 9L, 9L,
12L, 10L, 13L, 10L, 10L, 7L, 10L, 9L, 10L, 12L, 5L, 13L,
13L, 9L, 9L, 13L, 9L, 14L, 11L, 11L, 10L, 9L, 15L, 13L, 9L,
10L, 3L, 11L, 9L, 9L, 13L, 4L, 9L, 16L, 10L, 9L, 10L, 9L,
10L, 10L, 10L, 13L, 13L, 10L, 10L, 9L, 12L, 6L, 7L, 4L, 9L,
13L, 9L, 9L, 9L, 9L, 9L, 14L, 5L, 16L, 11L, 10L, 10L, 9L,
13L, 10L, 16L, 10L, 12L, 9L), marital = structure(c(5L, 3L,
1L, 3L, 3L, 3L, 4L, 3L, 5L, 3L, 3L, 3L, 5L, 5L, 3L, 3L, 5L,
5L, 3L, 1L, 3L, 6L, 3L, 3L, 1L, 3L, 5L, 3L, 1L, 3L, 5L, 5L,
1L, 3L, 3L, 5L, 5L, 2L, 3L, 3L, 3L, 3L, 3L, 6L, 5L, 3L, 3L,
1L, 3L, 5L, 3L, 5L, 3L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 4L,
3L, 3L, 1L, 3L, 1L, 3L, 3L, 5L, 5L, 6L, 3L, 5L, 3L, 5L, 3L,
3L, 5L, 3L, 5L, 3L, 3L, 3L, 1L, 1L, 3L, 3L, 5L, 5L, 3L, 1L,
1L, 3L, 3L, 5L, 3L, 3L, 1L, 5L), .Label = c(" Divorced",
" Married-AF-spouse", " Married-civ-spouse", " Married-spouse-absent",
" Never-married", " Separated", " Widowed"), class = "factor"),
occ = structure(c(2L, 5L, 7L, 7L, 11L, 5L, 9L, 5L, 11L, 5L,
5L, 11L, 2L, 13L, 4L, 15L, 6L, 8L, 13L, 5L, 11L, 9L, 6L,
15L, 14L, 14L, 4L, 1L, 5L, 4L, 12L, 13L, 5L, 2L, 9L, 8L,
8L, 2L, 13L, 11L, 8L, 11L, 14L, 2L, 7L, 11L, 8L, 5L, 4L,
11L, 5L, 9L, 11L, 5L, 5L, 14L, 8L, 9L, 2L, 8L, 13L, 1L, 15L,
11L, 14L, 4L, 2L, 2L, 5L, 1L, 11L, 13L, 13L, 8L, 11L, 9L,
2L, 1L, 9L, 6L, 13L, 9L, 9L, 13L, 4L, 13L, 12L, 11L, 13L,
11L, 11L, 4L, 8L, 13L, 12L, 7L, 11L, 13L, 5L, 9L), .Label = c(" ?",
" Adm-clerical", " Armed-Forces", " Craft-repair", " Exec-managerial",
" Farming-fishing", " Handlers-cleaners", " Machine-op-inspct",
" Other-service", " Priv-house-serv", " Prof-specialty",
" Protective-serv", " Sales", " Tech-support", " Transport-moving"
), class = "factor"), relationship = structure(c(2L, 1L,
2L, 1L, 6L, 6L, 2L, 1L, 2L, 1L, 1L, 1L, 4L, 2L, 1L, 1L, 4L,
5L, 1L, 5L, 1L, 5L, 1L, 1L, 5L, 1L, 4L, 1L, 2L, 1L, 2L, 4L,
4L, 4L, 1L, 5L, 4L, 6L, 1L, 1L, 1L, 1L, 1L, 5L, 2L, 1L, 1L,
5L, 1L, 2L, 6L, 4L, 6L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 2L, 1L, 2L, 6L, 1L, 4L, 4L, 4L, 1L, 2L, 3L, 4L, 1L,
1L, 4L, 1L, 2L, 1L, 6L, 1L, 2L, 4L, 1L, 1L, 2L, 2L, 1L, 5L,
5L, 6L, 1L, 2L, 1L, 1L, 5L, 4L), .Label = c(" Husband", " Not-in-family",
" Other-relative", " Own-child", " Unmarried", " Wife"), class = "factor"),
race = structure(c(5L, 5L, 5L, 3L, 3L, 5L, 3L, 5L, 5L, 5L,
3L, 2L, 5L, 3L, 2L, 1L, 5L, 5L, 5L, 5L, 5L, 3L, 3L, 5L, 5L,
5L, 5L, 2L, 5L, 5L, 5L, 3L, 5L, 5L, 3L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 3L, 5L, 5L, 5L, 5L, 4L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 3L, 2L, 5L, 5L, 5L, 5L, 5L, 3L
), .Label = c(" Amer-Indian-Eskimo", " Asian-Pac-Islander",
" Black", " Other", " White"), class = "factor"), sex = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L), .Label = c(" Female",
" Male"), class = "factor"), capGain = c(2174L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 14084L, 5178L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 5013L, 2407L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 14344L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), capLoss = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 2042L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1408L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1902L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1573L, 0L, 0L, 1902L, 0L, 0L, 0L), hours = c(40L,
13L, 40L, 40L, 40L, 40L, 16L, 45L, 50L, 40L, 80L, 40L, 30L,
50L, 40L, 45L, 35L, 40L, 50L, 45L, 60L, 20L, 40L, 40L, 40L,
40L, 40L, 60L, 80L, 40L, 52L, 44L, 40L, 40L, 15L, 40L, 40L,
25L, 38L, 40L, 43L, 40L, 50L, 40L, 35L, 40L, 38L, 40L, 40L,
43L, 40L, 30L, 60L, 55L, 60L, 40L, 40L, 40L, 48L, 40L, 40L,
40L, 40L, 45L, 58L, 40L, 40L, 40L, 50L, 40L, 32L, 40L, 70L,
40L, 20L, 40L, 40L, 2L, 22L, 40L, 30L, 40L, 40L, 48L, 40L,
35L, 40L, 50L, 40L, 50L, 40L, 40L, 25L, 35L, 40L, 50L, 60L,
48L, 40L, 40L), country = structure(c(40L, 40L, 40L, 40L,
6L, 40L, 24L, 40L, 40L, 40L, 40L, 20L, 40L, 40L, 1L, 27L,
40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 36L,
40L, 40L, 40L, 40L, 40L, 40L, 40L, 34L, 40L, 40L, 1L, 40L,
40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 1L,
17L, 40L, 40L, 40L, 27L, 34L, 40L, 40L, 40L, 1L, 40L, 40L,
40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 40L, 27L,
40L, 40L, 40L, 40L, 40L, 6L, 40L, 40L, 40L, 40L, 40L, 40L,
40L, 40L, 40L, 40L, 40L, 1L, 40L, 40L, 40L, 40L, 10L, 40L
), .Label = c(" ?", " Cambodia", " Canada", " China", " Columbia",
" Cuba", " Dominican-Republic", " Ecuador", " El-Salvador",
" England", " France", " Germany", " Greece", " Guatemala",
" Haiti", " Holand-Netherlands", " Honduras", " Hong", " Hungary",
" India", " Iran", " Ireland", " Italy", " Jamaica", " Japan",
" Laos", " Mexico", " Nicaragua", " Outlying-US(Guam-USVI-etc)",
" Peru", " Philippines", " Poland", " Portugal", " Puerto-Rico",
" Scotland", " South", " Taiwan", " Thailand", " Trinadad&Tobago",
" United-States", " Vietnam", " Yugoslavia"), class = "factor")), .Names = c("age",
"workClass", "fnlwgt", "education", "educationNum", "marital",
"occ", "relationship", "race", "sex", "capGain", "capLoss", "hours",
"country"), row.names = c(NA, 100L), class = "data.frame")