UPDATE: Following comments (thanks!) I have now successfully merge these datasets by converting the ratios from 'double' to 'character'. I am still interested to know why R has trouble merging rounded 'double' variables though (the problem occurs also when using merge
and sqldf
and is not restricted to dplyr).
I have been having trouble with merging dataframes using dplyr’s left_join and ‘double’ variables.. I’m attempting to join small tables of data (only 20 rows) on the basis of a grouping of the ratio of target to predictor to identify what proportion of these ratios fall within 5% bandings for training, validation and test set results (from models produced by h2o).
To produce the groups of ratio I’ve simply rounded them to the nearest one decimal place (so the ratio variable remains a ‘double’ type like the original calculated value). However, when I attempt to merge on these rounded values, random rows of NAs appear in the merged dataframe.
Here’s an example of the code and the tables produced - the final table has a row of missing values for the ratios rounded to 0.7, but all are present in the original tables.
Is this a possible bug?
(I know that I could do the calculations more efficiently in dplyr, but as it’s not merging at the moment there seems little point in tidying things up).
trainR <- mutate(trainR, ratio_GBM = round((ZZ_TOP_PRICE_R/pred_GBM),digits=1))
> train_pcs_GBM <- count(trainR, ratio_GBM)
> train_pcs_GBM <- rename(train_pcs_GBM, train_count_GBM = n)
> train_pcs_GBM <- mutate(train_pcs_GBM, train_pc_GBM = round((train_count_GBM/length(trainR$ratio_GBM)*100),1))
> train_pcs_GBM
# A tibble: 32 x 3
ratio_GBM train_count_GBM train_pc_GBM
<dbl> <int> <dbl>
1 0.0 1 0.0
2 0.1 15 0.0
3 0.2 302 0.1
4 0.3 440 0.1
5 0.4 1575 0.3
6 0.5 23304 4.6
7 0.6 62381 12.4
8 0.7 71098 14.1
9 0.8 65792 13.0
10 0.9 66991 13.3
# ... with 22 more rows
> #View(train_pcs_GBM)
>
> validR <- mutate(validR, ratio_GBM = round((ZZ_TOP_PRICE_R/pred_GBM),digits=1))
> valid_pcs_GBM <- count(validR, ratio_GBM)
> valid_pcs_GBM <- rename(valid_pcs_GBM, valid_count_GBM = n)
> valid_pcs_GBM <- mutate(valid_pcs_GBM, valid_pc_GBM = round((valid_count_GBM/length(validR$ratio_GBM)*100),1))
> valid_pcs_GBM
# A tibble: 34 x 3
ratio_GBM valid_count_GBM valid_pc_GBM
<dbl> <int> <dbl>
1 0.1 9 0.0
2 0.2 148 0.1
3 0.3 168 0.1
4 0.4 688 0.4
5 0.5 8058 4.8
6 0.6 20980 12.5
7 0.7 23446 14.0
8 0.8 21932 13.1
9 0.9 22030 13.1
10 1.0 19342 11.5
# ... with 24 more rows
> View(valid_pcs_GBM)
>
> testR <- mutate(testR, ratio_GBM = round((ZZ_TOP_PRICE_R/pred_GBM),digits=1))
> test_pcs_GBM <- count(testR, ratio_GBM)
> test_pcs_GBM <- rename(test_pcs_GBM, test_count_GBM = n)
> test_pcs_GBM <- mutate(test_pcs_GBM, test_pc_GBM = round((test_count_GBM/length(testR$ratio_GBM)*100),1))
> test_pcs_GBM
# A tibble: 31 x 3
ratio_GBM test_count_GBM test_pc_GBM
<dbl> <int> <dbl>
1 0.1 10 0.0
2 0.2 148 0.1
3 0.3 154 0.1
4 0.4 600 0.4
5 0.5 8359 5.0
6 0.6 20739 12.4
7 0.7 23283 13.9
8 0.8 21899 13.1
9 0.9 22216 13.2
10 1.0 19202 11.4
# ... with 21 more rows
>
> # Merge table of percentages ******************************************************************************
> res_GBM <- data.frame(ratio = numeric(20))
> res_GBM$ratio <- seq(0.1,2,0.1)
> res_GBM <- left_join(res_GBM, train_pcs_GBM, by = c("ratio" = "ratio_GBM"))
> res_GBM <- left_join(res_GBM, valid_pcs_GBM, by = c("ratio" = "ratio_GBM"))
> res_GBM <- left_join(res_GBM, test_pcs_GBM, by = c("ratio" = "ratio_GBM"))
> res_GBM
ratio train_count_GBM train_pc_GBM valid_count_GBM valid_pc_GBM test_count_GBM test_pc_GBM
1 0.1 15 0.0 9 0.0 10 0.0
2 0.2 302 0.1 148 0.1 148 0.1
3 0.3 NA NA NA NA NA NA
4 0.4 1575 0.3 688 0.4 600 0.4
5 0.5 23304 4.6 8058 4.8 8359 5.0
6 0.6 62381 12.4 20980 12.5 20739 12.4
7 0.7 NA NA NA NA NA NA
8 0.8 65792 13.0 21932 13.1 21899 13.1
9 0.9 66991 13.3 22030 13.1 22216 13.2
10 1.0 58921 11.7 19342 11.5 19202 11.4
11 1.1 47096 9.3 15071 9.0 15086 9.0
12 1.2 NA NA NA NA NA NA
13 1.3 NA NA NA NA NA NA
14 1.4 NA NA NA NA NA NA
15 1.5 NA NA NA NA NA NA
16 1.6 6761 1.3 2372 1.4 2359 1.4
17 1.7 NA NA NA NA NA NA
18 1.8 NA NA NA NA NA NA
19 1.9 NA NA NA NA NA NA
20 2.0 1095 0.2 482 0.3 435 0.3
>
with(DL_FC_res, lineplot(ratio, test_pc, train_pc, valid_pc, FC_pc))