Please, find my data below. I have encountered two problems.
I am trying to merge yy$n_otte
values into the missing h$n_otte
values. My approach is a dplyr::left_join
matched by study
, os.neck
, n_sygdom
and age
between h
and yy
. I need to match by all these variables as the both h
and yy
comprise two large spreadsheets.
> head(h)
study os.neck age n_sygdom n_otte
1 B 49.00 53 0 N0
2 B 1.00 83 0 N0
3 A 76.44 63 2 <NA>
4 B 11.00 45 0 N0
5 A 9.21 37 15 <NA>
6 B 1.00 60 1 N1
And
> head(yy)
study os.neck n_sygdom age n_otte
1 A 42.12 0 63 N0
2 A 30.72 0 61 N0
3 A 136.20 0 48 N0
4 A 23.40 0 63 N0
5 A 5.16 3 67 N3b
6 A 33.96 0 58 N0
Problem 1: why is as_integer() changing my values?
> str(yy)
'data.frame': 643 obs. of 5 variables:
$ study : Factor w/ 1 level "A": 1 1 1 1 1 1 1 1 1 1 ...
$ os.neck : num 42.12 30.72 136.2 23.4 5.16 ...
$ n_sygdom: Factor w/ 22 levels "0","1","10","11",..: 1 1 1 1 13 1 11 11 2 1 ...
$ age : num 63 61 48 63 67 58 23 52 53 62 ...
$ n_otte : Factor w/ 6 levels "N0","N1","N2a",..: 1 1 1 1 6 1 6 4 3 1 ...
I am trying to
yy <- yy %>% mutate(n_sygdom = as.integer(n_sygdom))
But yy$n_sygdom
changes.
> head(yy)
study os.neck n_sygdom age n_otte
1 A 42.12 1 63 N0
2 A 30.72 1 61 N0
3 A 136.20 1 48 N0
4 A 23.40 1 63 N0
5 A 5.16 13 67 N3b
6 A 33.96 1 58 N0
Question
Why does yy$n_sygdom
change? I would like to include yy$n_sygdom
as integer but obviously keep the initial integers.
Problem 2: the left_join match does not produce the intended output
Obviously, problem 1 needs to be solved first, as
a <- left_join(h, yy, by=c("study", "os.neck", "age", "n_sygdom"))
Yields
Can't join on 'n_sygdom' x 'n_sygdom' because of incompatible types (factor / integer)
But, the problem I get is also present here (without n_sygdom
):
a <- left_join(h, yy, by=c("study", "os.neck", "age"))
> head(a)
study os.neck age n_sygdom.x n_otte.x n_sygdom.y n_otte.y
1 B 49.00 53 0 N0 <NA> <NA>
2 B 1.00 83 0 N0 <NA> <NA>
3 A 76.44 63 2 <NA> <NA> <NA>
4 B 11.00 45 0 N0 <NA> <NA>
5 A 9.21 37 15 <NA> 15 N3b
6 B 1.00 60 1 N1 <NA> <NA>
Why is .x
and .y
generated, and how can I obtain a$n_otte
, exclusively?
Expected output
> head(a)
study os.neck age n_sygdom n_otte
1 B 49.00 53 0 N0
2 B 1.00 83 0 N0
3 A 76.44 63 2 <NA>
4 B 11.00 45 0 N0
5 A 9.21 37 15 N3b
6 B 1.00 60 1 N1
Primary data
h <- structure(list(study = c("B", "B", "A", "B", "A", "B", "A", "A",
"B", "B", "B", "B", "A", "A", "A", "A", "C", "B", "A", "C", "B",
"B", "B", "B", "A", "B", "A", "B", "A", "A", "C", "B", "B", "A",
"B", "C", "B", "B", "B", "C", "A", "C", "C", "B", "B", "C", "C",
"B", "B", "C", "C", "B", "B", "A", "B", "B", "C", "B", "C", "A",
"A", "C", "C", "A", "B", "B", "C", "B", "C", "C", "C", "B", "C",
"A", "B", "A", "B", "B", "C", "C", "B", "B", "B", "B", "C", "B",
"A", "A", "B", "C", "C", "B", "B", "B", "C", "B", "B", "B", "A",
"B"), os.neck = c(49, 1, 76.44, 11, 9.21, 1, 2.07, 4.08, 17,
11, 41, 38, 84.96, 5.64, 93.86, 11.52, 5.29, 61, 10.95, 3.68,
24, 63, 21, 68, 6.12, 7, 11.52, 48, 11.38, 73.68, 27.53, 12,
19, 17.98, 55, 77.77, 39, 4, 13, 57.56, 24.59, 46.55, 83.02,
14, 42, 49.58, 33.58, 33, 21, 29.96, 10.41, 67, 8, 94.72, 2,
1, 7.03, 1, 46.36, 23.76, 57.48, 14.49, 14.69, 39.62, 68, 5,
35.78, 75, 80.82, 54.24, 49.12, 87, 50.96, 2.4, 10, 7.2, 17,
10, 34.56, 104.08, 5, 28, 13, 17, 104.08, 29, 5.04, 54.96, 49,
4.27, 47.93, 5, 60, 47, 14.49, 3, 32, 23, 13.97, 32), age = c(53,
83, 63, 45, 37, 60, 52, 64, 53, 78, 67, 68, 45, 68, 43, 72, 65,
65, 59, 58, 51, 61, 62, 61, 60, 66, 64, 62, 56, 41, 51, 69, 42,
65, 59, 60, 58, 58, 63, 71, 60, 72, 58, 55, 63, 63, 49, 79, 54,
57, 50, 54, 77, 64, 74, 85, 50, 64, 60, 43, 67, 63, 68, 64, 65,
67, 57, 65, 47, 70, 61, 66, 63, 33, 75, 74, 80, 80, 62, 80, 55,
74, 78, 80, 80, 33, 34, 52, 51, 47, 51, 54, 74, 71, 63, 81, 53,
46, 48, 49), n_sygdom = c(0L, 0L, 2L, 0L, 15L, 1L, 8L, 6L, 1L,
0L, 3L, 0L, 6L, 5L, 2L, 0L, 1L, 0L, 8L, 0L, 0L, 0L, 0L, 0L, 8L,
5L, 20L, 0L, 4L, 5L, 1L, 1L, 0L, 5L, 0L, 0L, 1L, 0L, 9L, 0L,
5L, 0L, 0L, 2L, 1L, 1L, 8L, 0L, 1L, 23L, 0L, 1L, 0L, 2L, 1L,
6L, 2L, 0L, 2L, 3L, 1L, 1L, 0L, 6L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L, 3L, 0L, 10L, 0L, 0L, 1L, 1L, 1L, 0L, 3L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 0L, 4L, 0L, 0L, 1L, 8L, 0L, 0L, 3L, 0L), n_otte = structure(c(1L,
1L, NA, 1L, NA, 2L, NA, NA, 6L, 1L, 5L, 1L, NA, NA, NA, NA, NA,
1L, NA, NA, 1L, 1L, 1L, 1L, NA, 4L, NA, 1L, NA, NA, NA, 2L, 1L,
NA, 1L, NA, 2L, 1L, 5L, NA, NA, NA, NA, 6L, 2L, NA, NA, 1L, 2L,
NA, NA, 6L, 1L, NA, 6L, 5L, NA, 1L, NA, NA, NA, NA, NA, NA, 1L,
1L, NA, 1L, NA, NA, NA, 1L, NA, NA, 1L, NA, 6L, 1L, NA, NA, 2L,
1L, 6L, 6L, NA, 1L, NA, NA, 1L, NA, NA, 6L, 1L, 1L, NA, 6L, 1L,
1L, NA, 1L), .Label = c("N0", "N1", "N2a", "N2b", "N2c", "N3b"
), class = "factor")), row.names = c(NA, -100L), class = "data.frame")
Data to extract
yy <- structure(list(study = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "A", class = "factor"),
os.neck = c(24.84, 24.84, 9.76, 98.28, 19.08, 111.48, 41.52,
47.28, 35.24, 6.38, 57.48, 39.78, 35.52, 70.08, 12.49, 19.33,
3.02, 40.77, 32.71, 40.08, 59.4, 52.18, 48.33, 1.38, 26.89,
35.52, 59.18, 5.04, 6.24, 80.65, 5.13, 49.84, 9.48, 3.25,
46.42, 25.15, 10.8, 23.76, 17.1, 27.6, 4.68, 12.3, 52.96,
49.97, 10.98, 44.64, 39.78, 10.8, 9.5, 20.19, 11.97, 22.88,
60.59, 85.15, 55.04, 7.2, 28.2, 33.96, 2.76, 4.77, 9.96,
1.38, 33.4, 27.29, 37.2, 36.36, 90.28, 53.65, 32.09, 17.98,
68.28, 7.63, 36.36, 22.32, 43.2, 9.36, 5.88, 14.79, 48.1,
45.24, 9.21, 110.01, 42.12, 0.3, 0.56, 11.88, 46.26, 59.15,
87.22, 11.93, 88.8, 36.36, 29.19, 14.07, 11.21, 16.08, 20.58,
3.48, 73.74, 45.72), n_sygdom = structure(c(2L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 16L, 1L, 1L, 1L, 4L, 11L, 1L, 2L, 1L, 1L,
2L, 11L, 1L, 1L, 13L, 2L, 1L, 18L, 1L, 1L, 1L, 11L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 13L, 1L, 1L, 1L, 11L, 1L,
16L, 13L, 1L, 11L, 2L, 1L, 16L, 1L, 13L, 1L, 2L, 2L, 11L,
1L, 2L, 17L, 2L, 1L, 1L, 8L, 1L, 1L, 1L, 11L, 1L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 16L, 1L, 11L, 1L, 1L, 2L, 1L), .Label = c("0",
"1", "10", "11", "12", "13", "14", "15", "17", "18", "2",
"20", "3", "35", "39", "4", "5", "6", "7", "8", "9", "number"
), class = "factor"), age = c(44, 44, 45, 51, 50, 59, 30,
49, 35, 53, 67, 79, 64, 68, 75, 55, 67, 42, 65, 37, 77, 32,
60, 80, 64, 64, 45, 72, 54, 37, 45, 47, 70, 54, 56, 45, 71,
43, 53, 38, 58, 76, 26, 50, 65, 67, 79, 71, 67, 56, 79, 47,
54, 35, 41, 62, 64, 58, 41, 65, 73, 80, 68, 47, 32, 49, 32,
45, 56, 33, 61, 33, 49, 52, 61, 75, 60, 55, 28, 82, 37, 32,
63, 86, 63, 60, 63, 54, 63, 84, 27, 55, 67, 59, 72, 63, 47,
62, 53, 34), n_otte = structure(c(3L, 3L, 1L, 1L, 1L, 1L,
1L, 3L, 1L, 1L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 6L, 1L, 1L, 1L, 6L, 4L, 1L, 3L, 1L, 1L, 3L, 5L, 1L,
1L, 4L, 3L, 1L, 6L, 1L, 1L, 1L, 6L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 6L, 1L, 1L, 1L, 6L, 1L, 6L, 6L, 1L, 4L, 2L,
1L, 6L, 1L, 3L, 1L, 3L, 2L, 6L, 1L, 2L, 6L, 2L, 1L, 1L, 6L,
1L, 1L, 1L, 4L, 1L, 3L, 3L, 2L, 1L, 1L, 1L, 1L, 6L, 1L, 6L,
1L, 1L, 2L, 1L), .Label = c("N0", "N1", "N2a", "N2b", "N2c",
"N3b"), class = "factor")), row.names = c(NA, -100L), class = "data.frame")