To simplify my data analysis I need to process different statistical tests (in that example a shapiro test) on different different variables and on different groups of data. The purpose is to not write 150 times the same code. To do that I created, from my dataset, a tibble where each list correspond to a group of data.
The dataset:
dput(head(PdataLaKo,25))
structure(list(ctd_file = c("2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090",
"2018-08-02T162038 SBE0251090", "2018-08-02T162038 SBE0251090",
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090",
"2018-08-02T175518 SBE0251090", "2018-08-02T175518 SBE0251090",
"2018-08-03T090018 SBE0251090", "2018-08-03T090018 SBE0251090",
"2018-08-03T090018 SBE0251090", "2018-08-03T090018 SBE0251090",
"2018-08-03T090018 SBE0251090"), station = c("1", "1", "1", "1",
"1", "1", "1", "1", "1", "1", "2", "2", "2", "2", "2", "2", "2",
"2", "2", "2", "3", "3", "3", "3", "3"), month = c(8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L), day = c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L), year = c(2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L
), time = c(16.38, 16.38, 16.38, 16.38, 16.38, 16.38, 16.38,
16.38, 16.38, 16.38, 17.57, 17.57, 17.57, 17.57, 17.57, 17.57,
17.57, 17.57, 17.57, 17.57, 9.05, 9.05, 9.05, 9.05, 9.05), LAT = c("69.27166667",
"69.27166667", "69.27166667", "69.27166667", "69.27166667", "69.27166667",
"69.27166667", "69.27166667", "69.27166667", "69.27166667", "69.25386667",
"69.25386667", "69.25386667", "69.25386667", "69.25386667", "69.25386667",
"69.25386667", "69.25386667", "69.25386667", "69.25386667", "69.23883333",
"69.23883333", "69.23883333", "69.23883333", "69.23883333"),
LONG = c("-25.15166667", "-25.15166667", "-25.15166667",
"-25.15166667", "-25.15166667", "-25.15166667", "-25.15166667",
"-25.15166667", "-25.15166667", "-25.15166667", "-25.0637",
"-25.0637", "-25.0637", "-25.0637", "-25.0637", "-25.0637",
"-25.0637", "-25.0637", "-25.0637", "-25.0637", "-24.9915",
"-24.9915", "-24.9915", "-24.9915", "-24.9915"), bottom_depth = c("226",
"226", "226", "226", "226", "226", "226", "226", "226", "226",
"229", "229", "229", "229", "229", "229", "229", "229", "229",
"229", "255", "255", "255", "255", "255"), timeS = c(223.5,
227.236, 228.558, 228.916, 229.336, 229.749, 230.119, 230.49,
228.871, 231.587, 242.497, 241.334, 241.591, 241.869, 242.205,
242.601, 249.68, 249.758, 250.013, 250.3, 266.6, 267.148,
267.57, 267.899, 268.245), pressure = c(0.607, 0.707, 0.808,
0.909, 1.01, 1.112, 1.212, 1.313, 1.516, 1.616, 0.503, 0.101,
0.202, 0.303, 0.404, 0.505, 0.606, 0.707, 0.808, 0.909, 0.303,
0.404, 0.505, 0.606, 0.707), depth = c(0.6, 0.7, 0.8, 0.9,
1, 1.1, 1.2, 1.3, 1.5, 1.6, 0.5, 0.1, 0.2, 0.3, 0.4, 0.5,
0.6, 0.7, 0.8, 0.9, 0.3, 0.4, 0.5, 0.6, 0.7), salinity = c(30.195,
30.1989, 30.199, 30.1737, 30.1894, 30.1824, 30.1546, 30.2275,
29.9942, 30.2885, 28.2285, 28.1332, 28.2182, 28.3279, 28.3815,
28.4242, 29.0786, 29.1525, 29.2647, 29.3672, 32.484, 32.3028,
31.885, 31.3775, 30.978), temperature = c(2.6459, 2.6477,
2.6263, 2.5913, 2.6188, 2.608, 2.6106, 2.6746, 2.6973, 2.6837,
1.9945, 2.0299, 1.9315, 1.7799, 1.734, 1.7082, 1.9263, 1.928,
1.9184, 1.8562, 1.1441, 1.073, 1.052, 1.0518, 1.0603), oxygen = c(352.891,
352.339, 352.12, 352.03, 351.884, 351.702, 351.505, 351.296,
347.288, 351.001, 345.827, 345.879, 345.866, 345.841, 345.782,
345.65, 346.849, 346.78, 346.718, 346.609, 340.888, 340.026,
339.348, 338.819, 338.245), oxygen2 = c(102.182, 102.228,
102.257, 102.281, 102.313, 102.347, 102.383, 102.427, 101.491,
102.644, 97.258, 97.311, 97.301, 97.29, 97.275, 97.253, 98.285,
98.293, 98.325, 98.357, 97.857, 97.839, 97.829, 97.812, 97.788
), fluorescence = c(0.17892, 0.17232, 0.17139, 0.17278, 0.17479,
0.17701, 0.1792, 0.18158, 0.18525, 0.18909, 0.10588, 0.10647,
0.10697, 0.10818, 0.10978, 0.11157, 0.1479, 0.14969, 0.15256,
0.15607, 0.30833, 0.31088, 0.31321, 0.31532, 0.31758), turbidity = c(0.164,
0.164, 0.164, 0.164, 0.163, 0.163, 0.163, 0.163, 0.161, 0.163,
0.165, 0.165, 0.165, 0.165, 0.165, 0.164, 0.159, 0.16, 0.16,
0.16, 4.742, 4.747, 4.751, 4.753, 4.755), conductivity = c(2.742475,
2.743057, 2.74087, 2.73591, 2.740138, 2.738197, 2.736985,
2.747572, 2.72913, 2.752912, 2.524555, 2.525942, 2.524426,
2.523692, 2.525012, 2.526692, 2.596153, 2.602249, 2.609861,
2.613963, 2.806078, 2.786396, 2.752134, 2.712379, 2.682113
), pH = c(8.094, 8.098, 8.099, 8.099, 8.099, 8.099, 8.099,
8.099, 8.014, 8.099, 8.16, 8.16, 8.16, 8.16, 8.159, 8.159,
8.149, 8.148, 8.147, 8.146, 8.121, 8.12, 8.12, 8.119, 8.118
), par = c(2627.7, 2627.7, 2627.7, 2627.7, 2627.7, 2627.7,
2627.7, 2627.7, 2600, 2627.7, 2627.8, 2627.8, 2627.8, 2627.8,
2627.8, 2627.8, 2627.8, 2627.8, 2627.8, 2627.8, 2627.6, NA,
2627.6, 2627.6, 2627.6), soundSpeed = c(1454.65, 1454.65,
1454.57, 1454.38, 1454.53, 1454.47, 1454.45, 1454.82, 1439.8,
1454.96, 1449.08, 1449.22, 1448.88, 1448.36, 1448.23, 1448.18,
1449.98, 1450.1, 1450.19, 1450.08, 1450.99, 1450.48, 1449.83,
1449.19, 1448.69), svDM = c(1454.55, 1454.55, 1454.48, 1454.29,
1454.43, 1454.37, 1454.35, 1454.72, 1439.71, 1454.86, 1448.98,
1449.12, 1448.78, 1448.26, 1448.14, 1448.08, 1449.89, 1450.01,
1450.1, 1449.98, 1450.91, 1450.4, 1449.76, 1449.12, 1448.61
), svWM = c(1454.57, 1454.57, 1454.49, 1454.3, 1454.45, 1454.39,
1454.37, 1454.74, 1439.73, 1454.89, 1448.89, 1449.03, 1448.69,
1448.18, 1448.05, 1448, 1449.84, 1449.96, 1450.05, 1449.94,
1451.01, 1450.49, 1449.83, 1449.16, 1448.62)), row.names = c(NA,
25L), class = "data.frame")
#to correspond to the number of observations, the rows are duplicated
PdataLaKo <- as.data.frame(lapply(PdataLaKo, rep, 250))
To do that I used, the function group_split
from the package dplyr
, which created a tibble of different lists corresponding to each group (ctd_file). after I tried to apply shapiro.test
from the package rstatix
to each selected variable of a group with sapply
and to each groups with lapply
.
PdataLaKo <- PdataLaKo %>%
group_split(ctd_file)
lnorm <- PdataLaKo[, c(11:24)] %>%
map(~ lapply(., sapply(., shapiro.test)))
If I extract one group with filter and do not create a tibble, the sapply
function applied to each variable works. Consequently, the problem come from tibble and not the application to selected column.
#non-grouped data
PdataLaKot <- PdataLaKo %>% filter(ctd_file == "2018-08-05T092503 SBE0251090")
lnorm <- PdataLaKot[, c(11:24)] %>% sapply(shapiro.test)