I've got a dataframe with the following structure:
# A tibble: 95 x 7
# Groups: WallReg_2p5 [19]
CellID_2p5 Y_Coord_2p5Weighting WallReg_2p5 piC_1 piC_2 piC_3 piC_4
<int> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 6561 0.915 African 6.55 6.63 5.84 0.766
2 6278 0.947 African 15.1 5.59 2.15 2.01
3 4394 0.971 African 11.4 3.92 0.774 1.47
4 4840 0.994 African 4.70 0.962 6.21 3.54
5 4105 0.947 African 6.35 2.10 2.25 3.24
6 5228 1.000 Amazonian 8.49 5.00 1.92 2.42
7 5089 1.000 Amazonian 15.6 6.48 2.53 2.89
8 4939 0.998 Amazonian 5.56 2.94 0.389 2.44
9 5088 1.000 Amazonian 12.9 5.16 1.99 3.13
10 4947 0.998 Amazonian 8.05 11.2 2.54 4.61
# ... with 85 more rows
Here is the dput()
of a subset of the dataframe. My real dataset consists of 10,368 rows and 255,611 columns
structure(list(CellID_2p5 = c(6561L, 6278L, 4394L, 4840L, 4105L,
5228L, 5089L, 4939L, 5088L, 4947L, 1710L, 2569L, 1438L, 1175L,
1840L, 6888L, 7185L, 6031L, 7045L, 7044L, 3432L, 3288L, 3143L,
3574L, 3577L, 3260L, 1959L, 2568L, 2986L, 2386L, 5551L, 5407L,
5556L, 4979L, 5694L, 5303L, 4442L, 5587L, 5157L, 4865L, 3294L,
3009L, 2865L, 2722L, 3151L, 6427L, 6571L, 5996L, 6570L, 6139L,
3631L, 3920L, 3342L, 3341L, 4064L, 2617L, 2049L, 3346L, 1599L,
3205L, 7487L, 6612L, 6613L, 7630L, 7916L, 3854L, 3561L, 4290L,
4138L, 3704L, 4211L, 4068L, 4069L, 4357L, 4648L, 5601L, 5600L,
5455L, 5456L, 5458L, 3978L, 3822L, 3532L, 3832L, 3834L, 7105L,
6817L, 6104L, 7963L, 6098L, 3418L, 3424L, 3281L, 3566L, 3273L
), Y_Coord_2p5Weighting = c(0.915311479119447, 0.946930129495106,
0.971342069813261, 0.99405633822232, 0.946930129495106, 0.999762027079909,
0.999762027079909, 0.997858923238603, 0.999762027079909, 0.997858923238603,
0.480988768919388, 0.691513055782269, 0.402746689858737, 0.362438038283702,
0.518773258160522, 0.876726755707508, 0.831469612302545, 0.971342069813261,
0.854911870672947, 0.854911870672947, 0.854911870672947, 0.831469612302545,
0.806444604267483, 0.876726755707508, 0.876726755707508, 0.831469612302545,
0.555570233019602, 0.691513055782269, 0.779884483092882, 0.659345815100069,
0.99405633822232, 0.997858923238603, 0.99405633822232, 0.997858923238603,
0.988361510467761, 0.999762027079909, 0.971342069813261, 0.99405633822232,
0.999762027079909, 0.99405633822232, 0.831469612302545, 0.779884483092882,
0.751839807478977, 0.722363962059756, 0.806444604267483, 0.932007869282799,
0.915311479119447, 0.971342069813261, 0.915311479119447, 0.960049854385929,
0.896872741532688, 0.932007869282799, 0.854911870672947, 0.854911870672947,
0.946930129495106, 0.722363962059756, 0.591309648363582, 0.854911870672947,
0.480988768919388, 0.831469612302545, 0.779884483092882, 0.915311479119447,
0.915311479119447, 0.751839807478977, 0.691513055782269, 0.915311479119447,
0.876726755707508, 0.960049854385929, 0.946930129495106, 0.896872741532688,
0.960049854385929, 0.946930129495106, 0.946930129495106, 0.971342069813261,
0.988361510467761, 0.99405633822232, 0.99405633822232, 0.997858923238603,
0.997858923238603, 0.997858923238603, 0.932007869282799, 0.915311479119447,
0.876726755707508, 0.915311479119447, 0.915311479119447, 0.831469612302545,
0.876726755707508, 0.960049854385929, 0.659345815100069, 0.960049854385929,
0.854911870672947, 0.854911870672947, 0.831469612302545, 0.876726755707508,
0.831469612302545), WallReg_2p5 = c("African", "African", "African",
"African", "African", "Amazonian", "Amazonian", "Amazonian",
"Amazonian", "Amazonian", "Arctico-Siberian", "Arctico-Siberian",
"Arctico-Siberian", "Arctico-Siberian", "Arctico-Siberian", "Australian",
"Australian", "Australian", "Australian", "Australian", "Chinese",
"Chinese", "Chinese", "Chinese", "Chinese", "Eurasian", "Eurasian",
"Eurasian", "Eurasian", "Eurasian", "Guineo-Congolian", "Guineo-Congolian",
"Guineo-Congolian", "Guineo-Congolian", "Guineo-Congolian", "Indo-Malayan",
"Indo-Malayan", "Indo-Malayan", "Indo-Malayan", "Indo-Malayan",
"Japanese", "Japanese", "Japanese", "Japanese", "Japanese", "Madagascan",
"Madagascan", "Madagascan", "Madagascan", "Madagascan", "Mexican",
"Mexican", "Mexican", "Mexican", "Mexican", "North American",
"North American", "North American", "North American", "North American",
"Novozelandic", "Novozelandic", "Novozelandic", "Novozelandic",
"Novozelandic", "Oriental", "Oriental", "Oriental", "Oriental",
"Oriental", "Panamanian", "Panamanian", "Panamanian", "Panamanian",
"Panamanian", "Papua-Melanesian", "Papua-Melanesian", "Papua-Melanesian",
"Papua-Melanesian", "Papua-Melanesian", "Saharo-Arabian", "Saharo-Arabian",
"Saharo-Arabian", "Saharo-Arabian", "Saharo-Arabian", "South American",
"South American", "South American", "South American", "South American",
"Tibetan", "Tibetan", "Tibetan", "Tibetan", "Tibetan"), piC_1 = c(6.54637718200684,
15.1273813247681, 11.4171981811523, 4.70245027542114, 6.35227298736572,
8.48885822296143, 15.5538415908813, 5.56155681610107, 12.9046697616577,
8.04517650604248, 2.95071268081665, 21.6441345214844, 11.2329692840576,
16.1649322509766, 17.2905006408691, 3.43583130836487, 10.0594062805176,
12.3438568115234, 7.94222640991211, 6.89916276931763, 7.45456171035767,
8.77329444885254, 14.3378238677979, 3.86588025093079, 12.4889860153198,
7.18962049484253, 19.2145137786865, 22.0060653686523, 1.86285281181335,
2.09195709228516, 9.87592029571533, 12.2629871368408, 7.31402492523193,
0.601671099662781, 6.9998254776001, 20.6269207000732, 6.21515369415283,
22.039529800415, 8.35955047607422, 9.50113105773926, 7.06818675994873,
4.63532447814941, 5.81412315368652, 0.996474027633667, 8.32744407653809,
5.03945255279541, 0.893457889556885, 2.42736291885376, 10.3842725753784,
3.32475543022156, 8.1105375289917, 6.61336517333984, 4.06754541397095,
3.31069254875183, 8.05746650695801, 1.24714422225952, 6.44647121429443,
2.97141313552856, 13.3264999389648, 4.86157178878784, 6.71903085708618,
20.3318004608154, 20.8287792205811, 10.0042209625244, 12.7859420776367,
13.6358938217163, 15.9491415023804, 11.4823551177979, 18.6053276062012,
16.6047229766846, 16.1496143341064, 2.9492039680481, 13.8130388259888,
18.6300754547119, 14.464674949646, 4.92032289505005, 0.511945068836212,
3.16324853897095, 13.3062620162964, 9.84803581237793, 1.74625515937805,
2.54861640930176, 9.97869968414307, 11.2339553833008, 0.865878522396088,
14.7632684707642, 21.8330593109131, 6.42118740081787, 9.51691722869873,
13.2857227325439, 4.01672554016113, 10.9487056732178, 13.6308097839355,
4.69979858398438, 1.83490359783173), piC_2 = c(6.62732124328613,
5.59194660186768, 3.92186212539673, 0.962285339832306, 2.1002824306488,
4.99801731109619, 6.4822793006897, 2.94481801986694, 5.16082000732422,
11.2070302963257, 0.585842967033386, 4.83236265182495, 1.637331366539,
7.65087461471558, 2.28347945213318, 7.16115474700928, 3.54162955284119,
5.23653078079224, 2.28897953033447, 2.29887819290161, 0.752622723579407,
0.653791189193726, 1.5378258228302, 2.15203213691711, 1.64702248573303,
6.0682373046875, 0.22119003534317, 4.76900386810303, 0.366481363773346,
6.11435651779175, 10.8921070098877, 7.97591733932495, 6.05282688140869,
3.74584698677063, 5.75792741775513, 0.471727430820465, 2.75132250785828,
1.21862363815308, 0.138835281133652, 2.98711204528809, 0.627980709075928,
0.108154557645321, 0.995486855506897, 2.4163064956665, 0.0193456951528788,
5.70003795623779, 5.56746625900269, 2.9861011505127, 0.344279021024704,
0.640789806842804, 9.4457426071167, 7.05727958679199, 3.89853048324585,
0.340702921152115, 1.17963445186615, 8.93050575256348, 14.796028137207,
4.88054323196411, 9.28642845153809, 7.68382120132446, 2.27267980575562,
0.916118919849396, 0.689630210399628, 0.549197673797607, 1.68408465385437,
1.76007652282715, 3.2269868850708, 0.980833470821381, 5.00142002105713,
3.41616177558899, 6.74930334091187, 12.0952653884888, 15.2918863296509,
0.105648428201675, 4.59846162796021, 1.48986113071442, 5.02905178070068,
5.07208204269409, 4.98251914978027, 4.70810985565186, 2.37468719482422,
6.78730487823486, 6.18559217453003, 11.6090707778931, 2.91017484664917,
3.51590204238892, 3.35987615585327, 8.74919319152832, 2.23059439659119,
0.292922139167786, 5.41262531280518, 8.86936473846436, 8.20160961151123,
7.33296489715576, 8.42716407775879), piC_3 = c(5.84101867675781,
2.14856338500977, 0.774434208869934, 6.21446466445923, 2.25056719779968,
1.9200998544693, 2.52935075759888, 0.38894659280777, 1.98762917518616,
2.53701376914978, 6.93642854690552, 0.608367025852203, 4.7472562789917,
1.25435817241669, 4.09390258789062, 5.41882562637329, 0.221905186772346,
3.72868466377258, 0.763698220252991, 0.783569753170013, 8.32380294799805,
4.482017993927, 2.38237118721008, 10.7143220901489, 10.1253957748413,
4.51582384109497, 5.18871164321899, 1.76670265197754, 7.50785446166992,
6.2304630279541, 8.79040622711182, 7.47595691680908, 1.57976567745209,
1.46996772289276, 0.894773840904236, 1.30858862400055, 7.34649181365967,
1.41060519218445, 2.03947067260742, 4.6038031578064, 4.44245910644531,
0.236538723111153, 0.194929093122482, 0.684483885765076, 0.530747056007385,
1.89696133136749, 1.94861626625061, 3.36041831970215, 0.0835498198866844,
2.04665040969849, 7.02379274368286, 2.93551588058472, 5.33355855941772,
1.59516668319702, 2.19099020957947, 2.88170146942139, 7.42911052703857,
4.64155960083008, 2.24829292297363, 3.64715957641602, 0.363596022129059,
1.41882479190826, 0.474381387233734, 2.24125337600708, 4.11492681503296,
3.44695138931274, 3.08158445358276, 0.218709617853165, 2.44625425338745,
1.71628797054291, 1.75634157657623, 4.76044988632202, 0.387977868318558,
1.70636379718781, 1.70855867862701, 3.67641615867615, 0.744896650314331,
1.09648311138153, 1.37377882003784, 0.200171306729317, 1.4753475189209,
6.56762170791626, 7.72892284393311, 2.18395304679871, 0.481256455183029,
0.37385630607605, 4.25140476226807, 6.76727914810181, 4.81376981735229,
3.8882269859314, 2.90145373344421, 7.48540449142456, 9.90997123718262,
4.46362543106079, 5.19004011154175), piC_4 = c(0.765519082546234,
2.01459360122681, 1.4724348783493, 3.53503012657166, 3.23746180534363,
2.42439723014832, 2.89345812797546, 2.43676805496216, 3.13469624519348,
4.61154937744141, 4.51843070983887, 0.767921149730682, 5.01102733612061,
2.94891023635864, 5.20972728729248, 1.1311411857605, 2.22004199028015,
3.79573369026184, 0.551535904407501, 0.574182093143463, 5.87988710403442,
5.06349992752075, 3.72144675254822, 8.49415874481201, 4.27884483337402,
2.48057842254639, 4.45665884017944, 0.667030334472656, 6.93020153045654,
2.26927351951599, 1.5674192905426, 3.63813829421997, 2.73822736740112,
0.674351632595062, 1.89532685279846, 4.79139471054077, 1.34277474880219,
0.564522683620453, 3.33897042274475, 1.42253696918488, 2.7286331653595,
0.960368096828461, 2.00121903419495, 4.58775472640991, 2.11190366744995,
0.29313051700592, 0.0706640183925629, 2.87113666534424, 1.36242246627808,
3.57689785957336, 2.05132532119751, 0.340487778186798, 1.3506361246109,
0.400035679340363, 1.65728294849396, 5.17583227157593, 6.23331356048584,
1.60608506202698, 6.12336874008179, 0.46411395072937, 0.205161795020103,
1.93029391765594, 2.6833176612854, 0.199026927351952, 0.0609574876725674,
1.12770354747772, 1.49503016471863, 0.299944281578064, 0.302427768707275,
0.745285212993622, 2.91650176048279, 4.18865776062012, 2.71514081954956,
1.93356776237488, 1.67894613742828, 1.67655885219574, 3.09425163269043,
2.87126135826111, 2.42724895477295, 5.48751878738403, 3.4703311920166,
3.71456289291382, 4.29666662216187, 3.37810254096985, 3.07785415649414,
1.90873026847839, 3.57397627830505, 0.902793109416962, 3.96058869361877,
0.35958793759346, 2.9896719455719, 1.81924939155579, 4.22445392608643,
2.22684979438782, 4.53710412979126)), row.names = c(NA, -95L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), .Names = c("CellID_2p5", "Y_Coord_2p5Weighting",
"WallReg_2p5", "piC_1", "piC_2", "piC_3", "piC_4"), vars = "WallReg_2p5", drop = TRUE, indices = list(
0:4, 5:9, 10:14, 15:19, 20:24, 25:29, 30:34, 35:39, 40:44,
45:49, 50:54, 55:59, 60:64, 65:69, 70:74, 75:79, 80:84, 85:89,
90:94), group_sizes = c(5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), biggest_group_size = 5L, labels = structure(list(
WallReg_2p5 = c("African", "Amazonian", "Arctico-Siberian",
"Australian", "Chinese", "Eurasian", "Guineo-Congolian",
"Indo-Malayan", "Japanese", "Madagascan", "Mexican", "North American",
"Novozelandic", "Oriental", "Panamanian", "Papua-Melanesian",
"Saharo-Arabian", "South American", "Tibetan")), row.names = c(NA,
-19L), class = "data.frame", vars = "WallReg_2p5", drop = TRUE, .Names = "WallReg_2p5"))
What I am trying to do is to generate weighted values of all the piC_
columns for each region. The process for each column (x
) involves 3 steps:
- multiply each row in the
piC_x
column, by the values inY_Coord_2p5Weighting
- Sum the weighted
piC_x
values within each of theWallReg_2p5
groups - Divide the summed
piC_x
value, by the sum of the values inY_Coord_2p5Weighting
for each of theWallReg_2p5
groups
After some reading it appears that data.table
is faster on large datasets than dplyr
, but I am open to using either package, or even base r
.
I have attempted to do both, but am getting incorrect results when using data.table
, and I'm worried about the speed of dplyr
when I apply this to my full dataframe. Here is what I've tried so far
dplyr
df <- df %>% tbl_df() %>%
group_by(WallReg_2p5) %>%
summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 8.83 4.70 15.1
2 Amazonian 10.1 5.56 15.6
3 Arctico-Siberian 13.9 2.95 21.6
4 Australian 8.14 3.44 12.3
5 Chinese 9.38 3.87 14.3
6 Eurasian 10.5 1.86 22.0
7 Guineo-Congolian 7.41 0.602 12.3
8 Indo-Malayan 13.3 6.22 22.0
9 Japanese 5.37 0.996 8.33
10 Madagascan 4.41 0.893 10.4
11 Mexican 6.03 3.31 8.11
12 North American 5.77 1.25 13.3
13 Novozelandic 14.1 6.72 20.8
14 Oriental 15.3 11.5 18.6
15 Panamanian 13.2 2.95 18.6
16 Papua-Melanesian 6.35 0.512 13.3
17 Saharo-Arabian 5.27 0.866 11.2
18 South American 13.2 6.42 21.8
19 Tibetan 7.03 1.83 13.6
weighted <- df %>%
mutate_at(.funs = funs(.*Y_Coord_2p5Weighting), .vars = vars(starts_with("piC_"))) %>% ## multiply by lat weight
mutate_at(.funs = funs(sum), .vars = vars(starts_with("piC_"))) %>% ## sum the weighted values
mutate_at(.funs = funs(./sum(Y_Coord_2p5Weighting)), .vars = vars(starts_with("piC_"))) ## divide weighted values by sum of weights
weighted %>% tbl_df %>% group_by(WallReg_2p5) %>% summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 8.82 8.82 8.82
2 Amazonian 10.1 10.1 10.1
3 Arctico-Siberian 14.5 14.5 14.5
4 Australian 8.21 8.21 8.21
5 Chinese 9.32 9.32 9.32
6 Eurasian 9.86 9.86 9.86
7 Guineo-Congolian 7.41 7.41 7.41
8 Indo-Malayan 13.4 13.4 13.4
9 Japanese 5.47 5.47 5.47
10 Madagascan 4.38 4.38 4.38
11 Mexican 6.10 6.10 6.10
12 North American 5.09 5.09 5.09
13 Novozelandic 14.6 14.6 14.6
14 Oriental 15.2 15.2 15.2
15 Panamanian 13.2 13.2 13.2
16 Papua-Melanesian 6.36 6.36 6.36
17 Saharo-Arabian 5.22 5.22 5.22
18 South American 13.2 13.2 13.2
19 Tibetan 7.01 7.01 7.01
Using dplyr
I get the correct values. However, when I use data.table
I get incorrect values. I have based my code on the question here, but clearly I'm doing something wrong.
data.table
df <- df %>% group_by(WallReg_2p5) %>%
as.data.table(.) %>% setkey(., WallReg_2p5)
is.data.table(df); haskey(df)
[1] TRUE
[1] TRUE
## same as above
df %>% tbl_df %>% group_by(WallReg_2p5) %>%
summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 8.83 4.70 15.1
2 Amazonian 10.1 5.56 15.6
3 Arctico-Siberian 13.9 2.95 21.6
4 Australian 8.14 3.44 12.3
5 Chinese 9.38 3.87 14.3
6 Eurasian 10.5 1.86 22.0
7 Guineo-Congolian 7.41 0.602 12.3
8 Indo-Malayan 13.3 6.22 22.0
9 Japanese 5.37 0.996 8.33
10 Madagascan 4.41 0.893 10.4
11 Mexican 6.03 3.31 8.11
12 North American 5.77 1.25 13.3
13 Novozelandic 14.1 6.72 20.8
14 Oriental 15.3 11.5 18.6
15 Panamanian 13.2 2.95 18.6
16 Papua-Melanesian 6.35 0.512 13.3
17 Saharo-Arabian 5.27 0.866 11.2
18 South American 13.2 6.42 21.8
19 Tibetan 7.03 1.83 13.6
# https://stackoverflow.com/q/28123098/1710632
indx <- grep("piC_", colnames(df))
for (j in indx) {
set(df, i = NULL, j = j, value = df[[j]]*df[["Y_Coord_2p5Weighting"]]) ## multiply by weights
set(df, i = NULL, j = j, value = sum(df[[j]])) ## sum the weighted values
set(df, i = NULL, j = j, value = df[[j]]/sum(df[["Y_Coord_2p5Weighting"]])) ## divide by sum of weights
}
## wrong values
df %>% tbl_df %>% group_by(WallReg_2p5) %>%
summarise(meanS = mean(piC_1), minS = min(piC_1), maxS = max(piC_1))
# A tibble: 19 x 4
WallReg_2p5 meanS minS maxS
<chr> <dbl> <dbl> <dbl>
1 African 9.27 9.27 9.27
2 Amazonian 9.27 9.27 9.27
3 Arctico-Siberian 9.27 9.27 9.27
4 Australian 9.27 9.27 9.27
5 Chinese 9.27 9.27 9.27
6 Eurasian 9.27 9.27 9.27
7 Guineo-Congolian 9.27 9.27 9.27
8 Indo-Malayan 9.27 9.27 9.27
9 Japanese 9.27 9.27 9.27
10 Madagascan 9.27 9.27 9.27
11 Mexican 9.27 9.27 9.27
12 North American 9.27 9.27 9.27
13 Novozelandic 9.27 9.27 9.27
14 Oriental 9.27 9.27 9.27
15 Panamanian 9.27 9.27 9.27
16 Papua-Melanesian 9.27 9.27 9.27
17 Saharo-Arabian 9.27 9.27 9.27
18 South American 9.27 9.27 9.27
19 Tibetan 9.27 9.27 9.27
Reading ?set()
, states that it cannot perform grouping operations, but I thought that as I had already defined my groups that this process would work. I've never used data.table
before, so any guidance would be much appreciated.