0

I really thought I had a solution to this problem when I learned about the additional parameter 'sort' that was defaulted to TRUE in merge(). However, setting this to false did not help. Below is a demo of my code, with the results I am getting and the results I want:

df2 = structure(list(player = c("Marvin Williams", "Spencer Hawes", 
"Jeremy Lin", "Kemba Walker", "P.J. Hairston", "Rudy Gay", "Rajon Rondo", 
"DeMarcus Cousins", "Ben McLemore", "Willie Cauley-Stein"), global.player.id = c(263884L, 
329824L, 340730L, 462980L, 609567L, 266358L, 262882L, 509450L, 
604898L, 699950L), team.name = c("Hornets", "Hornets", "Hornets", 
"Hornets", "Grizzlies", "Kings", "Kings", "Kings", "Kings", "Kings"
)), .Names = c("player", "global.player.id", "team.name"), class = "data.frame", row.names = c(47L, 
48L, 52L, 53L, 225L, 389L, 390L, 395L, 398L, 401L))

df1 = structure(list(global.player.id = c(-1L, 262882L, 266358L, 509450L, 
604898L, 699950L, 263884L, 329824L, 340730L, 462980L, 609567L, 
-1L, 262882L, 266358L, 509450L, 604898L, 699950L, 263884L, 329824L, 
340730L, 462980L, 609567L, -1L, 262882L, 266358L), x_loc = c(47.17753, 
13.57165, 46.45843, 26.68803, 52.16717, 47.20201, 60.097, 47.20201, 
52.16717, 65.1302, 46.45843, 47.19141, 13.61702, 46.5355, 26.71856, 
52.25433, 47.27324, 60.08215, 47.27324, 52.25433, 65.11267, 46.5355, 
46.82163, 13.66478, 46.57545), y_loc = c(26.44326, 25.18298, 
18.46573, 25.48557, 33.09177, 31.09372, 22.79717, 31.09372, 33.09177, 
26.39671, 18.46573, 26.5187, 25.17431, 18.42014, 25.53807, 33.11185, 
31.01197, 22.76307, 31.01197, 33.11185, 26.40227, 18.42014, 26.72834, 
25.17784, 18.35961), order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)), .Names = c("global.player.id", 
"x_loc", "y_loc", "order"), row.names = c("1", "2", "3", "4", 
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", 
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25"), class = "data.frame")

The above are the dataframes I am working with. I want to keep the order of df1 when i merge df2 onto it. I am dealing with time series data here, so the order of the dataframe is important. The order column in df1 is just to test whether df1 is being shuffled or not (I do not want to use extra code to sort on order after the merge).

Here is what I've tried:

merge(df1, df2, by = 'global.player.id', all.x = TRUE)

   global.player.id    x_loc    y_loc order              player team.name
1                -1 47.17753 26.44326     1                <NA>      <NA>
2                -1 46.82163 26.72834    23                <NA>      <NA>
3                -1 47.19141 26.51870    12                <NA>      <NA>
4            262882 13.57165 25.18298     2         Rajon Rondo     Kings
5            262882 13.61702 25.17431    13         Rajon Rondo     Kings
6            262882 13.66478 25.17784    24         Rajon Rondo     Kings
7            263884 60.08215 22.76307    18     Marvin Williams   Hornets
8            263884 60.09700 22.79717     7     Marvin Williams   Hornets
9            266358 46.53550 18.42014    14            Rudy Gay     Kings
10           266358 46.45843 18.46573     3            Rudy Gay     Kings
11           266358 46.57545 18.35961    25            Rudy Gay     Kings
12           329824 47.27324 31.01197    19       Spencer Hawes   Hornets
13           329824 47.20201 31.09372     8       Spencer Hawes   Hornets
14           340730 52.16717 33.09177     9          Jeremy Lin   Hornets
15           340730 52.25433 33.11185    20          Jeremy Lin   Hornets
16           462980 65.13020 26.39671    10        Kemba Walker   Hornets
17           462980 65.11267 26.40227    21        Kemba Walker   Hornets
18           509450 26.71856 25.53807    15    DeMarcus Cousins     Kings
19           509450 26.68803 25.48557     4    DeMarcus Cousins     Kings
20           604898 52.16717 33.09177     5        Ben McLemore     Kings
21           604898 52.25433 33.11185    16        Ben McLemore     Kings
22           609567 46.53550 18.42014    22       P.J. Hairston Grizzlies
23           609567 46.45843 18.46573    11       P.J. Hairston Grizzlies
24           699950 47.20201 31.09372     6 Willie Cauley-Stein     Kings
25           699950 47.27324 31.01197    17 Willie Cauley-Stein     Kings

Originally in df1, order was sorted 1-25, and now it is all out of order. Clearly df1 was shuffled in a way I didn't want it to be. Here's the output when I pass sort = FALSE to the merge function:

merge(df1, df2, by = 'global.player.id', all.x = TRUE, sort = FALSE)

global.player.id x_loc y_loc order player team.name

1            262882 13.57165 25.18298     2         Rajon Rondo     Kings
2            262882 13.61702 25.17431    13         Rajon Rondo     Kings
3            262882 13.66478 25.17784    24         Rajon Rondo     Kings
4            266358 46.53550 18.42014    14            Rudy Gay     Kings
5            266358 46.45843 18.46573     3            Rudy Gay     Kings
6            266358 46.57545 18.35961    25            Rudy Gay     Kings
7            509450 26.71856 25.53807    15    DeMarcus Cousins     Kings
8            509450 26.68803 25.48557     4    DeMarcus Cousins     Kings
9            604898 52.16717 33.09177     5        Ben McLemore     Kings
10           604898 52.25433 33.11185    16        Ben McLemore     Kings
11           699950 47.20201 31.09372     6 Willie Cauley-Stein     Kings
12           699950 47.27324 31.01197    17 Willie Cauley-Stein     Kings
13           263884 60.08215 22.76307    18     Marvin Williams   Hornets
14           263884 60.09700 22.79717     7     Marvin Williams   Hornets
15           329824 47.27324 31.01197    19       Spencer Hawes   Hornets
16           329824 47.20201 31.09372     8       Spencer Hawes   Hornets
17           340730 52.16717 33.09177     9          Jeremy Lin   Hornets
18           340730 52.25433 33.11185    20          Jeremy Lin   Hornets
19           462980 65.13020 26.39671    10        Kemba Walker   Hornets
20           462980 65.11267 26.40227    21        Kemba Walker   Hornets
21           609567 46.53550 18.42014    22       P.J. Hairston Grizzlies
22           609567 46.45843 18.46573    11       P.J. Hairston Grizzlies
23               -1 47.17753 26.44326     1                <NA>      <NA>
24               -1 46.82163 26.72834    23                <NA>      <NA>
25               -1 47.19141 26.51870    12                <NA>      <NA>

Also not what I wanted, as order is all out of order again.

Is there anyway to call the merge function without completely shuffling the first dataframe parameter passed, or am I completely out of luck. This seems like a major downfall of the merge() function if so. Thanks!

Canovice
  • 9,012
  • 22
  • 93
  • 211

1 Answers1

2

You could use join from plyr

library(plyr)
plyr::join(df1,df2, by='global.player.id')

The result is not sorted.

discipulus
  • 2,665
  • 3
  • 34
  • 51