47

I have a data frame (all_data) in which I have a list of sites (1... to n) and their scores e.g.

  site  score
     1    10
     1    11  
     1    12
     4    10 
     4    11
     4    11
     8    9
     8    8
     8    7

I want create a column that numbers each level of site in numerical order, like a counter. In the example, the sites (1, 4, and 8) would be have a corresponding counter from 1 to 3 in the 'number' column:

site  score number
     1    10    1
     1    11    1 
     1    12    1 
     4    10    2
     4    11    2
     4    11    2
     8    9     3
     8    8     3 
     8    7     3

I am sure this must be easily solved, but I have not found a way yet.

Henrik
  • 65,555
  • 14
  • 143
  • 159
ThallyHo
  • 2,667
  • 5
  • 22
  • 19

13 Answers13

32

In the new dplyr 1.0.0 we can use cur_group_id() which gives a unique numeric identifier to a group.

library(dplyr)
df %>% group_by(site) %>% mutate(number = cur_group_id())

#  site score number
#  <int> <int>  <int>
#1     1    10      1
#2     1    11      1
#3     1    12      1
#4     4    10      2
#5     4    11      2
#6     4    11      2
#7     8     9      3
#8     8     8      3
#9     8     7      3

data

df <- structure(list(site = c(1L, 1L, 1L, 4L, 4L, 4L, 8L, 8L, 8L), 
score = c(10L, 11L, 12L, 10L, 11L, 11L, 9L, 8L, 7L)), 
class = "data.frame", row.names = c(NA, -9L))
Ronak Shah
  • 377,200
  • 20
  • 156
  • 213
31

Try Data$number <- as.numeric(as.factor(Data$site))

On a sidenote : the difference between the solution of me and @Chase on one hand, and the one of @DWin on the other, is the ordering of the numbers. Both as.factor and factor will automatically sort the levels, whereas that doesn't happen in the solution of @DWin :

Dat <- data.frame(site = rep(c(1,8,4), each = 3), score = runif(9))

Dat$number <- as.numeric(factor(Dat$site))
Dat$sitenum <- match(Dat$site, unique(Dat$site) ) 

Gives

> Dat
  site     score number sitenum
1    1 0.7377561      1       1
2    1 0.3131139      1       1
3    1 0.7862290      1       1
4    8 0.4480387      3       2
5    8 0.3873210      3       2
6    8 0.8778102      3       2
7    4 0.6916340      2       3
8    4 0.3033787      2       3
9    4 0.6552808      2       3
Joris Meys
  • 106,551
  • 31
  • 221
  • 263
25

Two other options:

1) Using the .GRP function from the data.table package:

library(data.table)
setDT(dat)[, num := .GRP, by = site]

with the example dataset from below this results in:

> dat
    site      score num
 1:    1 0.14945795   1
 2:    1 0.60035697   1
 3:    1 0.94643075   1
 4:    8 0.68835336   2
 5:    8 0.50553372   2
 6:    8 0.37293624   2
 7:    4 0.33580504   3
 8:    4 0.04825135   3
 9:    4 0.61894754   3
10:    8 0.96144729   2
11:    8 0.65496051   2
12:    8 0.51029199   2

2) Using the group_indices function from dplyr:

dat$num <- group_indices(dat, site)

or when you want to work around non-standard evaluation:

library(dplyr)
dat %>% 
  mutate(num = group_indices_(dat, .dots = c('site')))

which results in:

   site      score num
1     1 0.42480366   1
2     1 0.98736177   1
3     1 0.35766187   1
4     8 0.06243182   3
5     8 0.55617002   3
6     8 0.20304632   3
7     4 0.90855921   2
8     4 0.25215078   2
9     4 0.44981251   2
10    8 0.60288270   3
11    8 0.46946587   3
12    8 0.44941782   3

As can be seen, dplyr gives a different order of the group numbers.


If you want another number every time the group changes, there are several other options:

1) with base R:

# option 1:
dat$num <- cumsum(c(TRUE, head(dat$site, -1) != tail(dat$site, -1)))

# option 2:
x <- rle(dat$site)$lengths
dat$num <- rep(seq_along(x), times=x)

2) with the data.table package:

library(data.table)
setDT(dat)[, num := rleid(site)]

which all result in:

> dat
   site      score num
1     1 0.80817855   1
2     1 0.07881334   1
3     1 0.60092828   1
4     8 0.71477988   2
5     8 0.51384565   2
6     8 0.72011650   2
7     4 0.74994627   3
8     4 0.09564052   3
9     4 0.39782587   3
10    8 0.29446540   4
11    8 0.61725367   4
12    8 0.97427413   4

Used data:

dat <- data.frame(site = rep(c(1,8,4,8), each = 3), score = runif(12))
Jaap
  • 81,064
  • 34
  • 182
  • 193
17

This should be fairly efficient and understandable:

Dat$sitenum <- match(Dat$site, unique(Dat$site))  
zx8754
  • 52,746
  • 12
  • 114
  • 209
IRTFM
  • 258,963
  • 21
  • 364
  • 487
12

Using the data from @Jaap, a different dplyr possibility using dense_rank() could be:

dat %>%
 mutate(ID = dense_rank(site))

   site     score ID
1     1 0.1884490  1
2     1 0.1087422  1
3     1 0.7438149  1
4     8 0.1150771  3
5     8 0.9978203  3
6     8 0.7781222  3
7     4 0.4081830  2
8     4 0.2782333  2
9     4 0.9566959  2
10    8 0.2545320  3
11    8 0.1201062  3
12    8 0.5449901  3

Or a rleid()-like dplyr approach, with the data arranged first:

dat %>%
 arrange(site) %>%
 mutate(ID = with(rle(site), rep(seq_along(lengths), lengths)))

   site     score ID
1     1 0.1884490  1
2     1 0.1087422  1
3     1 0.7438149  1
4     4 0.4081830  2
5     4 0.2782333  2
6     4 0.9566959  2
7     8 0.1150771  3
8     8 0.9978203  3
9     8 0.7781222  3
10    8 0.2545320  3
11    8 0.1201062  3
12    8 0.5449901  3

Or using duplicated() and cumsum():

df %>%
 mutate(ID = cumsum(!duplicated(site)))

The same with base R:

df$ID <- with(rle(df$site), rep(seq_along(lengths), lengths))

Or:

df$ID <- cumsum(!duplicated(df$site))
tmfmnk
  • 38,881
  • 4
  • 47
  • 67
3

You can turn site into a factor and then return the numeric or integer values of that factor:

dat <- data.frame(site = rep(c(1,4,8), each = 3), score = runif(9))
dat$number <- as.integer(factor(dat$site))
dat

  site     score number
1    1 0.5305773      1
2    1 0.9367732      1
3    1 0.1831554      1
4    4 0.4068128      2
5    4 0.3438962      2
6    4 0.8123883      2
7    8 0.9122846      3
8    8 0.2949260      3
9    8 0.6771526      3
Chase
  • 67,710
  • 18
  • 144
  • 161
2

Another solution using the data.table package.

Example with the more complete datset provided by Jaap:

setDT(dat)[, number := frank(site, ties.method = "dense")]
dat
    site     score number
 1:    1 0.3107920      1
 2:    1 0.3640102      1
 3:    1 0.1715318      1
 4:    8 0.7247535      3
 5:    8 0.1263025      3
 6:    8 0.4657868      3
 7:    4 0.6915818      2
 8:    4 0.3558270      2
 9:    4 0.3376173      2
10:    8 0.7934963      3
11:    8 0.9641918      3
12:    8 0.9832120      3
s_baldur
  • 29,441
  • 4
  • 36
  • 69
1

Another way to do it. That I think is easy to get even when you know little about R:

library(dplyr)
df <- data.frame('site' = c(1, 1, 1, 4, 4, 4, 8, 8, 8))
df <- mutate(df, 'number' = cumsum(site != lag(site, default=-1)))
godot
  • 1,550
  • 16
  • 33
1

I too recently needed a solution to this. Didn't find this thread, started mine and was re-directed here (thank you). Good to see many solutions but to me (and I feel is good practice), a scalable solution is important. Hence, benchmarked several solutions below.

df <- data.table(country = rep(c('a', 'b', 'b', 'c', 'c', 'c'), 1e7)
                 )

a <-
microbenchmark(factor = {df[, group_id := as.integer(factor(country))]}
               , unique_match = df[, group_id := match(country, unique(country))]
               , rle = df[ , group_id := with(rle(country), rep(seq_along(lengths), lengths))]
               , dup_cumsum = df[,  group_id := cumsum(!duplicated(country))]
               , frank = df[, group_id := frank(country, ties.method = "dense")]
               , GRP = df[, group_id := .GRP, country]
               , rleid = df[, group_id := rleid(country)]
               , cumsum_head_tail = df[, group_id := cumsum(c(TRUE, head(country, -1) != tail(country, -1)))]
               , times = 50
               )
autoplot(a)

Benchmark 50 times

It would appear the podium is held by data.table. Still, was great to learn of alternatives e.g. cumsum(!duplicated(country)). What a brainteaser!

Sweepy Dodo
  • 1,761
  • 9
  • 15
  • 1
    Please note that the result may differ between the "run-length" based solutions and the other ones. E.g. `x = c(2, 2, 4, 4, 2)`; `data.table::rleid(x)`; `as.numeric(as.factor(x))` – Henrik Oct 18 '21 at 21:52
  • @Henrik you are right. I looked into `rleid`'s help. Didn't seem to be a parameter for cases where the vector isn't ordered. Am I right in saying `rleid` requires vector to be sorted beforehand? – Sweepy Dodo Oct 19 '21 at 10:31
  • 2
    Well, the point with run length functions is to keep track of _consecutive_ runs. E.g. if you want to distinguish between the two runs of 2 in my tiny example, clearly you shouldn't sort the vector first. The choice of function depends on your objective. – Henrik Oct 19 '21 at 13:44
1

Using collapse::group, Fast Hash-Based Grouping:

library(collapse) 
d = data.frame(site = rep(c(1,8,4), each = 3))

settransform(d, number = group(site)) # settransform updates data by reference. See also ftransform.
d
#   site number
# 1    1      1
# 2    1      1
# 3    1      1
# 4    8      2
# 5    8      2
# 6    8      2
# 7    4      3
# 8    4      3
# 9    4      3

The collapse functions are considerably faster on larger data. Here I compare with two common base idioms (factor / as.integer; match / unique), and two data.table methods (.GRP; frank), using 1e5 groups with 1e3 rows each.

library(data.table)
library(microbenchmark)

nr = 1e3
ng = 1e5
set.seed(1)
d1 = data.table(g = sample(1:ng, nr*ng, replace = TRUE))
d2 = copy(d1)
d3 = copy(d1)
d4 = copy(d1)
d5 = copy(d1)

microbenchmark(
  factor = {d1[ , gi := as.integer(factor(g))]},
  unique_match = {d2[, gi := match(g, unique(g))]},
  frank = {d3[, gi := frank(g, ties.method = "dense")]},
  GRP = {d4[, gi := .GRP, by = g]},
  collap = {settransform(d5, gi = group(g))},
  times = 20L)

# Unit: milliseconds
#         expr        min         lq      mean    median         uq max
#       factor 46648.6099 48493.0146 49918.956 49336.208 51547.0789 53585.472
# unique_match 12662.0978 13057.3210 13534.391 13530.457 13998.0141 14407.036
#        frank  2628.4923  2695.7064  3240.522  2833.950  3797.5579 5547.227
#          GRP  2754.2153  3283.2444  3796.109  3717.239  4184.5174 5117.918
#       collap   640.1993   668.2301   729.351   698.307   753.2932 1086.592

# Check equality with data.table .GRP. Use as.vector to remove attributes
all.equal(d4$gi, as.vector(d5$gi))
# [1] TRUE

Note: group, .GRP and match / unique all create the group number according to order of appearance of the different values in the original data (also discussed in previous posts).

Henrik
  • 65,555
  • 14
  • 143
  • 159
0

If you want to keep your existing columns and assign back to the same data frame...

my_df <- my_df %>%
    select(everything()) %>% 
    group_by(geo) %>% 
    mutate(geo_id = cur_group_id())

And you can do multiple columns like so...

my_df <- my_df %>%
    select(everything()) %>% 
    group_by(geo) %>% 
    mutate(geo_id = cur_group_id()) %>% 
    group_by(state) %>% 
    mutate(state_id = cur_group_id()) %>% 
    group_by(name) %>% 
    mutate(name_id = cur_group_id())
Harley
  • 1,305
  • 1
  • 13
  • 28
0

If the numbers of the site column were unordered, we could use as_factor() in combination with fct_inorder() from the forcats package:

library(tibble)
library(dplyr)
library(forcats)
all_data_unordered <- tibble(site  = c(1,1,1,8,8,8,4,4,4),
                             score = c(10,11,12,10,11,11,9,8,7))

all_data_unordered |> 
  mutate(number = as_factor(site) |> fct_inorder() |> as.integer())
#> # A tibble: 9 × 3
#>    site score number
#>   <dbl> <dbl>  <int>
#> 1     1    10      1
#> 2     1    11      1
#> 3     1    12      1
#> 4     8    10      2
#> 5     8    11      2
#> 6     8    11      2
#> 7     4     9      3
#> 8     4     8      3
#> 9     4     7      3

Created on 2021-11-05 by the reprex package (v2.0.1)

jwarz
  • 481
  • 2
  • 9
0

Since dplyr 1.1.0, another option is consecutive_id:

library(dplyr)
df %>% 
  mutate(id = consecutive_id(site))

#   site score id
# 1    1    10  1
# 2    1    11  1
# 3    1    12  1
# 4    4    10  2
# 5    4    11  2
# 6    4    11  2
# 7    8     9  3
# 8    8     8  3
# 9    8     7  3

Note that consecutive_id, like data.table::rleid but unlike cur_group_id or as.numeric(factor(.)) will return an ID for consecutive values, meaning that if the same value appears not consecutively, it'll create a new id.

df <- structure(list(site = c(1L, 1L, 1L, 4L, 4L, 4L, 1L, 1L, 1L)), 
                class = "data.frame", row.names = c(NA, -9L))

df %>% 
  mutate(cons_id = consecutive_id(site)) %>% 
  group_by(site) %>% 
  mutate(cur_group_id = cur_group_id())

#   site cons_id cur_group_id
# 1    1       1            1
# 2    1       1            1
# 3    1       1            1
# 4    4       2            2
# 5    4       2            2
# 6    4       2            2
# 7    1       3            1
# 8    1       3            1
# 9    1       3            1
Maël
  • 45,206
  • 3
  • 29
  • 67