0

I'm trying to find the R procedure that is equivalent to Google Sheets' Pivot Group Rule. That is, I want to summarize that data by discretizing a numerical column with a fixed interval size that I decide on.

I am almost getting the desired output, but am having a trouble with the "(a,b]" interval notation.

Example

df <- 
  data.frame(
     num_col = c(1400,9000,15000,17350,20000,22000,
                 25000,40000,42000,45000,50000,60000,65000,70000,75000,
                 1e+05,120000,125000,150000,168000,180000,2e+05,225000,
                 250000,270000,290000,3e+05,350000,4e+05,427000,450000,5e+05,
                 550000,560000,6e+05,625000,650000,7e+05,750000,8e+05,
                 850000,9e+05,913000,930000,950000,990000,1e+06,1066167,
                 1100000,1200000,1250000,1300000,1400000,1420000,1500000,
                 1700000,1750000,1800000,1900000,1950000,2e+06,2100000,
                 2300000,2400000,2450000,2500000,3e+06,3150000,3200000,
                 3300000,3400000,3440000,3500000,3660000,3800000,3850000,
                 4e+06,4400000,4500000,4600000,4700000,4800000,4900000,5e+06,
                 5500000,6e+06,6400000,6500000,6600000,6800000,6900000,
                 7e+06,7200000,7217600,7400000,7500000,7700000,8e+06,
                 8200000,8495000,8500000,8700000,8900000,9e+06,9200000,9500000,
                 9600000,1e+07,10500000,10818775,1.1e+07,11500000,
                 1.2e+07,12500000,12620000,1.3e+07,13200000,13400000,13500000,
                 1.4e+07,14500000,14800000,1.5e+07,1.6e+07,1.7e+07,17500000,
                 1.8e+07,18026148,18500000,1.9e+07,19500000,19800000,
                 19900000,2e+07,2.1e+07,2.2e+07,22500000,2.3e+07,2.4e+07,
                 2.5e+07,25500000,2.6e+07,2.7e+07,27220000,2.8e+07,2.9e+07,
                 3e+07,30300000,3.1e+07,31500000,3.2e+07,32500000,3.3e+07,
                 3.4e+07,3.5e+07,3.6e+07,3.7e+07,3.8e+07,38600000,3.9e+07,
                 39200000,4e+07,4.1e+07,4.2e+07,4.3e+07,4.4e+07,44500000,
                 4.5e+07,4.6e+07,4.7e+07,4.8e+07,4.9e+07,49900000,5e+07,
                 50100000,50200000,5.2e+07,5.3e+07,5.5e+07,5.6e+07,5.7e+07,
                 5.8e+07,58800000,6e+07,6.1e+07,6.3e+07,6.5e+07,6.6e+07,
                 6.8e+07,68005000,6.9e+07,7e+07,7.3e+07,7.4e+07,7.5e+07,
                 7.6e+07,7.8e+07,7.9e+07,8e+07,81200000,8.2e+07,8.4e+07,
                 8.5e+07,8.8e+07,9e+07,9.2e+07,9.3e+07,9.4e+07,9.5e+07,
                 9.9e+07,1e+08,1.02e+08,1.03e+08,1.05e+08,1.08e+08,1.1e+08,
                 1.12e+08,1.15e+08,1.17e+08,1.2e+08,1.25e+08,1.27e+08,
                 1.3e+08,1.32e+08,1.35e+08,1.4e+08,1.44e+08,1.45e+08,1.5e+08,
                 1.55e+08,1.6e+08,1.65e+08,1.7e+08,1.75e+08,1.76e+08,
                 1.78e+08,1.8e+08,1.85e+08,1.9e+08,1.95e+08,2e+08,2.09e+08,
                 2.1e+08,2.15e+08,2.2e+08,2.25e+08,2.3e+08,2.45e+08,2.5e+08,
                 2.6e+08,263700000,6e+08),
         val = c(1,1,1,1,2,1,1,1,1,1,4,3,1,2,2,
                 8,1,4,4,1,1,7,1,11,1,1,6,2,2,1,3,21,1,1,3,
                 1,3,1,3,1,1,3,1,1,2,1,24,1,6,8,1,3,2,1,13,
                 1,1,4,1,1,22,3,1,1,1,13,27,1,2,3,2,1,12,1,1,
                 1,20,2,3,1,2,1,1,44,2,12,1,4,1,1,1,21,1,1,1,
                 3,1,15,1,1,5,1,1,8,1,2,1,43,1,1,11,1,24,2,
                 1,15,1,1,2,8,1,1,34,9,16,1,15,1,1,6,1,1,1,55,
                 3,11,1,4,5,40,1,9,3,1,14,3,38,1,3,1,7,1,2,
                 3,34,5,6,6,1,1,1,38,1,6,1,3,1,8,1,1,1,1,1,
                 25,1,1,3,1,11,1,1,5,1,18,4,1,12,2,4,1,2,11,1,
                 2,9,1,2,2,14,1,1,1,5,1,9,2,1,1,5,1,16,1,1,
                 3,1,8,1,2,1,8,7,1,8,1,8,4,1,6,14,2,4,6,8,4,
                 1,2,3,2,5,2,12,1,1,2,1,3,1,2,6,1,1,1)
)

look at the data

tibble::as_tibble(df)
#> # A tibble: 252 x 2
#>    num_col   val
#>      <dbl> <dbl>
#>  1    1400     1
#>  2    9000     1
#>  3   15000     1
#>  4   17350     1
#>  5   20000     2
#>  6   22000     1
#>  7   25000     1
#>  8   40000     1
#>  9   42000     1
#> 10   45000     1
#> # ... with 242 more rows

desired output

desired_output <-
  tibble::tribble(
        ~num_col_interval,  ~val_sum,
           "0 - 49999999", 962L,
    "50000000 - 99999999", 164L,
  "100000000 - 149999999",  78L,
  "150000000 - 199999999",  53L,
  "200000000 - 249999999",  23L,
  "250000000 - 299999999",   8L,
  "600000000 - 649999999",   1L
  )

My attempt

library(dplyr)
library(ggplot2)

df |> 
  group_by(num_col_interval = ggplot2::cut_interval(num_col, length = 50000000 - 1, dig.lab = 10)) |> 
  summarise(across(val, sum))
#> # A tibble: 7 x 2
#>   num_col_interval        val
#>   <fct>                 <dbl>
#> 1 [0,49999999]            962
#> 2 (49999999,99999998]     164
#> 3 (99999998,149999997]     78
#> 4 (149999997,199999996]    53
#> 5 (199999996,249999995]    23
#> 6 (249999995,299999994]     8
#> 7 (599999988,649999987]     1

You can see that the interval boundaries overlap. In the first row, it ranges 0 to 49999999, and in the second row, it ranges 49999999 to 99999998. I do understand the difference between ] and ( in the breaks notation. Nevertheless, I wish the ranges in the num_col_interval column to be as in desired_output.

How can I programatically format the num_col_interval values to be as in desired_output?

I'm mostly looking for a straightforward dplyr solution.


Here's how I would do it with Google Sheets, getting the desired output: gsheets


Several SO posts are relevant, but none of them answered my question:

Emman
  • 3,695
  • 2
  • 20
  • 44

1 Answers1

1

Try this:

library(tidyverse)

df <- 
  data.frame(
    num_col = c(1400,9000,15000,17350,20000,22000,
                25000,40000,42000,45000,50000,60000,65000,70000,75000,
                1e+05,120000,125000,150000,168000,180000,2e+05,225000,
                250000,270000,290000,3e+05,350000,4e+05,427000,450000,5e+05,
                550000,560000,6e+05,625000,650000,7e+05,750000,8e+05,
                850000,9e+05,913000,930000,950000,990000,1e+06,1066167,
                1100000,1200000,1250000,1300000,1400000,1420000,1500000,
                1700000,1750000,1800000,1900000,1950000,2e+06,2100000,
                2300000,2400000,2450000,2500000,3e+06,3150000,3200000,
                3300000,3400000,3440000,3500000,3660000,3800000,3850000,
                4e+06,4400000,4500000,4600000,4700000,4800000,4900000,5e+06,
                5500000,6e+06,6400000,6500000,6600000,6800000,6900000,
                7e+06,7200000,7217600,7400000,7500000,7700000,8e+06,
                8200000,8495000,8500000,8700000,8900000,9e+06,9200000,9500000,
                9600000,1e+07,10500000,10818775,1.1e+07,11500000,
                1.2e+07,12500000,12620000,1.3e+07,13200000,13400000,13500000,
                1.4e+07,14500000,14800000,1.5e+07,1.6e+07,1.7e+07,17500000,
                1.8e+07,18026148,18500000,1.9e+07,19500000,19800000,
                19900000,2e+07,2.1e+07,2.2e+07,22500000,2.3e+07,2.4e+07,
                2.5e+07,25500000,2.6e+07,2.7e+07,27220000,2.8e+07,2.9e+07,
                3e+07,30300000,3.1e+07,31500000,3.2e+07,32500000,3.3e+07,
                3.4e+07,3.5e+07,3.6e+07,3.7e+07,3.8e+07,38600000,3.9e+07,
                39200000,4e+07,4.1e+07,4.2e+07,4.3e+07,4.4e+07,44500000,
                4.5e+07,4.6e+07,4.7e+07,4.8e+07,4.9e+07,49900000,5e+07,
                50100000,50200000,5.2e+07,5.3e+07,5.5e+07,5.6e+07,5.7e+07,
                5.8e+07,58800000,6e+07,6.1e+07,6.3e+07,6.5e+07,6.6e+07,
                6.8e+07,68005000,6.9e+07,7e+07,7.3e+07,7.4e+07,7.5e+07,
                7.6e+07,7.8e+07,7.9e+07,8e+07,81200000,8.2e+07,8.4e+07,
                8.5e+07,8.8e+07,9e+07,9.2e+07,9.3e+07,9.4e+07,9.5e+07,
                9.9e+07,1e+08,1.02e+08,1.03e+08,1.05e+08,1.08e+08,1.1e+08,
                1.12e+08,1.15e+08,1.17e+08,1.2e+08,1.25e+08,1.27e+08,
                1.3e+08,1.32e+08,1.35e+08,1.4e+08,1.44e+08,1.45e+08,1.5e+08,
                1.55e+08,1.6e+08,1.65e+08,1.7e+08,1.75e+08,1.76e+08,
                1.78e+08,1.8e+08,1.85e+08,1.9e+08,1.95e+08,2e+08,2.09e+08,
                2.1e+08,2.15e+08,2.2e+08,2.25e+08,2.3e+08,2.45e+08,2.5e+08,
                2.6e+08,263700000,6e+08),
    val = c(1,1,1,1,2,1,1,1,1,1,4,3,1,2,2,
            8,1,4,4,1,1,7,1,11,1,1,6,2,2,1,3,21,1,1,3,
            1,3,1,3,1,1,3,1,1,2,1,24,1,6,8,1,3,2,1,13,
            1,1,4,1,1,22,3,1,1,1,13,27,1,2,3,2,1,12,1,1,
            1,20,2,3,1,2,1,1,44,2,12,1,4,1,1,1,21,1,1,1,
            3,1,15,1,1,5,1,1,8,1,2,1,43,1,1,11,1,24,2,
            1,15,1,1,2,8,1,1,34,9,16,1,15,1,1,6,1,1,1,55,
            3,11,1,4,5,40,1,9,3,1,14,3,38,1,3,1,7,1,2,
            3,34,5,6,6,1,1,1,38,1,6,1,3,1,8,1,1,1,1,1,
            25,1,1,3,1,11,1,1,5,1,18,4,1,12,2,4,1,2,11,1,
            2,9,1,2,2,14,1,1,1,5,1,9,2,1,1,5,1,16,1,1,
            3,1,8,1,2,1,8,7,1,8,1,8,4,1,6,14,2,4,6,8,4,
            1,2,3,2,5,2,12,1,1,2,1,3,1,2,6,1,1,1)
  )

df |> 
  group_by(num_col_interval = cut_width(num_col, width = 50000000, 
                                        dig.lab = 10, closed = "left", boundary = 0)) |> 
  summarise(across(val, sum)) |> 
  separate(num_col_interval, into = c("left", "right"), sep = ",") |> 
  mutate(across(-val, parse_number),
         right = if_else(right < max(right), right - 1L, right),
         across(-val, ~ format(., scientific = FALSE)),
         val = as.integer(val)) |> 
  unite(num_col_interval, left:right, sep = " - ")
#> # A tibble: 7 × 2
#>   num_col_interval          val
#>   <chr>                   <int>
#> 1 "        0 -  49999999"   962
#> 2 " 50000000 -  99999999"   164
#> 3 "100000000 - 149999999"    78
#> 4 "150000000 - 199999999"    53
#> 5 "200000000 - 249999999"    23
#> 6 "250000000 - 299999999"     8
#> 7 "550000000 - 600000000"     1

Created on 2022-12-18 with reprex v2.0.2

Carl
  • 4,232
  • 2
  • 12
  • 24