3

I have a table structured as below (apologies for quite long example of data, but couldn't reproduce the issue with smaller no. of rows):

CREATE TABLE example_tbl (
  id CHAR(1),
  pid INT,
  sid INT, 
  tid VARCHAR(10),
  other_columns INT
)

INSERT INTO example_tbl (id, pid, sid, tid, other_columns)
VALUES 
('c', 157, 85, 'A32', 1),
 ('c', 157, 85, 'A32', 2),
 ('c', 157, 85, 'A32', 3),
 ('c', 157, 85, 'A32', 4),
 ('c', 157, 85, 'A32', 5),
 ('c', 157, 85, 'A32', 6),
 ('c', 157, 85, 'A32', 7),
 ('c', 157, 85, 'A32', 8),
 ('c', 157, 85, 'A32', 9),
 ('c', 157, 85, 'A32', 10),
 ('c', 157, 85, 'A32', 11),
 ('Q', 157, 81, 'A62', 1),
 ('Q', 157, 81, 'A62', 2),
 ('Q', 157, 81, 'A62', 3),
 ('Q', 157, 81, 'A62', 4),
 ('Q', 157, 81, 'A62', 5),
 ('Q', 157, 81, 'A62', 6),
 ('Q', 157, 81, 'A62', 7),
 ('Q', 157, 81, 'A62', 8),
 ('Q', 157, 81, 'A62', 9),
 ('Q', 157, 81, 'A62', 10),
 ('Q', 157, 81, 'A62', 11),
 ('f', 598, 51, 'A62', NULL),
 ('w', 598, 49, 'A32', 9),
 ('Z', 598, 44, 'A62', NULL),
 ('r', 598, 16, 'A32', 10),
 ('O', 118, 93, 'A32', 1),
 ('G', 118, 38, 'A32', 4),
 ('U', 118, 90, 'A32', 1),
 ('U', 118, 90, 'A32', 2),
 ('U', 118, 90, 'A32', 3),
 ('U', 118, 90, 'A32', 4),
 ('U', 118, 90, 'A32', 5),
 ('U', 118, 90, 'A32', 6),
 ('U', 118, 90, 'A32', 7),
 ('U', 118, 90, 'A32', 8),
 ('U', 118, 90, 'A32', 9),
 ('U', 118, 90, 'A32', 10),
 ('U', 118, 90, 'A32', 11),
 ('m', 118, 37, 'A62', 1),
 ('J', 118, 54, 'A32', 20),
 ('a', 118, 59, 'A32', 11),
 ('s', 118, 18, 'A62', 8),
 ('y', 118, 33, 'A62', NULL),
 ('N', 118, 79, 'A62', NULL),
 ('l', 118, 35, 'A32', 9),
 ('n', 118, 63, 'A32', 5),
 ('R', 118, 86, 'A62', 1),
 ('R', 118, 86, 'A62', 2),
 ('R', 118, 86, 'A62', 3),
 ('R', 118, 86, 'A62', 4),
 ('R', 118, 86, 'A62', 5),
 ('R', 118, 86, 'A62', 6),
 ('R', 118, 86, 'A62', 7),
 ('R', 118, 86, 'A62', 8),
 ('R', 118, 86, 'A62', 9),
 ('R', 118, 86, 'A62', 10),
 ('R', 118, 86, 'A62', 11),
 ('H', 118, 23, 'A32', 1),
 ('H', 118, 23, 'A32', 2),
 ('H', 118, 23, 'A32', 3),
 ('H', 118, 23, 'A32', 4),
 ('H', 118, 23, 'A32', 5),
 ('H', 118, 23, 'A32', 6),
 ('H', 118, 23, 'A32', 7),
 ('H', 118, 23, 'A32', 8),
 ('H', 118, 23, 'A32', 9),
 ('H', 118, 23, 'A32', 10),
 ('H', 118, 23, 'A32', 11),
 ('B', 118, 43, 'A62', 39),
 ('h', 118, 60, 'A62', NULL),
 ('p', 118, 72, 'A32', 1),
 ('v', 118, 22, 'A32', 5),
 ('I', 118, 89, 'A62', 9),
 ('T', 118, 17, 'A62', 1),
 ('F', 118, 41, 'A32', 10),
 ('z', 118, 55, 'A32', 6),
 ('Y', 118, 75, 'A32', NULL),
 ('u', 118, 48, 'A62', 9),
 ('x', 783, 27, 'A32', 10),
 ('V', 783, 11, 'A62', 8),
 ('i', 783, 61, 'A62', 1),
 ('i', 783, 61, 'A62', 2),
 ('i', 783, 61, 'A62', 3),
 ('i', 783, 61, 'A62', 4),
 ('i', 783, 61, 'A62', 5),
 ('i', 783, 61, 'A62', 6),
 ('i', 783, 61, 'A62', 7),
 ('i', 783, 61, 'A62', 8),
 ('i', 783, 61, 'A62', 9),
 ('i', 783, 61, 'A62', 10),
 ('i', 783, 61, 'A62', 11);

The id column defines the event. I'd like add a column with id of previous event within a group defined by tid. The data should be ordered according to pid (custom order where 157 < 598 < 118 < 783) and sid. The tricky part is that some events have 11 rows and some have only 1 row. The other_columns represents all the other columns that are there and should be preserved.

I tried to use a LAG() window function with precalculated offset as below:

WITH example_tbl_with_offset AS (
    SELECT * FROM example_tbl
    LEFT JOIN (SELECT id, COUNT(id)::int AS lag_offset
               FROM example_tbl
               GROUP BY id) AS offset_tbl
    USING (id)
)
SELECT
    *, 
    LAG(id, lag_offset) OVER (PARTITION BY tid
                              ORDER BY (CASE 
                                WHEN pid = 157 THEN 1
                                WHEN pid = 598 THEN 2
                                WHEN pid = 118 THEN 3
                                WHEN pid = 783 THEN 4
                              END, sid
    )) AS prev_id 
FROM example_tbl_with_offset;

However, although the prev_id column seems valid for the first couple of events, it eventually loses track.

Expected output is attached below:

CREATE TABLE expected_output (
  id CHAR(1),
  pid INT,
  sid INT, 
  tid VARCHAR(10),
  prev_id CHAR(1)
)

INSERT INTO expected_output (id, pid, sid, tid, prev_id)
VALUES 
 ('Q', 157, 81, 'A62', NULL),
 ('Q', 157, 81, 'A62', NULL),
 ('Q', 157, 81, 'A62', NULL),
 ('Q', 157, 81, 'A62', NULL),
 ('Q', 157, 81, 'A62', NULL),
 ('Q', 157, 81, 'A62', NULL),
 ('Q', 157, 81, 'A62', NULL),
 ('Q', 157, 81, 'A62', NULL),
 ('Q', 157, 81, 'A62', NULL),
 ('Q', 157, 81, 'A62', NULL),
 ('Q', 157, 81, 'A62', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('c', 157, 85, 'A32', NULL),
 ('r', 598, 16, 'A32', 'c'),
 ('Z', 598, 44, 'A62', 'Q'),
 ('w', 598, 49, 'A32', 'r'),
 ('f', 598, 51, 'A62', 'Z'),
 ('T', 118, 17, 'A62', 'f'),
 ('s', 118, 18, 'A62', 'T'),
 ('v', 118, 22, 'A32', 'w'),
 ('H', 118, 23, 'A32', 'v'),
 ('H', 118, 23, 'A32', 'v'),
 ('H', 118, 23, 'A32', 'v'),
 ('H', 118, 23, 'A32', 'v'),
 ('H', 118, 23, 'A32', 'v'),
 ('H', 118, 23, 'A32', 'v'),
 ('H', 118, 23, 'A32', 'v'),
 ('H', 118, 23, 'A32', 'v'),
 ('H', 118, 23, 'A32', 'v'),
 ('H', 118, 23, 'A32', 'v'),
 ('H', 118, 23, 'A32', 'v'),
 ('y', 118, 33, 'A62', 's'),
 ('l', 118, 35, 'A32', 'H'),
 ('m', 118, 37, 'A62', 'y'),
 ('G', 118, 38, 'A32', 'I'),
 ('F', 118, 41, 'A32', 'G'),
 ('B', 118, 43, 'A62', 'm'),
 ('u', 118, 48, 'A62', 'B'),
 ('J', 118, 54, 'A32', 'F'),
 ('z', 118, 55, 'A32', 'J'),
 ('a', 118, 59, 'A32', 'z'),
 ('h', 118, 60, 'A62', 'u'),
 ('n', 118, 63, 'A32', 'a'),
 ('p', 118, 72, 'A32', 'n'),
 ('Y', 118, 75, 'A32', 'p'),
 ('N', 118, 79, 'A62', 'h'),
 ('R', 118, 86, 'A62', 'N'),
 ('R', 118, 86, 'A62', 'N'),
 ('R', 118, 86, 'A62', 'N'),
 ('R', 118, 86, 'A62', 'N'),
 ('R', 118, 86, 'A62', 'N'),
 ('R', 118, 86, 'A62', 'N'),
 ('R', 118, 86, 'A62', 'N'),
 ('R', 118, 86, 'A62', 'N'),
 ('R', 118, 86, 'A62', 'N'),
 ('R', 118, 86, 'A62', 'N'),
 ('R', 118, 86, 'A62', 'N'),
 ('I', 118, 89, 'A62', 'R'),
 ('U', 118, 90, 'A32', 'Y'),
 ('U', 118, 90, 'A32', 'Y'),
 ('U', 118, 90, 'A32', 'Y'),
 ('U', 118, 90, 'A32', 'Y'),
 ('U', 118, 90, 'A32', 'Y'),
 ('U', 118, 90, 'A32', 'Y'),
 ('U', 118, 90, 'A32', 'Y'),
 ('U', 118, 90, 'A32', 'Y'),
 ('U', 118, 90, 'A32', 'Y'),
 ('U', 118, 90, 'A32', 'Y'),
 ('U', 118, 90, 'A32', 'Y'),
 ('O', 118, 93, 'A32', 'U'),
 ('V', 783, 11, 'A62', 'I'),
 ('x', 783, 27, 'A32', 'O'),
 ('i', 783, 61, 'A62', 'V'),
 ('i', 783, 61, 'A62', 'V'),
 ('i', 783, 61, 'A62', 'V'),
 ('i', 783, 61, 'A62', 'V'),
 ('i', 783, 61, 'A62', 'V'),
 ('i', 783, 61, 'A62', 'V'),
 ('i', 783, 61, 'A62', 'V'),
 ('i', 783, 61, 'A62', 'V'),
 ('i', 783, 61, 'A62', 'V'),
 ('i', 783, 61, 'A62', 'V'),
 ('i', 783, 61, 'A62', 'V');
 

The actual logic is probably easier to grasp with the following ordering:

SELECT * FROM expected_output
ORDER BY tid, CASE 
            WHEN pid = 157 THEN 1
            WHEN pid = 598 THEN 2
            WHEN pid = 118 THEN 3
            WHEN pid = 783 THEN 4
          END, sid;

@Edit: My PostgreSQL version is 9.5.

Erwin Brandstetter
  • 605,456
  • 145
  • 1,078
  • 1,228
jakes
  • 1,964
  • 3
  • 18
  • 50

3 Answers3

3

One of the major new features of Postgres 11 was advertised like this in the release notes:

  • Window functions now support all framing options shown in the SQL:2011 standard, including RANGE distance PRECEDING/FOLLOWING, GROUPS mode, and frame exclusion options

Using this, there is a solution with a single window function in a single SELECT:

SELECT *, first_value(id) OVER (PARTITION BY tid
                                ORDER BY CASE pid
                                            WHEN 157 THEN 1
                                            WHEN 598 THEN 2
                                            WHEN 118 THEN 3
                                            WHEN 783 THEN 4
                                         END, sid
                                GROUPS BETWEEN 1 PRECEDING AND 1 PRECEDING
                                ) AS prev_id
FROM   example_tbl;

db<>fiddle here

Should perform nicely, too.
Works for your extended test setup as well, but not for Postgres older than version 11.

The manual:

In GROUPS mode, the offset again must yield a non-null, non-negative integer, and the option means that the frame starts or ends the specified number of peer groups before or after the current row's peer group, where a peer group is a set of rows that are equivalent in the ORDER BY ordering.

The chosen frame definition GROUPS BETWEEN 1 PRECEDING AND 1 PRECEDING does exactly what you need.

Your description and test data seem to imply that sid and id change in lockstep. So the window frame only contains a single id value. min() or max() would work, too, but I picked first_value() as the cheapest. And because it works with any data type, even where min() or max() would fail (like json).

Also, use the shorter (and slightly cheaper) "simple" or "switched" CASE. See:

For Postgres 9.5

This works in your outdated Postgres version 9.5:

SELECT e.*, p.prev_id
FROM   example_tbl e
JOIN  (
   SELECT *
        , lag(id) OVER (PARTITION BY tid
                        ORDER BY CASE pid
                                    WHEN 157 THEN 1
                                    WHEN 598 THEN 2
                                    WHEN 118 THEN 3
                                    WHEN 783 THEN 4
                                 END, sid) AS prev_id
   FROM  (
      SELECT DISTINCT ON (tid, pid, sid)
             tid, pid, sid, id
      FROM   example_tbl
      ) dist
   ) p USING (tid, pid, sid);

db<>fiddle here

This is assuming that tid, pid, sid are defined NOT NULL. Else, you need to do more.

  1. Derive a table with distinct combinations of (tid, pid, sid). Again, id is just a payload column. About DISTINCT ON:

  2. Now, lag() with a default window frame does the job to get the id from the "previous" row.

  3. Join to the full table.

Erwin Brandstetter
  • 605,456
  • 145
  • 1,078
  • 1,228
  • Wow, that's super neat and useful! One question though: the command runs on my local, playground database, but on the other, external database it returns `ERROR: syntax error at or near "GROUPS"` - does it mean that this external database is not compatible with an appropriate version of PostgreSQL (>=11) and therefore GROUPS syntax is not recognized there? How can I check which version of PostgreSQL the external database I'm connecting to is running? I'm wondering if that's the source of the issue or I have some syntax error indeed. – jakes Jan 13 '21 at 07:29
  • You can run `SELECT version();` and get it – Tarun Lalwani Jan 13 '21 at 08:58
  • Thanks @TarunLalwani. Unfortunately, my Postgres version is 9.5.10. I'll update the topic accordingly. – jakes Jan 13 '21 at 09:10
  • @jakes: I added a solution for Postgres 9.5. Please remember to disclose major software versions with all your questions. Two notes about Postgres 9.5.10 - if you have any influence on that: 1.: Always upgrade to the latest point release, which is 9.5.24 at the time of writing. 2.: But consider upgrading to a current version instead, as [Postgres 9.5 reaches EOL in Feb 2021](https://www.postgresql.org/support/versioning/). – Erwin Brandstetter Jan 13 '21 at 13:13
  • Many thanks! It's probably too much to ask (and I accepted your answer already anyway), but if you have some time - could you see and possibly explain why my initial query goes wrong? I can't work it out – jakes Jan 14 '21 at 16:22
  • @jakes: you count IDs per group in your CTE `example_tbl_with_offset` for later use as offset in `LAG(id, lag_offset)`. But that can only work for the latest row in each group of peers. You would have to decrement that offset for every earlier row accordingly. (More complex and probably more expensive overall.) Also, there is no assertion that the column `id` is unique like that. The count might sum multiple groups of peers. You would have to count per `(tid, pid, sid)` if so ... – Erwin Brandstetter Jan 14 '21 at 16:38
1

This is working, hope somebody will find a better solution

select m.* from (
select *, lag(id,1) over ( PARTITION BY tid
order by (CASE 
                                WHEN pid = 157 THEN 1
                                WHEN pid = 598 THEN 2
                                WHEN pid = 118 THEN 3
                                WHEN pid = 783 THEN 4
                              END, sid
    )
) from ( select distinct * from example_tbl et )a )
m,
example_tbl n
where m.id = n.id and m.pid = n.pid and m.sid = n.sid and m.tid = n.tid
order by (CASE 
                                WHEN m.pid = 157 THEN 1
                                WHEN m.pid = 598 THEN 2
                                WHEN m.pid = 118 THEN 3
                                WHEN m.pid = 783 THEN 4
                              END, m.sid
    )
detzu
  • 701
  • 6
  • 12
  • Thanks for that. Unfortunately, I can't really use distinct + lag(id, 1) as my original table has more columns and select distinct doesn't return unique row per each pid x sid x tid combination. – jakes Jan 13 '21 at 06:55
  • No problem, use group by on the 4 columns instead of `select distinct * from example_tbl` use `select id, pid, sid, tid from example_tbl group by id, pid, sid, tid ` – detzu Jan 13 '21 at 08:32
  • Worth noting that this would have worked for the original question. – Erwin Brandstetter Jan 13 '21 at 13:29
1

Similar solution:

with 
  a as (
 select distinct *, 
 case
    WHEN pid = 157 THEN 1
    WHEN pid = 598 THEN 2
    WHEN pid = 118 THEN 3
    WHEN pid = 783 THEN 4
  end as cat
 from example_tbl
),
 b as (
select id, cat,
  lag(id, 1) over(partition by tid order by cat, sid) prev_id
from a
)
select c.*, b.prev_id
from example_tbl c
left join b using (id)
order by b.cat,c.sid
timothyzhang
  • 730
  • 9
  • 12
  • Thanks for that. Unfortunately, I can't really use distinct + lag(id, 1) as my original table has more columns and select distinct doesn't return unique row per each pid x sid x tid combination. – jakes Jan 13 '21 at 06:55
  • Sure, I think Erwin's solution is perfect as well. – timothyzhang Jan 13 '21 at 08:39
  • `left join b using (id)` might have failed, as `id` is not defined unique in the question. (The column name seems misleading.) – Erwin Brandstetter Jan 13 '21 at 13:32