1

Problem:

I'm having trouble getting just one record per category in Databricks. For example, I would like to get a single row for each of the entry_id values in the sample data below.

Sample data: enter image description here

Almost what I want:

How do I get just the last row for each category? There could be multiple identical records and/or multiple records with different values. This is noise in the data that needs to be eliminated by selecting exactly one record.
enter image description here

The complete sql is below:

-- * * *
--
-- INIT THE SESSION
--
-- * * *

create database if not exists test;
use test;
set spark.sql.legacy.timeParserPolicy = LEGACY;
set spark.sql.legacy.parquet.datetimeRebaseModeInWrite = LEGACY;
select concat('using: ', 'test') as message;


drop table if exists test;
create table test as (
select 'a' a, 'b' b, 'c' c, 'd' d, 'e' e, 'f' f, 0 entry_id
union all
select 'this','is','NOT','foo','bar','bat', 1
union all
select 'this','is','NOT','john','paul','george', 1 
union all
select 'this','is','the','data','i','want', 1
union all
select null, 'wrong', null, null, null, null, 2
union all
select null, null, 'wrong', null, null, null, 3
union all
select null, null, null, 'wrong', null, null, 4
union all
select null, null, null, null, 'wrong', null, 5
union all
select null, null, null, null, null, 'wrong', 6
union all
select null, null, null, null, null, null, 7
union all
select 'b', 'wrong', null, null, null, null, 8
union all
select 'c', 'wrong', null, null, null, null, 9
);

select * from test order by entry_id;


drop table if exists max_a;

create table max_a as (
  select
    entry_id,
    max(a) a
  from 
    test
  group by 1
  order by 1
);

select * from max_a;



select
  test.*
from
  max_a join test on max_a.a = test.a
order by entry_id
;

select
  test.*
from
  max_a join test on max_a.a = test.a
where test.entry_id = 1
;
John
  • 3,458
  • 4
  • 33
  • 54

2 Answers2

2

Databricks Runtime 10.0 and above supports QUALIFY

I ordered the ROW_NUMBER by NULL, but you could change it to whatever you like.

select  * 
from    t
qualify row_number() over (partition by entry_id order by null) = 1
a b c d e f entry_id
a b c d e f 0
this is NOT foo bar bat 1
null wrong null null null null 2
null null wrong null null null 3
null null null wrong null null 4
null null null null wrong null 5
null null null null null wrong 6
null null null null null null 7
b wrong null null null null 8
c wrong null null null null 9

P.S.

There are much nicer ways in Spark to create an ad hoc dataset

STACK

with t(a,b,c,d,e,f,entry_id)
as
(
    select  stack
            (
                 12 -- number of rows
                 
                ,'a'    ,'b'     ,'c'     ,'d'      ,'e'     ,'f'       ,0
                ,'this' ,'is'    ,'NOT'   ,'foo'    ,'bar'   ,'bat'     ,1
                ,'this' ,'is'    ,'NOT'   ,'john'   ,'paul'  ,'george'  ,1 
                ,'this' ,'is'    ,'the'   ,'data'   ,'i'     ,'want'    ,1
                ,null   ,'wrong' , null   ,null     ,null    ,null      ,2
                ,null   ,null    ,'wrong' ,null     ,null    ,null      ,3
                ,null   ,null    ,null    ,'wrong'  ,null    ,null      ,4
                ,null   ,null    ,null    ,null     ,'wrong' ,null      ,5
                ,null   ,null    ,null    ,null     ,null    ,'wrong'   ,6
                ,null   ,null    ,null    ,null     ,null    ,null      ,7
                ,'b'    ,'wrong' ,null    ,null     ,null    ,null      ,8
                ,'c'    ,'wrong' ,null    ,null     ,null    ,null      ,9
            )
)
select  * 
from    t
qualify row_number() over (partition by entry_id order by null) = 1

VALUES

with t(a,b,c,d,e,f,entry_id)
as
(
    select  * 
    from   values   ('a'    ,'b'     ,'c'     ,'d'      ,'e'     ,'f'       ,0)
                   ,('this' ,'is'    ,'NOT'   ,'foo'    ,'bar'   ,'bat'     ,1)
                   ,('this' ,'is'    ,'NOT'   ,'john'   ,'paul'  ,'george'  ,1) 
                   ,('this' ,'is'    ,'the'   ,'data'   ,'i'     ,'want'    ,1)
                   ,(null   ,'wrong' , null   ,null     ,null    ,null      ,2)
                   ,(null   ,null    ,'wrong' ,null     ,null    ,null      ,3)
                   ,(null   ,null    ,null    ,'wrong'  ,null    ,null      ,4)
                   ,(null   ,null    ,null    ,null     ,'wrong' ,null      ,5)
                   ,(null   ,null    ,null    ,null     ,null    ,'wrong'   ,6)
                   ,(null   ,null    ,null    ,null     ,null    ,null      ,7)
                   ,('b'    ,'wrong' ,null    ,null     ,null    ,null      ,8)
                   ,('c'    ,'wrong' ,null    ,null     ,null    ,null      ,9)
           
)
select  * 
from    t
qualify row_number() over (partition by entry_id order by null) = 1
David דודו Markovitz
  • 42,900
  • 6
  • 64
  • 88
0

I ended up doing something like the following:

select distinct
  test.*
from (
  select
    test.*,
    row_number() over (partition by entry_id, a order by c) as row_number
  from
    test
  order by entry_id, row_number
) test
join max_row mx on 1=1
  and mx.entry_id = test.entry_id 
  and mx.max_row_number = test.row_number
;

enter image description here

John
  • 3,458
  • 4
  • 33
  • 54