Well, I work it around using two temp tables:
drop table if exists administrator_tmp1;
drop table if exists administrator_tmp2;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
--review_administrator
CREATE TABLE if not exists review_administrator(
admin_id bigint ,
admin_name string,
create_time string,
email string ,
password string,
status_description string,
token string ,
expire_time string ,
granter_user_id bigint ,
admin_time string ,
effect_start_date string ,
effect_end_date string
)
partitioned by (current_row_indicator string comment 'current, expired')
stored as parquet;
--tmp1 is used for saving origin data
CREATE TABLE if not exists administrator_tmp1(
admin_id bigint ,
admin_name string,
create_time string,
email string ,
password string ,
status_description string ,
token string ,
expire_time string ,
granter_user_id bigint ,
admin_time string ,
effect_start_date string ,
effect_end_date string
)
partitioned by (current_row_indicator string comment 'current, expired:')
stored as parquet;
--tmp2 saving the scd data
CREATE TABLE if not exists administrator_tmp2(
admin_id bigint ,
admin_name string,
create_time string,
email string ,
password string ,
status_description string ,
token string ,
expire_time string ,
granter_user_id bigint ,
admin_time string ,
effect_start_date string ,
effect_end_date string
)
partitioned by (current_row_indicator string comment 'current, expired')
stored as parquet;
--insert origin data into tmp1
INSERT OVERWRITE TABLE administrator_tmp1 PARTITION(current_row_indicator)
SELECT
user_id as admin_id,
name as admin_name,
time as create_time,
email as email,
password as password,
status as status_description,
token as token,
expire_time as expire_time,
admin_id as granter_user_id,
admin_time as admin_time,
'{{ ds }}' as effect_start_date,
'9999-12-31' as effect_end_date,
'current' as current_row_indicator
FROM
ks_db_origin.gifshow_administrator_origin
;
--insert scd data into tmp2
--for the data unchanged
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
t2.admin_id,
t2.admin_name,
t2.create_time,
t2.email,
t2.password,
t2.status_description,
t2.token,
t2.expire_time,
t2.granter_user_id,
t2.admin_time,
t2.effect_start_date,
t2.effect_end_date as effect_end_date,
t2.current_row_indicator
FROM
administrator_tmp1 t1
INNER JOIN
(
SELECT * FROM review_administrator
WHERE current_row_indicator = 'current'
) t2
ON
t1.admin_id = t2.admin_id
AND t1.admin_name = t2.admin_name
AND t1.create_time = t2.create_time
AND t1.email = t2.email
AND t1.password = t2.password
AND t1.status_description = t2.status_description
AND t1.token = t2.token
AND t1.expire_time = t2.expire_time
AND t1.granter_user_id = t2.granter_user_id
AND t1.admin_time = t2.admin_time
;
--for the data changed , update the effect_end_date
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
t2.admin_id,
t2.admin_name,
t2.create_time,
t2.email,
t2.password,
t2.status_description,
t2.token,
t2.expire_time,
t2.granter_user_id,
t2.admin_time,
t2.effect_start_date as effect_start_date,
'{{ yesterday_ds }}' as effect_end_date,
'expired' as current_row_indicator
FROM
administrator_tmp1 t1
INNER JOIN
(
SELECT * FROM review_administrator
WHERE current_row_indicator = 'current'
) t2
ON
t1.admin_id = t2.admin_id
WHERE NOT
(
t1.admin_name = t2.admin_name
AND t1.create_time = t2.create_time
AND t1.email = t2.email
AND t1.password = t2.password
AND t1.status_description = t2.status_description
AND t1.token = t2.token
AND t1.expire_time = t2.expire_time
AND t1.granter_user_id = t2.granter_user_id
AND t1.admin_time = t2.admin_time
)
;
--for the changed data and the new data
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
t1.admin_id,
t1.admin_name,
t1.create_time,
t1.email,
t1.password,
t1.status_description,
t1.token,
t1.expire_time,
t1.granter_user_id,
t1.admin_time,
t1.effect_start_date,
t1.effect_end_date,
t1.current_row_indicator
FROM
administrator_tmp1 t1
LEFT OUTER JOIN
(
SELECT * FROM review_administrator
WHERE current_row_indicator = 'current'
) t2
ON
t1.admin_id = t2.admin_id
AND t1.admin_name = t2.admin_name
AND t1.create_time = t2.create_time
AND t1.email = t2.email
AND t1.password = t2.password
AND t1.status_description = t2.status_description
AND t1.token = t2.token
AND t1.expire_time = t2.expire_time
AND t1.granter_user_id = t2.granter_user_id
AND t1.admin_time = t2.admin_time
WHERE t2.admin_id IS NULL
;
--for the data already marked by 'expired'
INSERT INTO TABLE administrator_tmp2 PARTITION(current_row_indicator)
SELECT
t1.admin_id,
t1.admin_name,
t1.create_time,
t1.email,
t1.password,
t1.status_description,
t1.token,
t1.expire_time,
t1.granter_user_id,
t1.admin_time,
t1.effect_start_date,
t1.effect_end_date,
t1.current_row_indicator
FROM
review_administrator t1
WHERE t1.current_row_indicator = 'expired'
;
--populate the dim table
INSERT OVERWRITE TABLE review_administrator PARTITION(current_row_indicator)
SELECT
t1.admin_id,
t1.admin_name,
t1.create_time,
t1.email,
t1.password,
t1.status_description,
t1.token,
t1.expire_time,
t1.granter_user_id,
t1.admin_time,
t1.effect_start_date,
t1.effect_end_date,
t1.current_row_indicator
FROM
administrator_tmp2 t1
;
--drop the two temp table
drop table administrator_tmp1;
drop table administrator_tmp2;
-- --example data
-- --2017-01-01
-- insert into table review_administrator PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-01','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- --2017-01-02
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a01@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-02','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '2','b','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-02','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- --2017-01-03
-- --id 1 is changed
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '1','a','2016-12-31','a03@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- --id 2 is not changed at all
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '2','b','2016-12-31','a@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- --id 3 is a new record
-- insert into table administrator_tmp1 PARTITION(current_row_indicator)
-- SELECT '3','c','2016-12-31','c@ks.com','password','open','token1','2017-12-31',
-- 0,'2017-12-31','2017-01-03','9999-12-31','current'
-- FROM default.sample_07 limit 1;
-- --now dim table will show you the right SCD.