-- 创建表
drop table if exists table1;
CREATE TABLE table1 (
user_id INT,
login_date STRING,
source STRING
)
;
-- 插入数据
INSERT INTO table1 VALUES
(1,'2024/7/1','fb'),
(1,'2024/7/2','fb'),
(1,'2024/7/5','ig'),
(1,'2024/7/10','fb'),
(1,'2024/7/15','fb'),
(1,'2024/7/15','fb'),
(2,'2024/7/13','ig'),
(2,'2024/7/13','ig'),
(2,'2024/7/15','ytb'),
(2,'2024/7/15','ytb'),
(2,'2024/7/16','ytb');
/* 首先,使用窗口函数按照用户 ID 和登录日期排序,为每个登录记录分配一个行号 */
DROP TABLE IF EXISTS TEMP_1;
CREATE TABLE TEMP_1 AS
SELECT USER_ID,SOURCE,to_date(from_unixtime(unix_timestamp(login_date, 'yyyy/MM/dd'))) login_DATE
,row_number() OVER (PARTITION BY user_id ORDER BY to_date(from_unixtime(unix_timestamp(login_date, 'yyyy/MM/dd'))) asc ) AS login_order
from table1
;
/* 确定分组*/
DROP TABLE IF EXISTS TEMP_2;
CREATE TABLE TEMP_2 AS
select user_id,
source,
login_date,
-- 根据相邻行的 source 是否相同来分组
sum(grp_flag) OVER (PARTITION BY user_id ORDER BY login_date) AS grp
from (
SELECT user_id,
source,
login_date,
-- 如果当前行的 source 和上一行不同,则标记为 1,否则为 0
case when lag(source) over (PARTITION BY user_id ORDER BY login_DATE) is null or source!= lag(source) over (PARTITION BY user_id ORDER BY login_date) then 1 else 0 end AS grp_flag
FROM temp_1
) t
;
/* 3.最后,根据分组计数连续登录次数和登录顺序*/
DROP TABLE IF EXISTS TEMP_3;
CREATE TABLE TEMP_3 AS
SELECT user_id,
source,
grp,
min_date,
-- 连续访问次数
count(*) OVER (PARTITION BY user_id,source,grp) AS cnt
-- 连续访问顺序
-- row_number() OVER (PARTITION BY user_id ORDER BY min_date ASC) AS rk
FROM (
select user_id
, source
, grp
, min(login_date) over (partition by user_id,source,grp order by login_date asc) as min_date
from temp_2
) t
;
/* 输出结果*/
drop table if exists temp_4;
create table temp_4 as
select user_id,source,cnt,min_date
-- 连续访问顺序
,row_number() OVER (PARTITION BY user_id ORDER BY min_date ASC) AS rk
from temp_3
group by user_id,source,grp,min_date,cnt
order by user_id,min_date;
select * from temp_4;