Hive SQL視窗函式實現頁面統計(以騰雲天下頁面訪問為例)
阿新 • • 發佈:2019-01-06
埋點資料欄位為:
userid,at,sid,pid分別表示使用者id,訪問時間,sessionId(區分一次啟動),頁面id
表名為beacon
所有資料均為模擬資料
2018-07-04 11:46:37 2856 efda26adec1c3eb8 h_01 2018-07-04 11:46:47 2856 efda26adec1c3eb8 h_03 2018-07-04 11:46:54 2856 efda26adec1c3eb8 h_02 2018-07-04 11:47:04 2856 efda26adec1c3eb8 h_02 2018-07-04 11:47:39 2856 efda26adec1c3eb8 h_04 2018-07-04 11:47:39 2856 efda26adec1c3eb8 h_09 2018-07-04 11:47:39 2856 efda26adec1c3eb8 h_01 2018-07-04 11:47:39 2856 efda26adec1c3eb8 h_03 2018-07-04 11:48:40 2856 efda26adec1c3eb8 h_07 2018-07-04 12:48:13 2856 b975601de0e1c2fc h_01 2018-07-04 12:48:40 2856 b975601de0e1c2fc h_03 2018-07-04 12:49:07 2856 b975601de0e1c2fc h_02 2018-07-04 12:49:52 2856 b975601de0e1c2fc h_07 2018-07-04 12:50:02 2856 5f52c96c52c98367 h_01 2018-07-04 12:50:47 2823 5f52c96c52c98367 h_03 2018-07-04 12:51:09 2823 5f52c96c52c98367 h_02
埋點原因無法統計到最後一個頁面停留時間
最終視覺化效果為如下圖所示
頁面停留時間:
需要按sid分組後,訪問時間從小到大排序,後一條時間減去前一條時間為上一條資料裡頁面的停留時間,故需要用到lead函式
1.求頁面受訪人數,頁面受訪(次數|比率)
select to_date(at) date,page p,count(1) pv,count(distinct userid) uv
from tmp
group by to_date(at),page
結果如下
比率:需要每個頁面的pv/總的pv,這裡用視窗函式sum() over()
select t.date,t.p,t.uv,t.pv,round(t.pv/sum(t.pv) over(),3) from ( select to_date(at) date,page p,count(1) pv,count(distinct userid) uv from tmp group by to_date(at),page ) t
結果如下:
2.求受訪總時長佔比,平均停留時間(使用lead函式)
select to_date(at) date,page p,
lead(page,1,'endpage') over(partition by sid order by unix_timestamp(at)) nextpage,
at at,
lead(at,1,'endat') over(partition by sid order by unix_timestamp(at)) nextat
from tmp;
結果如下:
接下來求所有頁面的停留時長,並過濾掉最後一個頁面(下個頁面為endpage)與頁面與下個頁面相同的資料
受訪總時長佔比為:每個頁面總的訪問時長/所有頁面總的訪問時間
select p.date date,
p.p page,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/count(1) over(partition by p.p),3) avglen,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.date),3) rate
from
(
select to_date(at) date,
page p,lead(page,1,'endpage') over(partition by sid order by unix_timestamp(at)) nextpage,
at at,
lead(at,1,'endat') over(partition by sid order by unix_timestamp(at)) nextat
from tmp
) p
where p.p!=p.nextpage and p.nextpage!='endpage'
結果如下:
因為使用over(),頁面相同的資料都一樣,故去重一下
select n.date date,n.page p,n.avglen avg,n.rate rate
from
(
select p.date date,p.p page,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/count(1) over(partition by p.p),3) avglen,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.date),3) rate
from
(
select to_date(at) date,page p,
lead(page,1,'endpage') over(partition by sid order by unix_timestamp(at)) nextpage,
at at,
lead(at,1,'endat') over(partition by sid order by unix_timestamp(at)) nextat
from tmp
) p
where p.p!=p.nextpage and p.nextpage!='endpage'
) n
group by n.date,n.page,n.avglen,n.rate
結果如下:
:
3.求離開應用
select to_date(browsepath.time) date,browsepath.p p,
round(sum(case when browsepath.nextpage='end' then 1 else 0 end)/sum(1),3) lrate
from
(
select at time,page p,
lead(page,1,'end') over(partition by sid order by unix_timestamp(at)) nextpage
from tmp
) browsepath
where browsepath.p!=browsepath.nextpage
group by to_date(browsepath.time),browsepath.p
結果如下:
4.走向
select j.date date,j.p p,
collect_list(concat_ws('_',j.nextpage,j.rate)) l
from
(
select b.date date,b.p p,b.nextpage nextpage,
cast(b.c/sum(b.c) over(partition by b.p) as string) rate
from
(
select to_date(browsepath.time) date,
browsepath.p p,browsepath.nextpage nextpage,count(1) c
from
(
select at time,page p,
lead(page,1,'end') over(partition by sid order by unix_timestamp(at)) nextpage
from tmp
) browsepath
where browsepath.p!=browsepath.nextpage and nextpage!='end'
group by to_date(browsepath.time),browsepath.p,browsepath.nextpage
) b
) j
group by j.date,j.p
結果如下:
接下來就是把sql join一下:
select pu.date,pu.p,pu.uv,pu.pv,len.rate,len.avg,lr.lrate,lr.path
from
(
select leave.date date,leave.p p,leave.lrate lrate,browse.l path
from
(
select to_date(browsepath.time) date,browsepath.p p,
round(sum(case when browsepath.nextpage='end' then 1 else 0 end)/sum(1),3) lrate
from
(
select at time,page p,
lead(page,1,'end') over(partition by sid order by unix_timestamp(at)) nextpage
from tmp
) browsepath
where browsepath.p!=browsepath.nextpage
group by to_date(browsepath.time),browsepath.p
) leave
full join
(
select j.date date,j.p p,collect_list(concat_ws('_',j.nextpage,j.rate)) l
from
(
select b.date date,b.p p,b.nextpage nextpage,
cast(b.c/sum(b.c) over(partition by b.p) as string) rate
from
(
select to_date(browsepath.time) date,browsepath.p p,browsepath.nextpage nextpage,count(1) c
from
(
select at time,page p,
lead(page,1,'end') over(partition by sid order by unix_timestamp(at)) nextpage
from tmp
) browsepath
where browsepath.p!=browsepath.nextpage and nextpage!='end'
group by to_date(browsepath.time),browsepath.p,browsepath.nextpage
) b
) j
group by j.date,j.p
) browse
on leave.date=browse.date and leave.p=browse.p
) lr
join
(
select t.date date,t.p p,concat_ws('_',cast(t.pv as string),
cast(round(t.pv/sum(pv) over(),3) as string)) pv,t.uv uv
from
(
select to_date(at) date,page p,count(1) pv,count(distinct userid) uv
from tmp
group by to_date(at),page
) t
) pu
on lr.date=pu.date and lr.p=pu.p
join
(
select n.date date,n.page p,n.avglen avg,n.rate rate
from
(
select p.date date,p.p page,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/count(1) over(partition by p.p),3) avglen,
round(sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.p)/sum(unix_timestamp(p.nextat)-unix_timestamp(p.at)) over(partition by p.date),3) rate
from
(
select to_date(at) date,page p,
lead(page,1,'endpage') over(partition by sid order by unix_timestamp(at)) nextpage,
at at,
lead(at,1,'endat') over(partition by sid order by unix_timestamp(at)) nextat
from tmp
) p
where p.p!=p.nextpage and p.nextpage!='endpage'
) n
group by n.date,n.page,n.avglen,n.rate
) len
on pu.date=len.date and pu.p=len.p;
這就ok啦,有不足的地方歡迎大家評論!