1. 程式人生 > >hive影評練習

hive影評練習

現有如此三份資料:
1、users.dat 資料格式為: 2::M::56::16::70072,
共有6040條資料
對應欄位為:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
對應欄位中文解釋:使用者id,性別,年齡,職業,郵政編碼
2、movies.dat 資料格式為: 2::Jumanji (1995)::Adventure|Children's|Fantasy,
共有3883條資料
對應欄位為:MovieID BigInt, Title String, Genres String
對應欄位中文解釋:電影ID,電影名字,電影型別
3、ratings.dat 資料格式為: 1::1193::5::978300760,
共有1000209條資料
對應欄位為:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
對應欄位中文解釋:使用者ID,電影ID,評分,評分時間戳
題目要求
  資料要求:
    (1)寫shell指令碼清洗資料。(hive不支援解析多位元組的分隔符,也就是說hive只能解析':', 不支援解析'::',所以用普通方式建表來使用是行不通的,要求對資料做一次簡單清洗)
    (2)使用Hive能解析的方式進行
  Hive要求:
    (1)正確建表,匯入資料(三張表,三份資料),並驗證是否正確
    (2)求被評分次數最多的10部電影,並給出評分次數(電影名,評分次數)
    (3)分別求男性,女性當中評分最高的10部電影(性別,電影名,影評分)
    (4)求movieid = 2116這部電影各年齡段(因為年齡就只有7個,就按這個7個分就好了)的平均影評(年齡段,影評分)
    (5)求最喜歡看電影(影評次數最多)的那位女性評最高分的10部電影的平均影評分(觀影者,電影名,影評分)
    (6)求好片(評分>=4.0)最多的那個年份的最好看的10部電影
    (7)求1997年上映的電影中,評分最高的10部Comedy類電影
    (8)該影評庫中各種型別電影中評價最高的5部電影(型別,電影名,平均影評分)
    (9)各年評分最高的電影型別(年份,型別,影評分)
    (10)每個地區最高評分的電影名,把結果存入HDFS(地區,電影名,影評分)
之前已經使用MapReduce程式將3張表格進行合併,所以只需要將合併之後的表格匯入對應的表中進行查詢即可
原始資料是以::進行切分的,所以需要使用能解析多位元組分隔符的Serde即可
使用RegexSerde
需要兩個引數:
input.regex = "(.)::(.

)::(.*)"
output.format.string = "%1$s %2$s %3$s"

create table t_user(
userid bigint,
sex string,
age int,
occupation string,
zipcode string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.)::(.)::(.)::(.)::(.)','output.format.string'='%1$s %2$s %3$s %4$s %5$s')
stored as textfile;
load data local inpath "/root/users.dat" into table t_user;
create table t_movie(
movieid bigint,
moviename string,
movietype string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.

)::(.)::(.)','output.format.string'='%1$s %2$s %3$s')
stored as textfile;
load data local inpath "/root/movies.dat" into table t_movie;
create table t_rating(
userid bigint,
movieid bigint,
rate double,
times string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.)::(.
)::(.)::(.)','output.format.string'='%1$s %2$s %3$s %4$s')
stored as textfile;
load data local inpath "/root/ratings.dat" into table t_rating;
(2)求被評分次數最多的10部電影,並給出評分次數(電影名,評分次數)
create view v_movie_rate
as
select movieid,rate_count
from (
select movieid,count(1) as rate_count
from t_rating
group by movieid ) tmp
order by rate_count desc
limit 10;

select m.moviename,mr.rate_count
from v_movie_rate mr
join t_movie m on m.movieid=mr.movieid;

第二種方式
select a.moviename as moviename,count(a.moviename) as total
from t_movie a join t_rating b on a.movieid=b.movieid
group by a.moviename
order by total desc
limit 10;
(3)分別求男性,女性當中評分最高的10部電影(性別,電影名,影評分)

create view v_muser_rate_top
as
select 'M' as sex,m.moviename,avg(r.rate) as rate_avg,count(r.movieid) movie_count
from t_user u
join t_rating r on u.userid=r.userid
join t_movie m on m.movieid=r.movieid
where u.sex='M'
group by m.moviename
having movie_count >= 50
order by rate_avg desc
limit 10;

create view v_fuser_rate_top
as
select 'F' as sex,m.moviename,avg(r.rate) as rate_avg,count(r.movieid) movie_count
from t_user u
join t_rating r on u.userid=r.userid
join t_movie m on m.movieid=r.movieid
where u.sex='F'
group by m.moviename
having movie_count >= 50
order by rate_avg desc
limit 10;

select from v_muser_rate_top
union
select
from v_fuser_rate_top;
(4)求movieid = 2116這部電影各年齡段(因為年齡就只有7個,就按這個7個分就好了)的平均影評(年齡段,影評分)

select u.age,avg(r.rate) rate_avg
from t_rating r
join t_user u on u.userid=r.userid
where r.movieid=2116
group by u.age;

1 3.2941176470588234
18 3.3580246913580245
25 3.436548223350254
35 3.2278481012658227
45 2.8275862068965516
50 3.32
56 3.5
(5)求最喜歡看電影(影評次數最多)的那位女性評最高分的10部電影的平均影評分(電影名,影評分)
找出最牛逼的那位女性(userid=1150)
create table rate_max_count_famale
as
select r.userid,count(r.userid) as rate_count
from t_rating r
join t_user u on u.userid=r.userid
where u.sex='F'
group by r.userid
order by rate_count desc
limit 1;

找出那個女性評分最高的前10部電影
create table t_famale_top10
as
select r.movieid,r.rate
from t_rating r
join rate_max_count_famale f on f.userid=r.userid
order by r.rate desc
limit 10;
算出這10部電影的平均影評分
select r.movieid,m.moviename,avg(r.rate) rate_avg
from t_famale_top10 f
join t_rating r on r.movieid=f.movieid
join t_movie m on m.movieid=r.movieid
group by r.movieid,m.moviename;
(6)求好片(評分>=4.0)最多的那個年份的最好看的10部電影
先求評分大於4分的所有電影,並將電影上映的年份截取出來
create table tmp_movie_rateavg_4
as
select m.movieid,m.moviename,substr(m.moviename,-5,4) tyear,avg(r.rate) rate_avg
from t_movie m
join t_rating r on r.movieid=m.movieid
group by m.movieid,m.moviename
having rate_avg>=4;
按年分組求出每年最多好片的那一年(1998)
select tyear,count(tyear) total
from tmp_movie_rateavg_4
group by tyear
order by total desc
limit 1;
求出那個年份最好看的10部電影
select movieid,moviename,rate_avg
from tmp_movie_rateavg_4
where tyear='1998'
order by rate_avg desc
limit 10;

(7)求1997年上映的電影中,評分最高的10部Comedy類電影
insert overwrite local directory '/root/00movie_rate_top10' row format delimited fields terminated by '\t'
select m.movieid,m.moviename,avg(r.rate) rate_avg
from t_movie m
join t_rating r on r.movieid=m.movieid
where moviename like concat('%','1997','%') and movietype like concat('%','Comedy','%')
group by m.movieid,m.moviename
order by rate_avg desc
limit 10;
(8)該影評庫中各種型別電影中評價最高的5部電影(型別,電影名,平均影評分)
select movietype,count(1) total
from t_movie
group by movietype;

select m.movieid,m.moviename,r.rate,tv.type
from t_movie m
join t_rating r on r.movieid=m.movieid
lateral view explode(split(m.movietype,"\|")) tv as type;

求出每部電影的平均影評分
create table tmp_movie_rateavg_1
as
select m.movieid,m.moviename,m.movietype,avg(r.rate) rate_avg
from t_movie m
join t_rating r on r.movieid=m.movieid
group by m.movieid,m.moviename,m.movietype;
把型別列裂變成多行資料
create table tmp_movie_rateavg_1_1
as
select movieid,moviename,rate_avg,tv.type
from tmp_movie_rateavg_1
lateral view explode(split(movietype,"\|")) tv as type;

select type,moviename,rate_avg
from(
select type,moviename,rate_avg,row_number() over(partition by type order by rate_avg desc) rn
from tmp_movie_rateavg_1_1 ) tmp
where tmp.rn <=5;
(9)各年評分最高的電影型別(年份,型別,影評分)
在tmp_movie_rateavg_1基礎上將型別和年份變出來
create table tmp_movie_rateavg_1_2
as
select movieid,substr(moviename,-5,4) tyear,moviename ,rate_avg,tv.type
from tmp_movie_rateavg_1
lateral view explode(split(movietype,"\|")) tv as type;

create table tmp_movie_type_year_top
as
select tyear,type,rate_avg_movietype,row_number() over(partition by tyear,type order by rate_avg_movietype desc) rn
from(
select tyear,type,avg(rate_avg) rate_avg_movietype
from tmp_movie_rateavg_1_2
group by tyear,type
) tmp
;

select *
from tmp_movie_type_year_top
where rn=1;

(10)每個地區最高評分的電影名,把結果存入HDFS(地區,電影名,影評分)