Hive結合shell指令碼實現自動化業務
阿新 • • 發佈:2018-12-22
【案例】hive指令碼載入資料到hive分割槽表
access_logs/20170610/2017061000.log
2017061001.log
2017061002.log
......
2017061023.log
二級分割槽:天/小時
crontab+shell 實現自動排程。
建庫:
create database load_hive;
建表:
create table load_h(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
partitioned by (date string,hour string)
row format delimited fields terminated by '\t'
stored as textfile;
--hiveconf 指定引數
key=value
access_logs/20170610/2017061000.log
2017061001.log
2017061002.log
......
2017061023.log
二級分割槽:天/小時
crontab+shell 實現自動排程。
建庫:
create database load_hive;
建表:
create table load_h(
id string,
url string,
referer string,
keyword string,
type string,
guid string,
pageId string,
moduleId string,
linkId string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip string,
trackerSrc string,
cookie string,
orderCode string,
trackTime string,
endUserId string,
firstLink string,
sessionViewNo string,
productId string,
curMerchantId string,
provinceId string,
cityId string,
fee string,
edmActivity string,
edmEmail string,
edmJobId string,
ieVersion string,
platform string,
internalKeyword string,
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
partitioned by (date string,hour string)
row format delimited fields terminated by '\t'
stored as textfile;
--hiveconf 指定引數
key=value
show partitions load_hive.load_h; --查看錶分割槽情況。
通過Shell指令碼執行:load_to_hive.h
通過Shell指令碼及可執行SQL檔案執行:load_to_hive_f.sh#!/bin/bash #load #define the date of yesterday YESTERDAY=`date -d '-1 days' +%Y%m%d` #define log dir ACCESS_LOGS_DIR=/opt/datas/access_logs/$YESTERDAY #define hive home HIVE_HOME=/opt/cdh5/hive-0.13.1-cdh5.3.6 #load for FILE in `ls $ACCESS_LOGS_DIR` do Day=${FILE:0:8} Hour=${FILE:8:2} echo "${Day}+${Hour}" $HIVE_HOME/bin/hive -e "load data local inpath '$ACCESS_LOGS_DIR/$FILE' into table load_hive.load_h partition(date='${Day}',hour='${Hour}')" done $HIVE_HOME/bin/hive -e "show partitions load_hive.load_h"
SQL檔案:load.sql#!/bin/bash #load #define the date of yesterday YESTERDAY=`date -d '-1 days' +%Y%m%d` #define log dir ACCESS_LOGS_DIR=/opt/datas/access_logs/$YESTERDAY #define hive home HIVE_HOME=/opt/cdh5/hive-0.13.1-cdh5.3.6 #load for FILE in `ls $ACCESS_LOGS_DIR` do Day=${FILE:0:8} Hour=${FILE:8:2} echo "${Day}+${Hour}" $HIVE_HOME/bin/hive --hiveconf log_dir=$ACCESS_LOGS_DIR --hiveconf file_path=$FILE --hiveconf DAY=$Day --hiveconf HOUR=$Hour -f /opt/datas/hive_script/load.sql done $HIVE_HOME/bin/hive -e "show partitions load_hive.load_h"
load data local inpath '${hiveconf:log_dir}/${hiveconf:file_path}' into table load_hive.load_h partition(date='${hiveconf:DAY}',hour='${hiveconf:HOUR}');