hive參數配置及任務優化
阿新 • • 發佈:2019-04-22
shuff rtu current schedule 輸入 output tar filesize 0.10
一、hive常用參數
0.常用參數
--@Name: --@Description: --@Type:全量加載 --@Author:--- --@CreateDate: --@Target: --@SourceTable: --@ModifyBy: --@ModifyDate: --@ModifyDesc: --@Copyright --設置作業名 set mapred.job.name = hive_xxx(${statisdate}); --Map輸入合並大小 set mapreduce.input.fileinputformat.split.maxsize=300000000; set mapreduce.input.fileinputformat.split.minsize=100000000; set mapreduce.input.fileinputformat.split.minsize.per.node=100000000; set mapreduce.input.fileinputformat.split.minsize.per.rack=100000000; set hive.input.format = org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; --設置reduce數目 set hive.exec.reducers.bytes.per.reducer=300000000; set hive.exec.reducers.max=300; --輸出合並 set hive.merge.mapfiles = true; set hive.merge.mapredfiles = true; set hive.merge.size.per.task = 128000000; set hive.merge.smallfiles.avgsize=16000000; --是否使用mapjoin set hive.auto.convert.join = false; --設置默認用戶 use xxx_db;
1.任務名設置
set mapreduce.job.name=xxxx(${statis_date}) # 方便定位具體任務
2.輸入合並參數設置
set mapreduce.input.fileinputformat.split.maxsize=300000000; set mapreduce.input.fileinputformat.split.minsize=100000000; set mapreduce.input.fileinputformat.split.minsize.per.node=100000000; set mapreduce.input.fileinputformat.split.minsize.per.rack=100000000; set hive.input.format= org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; --不進行小文件合並
3.輸出合並參數設置
set hive.merge.mapfiles = true #在Map-only的任務結束時合並小文件 set hive.merge.mapredfiles = true #在Map-Reduce的任務結束時合並小文件 set hive.merge.size.per.task = 256*1000*1000 #合並文件的大小 set hive.merge.smallfiles.avgsize=16000000 #當輸出文件的平均大小小於該值時,啟動一個獨立的map-reduce任務進行文件merge
4.reduce設置
set hive.exec.reducers.bytes.per.reducer= 300000000; set hive.exec.reducers.max=300; set mapred.reduce.tasks=10; #固定reduce大小
5.mapjoin參數設置
set hive.auto.convert.join= false;
6.map端聚合
set hive.map.aggr = true;
7.mapreduce的物理內存、虛擬內存
set mapreduce.map.memory.mb = 4096; set mapreduce.reduce.memory.mb = 4096; set mapreduce.map.java.opts=-Xmx3278m; set mapreduce.reduce.java.opts=-Xmx3278m; --------------------------------------------------- -- set mapreduce.map.memory.mb = 4096; -- set mapreduce.reduce.memory.mb = 4096; -- 此參數設計必須在允許範圍內 -- yarn.scheduler.maximum-allocation-mb=8192; -- yarn.scheduler.minimum-allocation-mb=1024; --------------------------------------------------- -- 堆內存設置要小於物理內存,一般設置為80% -- set mapreduce.map.java.opts=-Xmx1638m; -- set mapreduce.reduce.java.opts=-Xmx3278m; --------------------------------------------------- -- Application application_1409135750325_48141 failed 2 times due to AM Container for -- appattempt_1409135750325_48141_000002 exited with exitCode: 143 due to: Container -- [pid=4733,containerID=container_1409135750325_48141_02_000001] is running beyond physical memory limits. -- Current usage: 2.0 GB of 2 GB physical memory used; 6.0 GB of 4.2 GB virtual memory used. Killing container. -- #虛擬內存打開:yarn.nodemanager.vmem-check-enabled=true -- 最大允許使用的虛擬內存=最大可使用的物理內存 * yarn.nodemanager.vmem-pmem-ratio=2.1 -- #物理內存檢查打開:yarn.nodemanager.pmem-check-enabled=true -- 兩者中有一個超過允許最大內存,此container容器均會被殺 ---------------------------------------------------
8.動態分區
set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; # 非嚴格模式
9. shuffle端內存溢出oom (BoundedByteArrayOutputStream)
set mapreduce.reduce.shuffle.memory.limit.percent=0.10;
10.map段謂詞下推
set hive.optimize.ppd=true;
11.並行執行
set hive.exec.parallel=true; set hive.exec.parallel.thread.number=16; # 並行度
二、hive任務優化
1.分區裁剪
1.查詢涉及分區表時,限制分區範圍
2.使用to_unix_timestamp代替unix_timestamp(),避免全表掃描
2.列裁剪
只讀取查詢中需要用到的列,忽略其他不關心的列 Select * from table_test; Select field_1,field_2,… from table_test; Select * 跟select 所有字段是否一樣?(網絡IO,索引)
3.合理設置map、reduce個數
Map數: splitSize=Math.max(minSize, Math.min(maxSize, blockSize)) reduce數: reducers = Math.min(maxReducers, totalInputFileSize/bytesPerReducer)
# 根據任務運行效率,調整map reduce處理數據量大小
4.group by 優化
set hive.map.aggr=true; select id,count(1) from test group by id; set hive.groupby.skewindata = true; • 先不按GroupBy字段分發,隨機分發做一次聚合 • 額外啟動一輪job,拿前面聚合過的數據按GroupBy字段分發再算結果
5.join優化
大表跟小表之間join時,可打開mapjoin,將小表加載到內存中 set hive.mapjoin.smalltable.filesize 25M set hive.auto.convert.join = true; ps:不能只看文件大小,決定使用使用mapjoin,容易導致OOM(字段、過濾、去重後的記錄數跟文件大小) --map端join把小表讀入內存 set hive.exec.parallel=true; select /*+mapjoin(t2)*/ t1.vendor_cd, t2.vendor_cd from (select vendor_cd from tmp_tt ) t1 left outer join (select vendor_cd from tmp_tt limit 10 ) t2 on t1.vendor_cd=t2.vendor_cd limit 100; --控制map數,並且用mapjoin實現笛卡爾積 set mapred.reduce.tasks=10; set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; --不進行小文件合並 set hive.exec.parallel=true; select /*+mapjoin(t2)*/ t1.vendor_cd, t2.vendor_cd from (select vendor_cd from tmp_tt distribute by vendor_cd ) t1 left outer join (select vendor_cd from tmp_tt distribute by vendor_cd ) t2 limit 100;
6.數據傾斜
--特殊傾斜值的處理(null值很多的時候) set hive.exec.parallel=true; select t1.vendor_cd, t2.vendor_cd from (select vendor_cd from tmp_tt ) t1 left outer join (select vendor_cd from tmp_tt ) t2 on nvl(t1.vendor_cd,concat(‘hive_‘,rand()))=t2.vendor_cd limit 100;
--當心關聯的類型是否一致,類型不一致可能會導致數據傾斜或者算出意想不到的結果 set hive.exec.parallel=true; select t1.vendor_cd, t2.vendor_cd from (select vendor_cd //int類型 from tmp_tt ) t1 left outer join (select vendor_cd //string類型 from tmp_tt ) t2 on cast(t1.vendor_cd as string)=t2.vendor_cd limit 100;
hive參數配置及任務優化