記錄一次hive大表脫敏和改造成parquet儲存動態分割槽的操作
阿新 • • 發佈:2018-11-04
#!/bin/bash tablename=frontlog startDate=2018-01-01 #endDate=`date -d "0 day ago" +%Y-%m-%d` DATE=`date +%Y-%m-%d` endDate=2018-08-30 #給新表增加分割槽 while [[ $startDate<=$endDate ]] do echo "startDate:$startDate">>result_dy.log hadoop fs -ls /user/hive/warehouse/data_market_tuomin.db/${tablename}/pt_date=${startDate} if [ $? -ne 0 ] then TIME=`date +%H:%M:%S` echo "$DATE $TIME the partition pt_date=${startDate} doesn't exists!!">>result_dy.log else impala-shell --quiet -B -q "create function if not exists params_remove(string) returns string location 'hdfs://dev1.hadoop.feidai.com:8020/user/hive/udf/bigdata.parsejson.jar' symbol='ParseUDF2';" echo "impala-shell --quiet -B -q \"insert into table data_market_tuomin.frontlog_hxx PARTITION(part_dt) select dt,vl,nt,tm,amc,re,nw,de,id,dd,fm,dip,_id,mb,bm,ve,createdate,offsetx,sy,fn,createdatestr,dc,sid,ev,dpl,dmb,imei,case when fn in('userRegister','userLogin','findPwdByMobStepTwo','checkLogin','changeMobile','updatePassWord','updateTradeCode','findTradeCodeStepOne','findTradeCodeStepTwo','checkLoginPassWord','checkTradePassWord','submitTradeCodeCash','setTradeCode','setTradePwdSendSms','setTradeCodeStepOne','cashSendSms') and ev='3' then params_remove(sd) else sd end as sd,asx,did,loc,pid,cu,ip,wbn,rs,rt,pr,sp,spid,se,seid,substring(cast(hours_add(from_unixtime(cast(substring(createdate,1,10) as bigint),'yyyy-MM-dd HH:mm:ss'),8) as string),1,7) as part_dt from data_market_tuomin.frontlog where pt_date='$startDate';\"">>result_dy.log TIME=`date +%H:%M:%S` echo "$DATE $TIME benging... !!">>result_dy.log impala-shell --quiet -B -q "insert into table data_market_tuomin.frontlog_hxx PARTITION(part_dt) \ select dt,vl,nt,tm,amc,re,nw,de,id,dd,fm,dip,_id,mb,bm,ve,createdate,offsetx,sy,fn,createdatestr,dc,sid,ev,dpl,dmb,imei, \ case when fn in('userRegister','userLogin','findPwdByMobStepTwo','checkLogin','changeMobile','updatePassWord','updateTradeCode','findTradeCodeStepOne','findTradeCodeStepTwo','checkLoginPassWord','checkTradePassWord','submitTradeCodeCash','setTradeCode','setTradePwdSendSms','setTradeCodeStepOne','cashSendSms') and ev='3' then params_remove(sd) else sd end as sd,asx,did,loc,pid,cu,ip,wbn,rs,rt,pr,sp,spid,se,seid, \ substring(cast(hours_add(from_unixtime(cast(substring(createdate,1,10) as bigint),'yyyy-MM-dd HH:mm:ss'),8) as string),1,7) as part_dt \ from data_market_tuomin.frontlog \ where pt_date='$startDate';" if [ $? -eq 0 ] then TIME=`date +%H:%M:%S` echo "$DATE $TIME $startDate insert into success!!">>result_dy.log else TIME=`date +%H:%M:%S` echo "$DATE $TIME $startDate insert into failure!!">>result_dy.log fi fi echo "">>result_dy.log impala-shell -q "refresh data_market_tuomin.frontlog_hxx;" startDate=$(date -d "+1 day $startDate" +%Y-%m-%d) done
現有一個12TB資料量的表,儲存格式為testfile,因為之前的設計問題,分割槽的區間資料裡面不止有這個分割槽段的資料。
使用impala查詢的時候已經查不動了