hive 数据迁移SQL
hive 数据迁移SQL
fly_xiang 发表于1年前
hive 数据迁移SQL
  • 发表于 1年前
  • 阅读 153
  • 收藏 2
  • 点赞 0
  • 评论 0

腾讯云 新注册用户 域名抢购1元起>>>   

摘要: 从一张表导入到另外一张表,控制mapper,reducer数量,设置LZO压缩,文件副本数
#!/bin/sh
. /etc/profile

workdir=$(dirname $0)
cd $workdir || exit 1

if [[ $# -eq 1 ]]; then
    startday=$(date +%Y%m%d -d "$1")
    endday=$startday
elif [[ $# -eq 2 ]]; then
    startday=$(date +%Y%m%d -d "$1")
    endday=$(date +%Y%m%d -d "$2")
else    
    startday=`date +%Y%m%d --date '-1 day'`
    endday=$startday
fi

while(($startday <= $endday)); do
    echo "==============================${startday}======================"

#insert overwrite table qyer.clean_track partition(logdate='$startday') select  user_id,unique_id,server_timestamp,session_timestamp,local_time,tz,ip,lon,lat,path,referer,type,category,ra_arg,os_browser,channel,ugc_type,platform,ugc_content,map(),refer_ugc_type,refer_ugc_content,'-',loghour,link_url,client_id,extra,map() from qyer.user_tracks where logdate='$startday';

hive -e  "
set mapreduce.job.maps=3;
set mapreduce.input.fileinputformat.split.minsize=330572800;
set mapreduce.input.fileinputformat.split.maxsize=330572800;
set mapreduce.map.output.compress=true;
set mapreduce.map.output.compress.codec=com.hadoop.compression.lzo.LzopCodec;
set mapreduce.output.fileoutputformat.compress=true;
set mapreduce.output.fileoutputformat.compress.codec=com.hadoop.compression.lzo.LzopCodec;
set hive.exec.compress.output=true;
set hive.exec.compress.intermediate=true;
use qyer;
alter table clean_track drop IF EXISTS partition (logdate='$startday');
insert overwrite  DIRECTORY '/flume/clean/track/$startday/' 
select  user_id,unique_id,server_timestamp,session_timestamp,local_time,tz,ip,lon,lat,path,referer,type,category,ra_arg,os_browser,channel,ugc_type,platform,ugc_content,map(),refer_ugc_type,refer_ugc_content,'-',loghour,link_url,client_id,extra,map() from qyer.user_tracks  where logdate='$startday' ;
ALTER TABLE clean_track add partition(logdate='$startday') location '/flume/clean/track/$startday/';
"

    if (($? != 0 )); then
    	echo "error occurs when deal with day ${startday}"
    	exit 1
    fi

    echo "==============================finished ${startday}======================"
    startday=$(date +%Y%m%d -d "$startday +1 days" )
done

hive使用map join:

hive -e "set hive.auto.convert.join=false;
set hive.ignore.mapjoin.hint=false;
select /*+ MAPJOIN(a) */ a.xid,b.id,b.platform,a.dora_id from (select xid,dora_id from qyer.xid_doraid where stat_date='20160530') a join (select if(user_id in ('0','-'),unique_id,user_id) as id,platform from qyer.clean_track where logdate='20160530' and loghour=9) b on a.xid=b.id" > 9.txt
查看文件大小
hadoop fs -du -s -h /hive/warehouse/qyer.db/user_tracks/*/logdate=20151008
hadoop dfs -ls -h /flume/clean/track/20130531/
hadoop dfs -setrep -R 2 /flume/clean/track/20151001

select logdate,count(1) from clean_track where logdate>='20130101' and logdate <='20130131' group by logdate
select logdate,count(1) from user_tracks where logdate>='20130101' and logdate <='20130131' group by logdate

hive 外部表 lzo压缩
create external table lzo_test(name string)
ROW FORMAT DELIMITED
  FIELDS TERMINATED BY '\001' COLLECTION ITEMS TERMINATED BY '\002' MAP KEYS TERMINATED BY '\003' LINES TERMINATED BY '\n'    
STORED AS INPUTFORMAT
  'com.hadoop.mapred.DeprecatedLzoTextInputFormat'  
OUTPUTFORMAT
  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' 
 LOCATION '/flume/clean/lzo_test';

 

共有 人打赏支持
粉丝 3
博文 44
码字总数 76735
×
fly_xiang
如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!
* 金额(元)
¥1 ¥5 ¥10 ¥20 其他金额
打赏人
留言
* 支付类型
微信扫码支付
打赏金额:
已支付成功
打赏金额: