上一篇博客讲解了使用jar -jar的方式来运行提交MR程序,以及通过修改YarnRunner的源码来实现MR的windows开发环境提交到集群的方式。本篇博主将分享sql中常见的join操作。
一、需求
订单数据表t_order:
id |
date |
pid |
amount |
1001 |
20150710 |
P0001 |
2 |
1002 |
20150710 |
P0001 |
3 |
1002 |
20150710 |
P0002 |
3 |
商品信息表t_product:
id |
pname |
category_id |
price |
P0001 |
小米5 |
1000 |
2 |
P0002 |
锤子T1 |
1000 |
3 |
假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,需要用mapreduce程序来实现一下SQL查询运算:
select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id |
实现机制:通过将关联的条件作为map输出的key,将两表满足join条件的数据并携带数据所来源的文件信息,发往同一个reduce task,在reduce中进行数据的串联
二、实现代码
join后的输出类:
package com.empire.hadoop.mr.rjoin;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
/**
* 类 InfoBean.java的实现描述:实体类
*
* @author arron 2018年12月10日 下午11:51:27
*/
public class InfoBean implements Writable {
private int order_id;
private String dateString;
private String p_id;
private int amount;
private String pname;
private int category_id;
private float price;
// flag=0表示这个对象是封装订单表记录
// flag=1表示这个对象是封装产品信息记录
private String flag;
public InfoBean() {
}
public void set(int order_id, String dateString, String p_id, int amount, String pname, int category_id,
float price, String flag) {
this.order_id = order_id;
this.dateString = dateString;
this.p_id = p_id;
this.amount = amount;
this.pname = pname;
this.category_id = category_id;
this.price = price;
this.flag = flag;
}
public int getOrder_id() {
return order_id;
}
public void setOrder_id(int order_id) {
this.order_id = order_id;
}
public String getDateString() {
return dateString;
}
public void setDateString(String dateString) {
this.dateString = dateString;
}
public String getP_id() {
return p_id;
}
public void setP_id(String p_id) {
this.p_id = p_id;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public int getCategory_id() {
return category_id;
}
public void setCategory_id(int category_id) {
this.category_id = category_id;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
/**
* private int order_id; private String dateString; private int p_id;
* private int amount; private String pname; private int category_id;
* private float price;
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(order_id);
out.writeUTF(dateString);
out.writeUTF(p_id);
out.writeInt(amount);
out.writeUTF(pname);
out.writeInt(category_id);
out.writeFloat(price);
out.writeUTF(flag);
}
@Override
public void readFields(DataInput in) throws IOException {
this.order_id = in.readInt();
this.dateString = in.readUTF();
this.p_id = in.readUTF();
this.amount = in.readInt();
this.pname = in.readUTF();
this.category_id = in.readInt();
this.price = in.readFloat();
this.flag = in.readUTF();
}
@Override
public String toString() {
return "order_id=" + order_id + ", dateString=" + dateString + ", p_id=" + p_id + ", amount=" + amount
+ ", pname=" + pname + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag;
}
}
mapreduce主程序类:
package com.empire.hadoop.mr.rjoin;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 订单表和商品表合到一起
order.txt(订单id, 日期, 商品编号, 数量)
1001 20150710 P0001 2
1002 20150710 P0001 3
1002 20150710 P0002 3
1003 20150710 P0003 3
product.txt(商品编号, 商品名字, 价格, 数量)
P0001 小米5 1001 2
P0002 锤子T1 1000 3
P0003 锤子 1002 4
*/
public class RJoin {
static class RJoinMapper extends Mapper<LongWritable, Text, Text, InfoBean> {
InfoBean bean = new InfoBean();
Text k = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
FileSplit inputSplit = (FileSplit) context.getInputSplit();
String name = inputSplit.getPath().getName();
// 通过文件名判断是哪种数据
String pid = "";
if (name.startsWith("order")) {
String[] fields = line.split("\t");
// id date pid amount
pid = fields[2];
bean.set(Integer.parseInt(fields[0]), fields[1], pid, Integer.parseInt(fields[3]), "", 0, 0, "0");
} else {
String[] fields = line.split("\t");
// id pname category_id price
pid = fields[0];
bean.set(0, "", pid, 0, fields[1], Integer.parseInt(fields[2]), Float.parseFloat(fields[3]), "1");
}
k.set(pid);
context.write(k, bean);
}
}
static class RJoinReducer extends Reducer<Text, InfoBean, InfoBean, NullWritable> {
@Override
protected void reduce(Text pid, Iterable<InfoBean> beans, Context context) throws IOException, InterruptedException {
InfoBean pdBean = new InfoBean();
ArrayList<InfoBean> orderBeans = new ArrayList<InfoBean>();
for (InfoBean bean : beans) {
if ("1".equals(bean.getFlag())) { //产品的
try {
BeanUtils.copyProperties(pdBean, bean);
} catch (Exception e) {
e.printStackTrace();
}
} else {
InfoBean odbean = new InfoBean();
try {
BeanUtils.copyProperties(odbean, bean);
orderBeans.add(odbean);
} catch (Exception e) {
e.printStackTrace();
}
}
}
// 拼接两类数据形成最终结果
for (InfoBean bean : orderBeans) {
bean.setPname(pdBean.getPname());
bean.setCategory_id(pdBean.getCategory_id());
bean.setPrice(pdBean.getPrice());
context.write(bean, NullWritable.get());
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("mapred.textoutputformat.separator", "\t");
Job job = Job.getInstance(conf);
// 指定本程序的jar包所在的本地路径
// job.setJarByClass(RJoin.class);
// job.setJar("D:/join.jar");
job.setJarByClass(RJoin.class);
// 指定本业务job要使用的mapper/Reducer业务类
job.setMapperClass(RJoinMapper.class);
job.setReducerClass(RJoinReducer.class);
// 指定mapper输出数据的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfoBean.class);
// 指定最终输出的数据的kv类型
job.setOutputKeyClass(InfoBean.class);
job.setOutputValueClass(NullWritable.class);
// 指定job的输入原始文件所在目录
FileInputFormat.setInputPaths(job, new Path(args[0]));
// 指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// 将job中配置的相关参数,以及job所用的java类所在的jar包,提交给yarn去运行
/* job.submit(); */
boolean res = job.waitForCompletion(true);
System.exit(res ? 0 : 1);
}
}
三、运行程序
#上传jar
Alt+p
lcd d:/
put rjoin.jar
#准备hadoop处理的数据文件
cd /home/hadoop/apps/hadoop-2.9.1
hadoop fs -mkdir -p /rjoin/input
hdfs dfs -put order.txt product.txt /rjoin/input
#运行rjoin程序
hadoop jar rjoin.jar com.empire.hadoop.mr.rjoin.RJoin /rjoin/input /rjoin/outputs
四、运行效果
[main] DEBUG org.apache.hadoop.ipc.ProtobufRpcEngine - Call: getJobReport took 5ms
[main] DEBUG org.apache.hadoop.security.UserGroupInformation - PrivilegedAction as:hadoop (auth:SIMPLE) from:org.apache.hadoop.mapreduce.Job.updateStatus(Job.java:328)
[IPC Parameter Sending Thread #0] DEBUG org.apache.hadoop.ipc.Client - IPC Client (1318427113) connection to centos-aaron-h3/192.168.29.146:34672 from hadoop sending #116 org.apache.hadoop.mapreduce.v2.api.MRClientProtocolPB.getJobReport
[IPC Client (1318427113) connection to centos-aaron-h3/192.168.29.146:34672 from hadoop] DEBUG org.apache.hadoop.ipc.Client - IPC Client (1318427113) connection to centos-aaron-h3/192.168.29.146:34672 from hadoop got value #116
[main] DEBUG org.apache.hadoop.ipc.ProtobufRpcEngine - Call: getJobReport took 7ms
[main] INFO org.apache.hadoop.mapreduce.Job - Job job_1544487152077_0003 completed successfully
[main] DEBUG org.apache.hadoop.security.UserGroupInformation - PrivilegedAction as:hadoop (auth:SIMPLE) from:org.apache.hadoop.mapreduce.Job.getCounters(Job.java:817)
[IPC Parameter Sending Thread #0] DEBUG org.apache.hadoop.ipc.Client - IPC Client (1318427113) connection to centos-aaron-h3/192.168.29.146:34672 from hadoop sending #117 org.apache.hadoop.mapreduce.v2.api.MRClientProtocolPB.getCounters
[IPC Client (1318427113) connection to centos-aaron-h3/192.168.29.146:34672 from hadoop] DEBUG org.apache.hadoop.ipc.Client - IPC Client (1318427113) connection to centos-aaron-h3/192.168.29.146:34672 from hadoop got value #117
[main] DEBUG org.apache.hadoop.ipc.ProtobufRpcEngine - Call: getCounters took 111ms
[main] INFO org.apache.hadoop.mapreduce.Job - Counters: 49
File System Counters
FILE: Number of bytes read=339
FILE: Number of bytes written=569177
FILE: Number of read operations=0
FILE: Number of large read operations=0
FILE: Number of write operations=0
HDFS: Number of bytes read=378
HDFS: Number of bytes written=452
HDFS: Number of read operations=9
HDFS: Number of large read operations=0
HDFS: Number of write operations=2
Job Counters
Launched map tasks=2
Launched reduce tasks=1
Data-local map tasks=2
Total time spent by all maps in occupied slots (ms)=17791
Total time spent by all reduces in occupied slots (ms)=3709
Total time spent by all map tasks (ms)=17791
Total time spent by all reduce tasks (ms)=3709
Total vcore-milliseconds taken by all map tasks=17791
Total vcore-milliseconds taken by all reduce tasks=3709
Total megabyte-milliseconds taken by all map tasks=18217984
Total megabyte-milliseconds taken by all reduce tasks=3798016
Map-Reduce Framework
Map input records=7
Map output records=7
Map output bytes=319
Map output materialized bytes=345
Input split bytes=230
Combine input records=0
Combine output records=0
Reduce input groups=3
Reduce shuffle bytes=345
Reduce input records=7
Reduce output records=4
Spilled Records=14
Shuffled Maps =2
Failed Shuffles=0
Merged Map outputs=2
GC time elapsed (ms)=552
CPU time spent (ms)=3590
Physical memory (bytes) snapshot=554237952
Virtual memory (bytes) snapshot=2538106880
Total committed heap usage (bytes)=259047424
Shuffle Errors
BAD_ID=0
CONNECTION=0
IO_ERROR=0
WRONG_LENGTH=0
WRONG_MAP=0
WRONG_REDUCE=0
File Input Format Counters
Bytes Read=148
File Output Format Counters
Bytes Written=452
[main] DEBUG org.apache.hadoop.security.UserGroupInformation - PrivilegedAction as:hadoop (auth:SIMPLE) from:org.apache.hadoop.mapreduce.Job.updateStatus(Job.java:328)
[IPC Parameter Sending Thread #0] DEBUG org.apache.hadoop.ipc.Client - IPC Client (1318427113) connection to centos-aaron-h3/192.168.29.146:34672 from hadoop sending #118 org.apache.hadoop.mapreduce.v2.api.MRClientProtocolPB.getJobReport
[IPC Client (1318427113) connection to centos-aaron-h3/192.168.29.146:34672 from hadoop] DEBUG org.apache.hadoop.ipc.Client - IPC Client (1318427113) connection to centos-aaron-h3/192.168.29.146:34672 from hadoop got value #118
[main] DEBUG org.apache.hadoop.ipc.ProtobufRpcEngine - Call: getJobReport took 2ms
[pool-4-thread-1] DEBUG org.apache.hadoop.ipc.Client - stopping client from cache: org.apache.hadoop.ipc.Client@4b5fd811
[Thread-3] DEBUG org.apache.hadoop.util.ShutdownHookManager - ShutdownHookManger complete shutdown.
五、运行结果
[hadoop@centos-aaron-h1 ~]$ hdfs dfs -ls /rjoin/outputs
Found 2 items
-rw-r--r-- 2 hadoop supergroup 0 2018-12-11 08:44 /rjoin/outputs/_SUCCESS
-rw-r--r-- 2 hadoop supergroup 452 2018-12-11 08:44 /rjoin/outputs/part-r-00000
[hadoop@centos-aaron-h1 ~]$ hdfs dfs -cat /rjoin/outputs/part-r-00000
order_id=1002, dateString=20150710, p_id=P0001, amount=3, pname=小米5, category_id=1001, price=2.0, flag=0
order_id=1001, dateString=20150710, p_id=P0001, amount=2, pname=小米5, category_id=1001, price=2.0, flag=0
order_id=1002, dateString=20150710, p_id=P0002, amount=3, pname=锤子T1, category_id=1000, price=3.0, flag=0
order_id=1003, dateString=20150710, p_id=P0003, amount=3, pname=锤子, category_id=1002, price=4.0, flag=0
[hadoop@centos-aaron-h1 ~]$
六、补充知识
mapreduce程序输出的日志路径一般为:
/home/hadoop/apps/hadoop-2.9.1/logs/userlogs/application_1544487152077_0004/container_1544487152077_0004_01_000003;
其中/home/hadoop/apps/hadoop-2.9.1/logs/为hadoop的安装目录下的logs。
最后寄语,以上是博主本次文章的全部内容,如果大家觉得博主的文章还不错,请点赞;如果您对博主其它服务器大数据技术或者博主本人感兴趣,请关注博主博客,并且欢迎随时跟博主沟通交流。