作业:
singlejoin.txt:
child parent
Tom Lucy
Tom Jack
Jone Lucy
Jone Jack
Lucy Marry
Lucy Ben
Jack Alice
Jack Jesse
Terry Alice
Terry Jesse
Philip Terry
Philip Alma
Mark Terry
Mark Alma
static {
System.setProperty("hadoop.home.dir","E:/x3/hadoop-2.9.2");
}
//map
public static class MyMapper extends Mapper<LongWritable,Text,Text,Text>{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
if(key.get()==0){
return;
}
//0 : child 1 : parent
context.write(new Text(split[0]),new Text("1:"+split[1]));
context.write(new Text(split[1]),new Text("0:"+split[0]));
}
}
//reduce
public static class MyReduce extends Reducer<Text,Text,Text,Text>{
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
ArrayList<String> parentList = new ArrayList<>();
ArrayList<String> chileList = new ArrayList<>();
for(Text value : values){
String[] split = value.toString().split(":");
//如果是父亲 添加进父类集合
if("1".equals(split[0])){
parentList.add(split[1]);
}else{
chileList.add(split[1]);
}
}
//循环遍历
for(String parent : parentList){
for(String child : chileList){
context.write(key,new Text(parent+" : "+child));
}
}
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//0.初始化一个job
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "single_join");
job.setJarByClass(SingleJoin.class);
//1.输入文件
FileInputFormat.addInputPaths(job, args[0]);
//2.map并行计算
job.setMapperClass(MyMapper.class);
//3.shuffle流程(内部实现)
//4.reduce计算
job.setReducerClass(MyReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//5.输出文件
FileOutputFormat.setOutputPath(job, new Path(args[1]));
FileSystem fs = FileSystem.get(conf);
if (fs.exists(new Path(args[1]))) {
fs.delete(new Path(args[1]), true);
}
//6.提交作业(总入口)
boolean result = job.waitForCompletion(true);
System.out.println(result ? 1 : 0);
}
最终结果:

本文详细介绍了一种在Hadoop环境下实现单表JOIN操作的方法,通过自定义Mapper和Reducer类,实现了对数据集的高效处理。文章提供了具体的代码示例,展示了如何设置Job参数,配置输入输出路径,以及如何进行数据的映射和规约操作。
3828

被折叠的 条评论
为什么被折叠?



