前言:
MapReduce作为hadoop中和HDFS YARN 三大组件之一
还是很有必要去掌握其中原理 并进行代码编写
在这部分导包内容 是基于后续写代码的过程中 进行导入的(可跳过)
package com.xkh.mr;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* hdfs中的hello.txt
* 文件中内容
* hello you
* hello me
* hello word and you
* 最终形式
* hello 3
* you 2
* me 1
* word 1
* and 1
*/
先整体看代码
public class wordcount {
//<k1,v1> <k2,v2> k1为偏移量 k2 为文本
public static class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable>{
@Override
protected void map(LongWritable k1, Text v1, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
//对每一行的数据进行切割
String[] words = v1.toString().split(" "); //通过空格切开
for (String word : words){
//变成<k2,v2> <hello,1>
Text k2 = new Text(word);
LongWritable v2 = new LongWritable(1L);
//写出数据
context.write(k2,v2);
}
}
}
public static class MyReduce extends Reducer<Text,LongWritable,Text,LongWritable>{
//针对<k2,{v2...}> 累加 并最后 k3,v3
@Override
protected void reduce(Text k2, Iterable<LongWritable> v2s, Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
//创建sum 保存v2s的和
long sum = 0L;
for(LongWritable v2: v2s){
sum += v2.get();
//k3 和 k2 是一样的
//组装k3,v3
Text k3 = k2;
LongWritable v3 = new LongWritable(sum);
context.write(k3,v3);
}
}
}
/**
* 组装 map + reduce
*/
public static void main(String[] args) {
try {
if(args.length!=2){
//如果传递参数不够 退出
System.exit(100);
}
Configuration conf = new Configuration();
//创建一个job
Job job = Job.getInstance(conf);
//指定输入路径
FileInputFormat.setInputPaths(job,new Path(args[0]));
//指定输出路径(hdfs当中不存在的目录)
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//必须设置,否则在集群中执行的时候是找不到wordcount这个类的
job.setJarByClass(wordcount.class);
//指定map相关的代码
job.setMapperClass(WordCountJob.MyMapper.class);
//指定k2的类型
job.setMapOutputKeyClass(Text.class);
//指定v2的类型
job.setMapOutputValueClass(LongWritable.class);
//指定reduce相关的代码
job.setReducerClass(WordCountJob.MyReducer.class);
//指定k3的类型
job.setOutputKeyClass(Text.class);
//指定v3的类型
job.setOutputValueClass(LongWritable.class);
//提交job
job.waitForCompletion(true);
}catch (Exception e){
e.printStackTrace();
}
}
}
Map阶段
注 k1 可以理解为 每一行开始的位置 也就是最前面开始的偏移量
<0,hello you>
<10,hello me>
public static class MyMapper extends Mapper<LongWritable,Text,Text,LongWritable>{
@Override
protected void map(LongWritable k1, Text v1, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
//对每一行的数据进行切割
String[] words = v1.toString().split(" "); //通过空格切开
for (String word : words){
//变成<k2,v2> <hello,1>
Text k2 = new Text(word);
LongWritable v2 = new LongWritable(1L);
//写出数据
context.write(k2,v2);
}
}
}
Reduce阶段
<hello,<1,1,1,1…>
public static class MyReduce extends Reducer<Text,LongWritable,Text,LongWritable>{
//针对<k2,{v2...}> 累加 并最后 k3,v3
@Override
protected void reduce(Text k2, Iterable<LongWritable> v2s, Reducer<Text, LongWritable, Text, LongWritable>.Context context)
throws IOException, InterruptedException {
//创建sum 保存v2s的和
long sum = 0L;
for(LongWritable v2: v2s){
sum += v2.get();
//k3 和 k2 是一样的
//组装k3,v3
Text k3 = k2;
LongWritable v3 = new LongWritable(sum);
context.write(k3,v3);
}
}
}
组装过程为固定使用方式,需要对类 类型 以及job的提交进行声明
使用maven进行打包操作
mvn clean package -DskipTests
提交至集群hadoop所在文件夹
hadoop jar db_hadoop-1.0-SNAPSHOT-jar-with-dependencies.jar com.xkh.mr.wordcount /hello.txt /out
指定包名和类 /hello.txt为输入路径 /out为输出结果位置 必须为一个不存在的文件夹
在HDFS中查看输出结果
成功!