porm.xml
MapReduce处理数据流程
在整个MapReduce程序中,所有的数据的流程流式都是键值对(Key-value)
Input -> Map ->shuffle->Reduce ->Output
(1)针对于Input和Output来讲,正常情况下,不需要编写任何的代码,
只需要指定对应目录即可。
(2)核心关注map和reduce
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.huadian.bigdata</groupId>
<artifactId>hadoop</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>aliyun</id>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
</repository>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<repository>
<id>jboss</id>
<url>http://repository.jboss.com/nexus/content/groups/public</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<hadoop.version>2.7.3</hadoop.version>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.20.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
MapReduce执行过程(实例)
input环节:
输入:读取HDFS上数据
输出: Key value
0 hadoop java spring springMvc
28 java spring java
Mapper环节
class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>
<输入Key,输入Value,输出Key,输出Value>
<行偏移量,行内容,XX,YY>
protected void map(KEYIN key, VALUEIN value, Context context)
map要干嘛:
通过 空格 分割,取出里面的单词
输出: key value
Hadoop 1
java 1
spring 1
springMvc 1
java 1
….
shuffle环节:
功能:
分区:
分组:会将相同Key 的value放到一个集合中
排序:按照字典顺序排序
输出:key value
Hadoop {1}
java {1,1}
spring {1}
Reduce环节:
class Reducer<KEYIN,VALUEIN,KEYOUT,VALUEOUT>
<单词,1,单词,频率>
void reduce(KEYIN key, Iterable values, Context context )
处理:将集合里面的值拿出来相加
输出: key value
单词 频率(次数)
java 2
Hadoop 1
Output环节:
输入: key value
单词 频率(次数)
java 2
Hadoop 1
…
输出:将内容写到HDFS文件中
代码示例:
package com.huadian.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @author 飞
* @create 2019 - 07 - 02 - 19:43
*/
public class WordCountMapReduce {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//1.读取配置文件
Configuration configuration = new Configuration();
//2.创建job
//job getinstance(configuration conf,string jobname)
Job job = Job.getInstance(configuration,"WordCount");
//设置job运行的主类
job.setJarByClass(WordCountMapReduce.class);
//3.设置job
//3.1 input
Path inputpath = new Path(args[0]);
FileInputFormat.setInputPaths(job,inputpath);
//3.1 map
job.setMapperClass(WordCountMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//3.1 shuffle
//3.1reduce
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//3.1output
Path outpath = new Path(args[1]);
FileOutputFormat.setOutputPath(job,outpath);
//4.提交job,运行。
boolean isSuccess = job.waitForCompletion(true);
System.exit(isSuccess?0:1);
}
/**
* map
* keyin:行偏移量,long表示
* valuein:行内容,string表示
* keyout:单词
* valueout:频率
*
*/
private static class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
private Text mapOutKey = new Text();
private static final IntWritable mapOutValue = new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//这里将内容转成一个一个单词:
String row = value.toString();
String[] word = row.split(" ");
for (String str : word) {
mapOutKey.set(str);
//用context将map方法的结果输出
context.write(mapOutKey,mapOutValue);
}
}
}
private static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable outPutValue = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable frequency:values) {
sum += frequency.get();
}
outPutValue.set(sum);
context.write(key,outPutValue);
}
}
}
Hadoop 数据类型
Hadoop 本身提供了一套可优化网络序列化传输的基本类型
类型 含义
BooleanWritable 标准布尔型数值
ByteWritable 单字节数值
DoubleWritable 双字节数值
FloatWritable 浮点数
IntWritable 整型数
LongWritable 长整型数
Text 使用 UTF8 格式存储的文本
NullWritable 当中的 key 或 value 为空时使用
之后就可以将编写好的程序打包上传到 Linux 系统中,进行运行测试