MapReduce实现带分区的倒排索引
编写环境:win10 ,IntelliJ IDEA 2020.1.1,maven3.8.1,jdk-1.8
运行环境:centos-7.3,hadoop-2.7.7,jdk-1.8
一、新建maven项目
File->New->Project->Maven
next
填写必要信息,完成。
二、编写程序
1、配置maven
File->Setting->进入maven的配置
repo是自己命名的仓库
Apply,OK
2、编写pom.xml,引入需要的jar包
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.joe.project</groupId>
<artifactId>invertedindexWithPartitioner</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-examples</artifactId>
<version>2.7.7</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
</dependencies>
</project>
点击代码区域右上角的Load,导入依赖
3、建立项目结构目录
4、编写代码
InvertedIndexMapper
package org.joe.project;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/**
* @ClassName InvertedIndexMapper
* @Description TODO
* @Author Joe
* @Date 2021/4/20 14:02
* @Version 1.0
**/
public class InvertedIndexMapper extends Mapper<LongWritable, Text,Text, Text> {
private static final Text keyInfo = new Text();
private static final Text valueInfo = new Text("1");
@Override
protected void map(LongWritable key, Text value, Context context)throws IOException,InterruptedException{
String line = value.toString();
String[] fields = line.split(" ");// 得到字段数组
FileSplit fileSplit = (FileSplit) context.getInputSplit();// 得到这行数据所在的文件切片
String fileName = fileSplit.getPath().getName();// 根据文件切片得到文件名
for (String field : fields) {
// key值由单词和URL组成,如“MapReduce->file1”
keyInfo.set(field + "->" + fileName);
context.write(keyInfo, valueInfo);
}
}
}
InvertedIndexCombiner
package org.joe.project;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.Text;
import java.io.IOException;
/**
* @ClassName InvertedIndexCombiner
* @Description TODO
* @Author Joe
* @Date 2021/4/20 14:05
* @Version 1.0
**/
public class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text> {
private static Text info =new Text();
@Override
protected void reduce(Text key,Iterable<Text> values, Context context)throws IOException,InterruptedException{
int sum = 0;// 统计词频
for (Text value : values) {
sum += Integer.parseInt(value.toString());
}
int splitIndex = key.toString().indexOf("->");// 重新设置 value 值由 URL 和词频组成
info.set(key.toString().substring(splitIndex + 1) + "->" + sum);// 重新设置 key 值为单词
key.set(key.toString().substring(0, splitIndex));
context.write(key, info);
}
}
InvertedIndexPatitioner
package org.joe.project;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
* @ClassName InvertedIndexPatitioner
* @Description TODO
* @Author Joe
* @Date 2021/4/20 14:02
* @Version 1.0
**/
public class InvertedIndexPatitioner extends Partitioner<Text,Text> {
private static int PatitionNumber=0;
@Override
public int getPartition(Text text, Text text2, int i) {
// 写分区规则,返回分区号;分区数为i;分区尽量使数据均匀分散到各个区,避免有的分区数据过多,有的分区没有数
String word=text.toString().trim();
if (word.length()==0) return 0;
char firstchar=Character.toUpperCase(word.charAt(0));
if (firstchar>='A'&&firstchar<='M')
PatitionNumber=1;
else if (firstchar>='N'&&firstchar<='Z')
PatitionNumber=2;
else PatitionNumber=0;
return PatitionNumber;
}
}
InvertedIndexReducer
package org.joe.project;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @ClassName InvertedIndexReducer
* @Description TODO
* @Author Joe
* @Date 2021/4/20 14:03
* @Version 1.0
**/
public class InvertedIndexReducer extends Reducer<Text, Text, Text, Text> {
private static Text result = new Text();
@Override
protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
// 生成文档列表
String fileList = new String();
for (Text value : values) {
fileList += value.toString() + ";";
}
result.set(fileList);
context.write(key, result);
}
}
InvertedIndexMain
package org.joe.project;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* @ClassName InvertedIndexMain
* @Description TODO
* @Author Joe
* @Date 2021/4/20 14:01
* @Version 1.0
**/
public class InvertedIndexMain {
public static void main(String[] args) throws Exception{
Configuration conf =new Configuration();
//Job job=Job.getInstance(conf);
Job job = Job.getInstance(conf, "invertedindexWithPartitioner");
if (args.length != 2) {
System.err.println("Usage: invertedindexWithPartitioner<in> <out>");
System.exit(2);
}
//job
job.setJarByClass(InvertedIndexMain.class);
//map
job.setMapperClass(InvertedIndexMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//combine
job.setCombinerClass(InvertedIndexCombiner.class);
//partition
job.setPartitionerClass(InvertedIndexPatitioner.class);
job.setNumReduceTasks(3);
//reduce
job.setReducerClass(InvertedIndexReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//input&ouput
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
boolean flag = job.waitForCompletion(true);
if (!flag) {
System.out.println("InvertedIndex failed!");
}
}
}
三、项目打包
右键项目名称->选择如图项:
选择如下:
填写Main Class
下面有两个选项,第一个是extract to the target JAR,指将项目及项目依赖的包都打包成一个JAR(结果运行比较慢,第二个是copy to the output directory and link via manifest,指其他依赖包分开放,结果为多个JAR,因为执行环境hadoop上已经有相关的依赖包,这里选第二个,点击ok,如下:
勾选include in project build ,其中Output directory为最后的输出目录,下面output layout是输出的各jar包,点击ok,如下:
Build->Build Aritifacts
结果查看如下,找到invertedindexWithPartitioner.jar,并上传到hadoop
四、HADOOP执行程序
1、开启hadoop
2、上传invertedindexWithPartitioner.jar到指定目录
3、编辑三个文件a.txt、b.txt、c.txt上传到虚拟机,内容如下:
4、将txt文件上传到hdfs中,运行jar包
5、查看结果
结果无误
参考:
https://blog.youkuaiyun.com/Imflash/article/details/100619056
https://blog.youkuaiyun.com/weixin_42370346/article/details/88688693
有不对之处请指正。