MapReduce（wordcount实现倒排索引）附项目全程_用idea实现mapreduce经典案例—

本文链接：https://blog.youkuaiyun.com/P_Joe/article/details/115903978

MapReduce实现带分区的倒排索引

编写环境：win10 ，IntelliJ IDEA 2020.1.1，maven3.8.1，jdk-1.8
运行环境：centos-7.3，hadoop-2.7.7，jdk-1.8

一、新建maven项目

File->New->Project->Maven
在这里插入图片描述
next

填写必要信息，完成。

二、编写程序

1、配置maven
File->Setting->进入maven的配置
在这里插入图片描述
repo是自己命名的仓库
Apply，OK

2、编写pom.xml，引入需要的jar包

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.joe.project</groupId>
    <artifactId>invertedindexWithPartitioner</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
    </properties>
    <dependencies>
        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.7</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.7</version>
        </dependency>
        <dependency>
            <groupId>commons-cli</groupId>
            <artifactId>commons-cli</artifactId>
            <version>1.4</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
            <version>2.7.7</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-mapreduce-examples</artifactId>
            <version>2.7.7</version>
        </dependency>
        <dependency>
            <groupId>log4j</groupId>
            <artifactId>log4j</artifactId>
            <version>1.2.17</version>
        </dependency>
    </dependencies>

</project>

点击代码区域右上角的Load，导入依赖

3、建立项目结构目录
在这里插入图片描述

4、编写代码
InvertedIndexMapper

package org.joe.project;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/**
 * @ClassName InvertedIndexMapper
 * @Description TODO
 * @Author Joe
 * @Date 2021/4/20 14:02
 * @Version 1.0
 **/
public class InvertedIndexMapper extends Mapper<LongWritable, Text,Text, Text> {
    private static final Text keyInfo = new Text();
    private static final Text valueInfo = new Text("1");
    @Override
    protected  void map(LongWritable key, Text value, Context context)throws IOException,InterruptedException{
        String line = value.toString();
        String[] fields = line.split(" ");// 得到字段数组
        FileSplit fileSplit = (FileSplit) context.getInputSplit();// 得到这行数据所在的文件切片
        String fileName = fileSplit.getPath().getName();// 根据文件切片得到文件名
        for (String field : fields) {
            // key值由单词和URL组成，如“MapReduce->file1”
            keyInfo.set(field + "->" + fileName);
            context.write(keyInfo, valueInfo);
        }
    }
}

InvertedIndexCombiner

package org.joe.project;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.Text;
import java.io.IOException;
/**
 * @ClassName InvertedIndexCombiner
 * @Description TODO
 * @Author Joe
 * @Date 2021/4/20 14:05
 * @Version 1.0
 **/
public class InvertedIndexCombiner extends Reducer<Text,Text,Text,Text> {
    private static Text info =new Text();
    @Override
    protected void reduce(Text key,Iterable<Text> values, Context context)throws IOException,InterruptedException{
        int sum = 0;// 统计词频
        for (Text value : values) {
            sum += Integer.parseInt(value.toString());
        }
        int splitIndex = key.toString().indexOf("->");// 重新设置 value 值由 URL 和词频组成
        info.set(key.toString().substring(splitIndex + 1) + "->" + sum);// 重新设置 key 值为单词
        key.set(key.toString().substring(0, splitIndex));
        context.write(key, info);
    }
}

InvertedIndexPatitioner

package org.joe.project;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
/**
 * @ClassName InvertedIndexPatitioner
 * @Description TODO
 * @Author Joe
 * @Date 2021/4/20 14:02
 * @Version 1.0
 **/
public class InvertedIndexPatitioner extends Partitioner<Text,Text> {
    private static int PatitionNumber=0;
    @Override
    public int getPartition(Text text, Text text2, int i) {
        // 写分区规则，返回分区号；分区数为i;分区尽量使数据均匀分散到各个区，避免有的分区数据过多，有的分区没有数
        String word=text.toString().trim();
        if (word.length()==0) return 0;
        char firstchar=Character.toUpperCase(word.charAt(0));
        if (firstchar>='A'&&firstchar<='M')
            PatitionNumber=1;
        else  if (firstchar>='N'&&firstchar<='Z')
            PatitionNumber=2;
        else PatitionNumber=0;
        return PatitionNumber;
    }
}

InvertedIndexReducer

package org.joe.project;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
 * @ClassName InvertedIndexReducer
 * @Description TODO
 * @Author Joe
 * @Date 2021/4/20 14:03
 * @Version 1.0
 **/
public class InvertedIndexReducer extends Reducer<Text, Text, Text, Text> {
    private static Text result = new Text();
    @Override
    protected void reduce(Text key, Iterable<Text> values, Context context)  throws IOException, InterruptedException {
        // 生成文档列表
        String fileList = new String();
        for (Text value : values) {
            fileList += value.toString() + ";";
        }
        result.set(fileList);
        context.write(key, result);
    }
}

InvertedIndexMain

package org.joe.project;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

/**
 * @ClassName InvertedIndexMain
 * @Description TODO
 * @Author Joe
 * @Date 2021/4/20 14:01
 * @Version 1.0
 **/
public class InvertedIndexMain {
    public static void main(String[] args) throws Exception{
        Configuration conf =new Configuration();
        //Job job=Job.getInstance(conf);
        Job job = Job.getInstance(conf, "invertedindexWithPartitioner");
        if (args.length != 2) {
            System.err.println("Usage: invertedindexWithPartitioner<in> <out>");
            System.exit(2);
        }
        //job
        job.setJarByClass(InvertedIndexMain.class);
        //map
        job.setMapperClass(InvertedIndexMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        //combine
        job.setCombinerClass(InvertedIndexCombiner.class);
        //partition
        job.setPartitionerClass(InvertedIndexPatitioner.class);
        job.setNumReduceTasks(3);
        //reduce
        job.setReducerClass(InvertedIndexReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        //input&ouput
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        boolean flag = job.waitForCompletion(true);
        if (!flag) {
            System.out.println("InvertedIndex failed!");
        }
    }
}

三、项目打包

右键项目名称->选择如图项：
在这里插入图片描述
选择如下：

填写Main Class
下面有两个选项，第一个是extract to the target JAR，指将项目及项目依赖的包都打包成一个JAR（结果运行比较慢，第二个是copy to the output directory and link via manifest，指其他依赖包分开放，结果为多个JAR，因为执行环境hadoop上已经有相关的依赖包，这里选第二个，点击ok，如下：
在这里插入图片描述
勾选include in project build ，其中Output directory为最后的输出目录，下面output layout是输出的各jar包，点击ok，如下：