自定义分区器-优快云博客

本文链接：https://blog.youkuaiyun.com/2301_80781830/article/details/147088295

//1
package com.example.mapreduce;
//自定义分区器

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

//1. 继承Partitioner类
//2. 重写getPartition方法
public class CustomerPartitioner extends Partitioner<Text, LongWritable> {
    @Override
    public int getPartition(Text key, LongWritable value, int numPartitions) {
        //返回值是0，1，2.......
        // 如果key1和Key2的code相同，那么他们会被分配到同一个分区，保存到同一个文件
//        如果单词是以a~m开头的，那么会被分配到第一个分区，
//        否则，被分配到第二个分区
        //1. 获取单词首字符
        char firstChar = key.toString().charAt(0);
        //2. 判断首字符
        if(firstChar >= 'a' && firstChar <= 'm'){
            return 0;
        }else {
            return 1;
        }
    }
}

//2

package com.example.mapreduce;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCountDriver {

    // mapreduce的Driver
    // 提交job
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
        //设置操作集群时的用户名为root
        System.setProperty("HADOOP_USER_NAME", "root");
        // 1. 获取配置信息以及获取job对象
        Configuration conf = new Configuration();

        //设置集群地址
        //conf.set("fs.defaultFS", "hdfs://douyin:8020");

        Job job = Job.getInstance(conf);
        // 2. 关联本地的jar
        job.setJarByClass(WordCountDriver.class);

        // 3. 关联Mapper和Reducer
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        // 4. 设置Mapper输出的KV类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(LongWritable.class);

        // 5. 设置最终输出KV类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(LongWritable.class);

//        // 6. 设置输入和输出路径。请确保wcinput是存在的,并且下面有待统计词频的单词文件。
//        //                         output1会自动被创建，如果它已经存在，程序会报错!
//        FileInputFormat.setInputPaths(job, new Path("E://vm//wordcount"));
//        FileOutputFormat.setOutputPath(job, new Path("E://vm//wordcount1"));

        //改成集群的地址
        FileInputFormat.setInputPaths(job, new Path("D:\\vm\\wcinput"));
        FileOutputFormat.setOutputPath(job, new Path("D:\\vm\\wcoutput1"));

//        设置reduceTask的数量
        job.setNumReduceTasks(2);
        //        设置自定义分区器
        job.setPartitionerClass(CustomerPartitioner.class);
        // 7. 提交job
        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

//3

package com.example.mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

//1.从Mapper继承
//2.重写map方法
//LongWritable,Text:表示初始值的键值对格式。LongWritable是键的数据类型，Text是值的数据类型
//Text,LongWritable:表示map函数输出的数据的格式。Text是键的数据类型，LongWritable是值的数据类型
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1.获取一行数据，并拆分成一个个的单词
        String[] words = value.toString().split(" ");
        //2.遍历单词，并输出<单词，1>的键值对
        for (String word : words) {
            context.write(new Text(word), new LongWritable(1));

        }
    }
}

//4

package com.example.mapreduce;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

//1.继承Reducer
//2.重写reduce方法
public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable> {
    @Override
    protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
        //1.对每个values中的数据进行汇总：累加求和
        long sum = 0;
        for (LongWritable value : values) {
            sum += value.get();
        }
        //2.输出<单词，总次数>
        context.write(key, new LongWritable(sum));
    }

}