MapReduce二次排序

本文介绍了一种在Hadoop MapReduce环境下实现数据二次排序的方法,具体案例为按cookieId和time先后排序,以达到按session切分日志的目的。文章详细讲解了自定义key类型以实现复杂的排序需求。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

package com.uplooking.bigdata.mr.test;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**
 * 对数据进行二次排序:
 *
 假如我们现在的需求是先按 cookieId 排序,然后按 time 排序,以便按 session 切分日志
     cookieId    time    url
     2    12:12:34    2_hao123
     3    09:10:34    3_baidu
     1    15:02:41    1_google
     3    22:11:34    3_sougou
     1    19:10:34    1_baidu
     2    15:02:41    2_google
     1    12:12:34    1_hao123
     3    23:10:34    3_soso
     2    05:02:41    2_google
 结果:
     ---------------------------------
     1      12:12:34        1_hao123
     1      15:02:41        1_google
     1      19:10:34        1_baidu
     ---------------------------------
     2      05:02:41        2_google
     2      12:12:34        2_hao123
     2      15:02:41        2_google
     ---------------------------------
     3      09:10:34        3_baidu
     3      22:11:34        3_sougou
     3      23:10:34        3_soso

    分析:
        这里显然要比上一个案例复杂,上一个案例我们只对一列之进行了排序,该案例我们需要对两列值进行排序,把这种排序称之为二次排序。
        按照我们之前在学习集合的时候的思路(在treeset或者treemap的时候对person先按照年龄自然顺序排序,然后在按照姓名自然顺序,为二次排序),
        类比到我们这道题,因为在mr中要进行排序只对key进行排序的原则,同时我们知道key作为mr的泛型参数,需要具备以下原则:
        1、key必须要实现序列化(implements Writable)
        2、同时因为要经过shuffle,shuffle中有排序,所以key还有可比较(implements Comparable)
            这二者综合就需要让key实现WritableComparable接口

        所以要完成二次排序,需要自定义key类型,key中包含了我们要进行排序的内容,

 */
public class DataSecondSortApp {
    public static void main(String[] args) throws Exception {
        if(args == null || args.length < 2) {
            System.err.println("Parameter Error! Usage: <inputPath outputPath>");
            System.exit(-1);
        }
        String inputPath = args[0];
        Path outputPath = new Path(args[1]);

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf, DataSecondSortApp.class.getSimpleName());
        job.setJarByClass(DataSecondSortApp.class);
        //设置输入
        FileInputFormat.setInputPaths(job, inputPath);
        job.setInputFormatClass(TextInputFormat.class);
        //setmap
        job.setMapperClass(DataSecondSortMapper.class);
        job.setMapOutputKeyClass(SecondSortWritable.class);
        job.setMapOutputValueClass(NullWritable.class);
        //设置输出
        outputPath.getFileSystem(conf).delete(outputPath, true);
        FileOutputFormat.setOutputPath(job, outputPath);
        job.setOutputFormatClass(TextOutputFormat.class);

        //设置 reducer
        job.setReducerClass(DataSecondSortReducer.class);
        job.setOutputKeyClass(SecondSortWritable.class);
        job.setOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(1);

        job.waitForCompletion(true);
    }

    static class DataSecondSortMapper extends Mapper<LongWritable, Text, SecondSortWritable, NullWritable> {
        @Override
        protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
            String line = v1.toString();
            String[] splits = line.split("\t");
            //2    12:12:34    2_hao123
            Integer cookieid = Integer.valueOf(splits[0].trim());
            String time = splits[1].trim();
            String url = splits[2].trim();
            SecondSortWritable ssw = new SecondSortWritable(cookieid, time, url);
            context.write(ssw, NullWritable.get());
        }
    }
    static class DataSecondSortReducer extends Reducer<SecondSortWritable, NullWritable, SecondSortWritable, NullWritable> {
        @Override
        protected void reduce(SecondSortWritable k2, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
            context.write(k2, NullWritable.get());
        }
    }
}

/**
 * 自定义key的类型
 */
class SecondSortWritable implements WritableComparable<SecondSortWritable> {
    private int cookieid;
    private String time;
    private String url;

    public SecondSortWritable(int cookieid, String time, String url) {
        this.cookieid = cookieid;
        this.time = time;
        this.url = url;
    }

    public SecondSortWritable() {
    }

    public int getCookieid() {
        return cookieid;
    }

    public void setCookieid(int cookieid) {
        this.cookieid = cookieid;
    }

    public String getTime() {
        return time;
    }

    public void setTime(String time) {
        this.time = time;
    }

    public String getUrl() {
        return url;
    }

    public void setUrl(String url) {
        this.url = url;
    }

    public void write(DataOutput out) throws IOException {
        out.writeInt(this.cookieid);
        out.writeUTF(this.time);
        out.writeUTF(this.url);
    }

    public void readFields(DataInput in) throws IOException {
        this.cookieid = in.readInt();
        this.time = in.readUTF();
        this.url = in.readUTF();
    }

    /*
        核心的比较
            先按照cookieid进行升序比较
            在按照time进行比较
    */
    public int compareTo(SecondSortWritable sw) {
        int ret = this.cookieid - sw.cookieid;
        if(ret == 0) {
            ret = this.time.compareTo(sw.time);
        }
        return ret;
    }

    @Override
    public String toString() {
        return this.cookieid + "\t" + this.time + "\t" + this.url;
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值