hadoop自定义分组步骤1.4

最新推荐文章于 2025-05-12 03:19:40 发布

原创最新推荐文章于 2025-05-12 03:19:40 发布 · 863 阅读

0 ·

CC 4.0 BY-SA版权

hadoop 专栏收录该内容

30 篇文章

订阅专栏

本文介绍了一个使用Hadoop MapReduce处理数据的具体案例，重点在于如何通过自定义分组类来优化数据处理流程。文章详细展示了自定义Mapper、Reducer、SortWritable类以及分组比较器的过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

将如下数字,分组后找出每一行最小的value

3 2

2 2
3 1
1 1
3 3
3 1

2 1

--------------------------------------------

1 1

2 1

3 1

为什么要自定义分组类：
1.最开始写mapreduce的时候1.4分组步骤是没指定的,可以认为hadoop默认是有个分组规则的。此时reduce接收的类型是<key1,{value1,value2}>的形式
2.当key2为自定义hadoop类型的时候,hadoop默认的分组规则不起作用了,此时如果不自定义分组规则,reduce接收的类型是<key1,value1>,<key1,value1>的形式。如果还希望接收到的数据是<key1,{value1,value2}>的形式就需要自定义分组类。

Mapper

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MyMapper extends Mapper<LongWritable, Text, SortWritable, LongWritable> {

	protected void map(LongWritable key, Text value,
			org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, SortWritable, LongWritable>.Context context)
			throws java.io.IOException, InterruptedException {
		String[] splits = value.toString().split("\t");
		String first = splits[0]; 
		String second = splits[1];
		SortWritable sw = new SortWritable(Long.parseLong(first), Long.parseLong(second));
		context.write(sw, new LongWritable(Long.parseLong(second)));
	};
}

reduce

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Reducer;

public class MyReduce extends Reducer<SortWritable, LongWritable, LongWritable, LongWritable> {
	@Override
	protected void reduce(SortWritable key2, java.lang.Iterable<LongWritable> values2,
			org.apache.hadoop.mapreduce.Reducer<SortWritable, LongWritable, LongWritable, LongWritable>.Context context)
			throws java.io.IOException, InterruptedException {
		//取最小值
		long min = Long.MAX_VALUE;
		for (LongWritable v2 : values2) {
			if(v2.get()<min){
				min = v2.get();
			}
		}
		context.write(new LongWritable(key2.first), new LongWritable(min));
	};
}

自定义hadoop类,也就是key2

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class SortWritable implements WritableComparable<SortWritable> {
	Long first = 1l;
	Long second = 1l;

	public SortWritable() {
	}

	public SortWritable(Long first, Long second) {
		this.first = first;
		this.second = second;
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.first = in.readLong();
		this.second = in.readLong();
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeLong(first);
		out.writeLong(second);
	}

	@Override
	public int compareTo(SortWritable o) {
		if (this.first == o.first) {
			return (int) (this.second - o.second);
		}
		return (int) (this.first - o.first);
	}

	@Override
	public int hashCode() {
		return first.hashCode() + second.hashCode();
	}

	@Override
	public boolean equals(Object obj) {
		if (obj instanceof SortWritable) {
			SortWritable sw = (SortWritable) obj;
			return this.first == sw.first && this.second == sw.second;
		}
		return false;
	}

}

自定义分组类,需要实现RawComparator类泛型写需要分组的自定义hadoop类

import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;

public class MyGroupingComparator implements RawComparator<SortWritable> {

	@Override
	public int compare(SortWritable o1, SortWritable o2) {
		return (int)(o1.first - o2.first);
	}
	
	/**
	 * @param arg0 表示第一个参与比较的字节数组
	 * @param arg1 表示第一个参与比较的字节数组的起始位置
	 * @param arg2 表示第一个参与比较的字节数组的偏移量
	 * 
	 * @param arg3 表示第二个参与比较的字节数组
	 * @param arg4 表示第二个参与比较的字节数组的起始位置
	 * @param arg5 表示第二个参与比较的字节数组的偏移量
	 * 
	 * 一个long类型占8字节
	 */
	@Override
	public int compare(byte[] arg0, int arg1, int arg2, byte[] arg3, int arg4, int arg5) {
		return WritableComparator.compareBytes(arg0, arg1, 8, arg3, arg4, 8);
	}

}

测试类

import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;


public class SortTest {
	private static final String INPUT_PATH = "hdfs://xxc:9000/input";
	private static final String OUT_PATH = "hdfs://xxc:9000/out";

	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
		Configuration conf = new Configuration();
		
		FileSystem fileSystem = FileSystem.get(new URI(INPUT_PATH), conf);
		Path outPath = new Path(OUT_PATH);
		if(fileSystem.exists(outPath)){
			fileSystem.delete(outPath, true);
		}
		
		Job job = new Job(conf,SortTest.class.getSimpleName());
		
		
		FileInputFormat.setInputPaths(job, INPUT_PATH);
		job.setInputFormatClass(TextInputFormat.class);
		
		job.setMapperClass(MyMapper.class);
		job.setMapOutputKeyClass(SortWritable.class);
		job.setMapOutputValueClass(LongWritable.class);
		
		//1.3分区
		job.setPartitionerClass(HashPartitioner.class);
		job.setNumReduceTasks(1);
		
		//1.4分组
		job.setGroupingComparatorClass(MyGroupingComparator.class);
		
		job.setReducerClass(MyReduce.class);
		job.setOutputKeyClass(LongWritable.class);
		job.setOutputValueClass(LongWritable.class);
		
		FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
		job.setOutputFormatClass(TextOutputFormat.class);
		
		job.waitForCompletion(true);
	}
}