hadoop 三次排序

本文详细探讨了Hadoop中实现数据三次排序的过程,包括数据预处理、自定义数据结构以及如何利用MapReduce和Driver进行整合排序,最终得出有序结果。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

数据

10.0.0.1,192.168.0.1,10000
10.0.0.1,192.168.0.1,10001
10.0.0.1,192.168.0.1,10002
10.0.0.1,192.168.0.2,10000
10.0.0.1,192.168.0.2,10001
10.0.0.1,192.168.0.3,09999
10.0.0.1,192.168.0.3,10000
10.0.0.2,192.168.0.1,10001
10.0.0.2,192.168.0.1,10002
10.0.0.2,192.168.0.2,10000
10.0.0.2,192.168.0.2,10003
10.0.0.3,192.168.0.1,10000
10.0.0.3,192.168.0.1,10001
10.0.0.3,192.168.0.1,10001
10.0.0.3,192.168.0.1,10002
10.0.0.3,192.168.0.1,10004
10.0.0.3,192.168.0.3,10005
10.0.0.3,192.168.0.3,10006
10.0.0.3,192.168.0.3,10007
10.0.0.3,192.168.0.3,10008


打乱顺序

10.0.0.3,192.168.0.3,10005
10.0.0.1,192.168.0.1,10000
10.0.0.3,192.168.0.1,10001
10.0.0.2,192.168.0.2,10000
10.0.0.3,192.168.0.1,10004
10.0.0.3,192.168.0.1,10002
10.0.0.2,192.168.0.1,10001
10.0.0.1,192.168.0.1,10002
10.0.0.3,192.168.0.1,10000
10.0.0.3,192.168.0.3,10006
10.0.0.1,192.168.0.3,09999
10.0.0.3,192.168.0.3,10008
10.0.0.1,192.168.0.3,10000
10.0.0.1,192.168.0.1,10001
10.0.0.3,192.168.0.3,10007
10.0.0.1,192.168.0.2,10000
10.0.0.2,192.168.0.1,10002
10.0.0.1,192.168.0.2,10001
10.0.0.3,192.168.0.1,10001
10.0.0.2,192.168.0.2,10003


自定义结构

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class TriTuple implements WritableComparable<TriTuple>
{
	private Text serverIP;
	private Text clientIP;
	private Text time;

	public TriTuple()
	{
		serverIP = new Text();
		clientIP = new Text();
		time = new Text();
	}

	public Text getServerIP()
	{
		return serverIP;
	}

	public void setServerIP(Text serverIP)
	{
		this.serverIP = serverIP;
	}

	public Text getClientIP()
	{
		return clientIP;
	}

	public void setClientIP(Text clientIP)
	{
		this.clientIP = clientIP;
	}

	public Text getTime()
	{
		return time;
	}

	public void setTime(Text time)
	{
		this.time = time;
	}

	public void readFields(DataInput in) throws IOException
	{
		serverIP.readFields(in);
		clientIP.readFields(in);
		time.readFields(in);
	}

	public void write(DataOutput out) throws IOException
	{
		serverIP.write(out);
		clientIP.write(out);
		time.write(out);
	}

	public int compareTo(TriTuple tt)
	{
		int cmp;
		if(0 != (cmp = serverIP.compareTo(tt.serverIP)))
		{
			return cmp;
		}
		else
		{
			if(0 != (cmp = clientIP.compareTo(tt.clientIP)))
			{
				return cmp;
			}
			else
			{
				return time.compareTo(tt.time);
			}
		}
	}

	public int hashCode()
	{
		return serverIP.hashCode() * 31;
	}

	public boolean equals(Object o)
	{
		if(o instanceof TriTuple)
		{
			TriTuple tt = (TriTuple) o;
			return serverIP.equals(tt.serverIP) && clientIP.equals(tt.clientIP)
					&& time.equals(tt.time);
		}
		return false;
	}

	public String toString()
	{
		return serverIP + "," + clientIP + "," + time;
	}

	public static class FirstComparator extends WritableComparator
	{
		protected FirstComparator()
		{
			super(TriTuple.class, true);
		}

		public int compare(WritableComparable w1, WritableComparable w2)
		{
			int cmp1 = ((TriTuple) w1).getServerIP().compareTo(((TriTuple) w2).getServerIP());
			int cmp2 = ((TriTuple) w1).getClientIP().compareTo(((TriTuple) w2).getClientIP());
			if(0 == cmp1 && 0 == cmp2)
			{
				return 0;
			}
			else
			{
				if(0 != cmp1)
				{
					return cmp1;
				}
				else
				{
					return cmp2;
				}
			}
		}
	}
}


mapreduce + driver

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

public class MyJob
{
	public static class MapClass extends Mapper<LongWritable, Text, TriTuple, NullWritable>
	{
		TriTuple outKey = new TriTuple();
		
		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
		{
			String inValue = value.toString();
			String[] field = inValue.split(",");
			if(3 == field.length)
			{
				outKey.setClientIP(new Text(field[0]));
				outKey.setServerIP(new Text(field[1]));
				outKey.setTime(new Text(field[2]));
				
				context.write(outKey, NullWritable.get());
			}
		}
	}

	public static class Reduce extends Reducer<TriTuple, NullWritable, Text, NullWritable>
	{
		Text outKey = new Text();
		int counter = 0;
		public void reduce(TriTuple key, Iterable<NullWritable> values,	Context context) throws IOException, InterruptedException
		{
			counter = 0;
			for(NullWritable value : values)
			{
				++counter;
				outKey.set(key.toString() + ", " + counter);
				context.write(outKey, NullWritable.get());
			}
		}
	}

	public static void main(String[] args) throws Exception
	{
		Configuration conf = new Configuration();
		Job job = new Job(conf, "MyJob");

		job.setNumReduceTasks(10);
		job.setJarByClass(MyJob.class);
		
		job.setMapperClass(MapClass.class);
		job.setReducerClass(Reduce.class);	
		job.setGroupingComparatorClass(TriTuple.FirstComparator.class);
		//job.setPartitionerClass(PhNumPartitioner.class);
		
		job.setMapOutputKeyClass(TriTuple.class);
		job.setMapOutputValueClass(NullWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);
		
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}


结果

192.168.0.1,10.0.0.1,10000, 1
192.168.0.1,10.0.0.1,10001, 2
192.168.0.1,10.0.0.1,10002, 3
192.168.0.1,10.0.0.2,10001, 1
192.168.0.1,10.0.0.2,10002, 2
192.168.0.1,10.0.0.3,10000, 1
192.168.0.1,10.0.0.3,10001, 2
192.168.0.1,10.0.0.3,10001, 3
192.168.0.1,10.0.0.3,10002, 4
192.168.0.1,10.0.0.3,10004, 5
192.168.0.2,10.0.0.1,10000, 1
192.168.0.2,10.0.0.1,10001, 2
192.168.0.2,10.0.0.2,10000, 1
192.168.0.2,10.0.0.2,10003, 2
192.168.0.3,10.0.0.1,09999, 1
192.168.0.3,10.0.0.1,10000, 2
192.168.0.3,10.0.0.3,10005, 1
192.168.0.3,10.0.0.3,10006, 2
192.168.0.3,10.0.0.3,10007, 3
192.168.0.3,10.0.0.3,10008, 4


 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值