2.x MapReduce的测试类-优快云博客

本文详细介绍了如何使用自定义输出类、数据整理类以及MapReduce框架进行数据处理。通过自定义输出类实现数据流向的精细化控制，并结合数据整理类对输入数据进行高效分割和处理。实例展示了在Map阶段数据直接输出的应用场景，以及如何通过自定义输出逻辑实现实现不同条件下的数据分类输出。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1 wordcount

2 倒排序

3 自定义分区（不同规则输出到不同的文件）

4 自定义文件输出

5 统计文件流

1 自定义输出类

package com.wzt.mapreduce.custom;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
 * @author root
 *
 * @param <Text> reduce 输出的key类型     value类型  
 * @param <LongWritable>
 */
public class MyCustomOutputFormat<Text, LongWritable> extends FileOutputFormat<Text, LongWritable>{
	
	
	@Override
	public RecordWriter<Text, LongWritable> getRecordWriter(TaskAttemptContext job)
			throws IOException, InterruptedException {

        Configuration conf = job.getConfiguration();  
		FileSystem hdfs = FileSystem.get(conf);
		FSDataOutputStream os1 = hdfs.create(new Path("/wc/output1/file1.log"));
		FSDataOutputStream os2 = hdfs.create(new Path("/wc/output2/file2.log"));
		
		return new MyRecordWriter<Text, LongWritable>( os1, os2);
	}

	public static class MyRecordWriter<Text, LongWritable> extends RecordWriter<Text, LongWritable>{
		FSDataOutputStream os1 =  null ;
		FSDataOutputStream os2  = null;
		
		public MyRecordWriter(FSDataOutputStream os1, FSDataOutputStream os2) {
			 this.os1 = os1 ;
			 this.os2 = os2 ; 
		}

		@Override
		public void write(Text key, LongWritable value2) throws IOException,
				InterruptedException {

			Long hang = Long.parseLong( value2.toString());
			
			if(hang%2==0){
				os1.writeBytes(key.toString() );
			}else{
				os2.writeBytes(key.toString() );
			}
		}

		@Override
		public void close(TaskAttemptContext context) throws IOException,
				InterruptedException {
			 if(os1!=null){
				 os1.close();
			 }
			 if(os2!=null){
				 os2.close();
			 }
			
		}
	}
}

2 Mapper 数据整理类

package com.wzt.mapreduce.custom;

import java.io.IOException;

import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class CIOMapper extends Mapper<LongWritable, Text , Text, LongWritable >{

	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		 
		String line = value.toString();
		String[] words  = StringUtils.split(line, " ");
		for(String word :words ){
			context.write( new Text(word) , key );; 
		}
	}
}

3 运行的主类（Map中数据直接输出所以没有使用到reducer）

package com.wzt.mapreduce.custom;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 
public class CIORunner {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		
		Configuration conf = new Configuration() ; 
		Job job = Job.getInstance(conf) ;
		
		job.setJarByClass(CIORunner.class );
		
		job.setMapperClass( CIOMapper.class );
		//job.setReducerClass( CIOReducer.class ); 没有reducer就不用了 
		
		job.setOutputKeyClass( Text.class );
		job.setOutputValueClass(LongWritable.class );
		
		job.setMapOutputKeyClass( Text.class);
		job.setMapOutputValueClass( LongWritable.class );
		
		job.setOutputFormatClass(MyCustomOutputFormat.class);
		
		FileInputFormat.setInputPaths(job,  "/wc/input/xiyou.txt");
		FileOutputFormat.setOutputPath(job,  new Path("/wc/outputcount"));
//		FileInputFormat.setInputPaths(job,  "D:\\wordcount\\wordcount.txt");
//		FileOutputFormat.setOutputPath(job,  new Path("D:\\wordcount\\output"));
 		job.waitForCompletion(true) ; 
		
	}
}