hadoop实现自定义的数据类型

最新推荐文章于 2021-07-16 12:12:59 发布

zhangbaoming815

最新推荐文章于 2021-07-16 12:12:59 发布

阅读量280

点赞数

CC 4.0 BY-SA版权

分类专栏： hadoop 文章标签： hadoop mrunit 自定义数据类型

本文链接：https://blog.youkuaiyun.com/zhangbaoming815/article/details/84416580

hadoop 专栏收录该内容

13 篇文章

订阅专栏

本文详细介绍了如何自定义数据类型并应用于WordCount程序中，通过使用Java实现了一个名为Http的数据类，并展示了如何将该类整合到WordCount程序中进行数据处理。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

关于自定义数据类型，http://book.douban.com/annotation/17067489/ 一文中给出了一个比较清晰的说明和解释。

以wordCount为例子

定义自己的数据类型Http类

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class Http implements WritableComparable<Http>
{
    public Http(){ }
    
    private String value;
    
    public Http(String value)
    {
        setValue(value);
    }

    public String getValue()
    {
        return value;
    }

    public void setValue(String value)
    {
        this.value = value;
    }

    public void readFields(DataInput in) throws IOException
    {
        value = in.readUTF();
    }

    public void write(DataOutput out) throws IOException
    {
        out.writeUTF(value);
    }

    public int compareTo(Http http)
    {
        return (value.compareTo(http.value));
    }

    @Override
    public int hashCode()
    {
        final int prime = 31;
        int result = 1;
        result = prime * result + ((value == null) ? 0 : value.hashCode());
        return result;
    }

    @Override
    public boolean equals(Object obj)
    {
        if (!(obj instanceof Http))
            return false;
        Http other = (Http)obj;
        return this.value.equals(other.value);
    }

    @Override
    public String toString()
    {
        return value;
    }
}

编写wordcount程序

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class WordCountEntry
{
    public static class TokenizerMapper extends
            Mapper<LongWritable, Http, Http, IntWritable>
    {

        private final static IntWritable one = new IntWritable(1);

        private Http word = new Http();

        public void map(LongWritable key, Http value, Context context)
                throws IOException, InterruptedException
        {
            StringTokenizer itr = new StringTokenizer(value.toString());
            while (itr.hasMoreTokens())
            {
                word.setValue(itr.nextToken());
                context.write(word, one);
            }
        }
    }

    public static class IntSumReducer extends
            Reducer<Http, IntWritable, Http, IntWritable>
    {
        private IntWritable result = new IntWritable();

        public void reduce(Http key, Iterable<IntWritable> values,
                Context context) throws IOException, InterruptedException
        {
            int sum = 0;
            for (IntWritable val : values)
            {
                sum += val.get();
            }
            result.set(sum);
            context.write(key, result);
        }
    }

    public static void main(String[] args) 
            throws IOException, InterruptedException, ClassNotFoundException 
    {
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args)
                .getRemainingArgs();
        if (otherArgs.length != 2)
        {
            System.err.println("Usage: wordcount <in> <out>");
            System.exit(2);
        }

        Path input = new Path(args[0]);
        Path output = new Path(args[1]);
        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCountEntry.class);
        job.setMapperClass(TokenizerMapper.class);
        job.setCombinerClass(IntSumReducer.class);
        job.setReducerClass(IntSumReducer.class);
        job.setOutputKeyClass(Http.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.addInputPath(job, input);
        FileOutputFormat.setOutputPath(job, output);
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

编写mrUnit测试用例进行mapreduce程序测试

import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mrunit.mapreduce.MapDriver;
import org.apache.hadoop.mrunit.mapreduce.ReduceDriver;
import org.junit.Before;
import org.junit.Test;

import com.geo.dmp.WordCountEntry.IntSumReducer;
import com.geo.dmp.WordCountEntry.TokenizerMapper;

public class WordCountEntryTest
{

    private MapDriver<LongWritable, Http, Http, IntWritable> mapDriver;
    private ReduceDriver<Http, IntWritable, Http, IntWritable> reduceDriver;
    
    @Before
    public void setUpBeforeClass() throws Exception
    {
        TokenizerMapper tm = new TokenizerMapper();
        mapDriver = MapDriver.newMapDriver(tm);
        
        IntSumReducer isr = new IntSumReducer();
        reduceDriver = ReduceDriver.newReduceDriver(isr);
    }

    @Test
    public void TokenizerMapperTest()
    {
        mapDriver.withInput(new LongWritable(), new Http("01a55\tablsd"));
        
        mapDriver.withOutput(new Http("01a55"), new IntWritable(1));
        mapDriver.withOutput(new Http("ablsd"), new IntWritable(1));
        
        mapDriver.runTest();
    }
    
    @Test
    public void IntSumReducerTest()
    {
        List<IntWritable> values = new ArrayList<IntWritable>();
        values.add(new IntWritable(1));
        values.add(new IntWritable(1));
        
        reduceDriver.withInput(new Http("01a55"), values);
        
        reduceDriver.withOutput(new Http("01a55"), new IntWritable(2));
        
        reduceDriver.runTest();
    }
}