【Avro三】Hadoop MapReduce读写Avro文件

使用Avro与Hadoop MapReduce进行数据处理

最新推荐文章于 2020-02-11 11:36:58 发布

原创最新推荐文章于 2020-02-11 11:36:58 发布 · 498 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#大数据 #java

Avro 专栏收录该内容

4 篇文章

订阅专栏

本文介绍了如何使用Avro作为Hadoop MapReduce的数据序列化和反序列化工具，通过一个简单的示例展示了如何从Avro格式的源文件中读取数据进行计数统计，并将结果以Avro格式写入目标文件。文章涵盖了Maven依赖配置、MapReduce代码实现及主要类注释，旨在帮助理解Hadoop MapReduce操作Avro的基本流程和Avro提供的API。

Avro是Doug Cutting(此人绝对是神一般的存在）牵头开发的。开发之初就是围绕着完善Hadoop生态系统的数据处理而开展的（使用Avro作为Hadoop MapReduce需要处理数据序列化和反序列化的场景）,因此Hadoop MapReduce集成Avro也就是自然而然的事情。

这个例子是一个简单的Hadoop MapReduce读取Avro格式的源文件进行计数统计，然后将计算结果作为Avro格式的数据写到目标文件中，主要目的是体会下Hadoop MapReduce操作Avro的基本流程和Avro提供的API

1. Maven依赖

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>learn</groupId>
    <artifactId>learn.avro</artifactId>
    <version>1.0-SNAPSHOT</version>

    <dependencies>
        <!--avro core-->
        <dependency>
            <groupId>org.apache.avro</groupId>
            <artifactId>avro</artifactId>
            <version>1.7.7</version>
        </dependency>

        <!--avro rpc support-->
        <dependency>
            <groupId>org.apache.avro</groupId>
            <artifactId>avro-ipc</artifactId>
            <version>1.7.7</version>
        </dependency>

        <!--avro utilities for Hadoop MapReduce to process avro files -->
        <dependency>
            <groupId>org.apache.avro</groupId>
            <artifactId>avro-mapred</artifactId>
            <version>1.7.7</version>
        </dependency>

        <!--Avro and Hadoop Map Reduce-->
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
            <version>1.2.1</version>
        </dependency>


    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.avro</groupId>
                <artifactId>avro-maven-plugin</artifactId>
                <version>1.7.7</version>
                <executions>
                    <execution>
                        <phase>generate-sources</phase>
                        <goals>
                            <goal>schema</goal>
                            <goal>protocol</goal>
                            <goal>idl-protocol</goal>
                        </goals>
                        <configuration>
                            <sourceDirectory>${project.basedir}/src/main/avro/</sourceDirectory>
                            <outputDirectory>${project.basedir}/src/main/java/</outputDirectory>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <configuration>
                    <source>1.7</source>
                    <target>1.7</target>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

2. MapReduce代码：

package examples.avro.mapreduce;

import examples.avro.simple.User;
import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyInputFormat;
import org.apache.avro.mapreduce.AvroKeyValueOutputFormat;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class MapReduceColorCount extends Configured implements Tool {

    ///Mapper定义：
    ///输入Key类型是AvroKey<User>，输入Value类型是NullWritable
    ///输出Key类型是Text，输出Value类型是IntWritable
    public static class ColorCountMapper extends
            Mapper<AvroKey<User>, NullWritable, Text, IntWritable> {

        @Override
        public void map(AvroKey<User> key, NullWritable value, Context context)
                throws IOException, InterruptedException {

            CharSequence color = key.datum().getFavoriteColor();
            if (color == null) {
                color = "none";
            }
            context.write(new Text(color.toString()), new IntWritable(1));
        }
    }

    ///Reducer定义：
    ///输入Key类型是Text，输入Value类型是IntWritable(跟Key的输出Key/Value类型一致)
    ///输出Key类型是AvroKey<CharSequence>，输出Value类型是AvroValue<Integer>
    public static class ColorCountReducer extends
            Reducer<Text, IntWritable, AvroKey<CharSequence>, AvroValue<Integer>> {

        @Override
        public void reduce(Text key, Iterable<IntWritable> values,
                           Context context) throws IOException, InterruptedException {

            int sum = 0;
            for (IntWritable value : values) {
                sum += value.get();
            }
            context.write(new AvroKey<CharSequence>(key.toString()), new AvroValue<Integer>(sum));
        }
    }

    public int run(String[] args) throws Exception {
        if (args.length != 2) {
            System.err.println("Usage: MapReduceColorCount <input path> <output path>");
            return -1;
        }

        Job job = new Job(getConf());
        job.setJarByClass(MapReduceColorCount.class);
        job.setJobName("Color Count");

        ///指定输入路径，输入文件是Avro格式
        FileInputFormat.setInputPaths(job, new Path(args[0]));

        ///指定输出路径，输出文件格式是Key/Value组成的Avro文件，见AvroKeyValueOutputFormat
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        //AvroKeyInputFormat: A MapReduce InputFormat that can handle Avro container files.
        job.setInputFormatClass(AvroKeyInputFormat.class);
        job.setMapperClass(ColorCountMapper.class);
        AvroJob.setInputKeySchema(job, User.getClassSchema());
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //AvroKeyValueOutputFormat: FileOutputFormat for writing Avro container files of key/value pairs
        job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
        job.setReducerClass(ColorCountReducer.class);
        AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING));
        AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT));

        return (job.waitForCompletion(true) ? 0 : 1);
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(new MapReduceColorCount(), args);
        System.exit(res);
    }
}

3. 主要类注释

3.1 AvroKey

/** The wrapper of keys for jobs configured with {@link AvroJob} . */

3.2 AvroValue

/** The wrapper of values for jobs configured with {@link AvroJob} . */

3.3 AvroJob

/** Setters to configure jobs for Avro data. */

3.4 AvroKeyInputFormat

/**
 * A MapReduce InputFormat that can handle Avro container files.
 *
 * <p>Keys are AvroKey wrapper objects that contain the Avro data.  Since Avro
 * container files store only records (not key/value pairs), the value from
 * this InputFormat is a NullWritable.</p>
 */

3.5 AvroKeyValueOutputFormat

/**
 * FileOutputFormat for writing Avro container files of key/value pairs.
 *
 * <p>Since Avro container files can only contain records (not key/value pairs), this
 * output format puts the key and value into an Avro generic record with two fields, named
 * 'key' and 'value'.</p>
 *
 * <p>The keys and values given to this output format may be Avro objects wrapped in
 * <code>AvroKey</code> or <code>AvroValue</code> objects.  The basic Writable types are
 * also supported (e.g., IntWritable, Text); they will be converted to their corresponding
 * Avro types.</p>
 *
 * @param <K> The type of key. If an Avro type, it must be wrapped in an <code>AvroKey</code>.
 * @param <V> The type of value. If an Avro type, it must be wrapped in an <code>AvroValue</code>.
 */

3.6

  /**
   * Sets the job input key schema.
   *
   * @param job The job to configure.
   * @param schema The input key schema.
   */
  public static void setInputKeySchema(Job job, Schema schema) {
    job.getConfiguration().set(CONF_INPUT_KEY_SCHEMA, schema.toString());
  }

  /**
   * Sets the job input value schema.
   *
   * @param job The job to configure.
   * @param schema The input value schema.
   */
  public static void setInputValueSchema(Job job, Schema schema) {
    job.getConfiguration().set(CONF_INPUT_VALUE_SCHEMA, schema.toString());
  }

3.7

/**
   * Sets the map output key schema.
   *
   * @param job The job to configure.
   * @param schema The map output key schema.
   */
  public static void setMapOutputKeySchema(Job job, Schema schema) {
    job.setMapOutputKeyClass(AvroKey.class);
    job.setGroupingComparatorClass(AvroKeyComparator.class);
    job.setSortComparatorClass(AvroKeyComparator.class);
    AvroSerialization.setKeyWriterSchema(job.getConfiguration(), schema);
    AvroSerialization.setKeyReaderSchema(job.getConfiguration(), schema);
    AvroSerialization.addToConfiguration(job.getConfiguration());
  }

  /**
   * Sets the map output value schema.
   *
   * @param job The job to configure.
   * @param schema The map output value schema.
   */
  public static void setMapOutputValueSchema(Job job, Schema schema) {
    job.setMapOutputValueClass(AvroValue.class);
    AvroSerialization.setValueWriterSchema(job.getConfiguration(), schema);
    AvroSerialization.setValueReaderSchema(job.getConfiguration(), schema);
    AvroSerialization.addToConfiguration(job.getConfiguration());
  }