跟我学hadoop学习4

本文介绍使用 Apache Avro 与 Hadoop MapReduce 进行数据处理的几个示例,包括如何找出最大温度记录、进行数据投影及排序Avro文件等。通过具体的Java代码实现展示了如何配置Job并使用Avro进行数据读写。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

// cc AvroGenericMaxTemperature MapReduce program to find the maximum temperature, creating Avro output

import java.io.IOException;

import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroMapper;
import org.apache.avro.mapred.AvroReducer;
import org.apache.avro.mapred.AvroUtf8InputFormat;
import org.apache.avro.mapred.Pair;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

//vv AvroGenericMaxTemperature
public class AvroGenericMaxTemperature extends Configured implements Tool {
  
  private static final Schema SCHEMA = new Schema.Parser().parse(
      "{" +
      "  \"type\": \"record\"," +
      "  \"name\": \"WeatherRecord\"," +
      "  \"doc\": \"A weather reading.\"," +
      "  \"fields\": [" +
      "    {\"name\": \"year\", \"type\": \"int\"}," +
      "    {\"name\": \"temperature\", \"type\": \"int\"}," +
      "    {\"name\": \"stationId\", \"type\": \"string\"}" +
      "  ]" +
      "}"
  );

  public static class MaxTemperatureMapper
      extends AvroMapper<Utf8, Pair<Integer, GenericRecord>> {
    private NcdcRecordParser parser = new NcdcRecordParser();
    private GenericRecord record = new GenericData.Record(SCHEMA);
    @Override
    public void map(Utf8 line,
        AvroCollector<Pair<Integer, GenericRecord>> collector,
        Reporter reporter) throws IOException {
      parser.parse(line.toString());
      if (parser.isValidTemperature()) {
        record.put("year", parser.getYearInt());
        record.put("temperature", parser.getAirTemperature());
        record.put("stationId", parser.getStationId());
        collector.collect(
            new Pair<Integer, GenericRecord>(parser.getYearInt(), record));
      }
    }
  }
  
  public static class MaxTemperatureReducer
      extends AvroReducer<Integer, GenericRecord, GenericRecord> {

    @Override
    public void reduce(Integer key, Iterable<GenericRecord> values,
        AvroCollector<GenericRecord> collector, Reporter reporter)
        throws IOException {
      GenericRecord max = null;
      for (GenericRecord value : values) {
        if (max == null || 
            (Integer) value.get("temperature") > (Integer) max.get("temperature")) {
          max = newWeatherRecord(value);
        }
      }
      collector.collect(max);
    }
    private GenericRecord newWeatherRecord(GenericRecord value) {
      GenericRecord record = new GenericData.Record(SCHEMA);
      record.put("year", value.get("year"));
      record.put("temperature", value.get("temperature"));
      record.put("stationId", value.get("stationId"));
      return record;
    }
  }

  @Override
  public int run(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.printf("Usage: %s [generic options] <input> <output>\n",
          getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }
    
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");
    
    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
    
    AvroJob.setInputSchema(conf, Schema.create(Schema.Type.STRING));
    AvroJob.setMapOutputSchema(conf,
        Pair.getPairSchema(Schema.create(Schema.Type.INT), SCHEMA));
    AvroJob.setOutputSchema(conf, SCHEMA);
    
    conf.setInputFormat(AvroUtf8InputFormat.class);

    AvroJob.setMapperClass(conf, MaxTemperatureMapper.class);
    AvroJob.setReducerClass(conf, MaxTemperatureReducer.class);

    JobClient.runJob(conf);
    return 0;
  }
  
  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new AvroGenericMaxTemperature(), args);
    System.exit(exitCode);
  }
}
// ^^ AvroGenericMaxTemperature



import java.io.IOException;

import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroMapper;
import org.apache.avro.mapred.AvroReducer;
import org.apache.avro.mapred.AvroUtf8InputFormat;
import org.apache.avro.mapred.Pair;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import specific.WeatherRecord;

public class AvroSpecificMaxTemperature extends Configured implements Tool {
  
  public static class MaxTemperatureMapper
      extends AvroMapper<Utf8, Pair<Integer, WeatherRecord>> {
    private NcdcRecordParser parser = new NcdcRecordParser();
    private WeatherRecord record = new WeatherRecord();
    @Override
    public void map(Utf8 line,
        AvroCollector<Pair<Integer, WeatherRecord>> collector,
        Reporter reporter) throws IOException {
      parser.parse(line.toString());
      if (parser.isValidTemperature()) {
        record.year = parser.getYearInt();
        record.temperature = parser.getAirTemperature();
        record.stationId = new Utf8(parser.getStationId());
        collector.collect(
            new Pair<Integer, WeatherRecord>(parser.getYearInt(), record));
      }
    }
  }
  
  public static class MaxTemperatureReducer extends
      AvroReducer<Integer, WeatherRecord, WeatherRecord> {

    @Override
    public void reduce(Integer key, Iterable<WeatherRecord> values,
        AvroCollector<WeatherRecord> collector,
        Reporter reporter) throws IOException {
      WeatherRecord max = null;
      for (WeatherRecord value : values) {
        if (max == null || value.temperature > max.temperature) {
          max = newWeatherRecord(value);
        }
      }
      collector.collect(max);
    }
  }

  public static class MaxTemperatureCombiner extends
      AvroReducer<Integer, WeatherRecord, Pair<Integer, WeatherRecord>> {
    
    @Override
    public void reduce(Integer key, Iterable<WeatherRecord> values,
        AvroCollector<Pair<Integer, WeatherRecord>> collector,
        Reporter reporter) throws IOException {
      WeatherRecord max = null;
      for (WeatherRecord value : values) {
        if (max == null || value.temperature > max.temperature) {
          max = newWeatherRecord(value);
        }
      }
      collector.collect(new Pair<Integer, WeatherRecord>(key, max));
    }
  }

  private static WeatherRecord newWeatherRecord(WeatherRecord value) {
    WeatherRecord record = new WeatherRecord();
    record.year = value.year;
    record.temperature = value.temperature;
    record.stationId = value.stationId;
    return record;
  }
  
  @Override
  public int run(String[] args) throws Exception {
    if (args.length != 2) {
      System.err.printf("Usage: %s [generic options] <input> <output>\n",
          getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }
    
    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Max temperature");
    
    FileInputFormat.addInputPath(conf, new Path(args[0]));
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));
    
    AvroJob.setInputSchema(conf, Schema.create(Schema.Type.STRING));
    AvroJob.setMapOutputSchema(conf, Pair.getPairSchema(
        Schema.create(Schema.Type.INT), WeatherRecord.SCHEMA$));
    AvroJob.setOutputSchema(conf, WeatherRecord.SCHEMA$);
    
    conf.setInputFormat(AvroUtf8InputFormat.class);

    AvroJob.setMapperClass(conf, MaxTemperatureMapper.class);
    AvroJob.setCombinerClass(conf, MaxTemperatureCombiner.class);
    AvroJob.setReducerClass(conf, MaxTemperatureReducer.class);

    JobClient.runJob(conf);
    return 0;
  }
  
  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new AvroSpecificMaxTemperature(), args);
    System.exit(exitCode);
  }
}


import java.text.*;
import java.util.Date;

import org.apache.hadoop.io.Text;

public class NcdcRecordParser {
  
  private static final int MISSING_TEMPERATURE = 9999;
  
  private static final DateFormat DATE_FORMAT =
    new SimpleDateFormat("yyyyMMddHHmm");
  
  private String stationId;
  private String observationDateString;
  private String year;
  private String airTemperatureString;
  private int airTemperature;
  private boolean airTemperatureMalformed;
  private String quality;
  
  public void parse(String record) {
    stationId = record.substring(4, 10) + "-" + record.substring(10, 15);
    observationDateString = record.substring(15, 27);
    year = record.substring(15, 19);
    airTemperatureMalformed = false;
    // Remove leading plus sign as parseInt doesn't like them
    if (record.charAt(87) == '+') {
      airTemperatureString = record.substring(88, 92);
      airTemperature = Integer.parseInt(airTemperatureString);
    } else if (record.charAt(87) == '-') {
      airTemperatureString = record.substring(87, 92);
      airTemperature = Integer.parseInt(airTemperatureString);
    } else {
      airTemperatureMalformed = true;
    }
    airTemperature = Integer.parseInt(airTemperatureString);
    quality = record.substring(92, 93);
  }
  
  public void parse(Text record) {
    parse(record.toString());
  }
  
  public boolean isValidTemperature() {
    return !airTemperatureMalformed && airTemperature != MISSING_TEMPERATURE
        && quality.matches("[01459]");
  }
  
  public boolean isMalformedTemperature() {
    return airTemperatureMalformed;
  }
  
  public boolean isMissingTemperature() {
    return airTemperature == MISSING_TEMPERATURE;
  }
  
  public String getStationId() {
    return stationId;
  }
  
  public Date getObservationDate() {
    try {
      System.out.println(observationDateString);
      return DATE_FORMAT.parse(observationDateString);
    } catch (ParseException e) {
      throw new IllegalArgumentException(e);
    }
  }

  public String getYear() {
    return year;
  }

  public int getYearInt() {
    return Integer.parseInt(year);
  }

  public int getAirTemperature() {
    return airTemperature;
  }
  
  public String getAirTemperatureString() {
    return airTemperatureString;
  }

  public String getQuality() {
    return quality;
  }

}



import java.io.File;

import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroJob;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class AvroProjection extends Configured implements Tool {

  @Override
  public int run(String[] args) throws Exception {
    
    if (args.length != 3) {
      System.err.printf("Usage: %s [generic options] <input> <output> <schema-file>\n",
          getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }
    
    String input = args[0];
    String output = args[1];
    String schemaFile = args[2];

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Avro projection");
    
    FileInputFormat.addInputPath(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    
    Schema schema = new Schema.Parser().parse(new File(schemaFile));
    AvroJob.setInputSchema(conf, schema);
    AvroJob.setMapOutputSchema(conf, schema);
    AvroJob.setOutputSchema(conf, schema);
    conf.setNumReduceTasks(0);

    JobClient.runJob(conf); 
    return 0;
  }
  
  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new AvroProjection(), args);
    System.exit(exitCode);
  }

}


// cc AvroSort A MapReduce program to sort an Avro data file

import java.io.File;
import java.io.IOException;

import org.apache.avro.Schema;
import org.apache.avro.mapred.AvroCollector;
import org.apache.avro.mapred.AvroJob;
import org.apache.avro.mapred.AvroMapper;
import org.apache.avro.mapred.AvroReducer;
import org.apache.avro.mapred.Pair;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

//vv AvroSort
public class AvroSort extends Configured implements Tool {

  static class SortMapper<K> extends AvroMapper<K, Pair<K, K>> {
    public void map(K datum, AvroCollector<Pair<K, K>> collector,
        Reporter reporter) throws IOException {
      collector.collect(new Pair<K, K>(datum, null, datum, null));
    }
  }

  static class SortReducer<K> extends AvroReducer<K, K, K> {
    public void reduce(K key, Iterable<K> values,
        AvroCollector<K> collector,
        Reporter reporter) throws IOException {
      for (K value : values) {
        collector.collect(value);
      }
    }
  }

  @Override
  public int run(String[] args) throws Exception {
    
    if (args.length != 3) {
      System.err.printf(
        "Usage: %s [generic options] <input> <output> <schema-file>\n",
        getClass().getSimpleName());
      ToolRunner.printGenericCommandUsage(System.err);
      return -1;
    }
    
    String input = args[0];
    String output = args[1];
    String schemaFile = args[2];

    JobConf conf = new JobConf(getConf(), getClass());
    conf.setJobName("Avro sort");
    
    FileInputFormat.addInputPath(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, new Path(output));
    
    Schema schema = new Schema.Parser().parse(new File(schemaFile));
    AvroJob.setInputSchema(conf, schema);
    Schema intermediateSchema = Pair.getPairSchema(schema, schema);
    AvroJob.setMapOutputSchema(conf, intermediateSchema);
    AvroJob.setOutputSchema(conf, schema);
    
    AvroJob.setMapperClass(conf, SortMapper.class);
    AvroJob.setReducerClass(conf, SortReducer.class);
  
    JobClient.runJob(conf); 
    return 0;
  }
  
  public static void main(String[] args) throws Exception {
    int exitCode = ToolRunner.run(new AvroSort(), args);
    System.exit(exitCode);
  }
}
// ^^ AvroSort




<think>嗯,用户想学习数据库,我得想想怎么组织回答比较好。首先,用户可能是个刚入门的新手,需要从基础开始讲起。那应该先介绍数据库的基本概念,比如什么是数据库,常见的类型有哪些,比如关系型和非关系型的区别。 接下来,学习路径很重要。可能需要分步骤,比如先学SQL,因为这是基础。然后关系型数据库,比如MySQL或者PostgreSQL,这些比较常用。接着是非关系型数据库,比如MongoDB或者Redis,用户可能需要知道不同场景下的选择。 然后实践部分很关键,用户需要实际操作,安装数据库软件,创建表,写查询语句。这部分最好给出具体的例子,比如如何用SQL创建表,插入数据,查询数据,这样用户能跟着做。 然后是进阶内容,比如索引、事务、锁机制,这些对于优化和理解数据库原理很重要。可能还需要提到数据库设计,比如范式和ER模型,帮助用户设计合理的数据库结构。 接下来是扩展部分,分布式数据库和云数据库现在挺重要的,特别是云服务普及了,用户可能想了解AWS RDS或者阿里云这些平台。大数据相关的Hadoop和Spark也可以提一下,作为扩展知识。 学习资源方面,推荐一些书籍,比如《SQL必知必会》和《数据库系统概念》,还有在线平台,比如Coursera和实验楼,这些资源比较实用。社区和论坛比如Stack Overflow和掘金,用户遇到问题可以去那里求助。 最后,项目实战很重要,用户需要实际做项目来巩固知识,比如设计一个电商系统的数据库,或者优化现有系统。持续学习方面,关注新技术和参与开源项目,保持更新。 要注意用户可能的问题点,比如安装配置数据库时遇到困难,或者理解复杂概念如事务隔离级别,这时候要建议查阅资料和实践。另外,用户可能想快速应用,所以强调实践和项目经验。 整体结构需要清晰,分步骤,每个部分简明扼要,给出具体例子和资源,帮助用户一步步学习。同时避免使用专业术语过多,用易懂的语言解释,确保用户能跟上节奏。</think>学习数据库需要系统化的路径和实践,以下是分阶段的学习建议: --- ### 一、基础概念铺垫 1. **理解数据库核心概念** - 数据库定义:结构化数据集合(如:学生信息表) - 常见类型: - 关系型数据库:MySQL, PostgreSQL(数据以二维表存储) - 非关系型数据库:MongoDB, Redis(键值对/文档存储) 2. **学习数据操作语言** - SQL语法基础: ```sql SELECT name FROM students WHERE age > 18; ``` - 重点掌握:CRUD操作、表连接、聚合函数 --- ### 二、实践技能培养 1. **环境搭建** - 安装MySQL或PostgreSQL - 使用命令行或工具(如DBeaver)连接数据库 2. **动手实验** - 创建表: ```sql CREATE TABLE users ( id INT PRIMARY KEY, name VARCHAR(50), email VARCHAR(100) UNIQUE ); ``` - 数据操作练习:插入/更新/删除记录 3. **设计实践** - 数据规范化(消除冗余) - 绘制ER图:实体关系模型设计 --- ### 三、进阶技术探索 1. **性能优化** - 索引原理:B+树结构加速查询 - 执行计划分析: ```sql EXPLAIN SELECT * FROM orders WHERE total > 1000; ``` 2. **事务管理** - ACID特性: $$ \text{原子性}(Atomicity) + \text{一致性}(Consistency) + \text{隔离性}(Isolation) + \text{持久性}(Durability) $$ - 隔离级别:读未提交 → 串行化 --- ### 四、扩展技术栈 1. **分布式数据库** - 分库分表策略 - 最终一致性 vs 强一致性 2. **云数据库实践** - AWS RDS/Azure SQL部署 - 备份与恢复策略配置 --- ### 五、学习资源推荐 1. **教材与课程** - 《数据库系统概念》(原书第7版) - Coursera《Database Systems》专项课程 2. **实战平台** - LeetCode数据库题库 - Kaggle数据集分析项目 --- ### 六、项目实战建议 1. **初级项目** - 图书馆管理系统(表设计+借阅逻辑) 2. **高级项目** - 电商平台数据库优化(读写分离+缓存整合) --- ### 常见问题解决 - **安装报错**:检查端口占用/权限设置 - **查询性能差**:分析慢查询日志+添加合适索引 - **死锁问题**:调整事务隔离级别/优化事务粒度 通过理论→实践→优化的循环学习,配合持续项目实践,可系统掌握数据库技术。建议每周投入10-15小时,3-6个月可达到中级水平。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值