HBase与MapReduce交互

最新推荐文章于 2024-01-25 01:55:26 发布

原创最新推荐文章于 2024-01-25 01:55:26 发布 · 2.2k 阅读

6 ·

CC 4.0 BY-SA版权

文章标签：

#hbase #mapreduce #交互

HBase 专栏收录该内容

5 篇文章

订阅专栏

本文详细介绍了如何使用HBase的MapReduce功能，包括从HDFS读取数据写入HBase表（案例1），以及从HBase读取并迁移数据到另一个表（案例2）。通过实例展示了自定义Mapper和Reducer的实现，以及配置和运行步骤。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.版本

HBase：1.3.1

Hadoop：3.1.3

2.官方案例

1.查看 HBase 的 MapReduce 任务的执行需要的jar包

2. 环境变量导入

永久生效的方式：

在 hadoop-env.sh 中配置，直接在最后一行加入

export HADOOP_CLASSPATH=$HADOOP_CLASSPATH:/opt/module/hbase/lib/*

3.运行官方的 MapReduce 任务

（1）案例一：统计 stu 表中有多少行数据(读数据)

在/opt/module/hbase目录下执行

/opt/module/hadoop-3.1.3/bin/yarn jar lib/hbase-server-1.3.1.jar rowcounter student

（2）案例二：使用 MapReduce 将本地数据导入到 HBase (写数据)

1）在本地创建一个 tsv 格式的文件：fruit.tsv

1001	Apple Red
1002	Pear Yellow
1003	Pineapple Yellow

2）创建Hbase表

Hbase(main):001:0> create 'fruit','info'

3）执行 MapReduce 到HBase 的 fruit 表中

/opt/module/hadoop-3.1.3/bin/yarn jar lib/hbase-server-1.3.1.jar importtsv \
-Dimporttsv.columns=HBASE_ROW_KEY,info:name,info:color fruit \
hdfs://hadoop102:8020/fruit.tsv

//每行后面的 \ 表示换行，结构清晰，也可以把所有写出一行，就去掉 \

//最后一行是输入

4）使用 scan 命令查看导入后的结果

3.自定义案例1

目标：实现将HDFS 中的数据写入到 Hbase 表中。

1.构建 fruitMapper 类

package com.atguigu.mr1;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class fruitMapper extends Mapper<LongWritable, Text,LongWritable, Text> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        context.write(key,value);
    }
}

2.构建 fruitReducer 类

package com.atguigu.mr1;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;

import java.io.IOException;

public class fruitReducer extends TableReducer<LongWritable, Text, NullWritable> {
//    String cf1 = null;
//
//    @Override
//    protected void setup(Context context) throws IOException, InterruptedException {
//        Configuration configuration = context.getConfiguration();
//
//        cf1 = configuration.get("cf1");
//    }

    @Override
    protected void reduce(LongWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
        //1.遍历values:1001 Apple Red
        for (Text value : values) {
            //2.获取每一行数据
            String[] fields = value.toString().split("\t");

            //3.构建Put对象
            Put put = new Put(Bytes.toBytes(fields[0]));

            //4.给put对象赋值
            put.addColumn(Bytes.toBytes("info"),Bytes.toBytes("name"),Bytes.toBytes(fields[1]));
            put.addColumn(Bytes.toBytes("info"),Bytes.toBytes("color"),Bytes.toBytes(fields[2]));

            //5.写出
            context.write(NullWritable.get(),put);

        }

    }
}

3.构建 fruitDriver类

package com.atguigu.mr1;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.io.IOException;

public class fruitDriver implements Tool {
    //定义一个configuration
    private  Configuration configuration = null;

    @Override
    public int run(String[] strings) throws Exception {
        //1.获取Job对象
        Job job = Job.getInstance(configuration);

        //2.设置驱动类路径
        job.setJarByClass(fruitDriver.class);

        //3.设置Mapper & Mapper输出的KV类型
        job.setMapperClass(fruitMapper.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);

        //4.设置Reducer类
        //strings[1]:表名
        //strings[0]:数据
        TableMapReduceUtil.initTableReducerJob(strings[1],
                fruitReducer.class,
                job);

        //5.设置输入输出参数
        FileInputFormat.setInputPaths(job,new Path(strings[0]));

        //6.提交任务
        boolean result = job.waitForCompletion(true);

        return result?0:1;
    }

    @Override
    public void setConf(Configuration conf) {
        configuration = conf;
    }

    @Override
    public Configuration getConf() {
        return configuration;
    }

    public static void main(String[] args) {

        try {
            Configuration configuration = new Configuration();
            int run = ToolRunner.run(configuration,new fruitDriver(),args);

            System.exit(run);

        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

4.打包

5.运行任务

创建完表fruit1后，运行以下命令，第一个参数 /fruit.tsv 为输入数据，第二个参数 fruit1 为表名

[atguigu@Hadoop102 hbase]$ yarn jar hbase-demo-1.0-SNAPSHOT.jar com.atguigu.mr1.fruitDriver /fruit.tsv fruit1

6.使用 scan 命令查看导入后的结果

4.自定义案例2

目标：从Hbase读数据，再写到Hbase。将 fruit1表中的数据，通过 MR 迁入到 fruit2表中。

1.构建 Fruit2Mapper 类

package com.atguigu.mr2;

import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapper;
import org.apache.hadoop.hbase.util.Bytes;

import java.io.IOException;

public class Fruit2Mapper extends TableMapper<ImmutableBytesWritable, Put> {
    @Override
    protected void map(ImmutableBytesWritable key, Result value, Context context) throws IOException, InterruptedException {
        //构建Put对象
        Put put = new Put(key.get());

        //1.获取数据
        for (Cell cell : value.rawCells()) {
            //2.判断当前的cell是否为"name"列
            if("name".equals(Bytes.toString(CellUtil.cloneQualifier(cell)))){
                //3.给Put对象赋值，也可以用put.addColumn()
                put.add(cell);
            }
        }
        //4.写出
        context.write(key,put);
    }
}

2.构建 Fruit2Reducer 类

package com.atguigu.mr2;

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.io.NullWritable;

import java.io.IOException;

public class Fruit2Reducer extends TableReducer<ImmutableBytesWritable, Put, NullWritable> {
    @Override
    protected void reduce(ImmutableBytesWritable key, Iterable<Put> values, Context context) throws IOException, InterruptedException {
        //遍历写出
        for (Put put : values) {
            context.write(NullWritable.get(),put);
        }
    }
}

3.构建 Fruit2Driver类

package com.atguigu.mr2;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class Fruit2Driver implements Tool {
    private Configuration configuration = null;

    @Override
    public int run(String[] strings) throws Exception {
        Job job = Job.getInstance(configuration);

        job.setJarByClass(Fruit2Driver.class);

        TableMapReduceUtil.initTableMapperJob("fruit",
                new Scan(),
                Fruit2Mapper.class,
                ImmutableBytesWritable.class,
                Put.class,
                job);

        TableMapReduceUtil.initTableReducerJob("fruit2",
                Fruit2Reducer.class,
                job);

        boolean result = job.waitForCompletion(true);

        return result?0:1;

    }

    @Override
    public void setConf(Configuration conf) {
        configuration = conf;
    }

    @Override
    public Configuration getConf() {
        return configuration;
    }

    public static void main(String[] args) {

        try {
//            Configuration configuration = new Configuration();
            Configuration configuration = HBaseConfiguration.create();
            ToolRunner.run(configuration,new Fruit2Driver(),args);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

4.写配置文件

在resources下新建一个文件，将hadoop102上 /opt/module/hbase/conf 位置的hbase-site.xml 复制过来，名字必须一样

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!--
/**
 *
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-->
<configuration>
    <property>
        <name>hbase.rootdir</name>
        <value>hdfs://hadoop102:8020/HBase</value>
    </property>

    <property>
        <name>hbase.cluster.distributed</name>
        <value>true</value>
    </property>

    <!-- 0.98 后的新变动，之前版本没有.port,默认端口为 60000 -->
    <property>
        <name>hbase.master.port</name>
        <value>16000</value>
    </property>

    <property>
        <name>hbase.zookeeper.quorum</name>
        <value>hadoop102,hadoop103,hadoop104</value>
    </property>

    <property>
        <name>hbase.zookeeper.property.dataDir</name>
        <value>/opt/module/zookeeper-3.5.7/zkData</value>
    </property>
</configuration>