sequenceFile 转换，并导入至hbase中 (图文解说 2017-7-23)

最新推荐文章于 2022-09-26 23:45:00 发布

原创最新推荐文章于 2022-09-26 23:45:00 发布 · 1.5k 阅读

5 ·

CC 4.0 BY-SA版权

hadoop 同时被 2 个专栏收录

14 篇文章

订阅专栏

hbase

11 篇文章

订阅专栏

本文介绍如何使用Hadoop的SequenceFile进行文件存储，并将其读取后写入HBase数据库的过程。包括SequenceFile的基本操作及与不同Hadoop版本的适配方法，通过实例演示了如何将HDFS上的图片文件转化为SequenceFile格式，最后导入到HBase中。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

我找了几个图片作为数据：

总共有5张（粉色的）

先上传至hdfs

hdfs://172.16.11.222:9000/JpgSequence

想要生成的sequencefile 的位置：

"hdfs://172.16.11.222:9000/Sequence/bb.txt";

具体步骤和解释全在代码里面标注《看完请评价》：

直接上代码：

这里先解释一下，sequence File的操作由于hadoop版本的原因，会有所不同。这里用的是hadoop2.X的版本，比较新。

前提准备：hbase 已经启动；

有 student 表 info列族（自己建一个）

import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ReflectionUtils;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;

import java.net.URI;

/**
 * Created by Administrator on 2017/7/24.
 */
public class SequenceFileTest {

      static String PATH = "hdfs://172.16.11.222:9000/Sequence/bb.txt";
      static SequenceFile.Writer writer = null;

    public static void main(String[] args) throws Exception{

//  Configuration conf = new Configuration();
//  String path = "hdfs://172.16.11.222:9000/JpgSequence";
//  URI uri = new URI(path);
//  FileSystem fileSystem = FileSystem.get(uri, conf);
//  writer = SequenceFile.createWriter(fileSystem, conf, new Path(PATH), Text.class, BytesWritable.class);
//  listFileAndWriteToSequenceFile(fileSystem,path);



      readSequenceFileAndWriteToHBase(new Path(PATH));

    }

    /****
     * 递归文件;并将文件写成SequenceFile文件
     * @param fileSystem
     * @param path
     * @throws Exception
     */
    public static void listFileAndWriteToSequenceFile(FileSystem fileSystem, String path) throws Exception{

        final FileStatus[] listStatuses = fileSystem.listStatus(new Path(path));
        for (FileStatus fileStatus : listStatuses) {
            if(fileStatus.isFile()){
                Text fileText = new Text(fileStatus.getPath().toString());
                System.out.println(fileText.toString());

                FSDataInputStream in = fileSystem.open(new Path(fileText.toString()));
                byte[] buffer = IOUtils.toByteArray(in);
                in.read(buffer);
                BytesWritable value = new BytesWritable(buffer);

                //写成SequenceFile文件
                writer.append(fileText, value);
                System.out.println(fileText+"   转换   SequenceFile成功");
            }
            if(fileStatus.isDirectory()){
                listFileAndWriteToSequenceFile(fileSystem,fileStatus.getPath().toString());
            }

        }
    }
    /***
     *
     * 读取sequenceFile文件，并将文件写入HBase （应当再加个 tablename 参数）
     * @param path1
     * @throws Exception
     * **/

    public static void readSequenceFileAndWriteToHBase(Path path1)throws Exception{
        Configuration conf1 = new Configuration();
        conf1.set("fs.default.name", "hdfs://172.16.11.222:9000");

        //写入HBase
        Configuration conf =HBaseConfiguration.create();
        conf.set("hbase.zookeeper.quorum", "172.16.11.221,172.16.11.222,172.16.11.223");
        conf.set("hbase.zookeeper.property.clientPort", "2800");
        // 将该值改大，防止hbase超时退出
        conf.set("dfs.socket.timeout", "180000");
        //指定表名
        HTable htable = new HTable(conf,"student");

        //读取sequenceFile文件,创建reader对象
        // 新版 hadoop 2  的读取方式
        SequenceFile.Reader.Option option1 = SequenceFile.Reader.file(path1);
        SequenceFile.Reader reader = null;
        try {
            reader = new SequenceFile.Reader(conf1,option1);
            Text key = (Text) org.apache.hadoop.util.ReflectionUtils.newInstance(
                    reader.getKeyClass(), conf1);
            BytesWritable value = (BytesWritable) org.apache.hadoop.util.ReflectionUtils.newInstance(
                    reader.getValueClass(), conf1);
            long position = reader.getPosition();
            while (reader.next(key, value)) {
                String syncSeen = reader.syncSeen() ? "*" : "";
                System.out.printf("[%s%s]\t%s\t%s\n", position, syncSeen, key, value);

                String temp = key.toString();
//                System.out.println(key.toString()+"。。。。。。。。key  值");
//                System.out.println(temp+".........切分key");
//                temp = temp.substring(temp.indexOf("hdfs://")+7);  //   172.16.11.222:9000/JpgSequence/化2.jpg
//
//                String[] keyCat=temp.split("/");
//                String tempIp=temp.split("/")[0].split(":")[0];           //   172.16.11.222
//                String port=temp.split("/")[0].split(":")[1];             //   9000
//                String path=temp.split("/")[1];                           //   JpgSequence
//                String data=temp.split("/")[keyCat.length-1];             //   化2.jpg

               // rowKey 设计  可以根据上面 自由定义，自由拼接


                String rowKey =temp;                      //这里没有修改key
                System.out.println(rowKey);

//                //value 定义
                String sequenceFileValue=value.toString();

                //指定ROWKEY的值
                Put put = new Put(Bytes.toBytes(rowKey));

                //指定列簇名称、列修饰符、列值
                put.add("info".getBytes(), temp.getBytes(), sequenceFileValue.getBytes());
                htable.put(put);

                System.out.println(rowKey+"。。。。。载入成功");

                position = reader.getPosition(); // beginning of next record
            }
        } finally {
            org.apache.hadoop.io.IOUtils.closeStream(reader);
        }

        //下面是网上原来的，我没有测试成功

//        BytesWritable val = new BytesWritable();
//        Writable key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), conf1);
//        val = (BytesWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf1);
//
//        int i = 0;
//        while(reader.next(key, val)){
//
//            String temp = key.toString();
//            temp = temp.substring(temp.indexOf("Image")+6, temp.indexOf("."));
//            String[] tmp = temp.split("/");
//            //rowKey 设计
//            String rowKey = Integer.valueOf(tmp[0])-1+"_"+Integer.valueOf(tmp[1])/2+"_"+Integer.valueOf(tmp[2])/2;
//            System.out.println(rowKey);
//
//
//
//            //指定ROWKEY的值
//            Put put = new Put(Bytes.toBytes(rowKey));
//            //指定列簇名称、列修饰符、列值
//            put.add("picinfo".getBytes(), temp.getBytes(), val.getBytes());
//            htable.put(put);
//
//        }
//        org.apache.hadoop.io.IOUtils.closeStream(reader);
    }

}