Hadoop生态 -- HBase API 大全详解-优快云博客

HBase API 大全详解

本文全面解析 HBase 的核心 Java API，涵盖表管理、数据操作、高级查询等关键功能，结合代码示例展示最佳实践。基于 HBase 2.4+ 版本。

一、环境配置与连接管理

1. Maven 依赖

<dependency>
  <groupId>org.apache.hbase</groupId>
  <artifactId>hbase-client</artifactId>
  <version>2.4.11</version>
</dependency>

2. 连接创建与关闭

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;

// 创建配置
Configuration config = HBaseConfiguration.create();
config.set("hbase.zookeeper.quorum", "zk1,zk2,zk3");
config.set("hbase.zookeeper.property.clientPort", "2181");

// 创建连接（线程安全，应复用）
try (Connection connection = ConnectionFactory.createConnection(config)) {
  // 所有操作在此执行
}

二、表管理 API

1. 表创建与删除

import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;

try (Admin admin = connection.getAdmin()) {
  TableName tableName = TableName.valueOf("users");
  
  // 构建列族描述符
  ColumnFamilyDescriptor cfDesc = ColumnFamilyDescriptorBuilder.newBuilder("info".getBytes())
      .setMaxVersions(3)
      .setCompressionType(Compression.Algorithm.SNAPPY)
      .build();
  
  // 构建表描述符
  TableDescriptor tableDesc = TableDescriptorBuilder.newBuilder(tableName)
      .setColumnFamily(cfDesc)
      .build();
  
  // 创建表
  if (!admin.tableExists(tableName)) {
    admin.createTable(tableDesc);
    System.out.println("Table created");
  }
  
  // 删除表
  admin.disableTable(tableName);
  admin.deleteTable(tableName);
}

2. 表修改与查询

// 添加新列族
ColumnFamilyDescriptor newCf = ColumnFamilyDescriptorBuilder.newBuilder("contact".getBytes())
    .setBloomFilterType(BloomType.ROW)
    .build();
admin.addColumnFamily(tableName, newCf);

// 修改列族配置
ColumnFamilyDescriptor modifiedCf = ColumnFamilyDescriptorBuilder.newBuilder("info".getBytes())
    .setMaxVersions(5)
    .build();
admin.modifyColumnFamily(tableName, modifiedCf);

// 列出所有表
TableName[] tables = admin.listTableNames();
for (TableName name : tables) {
  System.out.println(name.getNameAsString());
}

三、数据操作 API

1. 数据写入 (Put)

import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;

try (Table table = connection.getTable(TableName.valueOf("users"))) {
  // 单行写入
  Put put = new Put(Bytes.toBytes("user001"));
  put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes("Alice"));
  put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"), Bytes.toBytes(28));
  put.addColumn(Bytes.toBytes("contact"), Bytes.toBytes("email"), 
      System.currentTimeMillis(),  // 自定义时间戳
      Bytes.toBytes("alice@example.com"));
  
  table.put(put);
  
  // 批量写入
  List<Put> puts = new ArrayList<>();
  puts.add(new Put(Bytes.toBytes("user002"))...);
  puts.add(new Put(Bytes.toBytes("user003"))...);
  table.put(puts);
}

2. 数据读取 (Get)

import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Result;

Get get = new Get(Bytes.toBytes("user001"));
// 指定要获取的列
get.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"));
// 获取多个版本
get.readVersions(3);

Result result = table.get(get);
if (!result.isEmpty()) {
  // 获取最新版本
  byte[] name = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("name"));
  System.out.println("Name: " + Bytes.toString(name));
  
  // 获取所有版本
  List<Cell> cells = result.getColumnCells(Bytes.toBytes("contact"), Bytes.toBytes("email"));
  for (Cell cell : cells) {
    System.out.println("Value: " + Bytes.toString(CellUtil.cloneValue(cell)) + 
        " at " + cell.getTimestamp());
  }
}

3. 数据删除 (Delete)

import org.apache.hadoop.hbase.client.Delete;

// 删除特定单元格
Delete delete = new Delete(Bytes.toBytes("user001"));
delete.addColumn(Bytes.toBytes("contact"), Bytes.toBytes("email"));

// 删除整行
Delete fullDelete = new Delete(Bytes.toBytes("user002"));

// 删除特定版本
delete.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"), 1672531200000L);

table.delete(delete);

四、扫描与过滤器 API

1. 基础扫描 (Scan)

import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.ResultScanner;

Scan scan = new Scan();
// 设置扫描范围 [startRow, endRow)
scan.withStartRow(Bytes.toBytes("user100"));
scan.withStopRow(Bytes.toBytes("user200"));
// 设置要获取的列
scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"));
scan.setCaching(100);  // 每次RPC获取的行数

try (ResultScanner scanner = table.getScanner(scan)) {
  for (Result result : scanner) {
    // 处理每一行结果
    byte[] row = result.getRow();
    byte[] name = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("name"));
    System.out.println(Bytes.toString(row) + ": " + Bytes.toString(name));
  }
}

2. 过滤器使用

import org.apache.hadoop.hbase.filter.*;

// 单列值过滤器
SingleColumnValueFilter nameFilter = new SingleColumnValueFilter(
    Bytes.toBytes("info"), 
    Bytes.toBytes("name"),
    CompareOperator.EQUAL, 
    new SubstringComparator("John")
);
nameFilter.setFilterIfMissing(true);  // 如果列不存在则过滤掉

// 行键前缀过滤器
PrefixFilter prefixFilter = new PrefixFilter(Bytes.toBytes("userA"));

// 分页过滤器
PageFilter pageFilter = new PageFilter(10);  // 每页10行

// 组合过滤器
FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL);
filterList.addFilter(nameFilter);
filterList.addFilter(new KeyOnlyFilter());  // 只返回键

scan.setFilter(filterList);

五、计数器与原子操作

1. 计数器操作

import org.apache.hadoop.hbase.client.Increment;

Increment increment = new Increment(Bytes.toBytes("counter001"));
increment.addColumn(Bytes.toBytes("stats"), Bytes.toBytes("page_views"), 1);
increment.addColumn(Bytes.toBytes("stats"), Bytes.toBytes("clicks"), 5);

Result result = table.increment(increment);
long newViews = Bytes.toLong(
    result.getValue(Bytes.toBytes("stats"), Bytes.toBytes("page_views"))
);

2. 原子 CheckAndPut

Put put = new Put(Bytes.toBytes("user001"));
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("status"), Bytes.toBytes("active"));

// 只有当原值为"inactive"时才更新
boolean success = table.checkAndMutate(
    Bytes.toBytes("user001"),
    Bytes.toBytes("info"))
    .qualifier(Bytes.toBytes("status"))
    .ifEquals(Bytes.toBytes("inactive"))
    .thenPut(put);

六、高级特性 API

1. 协处理器 (Coprocessor)

// 端点协处理器示例
public class CountEndpoint extends BaseEndpoint<Long> implements CoprocessorService {
  @Override
  public void getRowCount(RpcController controller, CountRequest request, 
      RpcCallback<CountResponse> done) {
    Scan scan = new Scan();
    long count = 0;
    try (RegionScanner scanner = getEnvironment().getRegion().getScanner(scan)) {
      List<Cell> results = new ArrayList<>();
      boolean hasMore;
      do {
        hasMore = scanner.next(results);
        count += results.size();
        results.clear();
      } while (hasMore);
    }
    done.run(CountResponse.newBuilder().setCount(count).build());
  }
}

// 客户端调用
Table table = connection.getTable(tableName);
final CountRequest request = CountRequest.getDefaultInstance();
Map<byte[], Long> results = table.coprocessorService(
    CountService.class,
    null,  // 所有region
    null, 
    new Batch.Call<CountService, Long>() {
      public Long call(CountService counter) throws IOException {
        ServerRpcController controller = new ServerRpcController();
        BlockingRpcCallback<CountResponse> rpcCallback = new BlockingRpcCallback<>();
        counter.getRowCount(controller, request, rpcCallback);
        return rpcCallback.get().getCount();
      }
    }
);
long total = 0;
for (Long count : results.values()) {
  total += count;
}

2. 布隆过滤器

// 创建表时启用布隆过滤器
ColumnFamilyDescriptor cfDesc = ColumnFamilyDescriptorBuilder.newBuilder("cf".getBytes())
    .setBloomFilterType(BloomType.ROWCOL)
    .build();

3. 时间序列数据优化

// 使用反转时间戳避免热点
long reverseTimestamp = Long.MAX_VALUE - System.currentTimeMillis();
Put put = new Put(Bytes.toBytes("sensor001" + reverseTimestamp));

七、异步 API (HBase 2.0+)

import org.apache.hadoop.hbase.client.AsyncConnection;
import org.apache.hadoop.hbase.client.AsyncTable;
import org.apache.hadoop.hbase.client.Get;

// 创建异步连接
CompletableFuture<AsyncConnection> asyncConn = ConnectionFactory.createAsyncConnection(config);

asyncConn.thenAccept(conn -> {
  AsyncTable<AdvancedScanResultConsumer> table = conn.getTable(TableName.valueOf("users"));
  
  // 异步Get
  CompletableFuture<Result> future = table.get(new Get(Bytes.toBytes("user001")));
  
  future.thenAccept(result -> {
    // 处理结果
    byte[] value = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("name"));
    System.out.println("Async result: " + Bytes.toString(value));
  }).exceptionally(ex -> {
    System.err.println("Error: " + ex.getMessage());
    return null;
  });
});

八、MapReduce 集成

1. HBase 作为数据源

import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;

Job job = Job.getInstance(config, "HBase Reader");
Scan scan = new Scan();
scan.setCaching(500);

// 设置Mapper
TableMapReduceUtil.initTableMapperJob(
  "input_table",
  scan,
  HBaseMapper.class,
  Text.class,
  IntWritable.class,
  job
);

job.setReducerClass(HBaseReducer.class);
job.setOutputFormatClass(TextOutputFormat.class);

2. HBase 作为输出

TableMapReduceUtil.initTableReducerJob(
  "output_table",
  HBaseReducer.class,
  job
);

九、最佳实践与性能优化

1. 连接管理

// 使用连接池
HConnectionPool pool = new HConnectionPool(config, 10); // 最大10个连接
try (PooledConnection conn = pool.getConnection()) {
  try (Table table = conn.getTable(TableName.valueOf("users"))) {
    // 操作表
  }
}

2. 批量操作优化

// 使用BufferedMutator提高写入吞吐
BufferedMutatorParams params = new BufferedMutatorParams(TableName.valueOf("logs"))
    .writeBufferSize(8 * 1024 * 1024); // 8MB缓冲区

try (BufferedMutator mutator = connection.getBufferedMutator(params)) {
  for (int i = 0; i < 10000; i++) {
    Put put = new Put(Bytes.toBytes("row" + i));
    put.addColumn(...);
    mutator.mutate(put);
  }
  mutator.flush(); // 手动刷新缓冲区
}

3. 扫描优化

Scan scan = new Scan();
scan.setCacheBlocks(false); // 对于MapReduce作业禁用缓存
scan.setBatch(100); // 设置列批处理大小
scan.setMaxResultSize(2 * 1024 * 1024); // 设置每批最大字节数

4. 行键设计技巧

// 加盐处理解决热点问题
byte[] salt = new byte[1];
ThreadLocalRandom.current().nextBytes(salt);
byte[] rowKey = Bytes.add(salt, Bytes.toBytes("original_key"));

// 时间戳反转
long reverseTs = Long.MAX_VALUE - System.currentTimeMillis();
byte[] tsRowKey = Bytes.add(Bytes.toBytes("sensor"), Bytes.toBytes(reverseTs));

十、错误处理与监控

1. 重试策略

// 自定义重试策略
config.set(HConstants.HBASE_CLIENT_RETRIES_NUMBER, "5");
config.set(HConstants.HBASE_CLIENT_PAUSE, "1000"); // 1秒间隔

2. 异常处理

try {
  table.put(put);
} catch (RetriesExhaustedException e) {
  // 重试耗尽异常
  System.err.println("Operation failed after retries: " + e.getMessage());
} catch (TableNotFoundException e) {
  // 表不存在异常
  System.err.println("Table not found: " + e.getMessage());
}

3. 监控集成

// 获取RegionServer指标
MetricsRegionServer metrics = ((ClusterConnection) connection).getMetrics();
long readRequests = metrics.getReadRequestsCount();
long writeRequests = metrics.getWriteRequestsCount();