HBase API 大全详解
本文全面解析 HBase 的核心 Java API,涵盖表管理、数据操作、高级查询等关键功能,结合代码示例展示最佳实践。基于 HBase 2.4+ 版本。
一、环境配置与连接管理
1. Maven 依赖
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>2.4.11</version>
</dependency>
2. 连接创建与关闭
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
// 创建配置
Configuration config = HBaseConfiguration.create();
config.set("hbase.zookeeper.quorum", "zk1,zk2,zk3");
config.set("hbase.zookeeper.property.clientPort", "2181");
// 创建连接(线程安全,应复用)
try (Connection connection = ConnectionFactory.createConnection(config)) {
// 所有操作在此执行
}
二、表管理 API
1. 表创建与删除
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
try (Admin admin = connection.getAdmin()) {
TableName tableName = TableName.valueOf("users");
// 构建列族描述符
ColumnFamilyDescriptor cfDesc = ColumnFamilyDescriptorBuilder.newBuilder("info".getBytes())
.setMaxVersions(3)
.setCompressionType(Compression.Algorithm.SNAPPY)
.build();
// 构建表描述符
TableDescriptor tableDesc = TableDescriptorBuilder.newBuilder(tableName)
.setColumnFamily(cfDesc)
.build();
// 创建表
if (!admin.tableExists(tableName)) {
admin.createTable(tableDesc);
System.out.println("Table created");
}
// 删除表
admin.disableTable(tableName);
admin.deleteTable(tableName);
}
2. 表修改与查询
// 添加新列族
ColumnFamilyDescriptor newCf = ColumnFamilyDescriptorBuilder.newBuilder("contact".getBytes())
.setBloomFilterType(BloomType.ROW)
.build();
admin.addColumnFamily(tableName, newCf);
// 修改列族配置
ColumnFamilyDescriptor modifiedCf = ColumnFamilyDescriptorBuilder.newBuilder("info".getBytes())
.setMaxVersions(5)
.build();
admin.modifyColumnFamily(tableName, modifiedCf);
// 列出所有表
TableName[] tables = admin.listTableNames();
for (TableName name : tables) {
System.out.println(name.getNameAsString());
}
三、数据操作 API
1. 数据写入 (Put)
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
try (Table table = connection.getTable(TableName.valueOf("users"))) {
// 单行写入
Put put = new Put(Bytes.toBytes("user001"));
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes("Alice"));
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"), Bytes.toBytes(28));
put.addColumn(Bytes.toBytes("contact"), Bytes.toBytes("email"),
System.currentTimeMillis(), // 自定义时间戳
Bytes.toBytes("alice@example.com"));
table.put(put);
// 批量写入
List<Put> puts = new ArrayList<>();
puts.add(new Put(Bytes.toBytes("user002"))...);
puts.add(new Put(Bytes.toBytes("user003"))...);
table.put(puts);
}
2. 数据读取 (Get)
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Result;
Get get = new Get(Bytes.toBytes("user001"));
// 指定要获取的列
get.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"));
// 获取多个版本
get.readVersions(3);
Result result = table.get(get);
if (!result.isEmpty()) {
// 获取最新版本
byte[] name = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("name"));
System.out.println("Name: " + Bytes.toString(name));
// 获取所有版本
List<Cell> cells = result.getColumnCells(Bytes.toBytes("contact"), Bytes.toBytes("email"));
for (Cell cell : cells) {
System.out.println("Value: " + Bytes.toString(CellUtil.cloneValue(cell)) +
" at " + cell.getTimestamp());
}
}
3. 数据删除 (Delete)
import org.apache.hadoop.hbase.client.Delete;
// 删除特定单元格
Delete delete = new Delete(Bytes.toBytes("user001"));
delete.addColumn(Bytes.toBytes("contact"), Bytes.toBytes("email"));
// 删除整行
Delete fullDelete = new Delete(Bytes.toBytes("user002"));
// 删除特定版本
delete.addColumn(Bytes.toBytes("info"), Bytes.toBytes("age"), 1672531200000L);
table.delete(delete);
四、扫描与过滤器 API
1. 基础扫描 (Scan)
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.client.ResultScanner;
Scan scan = new Scan();
// 设置扫描范围 [startRow, endRow)
scan.withStartRow(Bytes.toBytes("user100"));
scan.withStopRow(Bytes.toBytes("user200"));
// 设置要获取的列
scan.addColumn(Bytes.toBytes("info"), Bytes.toBytes("name"));
scan.setCaching(100); // 每次RPC获取的行数
try (ResultScanner scanner = table.getScanner(scan)) {
for (Result result : scanner) {
// 处理每一行结果
byte[] row = result.getRow();
byte[] name = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("name"));
System.out.println(Bytes.toString(row) + ": " + Bytes.toString(name));
}
}
2. 过滤器使用
import org.apache.hadoop.hbase.filter.*;
// 单列值过滤器
SingleColumnValueFilter nameFilter = new SingleColumnValueFilter(
Bytes.toBytes("info"),
Bytes.toBytes("name"),
CompareOperator.EQUAL,
new SubstringComparator("John")
);
nameFilter.setFilterIfMissing(true); // 如果列不存在则过滤掉
// 行键前缀过滤器
PrefixFilter prefixFilter = new PrefixFilter(Bytes.toBytes("userA"));
// 分页过滤器
PageFilter pageFilter = new PageFilter(10); // 每页10行
// 组合过滤器
FilterList filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL);
filterList.addFilter(nameFilter);
filterList.addFilter(new KeyOnlyFilter()); // 只返回键
scan.setFilter(filterList);
五、计数器与原子操作
1. 计数器操作
import org.apache.hadoop.hbase.client.Increment;
Increment increment = new Increment(Bytes.toBytes("counter001"));
increment.addColumn(Bytes.toBytes("stats"), Bytes.toBytes("page_views"), 1);
increment.addColumn(Bytes.toBytes("stats"), Bytes.toBytes("clicks"), 5);
Result result = table.increment(increment);
long newViews = Bytes.toLong(
result.getValue(Bytes.toBytes("stats"), Bytes.toBytes("page_views"))
);
2. 原子 CheckAndPut
Put put = new Put(Bytes.toBytes("user001"));
put.addColumn(Bytes.toBytes("info"), Bytes.toBytes("status"), Bytes.toBytes("active"));
// 只有当原值为"inactive"时才更新
boolean success = table.checkAndMutate(
Bytes.toBytes("user001"),
Bytes.toBytes("info"))
.qualifier(Bytes.toBytes("status"))
.ifEquals(Bytes.toBytes("inactive"))
.thenPut(put);
六、高级特性 API
1. 协处理器 (Coprocessor)
// 端点协处理器示例
public class CountEndpoint extends BaseEndpoint<Long> implements CoprocessorService {
@Override
public void getRowCount(RpcController controller, CountRequest request,
RpcCallback<CountResponse> done) {
Scan scan = new Scan();
long count = 0;
try (RegionScanner scanner = getEnvironment().getRegion().getScanner(scan)) {
List<Cell> results = new ArrayList<>();
boolean hasMore;
do {
hasMore = scanner.next(results);
count += results.size();
results.clear();
} while (hasMore);
}
done.run(CountResponse.newBuilder().setCount(count).build());
}
}
// 客户端调用
Table table = connection.getTable(tableName);
final CountRequest request = CountRequest.getDefaultInstance();
Map<byte[], Long> results = table.coprocessorService(
CountService.class,
null, // 所有region
null,
new Batch.Call<CountService, Long>() {
public Long call(CountService counter) throws IOException {
ServerRpcController controller = new ServerRpcController();
BlockingRpcCallback<CountResponse> rpcCallback = new BlockingRpcCallback<>();
counter.getRowCount(controller, request, rpcCallback);
return rpcCallback.get().getCount();
}
}
);
long total = 0;
for (Long count : results.values()) {
total += count;
}
2. 布隆过滤器
// 创建表时启用布隆过滤器
ColumnFamilyDescriptor cfDesc = ColumnFamilyDescriptorBuilder.newBuilder("cf".getBytes())
.setBloomFilterType(BloomType.ROWCOL)
.build();
3. 时间序列数据优化
// 使用反转时间戳避免热点
long reverseTimestamp = Long.MAX_VALUE - System.currentTimeMillis();
Put put = new Put(Bytes.toBytes("sensor001" + reverseTimestamp));
七、异步 API (HBase 2.0+)
import org.apache.hadoop.hbase.client.AsyncConnection;
import org.apache.hadoop.hbase.client.AsyncTable;
import org.apache.hadoop.hbase.client.Get;
// 创建异步连接
CompletableFuture<AsyncConnection> asyncConn = ConnectionFactory.createAsyncConnection(config);
asyncConn.thenAccept(conn -> {
AsyncTable<AdvancedScanResultConsumer> table = conn.getTable(TableName.valueOf("users"));
// 异步Get
CompletableFuture<Result> future = table.get(new Get(Bytes.toBytes("user001")));
future.thenAccept(result -> {
// 处理结果
byte[] value = result.getValue(Bytes.toBytes("info"), Bytes.toBytes("name"));
System.out.println("Async result: " + Bytes.toString(value));
}).exceptionally(ex -> {
System.err.println("Error: " + ex.getMessage());
return null;
});
});
八、MapReduce 集成
1. HBase 作为数据源
import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil;
Job job = Job.getInstance(config, "HBase Reader");
Scan scan = new Scan();
scan.setCaching(500);
// 设置Mapper
TableMapReduceUtil.initTableMapperJob(
"input_table",
scan,
HBaseMapper.class,
Text.class,
IntWritable.class,
job
);
job.setReducerClass(HBaseReducer.class);
job.setOutputFormatClass(TextOutputFormat.class);
2. HBase 作为输出
TableMapReduceUtil.initTableReducerJob(
"output_table",
HBaseReducer.class,
job
);
九、最佳实践与性能优化
1. 连接管理
// 使用连接池
HConnectionPool pool = new HConnectionPool(config, 10); // 最大10个连接
try (PooledConnection conn = pool.getConnection()) {
try (Table table = conn.getTable(TableName.valueOf("users"))) {
// 操作表
}
}
2. 批量操作优化
// 使用BufferedMutator提高写入吞吐
BufferedMutatorParams params = new BufferedMutatorParams(TableName.valueOf("logs"))
.writeBufferSize(8 * 1024 * 1024); // 8MB缓冲区
try (BufferedMutator mutator = connection.getBufferedMutator(params)) {
for (int i = 0; i < 10000; i++) {
Put put = new Put(Bytes.toBytes("row" + i));
put.addColumn(...);
mutator.mutate(put);
}
mutator.flush(); // 手动刷新缓冲区
}
3. 扫描优化
Scan scan = new Scan();
scan.setCacheBlocks(false); // 对于MapReduce作业禁用缓存
scan.setBatch(100); // 设置列批处理大小
scan.setMaxResultSize(2 * 1024 * 1024); // 设置每批最大字节数
4. 行键设计技巧
// 加盐处理解决热点问题
byte[] salt = new byte[1];
ThreadLocalRandom.current().nextBytes(salt);
byte[] rowKey = Bytes.add(salt, Bytes.toBytes("original_key"));
// 时间戳反转
long reverseTs = Long.MAX_VALUE - System.currentTimeMillis();
byte[] tsRowKey = Bytes.add(Bytes.toBytes("sensor"), Bytes.toBytes(reverseTs));
十、错误处理与监控
1. 重试策略
// 自定义重试策略
config.set(HConstants.HBASE_CLIENT_RETRIES_NUMBER, "5");
config.set(HConstants.HBASE_CLIENT_PAUSE, "1000"); // 1秒间隔
2. 异常处理
try {
table.put(put);
} catch (RetriesExhaustedException e) {
// 重试耗尽异常
System.err.println("Operation failed after retries: " + e.getMessage());
} catch (TableNotFoundException e) {
// 表不存在异常
System.err.println("Table not found: " + e.getMessage());
}
3. 监控集成
// 获取RegionServer指标
MetricsRegionServer metrics = ((ClusterConnection) connection).getMetrics();
long readRequests = metrics.getReadRequestsCount();
long writeRequests = metrics.getWriteRequestsCount();
总结:HBase API 使用原则
- 连接管理:复用Connection,线程安全
- 资源释放:使用try-with-resources确保Table/Admin关闭
- 批量操作:优先使用批量Put/Delete
- 扫描优化:合理设置Caching/Batch/MaxResultSize
- 行键设计:避免热点,合理分片
- 异步处理:高并发场景使用Async API
- 错误处理:实现健壮的重试机制
通过掌握这些 API 及其最佳实践,您可以高效地构建基于 HBase 的高性能应用,处理 PB 级数据的同时保证低延迟访问。