spark读取hbase(NewHadoopAPI 例子)

本文介绍了一种使用Spark查询HBase数据库的方法,通过具体代码示例详细展示了如何配置Spark环境,获取任务参数,并进行HBase数据查询及结果解析。
package cn.piesat.controller

import java.text.{DecimalFormat, SimpleDateFormat}
import java.util
import java.util.concurrent.{CountDownLatch, Executors, Future}

import ba.common.log.enums.{LogLevel, LogType}
import ba.common.log.utils.LogUtil
import cn.piesat.constants.{HbaseZookeeperConstant, RowkeyConstant}
import cn.piesat.domain._
import cn.piesat.service.impl.{MsgServiceImpl, SparkTaskServiceImpl}
import cn.piesat.thread.HbaseQueryThread
import com.google.gson.Gson
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.{Result, Scan}
import org.apache.hadoop.hbase.filter.{Filter, FilterList}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.{Base64, Bytes}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import pie.storage.db.domain._
import pie.storage.db.enums.{CompareOp, DataBaseType}

/**
* @author liujie
* spark查询hbase的入口类
*/
object HbaseReader {
val sparkTaskService = new SparkTaskServiceImpl
val msgService = new MsgServiceImpl
val sparkAppName = "sparkApp"
val sparkMaster = "local[6]"
var taskId = 8
val serviceNum = 76
val systemId = 12011
val systemName = "8888"
val cf = "cf1"
val cell = "content"
val zookeeperHost = "bigdata03,bigdata04,bigdata05"
val zookeeperPort = "2181"
val excutor=Executors.newCachedThreadPool()

def main(args: Array[String]): Unit = {
try{
if (args.length > 0) {
taskId = args(0).toInt
}
/**
* 第一步,获取SparkContext对象
*/
val sc = getSparkContext
/**
* 第二步,获得查询参数集合
*/
val taskParamList = getTaskParam(taskId, sc)
/**
* 第三步,进行hbase数据查询
*/
val rowkeyRDD = queryHbaseData(taskParamList, sc)

rowkeyRDD.saveAsTextFile("file://")
println("rowkeyRDD的数量为:" + rowkeyRDD.count())
val rowkey = rowkeyRDD.first()
println("取出的值为:"+util.Arrays.toString(rowkey._2.getValue(cf.getBytes(),cell.getBytes())))

/**
* 第四步,进行数据解析
*/

/**
* 第五步,将结果写入文本,文本地址在第二步中的taskParamList中
*/

}catch {
case e:Exception =>{
e.printStackTrace()
}
}finally {
excutor.shutdown()
}


excutor.shutdown()

}

/**
* 获取任务Id
*
* @param args
* @return
*/
private def getTaskId(args: Array[String]): Int = {
if (args == null || args.length <= 0) {
-1;
} else {
try {
args.apply(0).toInt
} catch {
case e: Exception =>
-1
}
}
}

/**
* 获取sparkContext
*
* @return
*/

private def getSparkContext(): SparkContext = {
val sparkConf = new SparkConf().setAppName(sparkAppName).setMaster(sparkMaster)
sparkConf.set("spark.broadcast.factory", "org.apache.spark.broadcast.HttpBroadcastFactory")
sparkConf.set("spark.network.timeout", "300")
sparkConf.set("spark.streaming.unpersist", "true")
sparkConf.set("spark.scheduler.listenerbus.eventqueue.size", "100000")
sparkConf.set("spark.storage.memoryFraction", "0.5")
sparkConf.set("spark.shuffle.consolidateFiles", "true")
sparkConf.set("spark.shuffle.file.buffer", "64")
sparkConf.set("spark.shuffle.memoryFraction", "0.3")
sparkConf.set("spark.reducer.maxSizeInFlight", "24")
sparkConf.set("spark.shuffle.io.maxRetries", "60")
sparkConf.set("spark.shuffle.io.retryWait", "60")
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
new SparkContext(sparkConf)
}

/**
* 获取sparkTask的任务参数集合
*
* @param taskId
* @return
*/
private def getTaskParam(taskId: Int, sc: SparkContext): List[Tuple4[String, String, String, util.List[Filter]]] = {
var list: List[Tuple4[String, String, String, util.List[Filter]]] = List()
val sparkTask = sparkTaskService.getSparkTaskByTaskId(taskId)
val params = sparkTask.getQueryParam
val gson = new Gson
val sparkQueryParams = gson.fromJson(params, classOf[SparkQueryParams])
try {
//1.**
val systemId = sparkQueryParams.getSystemId
//2.开始时间
val startTime = sparkQueryParams.getStartTime
//3.结束时间
val endTime = sparkQueryParams.getEndTime
//4.**
val stationId = sparkQueryParams.getStationId
val paramList = sparkQueryParams.getParams
for (i <- 0 until paramList.size()) {
val param = paramList.get(i)
//5.**
val msgId = param.getMsgId
//6.**
val sinkId = param.getSinkId
//7.**
val sourceId = param.getSourceId
//8.表名
val tableName = msgService.getTieYuanMsgTableNameById(msgId);
for (num <- 0 until serviceNum) {
val rowkeyAndFilters = getRowkeyAndFilters(num, systemId, startTime, endTime, stationId, msgId, sinkId, sourceId, tableName)
list = rowkeyAndFilters :: list
}
}
list
} catch {
case e: Exception =>
LogUtil.writeLog(systemId, LogLevel.ERROR, LogType.NORMAL_LOG, systemName + " Error Info:任务参数异常。" + e)
null
}
}

/**
* hbase数据查询
*/
private def queryHbaseData(taskParamList: List[(String, String, String, util.List[Filter])], sc: SparkContext): RDD[(ImmutableBytesWritable, Result)] = {
var rdd: RDD[(ImmutableBytesWritable, Result)] = null
val latch:CountDownLatch=new CountDownLatch(taskParamList.length)
val list: util.List[Future[RDD[Tuple2[ImmutableBytesWritable, Result]]]]=new util.ArrayList[Future[RDD[Tuple2[ImmutableBytesWritable, Result]]]]()
for (taskParam <- taskParamList) {
list.add(excutor.submit(new HbaseQueryThread(taskParam,sc,latch)))
}
import scala.collection.JavaConversions._
for(li <- list){
if(rdd==null){
rdd=li.get()
}else{
rdd=rdd.++(li.get())
}
}
latch.await()
rdd
}


/**
* 获取
*
* @param num
* @param systemId
* @param startTime
* @param endTime
* @param stationId
* @param msgId
* @param sinkId
* @param sourceId
* @return
*/
private def getRowkeyAndFilters(num: Int, systemId: Int, startTime: String,
endTime: String, stationId: Int, msgId: Int,
sinkId: Int, sourceId: Int,
tableName: String): Tuple4[String, String, String, util.List[Filter]]

= {
//线程非安全,因此每次调用时创建新的对象
val simpleDateFormat1 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss:SSS")
val simpleDateFormat2 = new SimpleDateFormat("yyyyMMddHHmmssSSS")
val decimalFormat = new DecimalFormat("00")
val queryDef = new QueryDef
//1.设置数据库
queryDef.setDataBaseType(DataBaseType.HBASE)
//2.设置表名
queryDef.setTableName(tableName)
//3.设置请求参数集合
//3.1设置**Id参数
val systemIdParam = new QueryParam
systemIdParam.setField(new Field(new FieldInfo(RowkeyConstant.SYSTEM_ID), new FieldValue(systemId)))
systemIdParam.setCompareOp(CompareOp.EQUAL)
//3.2设置**
val msgIdParam = new QueryParam
msgIdParam.setField(new Field(new FieldInfo(RowkeyConstant.MSG_ID), new FieldValue(msgId)))
msgIdParam.setCompareOp(CompareOp.EQUAL)
//3.3设置开始时间参数
val startTimeParam = new QueryParam
val startTimeFormat = simpleDateFormat2.format(simpleDateFormat1.parse(startTime))
startTimeParam.setField(new Field(new FieldInfo(RowkeyConstant.TIME), new FieldValue(startTimeFormat)))
startTimeParam.setCompareOp(CompareOp.GREATER)
//3.4设置结束时间参数
val endTimeParam = new QueryParam
val endTimeFormat = simpleDateFormat2.format(simpleDateFormat1.parse(endTime))
endTimeParam.setField(new Field(new FieldInfo(RowkeyConstant.TIME), new FieldValue(endTimeFormat)))
endTimeParam.setCompareOp(CompareOp.LESS)
//3.5设置**
val sourceParam = new QueryParam
sourceParam.setField(new Field(new FieldInfo(RowkeyConstant.SINK_ID), new FieldValue(sinkId)))
sourceParam.setCompareOp(CompareOp.EQUAL)
//3.6设置**
val sinkParam = new QueryParam
sinkParam.setField(new Field(new FieldInfo(RowkeyConstant.SOURCE_ID), new FieldValue(sourceId)))
sinkParam.setCompareOp(CompareOp.EQUAL)
val queryParamList = util.Arrays.asList(systemIdParam, msgIdParam, startTimeParam, endTimeParam, sourceParam, sinkParam)
queryDef.setListQueryParam(queryParamList)
val startRowkey = decimalFormat.format(num) + queryDef.getStartRowKey(classOf[String])
val endRowkey = decimalFormat.format(num) + queryDef.getStopRowKey(classOf[String])
val filters = queryDef.getFilters(2, num, classOf[String])
new Tuple4(tableName, startRowkey, endRowkey, filters)
}

/**
* 进行hbase查询
*
* @param taskParam
* @param sc
*/
def getHbaseQueryRDD(taskParam: (String, String, String, util.List[Filter]), sc: SparkContext): RDD[(ImmutableBytesWritable, Result)] = {
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set(HbaseZookeeperConstant.HBASE_ZOOKEEPER_QUORUM, zookeeperHost)
hbaseConf.set(HbaseZookeeperConstant.HBASE_ZOOKEEPER_PROPERTY_CLIENTPORT, zookeeperPort)
hbaseConf.set(TableInputFormat.INPUT_TABLE, taskParam._1)
val scan = new Scan()
scan.setStartRow(Bytes.toBytes(taskParam._2))
scan.setStopRow(Bytes.toBytes(taskParam._3))
val filterList = new FilterList(FilterList.Operator.MUST_PASS_ALL, taskParam._4)
scan.setFilter(filterList)
hbaseConf.set(TableInputFormat.SCAN, convertScanToString(scan))
val rs = sc.newAPIHadoopRDD(
hbaseConf,
classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
//todo 解析
rs
// rs.map(tuple2=>{
// val result=tuple2._2
// result.
// })
}

private def convertScanToString(scan: Scan) = {
val proto = ProtobufUtil.toScan(scan)
Base64.encodeBytes(proto.toByteArray)
}
}

转载于:https://www.cnblogs.com/runnerjack/p/9976112.html

<think>我们参考用户提供的引用[1]和引用[2]中的代码示例,它们都使用了`sc.newAPIHadoopRDD`方法来读取HBase数据。同时,引用[3]提到了一种转换思路(将HBase数据映射到Hive表,然后直接读取Hive表得到DataFrame),但用户明确要求使用Spark读取HBase的示例代码,因此我们主要展示第一种方式。注意:在Spark读取HBase数据通常需要将HBase的配置(如Zookeeper地址、表名)传递给RDD读取方法。另外,需要指定key和value的转换器。根据引用[1]和[2]的示例,我们可以整合一个基本的读取HBase数据的代码框架。同时,我们需要注意,引用[1]中的代码在valueConv的字符串中有一个逗号写成了点(应该是笔误),我们这里修正一下。步骤:1.创建SparkConf和SparkContext。2.设置HBase配置,包括Zookeeper的地址和要读取的表。3.定义key和value的转换器(使用Spark自带的转换器)。4.使用`newAPIHadoopRDD`方法读取数据,得到RDD。5.对RDD进行操作(如count,collect等)。另外,引用[4]提到了一种使用SparkSQL读取HBase数据的方式,但需要特定的包。这里用户没有提供具体的包,所以我们先使用基本的方法。注意:使用这种方式读取HBase需要确保Spark的类路径中包含HBase的相关jar包。通常,在提交Spark作业时需要指定这些jar包。下面提供示例代码:</think> ### 使用 PySpark 读取 HBase 数据的解决方案 根据站内引用和最佳实践,以下是使用 Python Spark 读取 HBase 数据的完整示例代码,包含三种常用方式: #### 方式1:使用 `newAPIHadoopRDD` 基础读取(适合全表扫描) ```python #!/usr/bin/env python3 from pyspark import SparkConf, SparkContext # 初始化 Spark 环境 conf = SparkConf().setMaster("local").setAppName("HBaseReader") sc = SparkContext(conf=conf) # HBase 配置参数 hbase_conf = { "hbase.zookeeper.quorum": "localhost", # Zookeeper 地址 "hbase.mapreduce.inputtable": "student", # 目标表名 "hbase.scan.cache": "100", # 每次扫描缓存行数 "zookeeper.znode.parent": "/hbase" # HBase 根路径(集群配置) } # 键值转换器 key_conv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter" value_conv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter" # 读取 HBase 数据 hbase_rdd = sc.newAPIHadoopRDD( inputFormatClass="org.apache.hadoop.hbase.mapreduce.TableInputFormat", keyClass="org.apache.hadoop.hbase.io.ImmutableBytesWritable", valueClass="org.apache.hadoop.hbase.client.Result", keyConverter=key_conv, valueConverter=value_conv, conf=hbase_conf ) # 数据处理示例 print("总记录数:", hbase_rdd.count()) for row_key, row_data in hbase_rdd.take(3): print(f"RowKey: {row_key}, Data: {row_data}") ``` #### 方式2:通过 Phoenix 高效查询(推荐生产环境) ```python from pyspark.sql import SparkSession spark = SparkSession.builder \ .appName("PhoenixReader") \ .config("spark.jars.packages", "org.apache.phoenix:phoenix-spark:5.0.0-HBase-2.0") \ .getOrCreate() # 构建 Phoenix 查询 df = spark.read \ .format("org.apache.phoenix.spark") \ .option("table", "student") \ .option("zkUrl", "localhost:2181:/hbase") \ .load() # 执行 SQL 查询 df.createOrReplaceTempView("students") result = spark.sql("SELECT name, age FROM students WHERE grade > 80") result.show(5) ``` #### 方式3:转换为 DataFrame(需添加数据处理) ```python from pyspark.sql import Row # 转换函数 def parse_hbase_record(row): cols = row[1].split(';') return Row( rowkey=row[0], name=cols[0].split(':')[1], age=int(cols[1].split(':')[1]) ) # 转换 RDD → DataFrame parsed_rdd = hbase_rdd.map(parse_hbase_record) df = spark.createDataFrame(parsed_rdd) df.printSchema() df.show() ``` ### 关键配置说明 1. **依赖配置**: - 必须包含 HBase 和 Phoenix 的 Jar 包 - 提交命令示例: ```shell spark-submit --packages org.apache.hbase:hbase-client:2.4.9,org.apache.phoenix:phoenix-spark:5.0.0-HBase-2.0 your_script.py ``` 2. **配置优化**: ```python # 增加扫描缓存提升性能 hbase_conf["hbase.client.scanner.caching"] = "1000" # 设置批量处理大小 hbase_conf["hbase.htable.threads.max"] = "8" ``` 3. **查询过滤**: ```python # 添加 RowKey 范围过滤 hbase_conf["hbase.mapreduce.scan.row.start"] = "user100" hbase_conf["hbase.mapreduce.scan.row.stop"] = "user200" ``` ### 常见问题解决 1. **ClassNotFound 错误**: 确保包含所有 HBase 相关依赖包[^1][^2] 2. **连接超时**: 检查 `hbase.zookeeper.quorum` 和 `zookeeper.znode.parent` 配置是否正确[^2] 3. **性能优化**: - 避免全表扫描,使用 RowKey 范围查询 - 批量读取设置合适的缓存大小[^1][^2] > 注意:直接读取 HBase 适合批量处理,实时查询推荐 Phoenix 方案[^3][^4]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值