由于网上找到的版本都比较老旧,记录一版现在在用的版本的Scala读写Hbase示例。Scala2.11.8;Spark2.1.0。仅在本机集群通过,供参考。
package test
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase._
import org.apache.hadoop.hbase.client._
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{TableInputFormat, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.sql.SparkSession
object TestHBase {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("LinkStart").master("local").getOrCreate()
val sc = spark.sparkContext
val conf = HBaseConfiguration.create()
//设置zookeeper连接端口,默认2181
conf.set("hbase.zookeeper.quorum", "集群地址ip,逗号分隔") // HBase集群服务器地址(任一台)
conf.set("hbase.zookeeper.property.clientPort", "2181") // zookeeper客户端访问端口
conf.set("hbase.master", "master:port")
//设置查询的表名
conf.set(TableInputFormat.INPUT_TABLE, "test2019:bulletin")
val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
val count = hBaseRDD.count()
println("Students RDD Count:" + count)
hBaseRDD.cache()
//遍历输出
hBaseRDD.foreach({ case (_,result) =>
val key = Bytes.toString(result.getRow)
val oldData = Bytes.toString(result.getValue("docs".getBytes,"insert_time".getBytes))
val newData = Bytes.toString(result.getValue("docs".getBytes,"latest".getBytes))
println("Row key:"+key+" OLD:"+o