Spark读写HBase数据

最新推荐文章于 2025-05-22 11:11:17 发布

原创最新推荐文章于 2025-05-22 11:11:17 发布 · 429 阅读

2 ·

CC 4.0 BY-SA版权

Spark基础同时被 2 个专栏收录

17 篇文章

订阅专栏

HBase

3 篇文章

订阅专栏

博客介绍了信息技术领域的数据处理操作，包括使用Spark创建Hbase表并定义属性，读取Hbase数据写入ES，以及将数据从hive写入hbase，涉及大数据开发中的数据交互与处理。

1.使用Spark创建Hbase表，以及定义表属性

object HBaseCreateTable {
  def main(args: Array[String]) {
    val TABLE_NAME = "test_yuan"
    val hBaseConf = HBaseConfiguration.create()
    hBaseConf.set(HConstants.ZOOKEEPER_QUORUM, "bq2.bq.cn,bq1.bq.cn")
    hBaseConf.set(HConstants.ZOOKEEPER_CLIENT_PORT, "2181")
    val connect = ConnectionFactory.createConnection(hBaseConf)
    val admin = connect.getAdmin
    try {
      if (admin.tableExists(TableName.valueOf(TABLE_NAME))) {
        admin.disableTable(TableName.valueOf(TABLE_NAME));
        admin.deleteTable(TableName.valueOf(TABLE_NAME));
      }
      //2\创建描述
      val h_table = new HTableDescriptor(TableName.valueOf(TABLE_NAME));
      val column = new HColumnDescriptor("base".getBytes());
      //column.setBlockCacheEnabled(true)
      //column.setBlocksize(2222222)
      // 添加列簇
      h_table.addFamily(column);
      h_table.addFamily(new HColumnDescriptor("gps".getBytes()));
      //3\创建表
      admin.createTable(h_table)
      val table = connect.getTable(TableName.valueOf(TABLE_NAME))

      //插入5条数据
      for (i <- 1 to 5) {
        // 这里是主键
        val put = new Put(Bytes.toBytes("row" + i))
        // 必须添加到已经存在的列簇，列名可以不存在。
        put.addColumn(Bytes.toBytes("base"), Bytes.toBytes("name"), Bytes.toBytes("value " + i))
        put.addColumn(Bytes.toBytes("base"), Bytes.toBytes("famm"), Bytes.toBytes("value " + i))
        table.put(put)
      }
      table.close()
    } catch {
      case ex: Exception => ex.printStackTrace()
    } finally {
      releaseConn(admin)
    }
  }

  def releaseConn(admin: Admin) = {
    try {
      if (admin != null) {
        admin.close();
      }
    } catch {
      case ex: Exception => ex.getMessage
    }
  }
}

2.读取Hbase数据并写入ES

object HbaseToES {
  def main(args: Array[String]): Unit = {
    val zookeeper_quorum = "bq2.bq.cn,bq1.bq.cn"
    val zookeeper_client_port = "2181"
    val config = ConfigUtil.getConfig
    val sparkConf = new SparkConf().setAppName("HbaseToES")
      .set("es.nodes", config.getString("app.es.ips"))
      .set("es.port", config.getString("app.es.port"))
      .set("es.index.auto.create", "true")
      .set("es.net.http.auth.user", config.getString("app.es.es_user_name"))
      .set("es.net.http.auth.pass", config.getString("app.es.es_user_pass"))

    val ssc = SparkSession.builder().appName("SparkFromHBase").master("local[*]").config(sparkConf).getOrCreate()
    val sc = ssc.sparkContext

    val tableName = "test_yuan"
    val hBaseConf = HBaseConfiguration.create()
    hBaseConf.set(HConstants.ZOOKEEPER_QUORUM, zookeeper_quorum)
    hBaseConf.set(HConstants.ZOOKEEPER_CLIENT_PORT, zookeeper_client_port)
    hBaseConf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.INPUT_TABLE, tableName)
    //读取数据并转化成rdd TableInputFormat是org.apache.hadoop.hbase.mapreduce包下的
    val hbaseRDD = sc.newAPIHadoopRDD(hBaseConf, classOf[org.apache.hadoop.hbase.mapreduce.TableInputFormat], classOf[ImmutableBytesWritable], classOf[Result])
    val result = hbaseRDD.map(x => x._2).map { result =>
        (result.getRow,
          result.getValue(Bytes.toBytes("base"), Bytes.toBytes("name")),
          result.getValue(Bytes.toBytes("base"), Bytes.toBytes("address")),
          result.getValue(Bytes.toBytes("gps"), Bytes.toBytes("geohash")))
      }.map(row => testInsert(new String(row._1), new String(row._2), new String(row._3), new String(row._4)))
    println("数据量 " + result.count())
    //result.take(10).foreach(println)

    EsSpark.saveToEs(result, "test/hbase")
  }

  case class testInsert(row_id: String,
                        name: String,
                        address: String,
                        geohash: String)

}

3.从hive写入hbase

object HiveToHBase {
  def main(args: Array[String]): Unit = {
    val zookeeper_quorum = "bq2.bq.cn,bq1.bq.cn"
    val zookeeper_client_port = "2181"
    val TABLE_NAME = "test_yuan"TABLE_NAME)
    val sparkConf = new SparkConf().setAppName("HiveToHBase")
      .setMaster("local[*]")
    val ssc = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
    val dataFrame = ssc.sql("select mobiletelephone,customername,address,gps,geohash from graph.user_07_10 where mobiletelephone is not null limit 10")
    dataFrame.show(10)
    dataFrame.rdd.map(x => {
      val phone = Try(x(0).asInstanceOf[String]).getOrElse("0")
      val name = Try(x(1).asInstanceOf[String]).getOrElse("")
      val address =Try(x(2).asInstanceOf[String]).getOrElse("")
      val gps =Try(x(3).asInstanceOf[String]).getOrElse("")
      val geohash =Try(x(4).asInstanceOf[String]).getOrElse("")
      // rowkey
      // 列簇、列、值
      val p = new Put(Bytes.toBytes(phone))
      p.addColumn(Bytes.toBytes("base"), Bytes.toBytes("name"), Bytes.toBytes(name))
      p.addColumn(Bytes.toBytes("base"), Bytes.toBytes("address"), Bytes.toBytes(address))
      p.addColumn(Bytes.toBytes("gps"), Bytes.toBytes("gps"), Bytes.toBytes(gps))
      p.addColumn(Bytes.toBytes("gps"), Bytes.toBytes("geohash"), Bytes.toBytes(geohash))
    }).foreachPartition(Iterator => {
      //初始化jobconf，TableOutputFormat必须是org.apache.hadoop.hbase.mapred包下的！
      val jobConf = new JobConf(HBaseConfiguration.create())
      jobConf.set("hbase.zookeeper.quorum", zookeeper_quorum)
      jobConf.set("hbase.zookeeper.property.clientPort", "2181")
      // 走MapReduce   OutputFormat
      jobConf.setOutputFormat(classOf[TableOutputFormat])
      val table = new HTable(jobConf, TableName.valueOf(TABLE_NAME))
      import scala.collection.JavaConversions._
      table.put(seqAsJavaList(Iterator.toSeq))
    })
  }
}