spark scala 读取hbase

本文介绍了如何在Scala 2.10.6版本下,利用Spark读取HBase的数据。主要内容包括在pom.xml文件中配置Spark、Hadoop和HBase的依赖,以及在queryHbase.scala脚本中设置Hbase配置,特别是通过设置'hbase.nameserver.address'来提升程序运行效率。注意,版本不匹配会导致IncompatibleClassChangeError异常。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

scala版本:2.10.6

1.pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>com.graph</groupId>
    <artifactId>GraphGNN</artifactId>
    <version>1.0-SNAPSHOT</version>
<properties>
    <!--<scala.version>2.10.6</scala.version>-->
    <spark.version>2.1.0</spark.version>
    <hadoop.version>2.6.5</hadoop.version>
    <hbase.version>1.2.6</hbase.version>
</properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-core_2.10</artifactId>
            <version>${spark.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>${hadoop.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>${hadoop.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-client</artifactId>
            <version>${hbase.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase-server</artifactId>
            <version>${hbase.version}</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hbase</groupId>
            <artifactId>hbase</artifactId>
            <version>${hbase.version}</version>
            <type>pom</type>
        </dependency>
    </dependencies>
    <build>
        <plugins>
            <plugin>
                <groupId>org.scala-tools</groupId>
                <artifactId>maven-scala-plugin</artifactId>
                <version>2.15.2</version>
                <executions>
                    <execution>
                        <id>scala-compile-first</id>
                        <goals>
                            <goal>compile</goal>
                        </goals>
                        <configuration>
                            <includes>
                                <include>**/*.scala</include>
                            </includes>
                        </configuration>
                    </execution>
                    <execution>
                        <id>scala-test-compile</id>
                        <goals>
                            <goal>testCompile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>

            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-shade-plugin</artifactId>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>shade</goal>
                        </goals>
                        <configuration>
                            <transformers>
                                <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
                                    <mainClass>HelloWord</mainClass>
                                </transformer>
                            </transformers>
                            <filters>
                                <filter>
                                    <artifact>*:*</artifact>
                                    <excludes>
                                        <exclude>META-INF/*.SF</exclude>
                                        <exclude>META-INF/*.DSA</exclude>
                                        <exclude>META-INF/*.RSA</exclude>
                                    </excludes>
                                </filter>
                            </filters>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

2.queryHbase.scala

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{TableName, CellUtil, HBaseConfiguration}
import org.apache.hadoop.hbase.client.{ResultScanner, HTable, Result, Scan}
import org.apache.hadoop.hbase.filter.{CompareFilter, Filter, RegexStringComparator, RowFilter}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.Base64
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory


object queryHbase{
  val log = LoggerFactory.getLogger(this.getClass)
  /**
    * hbase连接
    *
    * @param tableName
    * @param serverHost
    * @param serverPort
    * @return
    */
  def getHbaseConfig(tableName: String, serverHost: String, serverPort: String): Configuration = {
    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.set("hbase.zookeeper.quorum", serverHost)
    hbaseConf.set("hbase.zookeeper.property.clientPort", serverPort)
    hbaseConf.set("hbase.rpc.timeout", "3600000")
    hbaseConf.set("hbase.client.scanner.timeout.period", "3600000")
    hbaseConf.set(TableInputFormat.INPUT_TABLE, tableName)
    hbaseConf
  }
  /**
    * hbase查询
    *
    * @param sc
    * @param tableName
    * @param serverHost
    * @param serverPort
    * @param prop
    * @param rowKeyRegex
    * @param partionNum
    * @return
    */
  def queryHbase(sc: SparkContext, tableName: String, serverHost: String, serverPort: String, prop: String, rowKeyRegex: String, partionNum: Int): RDD[(ImmutableBytesWritable, Result)] = {
    val hbaseConf = getHbaseConfig(tableName, serverHost, serverPort)
    log.info("===========successful!")
    val scan: Scan = new Scan()
    val rowFilter: Filter = new RowFilter(CompareFilter.CompareOp.EQUAL, new RegexStringComparator(rowKeyRegex))
    scan.setFilter(rowFilter)
    hbaseConf.set(TableInputFormat.SCAN, convertScanToString(scan))
    val hbaseRDD = sc.newAPIHadoopRDD(
      hbaseConf,
      classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])
    println(hbaseRDD.count())
    println(hbaseRDD.take(1))
    return hbaseRDD
  }




  /**
    * 将Scan转换为String作为设置参数输入
    *
    * @param scan
    * @return
    */
  def convertScanToString(scan: Scan) = {
    val proto = ProtobufUtil.toScan(scan)
    Base64.encodeBytes(proto.toByteArray)
  }

  def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("local").setAppName("demo")
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    val sc = new SparkContext(conf)
    val tableName = "entity_table_check_mon_2"
    val serverHost = "...." //ip
    val serverPort = "2181"
    val prop = "prop"
    val rowKeyRegex = ".*:6$"
    val partionNum = 1
    val hbaseRDD: RDD[(ImmutableBytesWritable, Result)] = queryHbase(sc, tableName, serverHost, serverPort, prop, rowKeyRegex, partionNum)

    println("done1")
    sc.stop()

  }


}

 

 

注意1:如果pom中spark,hadoop,hbase版本配置有误,则可能出现如下错误提示:

Exception in thread "main" java.lang.IncompatibleClassChangeError: Implementing class
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:763)

注意2:

在getHbaseConfig中设置nameserver会加速程序运行速度

hbaseConf.set("hbase.nameserver.address","name1,name2")

name1,name2分别为hosts文件中配置的serverHost的名字。

如10.112.23.23 name1

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值