scala版本:2.10.6
1.pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.graph</groupId>
<artifactId>GraphGNN</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<!--<scala.version>2.10.6</scala.version>-->
<spark.version>2.1.0</spark.version>
<hadoop.version>2.6.5</hadoop.version>
<hbase.version>1.2.6</hbase.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.10</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-server</artifactId>
<version>${hbase.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase</artifactId>
<version>${hbase.version}</version>
<type>pom</type>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<id>scala-compile-first</id>
<goals>
<goal>compile</goal>
</goals>
<configuration>
<includes>
<include>**/*.scala</include>
</includes>
</configuration>
</execution>
<execution>
<id>scala-test-compile</id>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>HelloWord</mainClass>
</transformer>
</transformers>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
2.queryHbase.scala
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{TableName, CellUtil, HBaseConfiguration}
import org.apache.hadoop.hbase.client.{ResultScanner, HTable, Result, Scan}
import org.apache.hadoop.hbase.filter.{CompareFilter, Filter, RegexStringComparator, RowFilter}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.protobuf.ProtobufUtil
import org.apache.hadoop.hbase.util.Base64
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory
object queryHbase{
val log = LoggerFactory.getLogger(this.getClass)
/**
* hbase连接
*
* @param tableName
* @param serverHost
* @param serverPort
* @return
*/
def getHbaseConfig(tableName: String, serverHost: String, serverPort: String): Configuration = {
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum", serverHost)
hbaseConf.set("hbase.zookeeper.property.clientPort", serverPort)
hbaseConf.set("hbase.rpc.timeout", "3600000")
hbaseConf.set("hbase.client.scanner.timeout.period", "3600000")
hbaseConf.set(TableInputFormat.INPUT_TABLE, tableName)
hbaseConf
}
/**
* hbase查询
*
* @param sc
* @param tableName
* @param serverHost
* @param serverPort
* @param prop
* @param rowKeyRegex
* @param partionNum
* @return
*/
def queryHbase(sc: SparkContext, tableName: String, serverHost: String, serverPort: String, prop: String, rowKeyRegex: String, partionNum: Int): RDD[(ImmutableBytesWritable, Result)] = {
val hbaseConf = getHbaseConfig(tableName, serverHost, serverPort)
log.info("===========successful!")
val scan: Scan = new Scan()
val rowFilter: Filter = new RowFilter(CompareFilter.CompareOp.EQUAL, new RegexStringComparator(rowKeyRegex))
scan.setFilter(rowFilter)
hbaseConf.set(TableInputFormat.SCAN, convertScanToString(scan))
val hbaseRDD = sc.newAPIHadoopRDD(
hbaseConf,
classOf[TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
println(hbaseRDD.count())
println(hbaseRDD.take(1))
return hbaseRDD
}
/**
* 将Scan转换为String作为设置参数输入
*
* @param scan
* @return
*/
def convertScanToString(scan: Scan) = {
val proto = ProtobufUtil.toScan(scan)
Base64.encodeBytes(proto.toByteArray)
}
def main(args: Array[String]) {
val conf = new SparkConf().setMaster("local").setAppName("demo")
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
val sc = new SparkContext(conf)
val tableName = "entity_table_check_mon_2"
val serverHost = "...." //ip
val serverPort = "2181"
val prop = "prop"
val rowKeyRegex = ".*:6$"
val partionNum = 1
val hbaseRDD: RDD[(ImmutableBytesWritable, Result)] = queryHbase(sc, tableName, serverHost, serverPort, prop, rowKeyRegex, partionNum)
println("done1")
sc.stop()
}
}
注意1:如果pom中spark,hadoop,hbase版本配置有误,则可能出现如下错误提示:
Exception in thread "main" java.lang.IncompatibleClassChangeError: Implementing class
at java.lang.ClassLoader.defineClass1(Native Method)
at java.lang.ClassLoader.defineClass(ClassLoader.java:763)
注意2:
在getHbaseConfig中设置nameserver会加速程序运行速度
hbaseConf.set("hbase.nameserver.address","name1,name2")
name1,name2分别为hosts文件中配置的serverHost的名字。
如10.112.23.23 name1