import com.alibaba.fastjson.JSON
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.spark.HBaseContext
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions.GenericHBaseRDDFunctions
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import java.text.SimpleDateFormat
object SparkHbaseWrite {
val typedate = (dats: String) => new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").parse(dats).getTime
def main(args: Array[String]): Unit = {
if (args.length < 2) {
System.err.println(
s"""
|Usage: Consumer4CStoreNew <brokers> <topics>
| <brokers> is a list of one or more Kafka brokers
| <topics> is a list of one or more kafka topics to consume from
| <groupid> is kafka coumer group
| <tableName> is habse table name
""".stripMargin)
System.exit(1)
}
val Array(brokers, topics, groupid, tableName) = args
val topicSet = topics.split(",").toSet
val sparkConf = new SparkConf()
.setAppName("AA") //测试
.setMaster("local[6]")
val sc = new SparkContext(sparkConf)
val config = new HBaseConfiguration()
config.set("hbase.client.ipc.pool.type", "Reusable")
config.set("hbase.client.ipc.pool.size", "10")
config.set("hbase.zookeeper.quorum", "ZK01,ZK02,ZK03")
config.set("zookeeper.znode.parent", "/hbase")
val hbaseContext = new HBaseContext(sc, config)
val ssc = new StreamingContext(sc, Milliseconds(200))
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupid,
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
//获取kafka 数据
val dpp_Message = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topicSet, kafkaParams)
)
dpp_Message.foreachRDD { rdd: RDD[ConsumerRecord[String, String]] => {
val range = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
val rdds = rdd.mapPartitions(kpd => {
val ff = kpd.flatMap(f => {
val lineJson = JSON.parseObject(f.value())
val dataList = lineJson.getJSONArray("data")
//按照需求过滤kafka数据
val lineData = dataList.toArray().filter(_.toString.contains("\"service_status\":\"3\"")).
filter(k => JSON.parseObject(k.toString).getOrDefault("patient_id", "").toString.length > 1).map(line => {
val keydata = JSON.parseObject(line.toString)
val dateTime = keydata.getOrDefault("created_at", "").toString
val doctor_id = keydata.getOrDefault("doctor_id", "").toString
val patient_id = keydata.getOrDefault("patient_id", "").toString
(patient_id, typedate(dateTime), "cs", doctor_id, dateTime)
}
)
lineData
})
ff
})
//调用hbase api批量写入
rdds.hbaseBulkPut(hbaseContext, TableName.valueOf(tableName),
(putRecord) => {
val put = new Put(Bytes.toBytes(putRecord._1), putRecord._2)
put.addColumn(Bytes.toBytes(putRecord._3), Bytes.toBytes(putRecord._4), Bytes.toBytes(putRecord._5))
put
})
dpp_Message.asInstanceOf[CanCommitOffsets].commitAsync(range)
}
}
ssc.start()
ssc.awaitTermination()
}
}
pom文件:
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.gaojihealth</groupId>
<artifactId>spark-hbase-streaming</artifactId>
<version>0.1</version>
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<spark.version>2.4.8</spark.version>
<downloadSources>true</downloadSources>
</properties>
<repositories>
<repository>
<id>aliyun-public</id>
<name>aliyun</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-xml_2.11</artifactId>
<version>1.1.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
<!-- <scope>provided</scope>-->
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase.connectors.spark</groupId>
<artifactId>hbase-spark</artifactId>
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.47</version>
<!-- <scope>provided</scope>-->
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
<encoding>utf8</encoding>
</configuration>
</plugin>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-resources-plugin</artifactId>
<version>2.6</version>
<configuration>
<encoding>UTF-8</encoding>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.6</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>attached</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>