Python3读取kafka消息写入HBASE

本文介绍如何通过三种不同的方式实现从Kafka消费数据并将其写入HBase:一是直接使用Python脚本进行消费与写入;二是利用Spark Streaming进行实时处理;三是通过Spark Streaming读取RDD后再写入HBase。

全栈工程师开发手册 (作者:栾鹏)
架构系列文章

ubunut系统下hbase的安装和使用参考:https://blog.youkuaiyun.com/luanpeng825485697/article/details/81027601

kafka的入门教程,参考:https://blog.youkuaiyun.com/luanpeng825485697/article/details/81036028

kafka消息格式为(None,[json串])

利用Python3有以下3种方式将kafka消息的往HBASE写入

1、直接消费kafka消息写入HBASE:

from kafka import KafkaConsumer
import time
import happybase
import json
 
hbase_ip='192.168.xxx.xxx'
hbase_port=9090
ip = hbase_ip
port = hbase_port
pool = happybase.ConnectionPool(size=3, host=ip)
 
#往tableName里插数据
def hbase_load(tableName, lists):
    with pool.connection() as connection:
        connection.open()
    if tableName not in str(connection.tables()):
        create_table(connection, tableName)
    #print(tableName,str(connection.tables()))
    table = connection.table(tableName)
    b = table.batch(batch_size=1024)
    for li in lists:
        try:
            rowkey = li['info']
            data_dicts = {}
            for d, x in li.items():
                key = "ss:" + d
                value = str(x)
                data_dicts[key] = value
                b.put(row=rowkey, data=data_dicts)
                b.send()
                print("rowkey:" + rowkey + " data append success")
        except Exception as ex:
            print(str(ex) + " 插入数据失败")
 
    connection.close()
 
#创建HBASE表
def create_table(conn, table):
    try:
        conn.create_table(
            table,
            {
                "ss": dict(max_versions=10)
            }
        )
    except Exception as ex:
        print(str(ex) + " table exists !!!")
 
#打印日志
def log(str):
    t = time.strftime(r"%Y-%m-%d_%H-%M-%S", time.localtime())
    print("[%s]%s" % (t, str))
 
lst = []
log('start consumer')
# 消费192.168.xxx.xxx:9092上的logfile 这个Topic,指定consumer group是test-consumer-group
consumer = KafkaConsumer('logfile', group_id='test-consumer-group', bootstrap_servers=['192.168.xxx.xxx:9092'])
for msg in consumer:
    recv = "%s:%d:%d: key=%s value=%s" % (msg.topic, msg.partition, msg.offset, msg.key, msg.value)
    log(recv)
    dict_data = json.loads(msg.value)
    dict_data['info'] = str(dict_data['time'])+'-'+dict_data['pool']
    lst.append(dict_data)
    hbase_load('logfile_zf', lst)

2、使用sparkstreaming的方法直接将RDD往HBASE写:

写入HBASE配置参考:http://dblab.xmu.edu.cn/blog/1715-2/

需要注意:在Spark 2.0版本上缺少相关把hbase的数据转换python可读取的jar包,需要我们另行下载。

打开spark-examples_2.11-1.6.0-typesafe-001.jar下载jar包

#!/usr/bin/env python3
 
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
import json
import time
 
conf = SparkConf().setAppName("logSparkStreaming")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 5)
# HBASE表,需要提前在HBASE中建好
table = 'logfile_stream2'
broker = "192.168.xxx.xxx:9092"
# kafka的topic
topic = "logfile"
# HBASE的zookeeper
hbaseZK = "192.168.xxx.xxx"
keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
hbaseConf = {"hbase.zookeeper.quorum": hbaseZK, "hbase.mapred.outputtable": table,
        "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
        "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
        "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
 
#打印日志
def log(str):
    t = time.strftime(r"%Y-%m-%d %H:%M:%S", time.localtime())
    print("[%s]%s" % (t, str))
 
#处理RDD元素,此RDD元素需为字典类型
def fmt_data(msg_dict):
    if msg_dict is not None:
        msg_dict['info'] = str(msg_dict['time'])+'-'+msg_dict['pool']
        rowkey = msg_dict['info']
        lst = []
        for d, x in msg_dict.items():
            col_name = d
            col_value = str(x)
            col_family = 'ss'
            # 需要将RDD中的字典的每个键值对准备成这种元组格式(rowkey, [row key, column family, column name, value])写入HBASE
            msg_tuple = (rowkey, [rowkey, col_family, col_name, col_value])
            print("rowkey:" + rowkey + "\ndata " + str(msg_tuple) + " append success")
            lst.append(msg_tuple)
        return  lst
 
#处理RDD并向HBASE中写入
def connectAndWrite(data):
    if not data.isEmpty():
        # 接收到的RDD中的元素转为字典,收到的格式为(None,[json串]),所以map第二个元素反序列化成为字典类型
        msg_list = data.map(lambda x: json.loads(x[1]))
        # 打印RDD观察,类似一个以字典类型数据为元素的列表
        log(msg_list.collect())
        try:
            # 处理RDD中元素为写入HBASE需要的格式,形成元组格式
            msg_row = msg_list.map(lambda x: fmt_data(x))
            # print(msg_row.flatMap(lambda x: x).map(lambda x: x).collect())
            # 将RDD中所有元素中的元组扁平化,再map后往HBASE存储
            msg_row.flatMap(lambda x: x).map(lambda x: x).saveAsNewAPIHadoopDataset(conf=hbaseConf, keyConverter=keyConv,valueConverter=valueConv)
        except Exception as ex:
            print(str(ex) + " 插入数据失败")
 
 
kafkaStreams = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams={"metadata.broker.list": broker})
# kafkaStreams.pprint()
kafkaStreams.foreachRDD(connectAndWrite)
 
 
log('start consumer')
ssc.start()
ssc.awaitTermination()

提交spark的命令如下:

$SPARK_HOME/bin/spark-submit --master local --packages org.apache.spark:spark-streaming-kafka_2.11:1.6.0  --jars spark-examples_2.11-1.6.0-typesafe-001.jar /home/user/spark/sparkstreaming_kafka2.py > /home/user/spark/sparkstreaming_kafka.log

注:spark-examples_2.11-1.6.0-typesafe-001.jar为把hbase的数据转换python可读取的jar包

3、读出sparkstreaming的RDD数据往HBASE写:

#!/usr/bin/env python3
 
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import SQLContext
import json
import time
import happybase
 
hbase_ip='192.168.xxx.xxx'
hbase_port=9090
ip = hbase_ip
port = hbase_port
pool = happybase.ConnectionPool(size=3, host=ip)
 
 
#创建HBASE表
def create_table(conn, table):
    try:
        conn.create_table(
            table,
            {
                "ss": dict(max_versions=10)
            }
        )
    except Exception as ex:
        print(str(ex) + " table exists !!!")
 
#打印日志
def log(str):
    t = time.strftime(r"%Y-%m-%d_%H-%M-%S", time.localtime())
    print("[%s]%s" % (t, str))
 
 
 
 
def writeHbase(msg):
    with pool.connection() as connection:
        connection.open()
    if table not in str(connection.tables()):
         create_table(connection, table)
    #print(tableName,str(connection.tables()))
    hbaseTable = connection.table(table)
    b = hbaseTable.batch(batch_size=1024)
    if not msg.isEmpty():
        # print(msg.collect())
        msg_rdd = msg.map(lambda x: json.loads(x[1]))
        # 读出RDD数据赋给Python变量来写入HBASE
        msg_list = msg_rdd.collect()
        lst = []
        for msg_dict in msg_list:
            # print(msg_dict)
            msg_dict['info'] = str(msg_dict['time'])+'-'+msg_dict['pool']
            lst.append(msg_dict)
            # print(lst)
            try:
                rowkey = msg_dict['info']
                data_dict = {}
                for d, x in msg_dict.items():
                    key = "ss:" + d
                    value = str(x)
                    data_dict[key] = value
                b.put(row=rowkey, data=data_dict)
                b.send()
                print("rowkey:" + rowkey + "\ndata " + str(data_dict) + " append success")
            except Exception as ex:
                print(str(ex) + " 插入数据失败")
 
    connection.close()
 
 
 
conf = SparkConf().setAppName("logSparkStreaming")
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 2)
sqc = SQLContext(sc)
table = 'logfile_stream'
 
broker = "192.168.xxx.xxx:9092"
topic = "logfile"
 
 
kafkaStreams = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams={"metadata.broker.list": broker})
# kafkaStreams.pprint()
kafkaStreams.foreachRDD(writeHbase)
 
 
 
log('start consumer')
ssc.start()
ssc.awaitTermination()

提交spark命令:

$SPARK_HOME/bin/spark-submit --master local[3] --packages org.apache.spark:spark-streaming-kafka_2.11:1.6.0  /home/user/spark/sparkstreaming_kafka.py > /home/user/spark/sparkstreaming_kafka.log
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值