spark structed streaming 两种消费kafka json数组的方式
kafka过来的原消息体
{"gamecode":"abcd","resultguid":"81_18148_184_-1699285363_4","startguid":"81_18148_184_1573391420_4","records":[{"cards":[40],"optype":0,"playtime":1573391438014,"type":1,"userid":53435,"waittime":17344},{"cards":[54],"optype":0,"playtime":1573391445155,"type":1,"userid":4354,"waittime":7141},{"optype":1,"playtime":1573391447514,"type":0,"userid":4546,"waittime":2359}]}
1、配置kafka参数
前面这些参数数据量小的时候没事,数据一旦大起来,一个参数都不能马虎
# 创建sparksession
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
# 配置kafka消费
kafka_df = spark.readStream.format("kafka") \
.option("kafka.bootstrap.servers", bootstrap_servers) \ #kafka集群
.option("subscribe", topic) \ #kafka topic
.option("group.id", groupid) \ # kafka 组号,便于归类,不一定需要
.option("failOnDataLoss", "false") \ #数据丢失之后(topic被删除,或者offset不在可用范围内时)查询是否失败
.option("startingOffsets", starting_offsets) \ # 从头消费
.option("includeTimestamp", True) \ # 包含kafka的timestamp
.option("maxOffsetsPerTrigger", max_offsets_per_trigger) \ # 最大单批次消费数
.load()
2、进行数据处理
2.1、json体的shema方式
schema不匹配的这种可能会丢数据
# 指定schema
json_data = '''
{
"resultguid": "123_123_123",
"startguid": "222_222_222",
"gamecode": "hagd",
"records": [{
"userid": 706778,
"playtime": 12345,
"waittime": 2,
"optype": 0,
"type": 12,
"cards": [1, 2, 3, 4]
},
{
"userid": 706772,
"playtime": 12345,
"waittime": 2,
"optype": 0,
"type": 12,
"cards": [1, 2, 3, 4]
},
{
"userid": 706778,
"playtime": 12346,
"waittime": 2,
"optype": 0,
"type": 12,
"cards": [8, 35, 23, 24]
}
]
}
'''
data_prefix = "data."
reader = sparkGetOrCreate.read
return reader.json(sparkGetOrCreate.sparkContext.parallelize([json_data]))
json_to_df = data_source.select(
from_json(lower(data_source.value.cast("string")), schema_df.schema).alias(data_prefix.split(".")[0]))
json_to_df.printSchema()
json_to_df.createOrReplaceTempView('origin')
card = card_enum.card_enum
df_card_enum = DataFrame(card)
df_card_enum = spark.createDataFrame(df_card_enum)
df_card_enum.createOrReplaceTempView("card_enum")
execute_1 = '''
select /*+ BROADCASTJOIN(t2) */
t1.resultguid,
t1.startguid,
t1.game,
t1.gamecode,
t1.room,
t1.tableno,
t1.starttime_unix,
t1.start_date,
t1.start_time,
t1.playercount,
t1.uid,
t1.playtime_unix,
t1.play_date,
t1.play_time,
t1.waittime,
t1.optype,
t1.type,
case when t1.card=-1 then null
else t1.card end as card,
t2.bigshape,
t2.color,
t2.num
from
(
select
resultguid,
startguid,
game,
gamecode,
room,
tableno,
starttime_unix,
start_date,
start_time,
playercount,
uid,
playtime_unix,
play_date,
play_time,
waittime,
optype,
type,
explode(card) as card
from
(
select
%(replace)sresultguid as resultguid,
%(replace)sstartguid as startguid,
split(%(replace)sresultguid, '_')[0] as game,
%(replace)sgamecode as gamecode,
split(%(replace)sresultguid, '_')[1] as room,
split(%(replace)sresultguid, '_')[2] as tableno,
split(%(replace)sstartguid, '_')[3] as starttime_unix,
from_unixtime(split(%(replace)sstartguid, '_')[3],'yyyyMMdd') as start_date,
from_unixtime(split(%(replace)sstartguid, '_')[3],'HHmmss') as start_time,
split(%(replace)sstartguid, '_')[4] as playercount,
my_view1.records.UserId as uid,
my_view1.records.PlayTime as playtime_unix,
from_unixtime(my_view1.records.PlayTime/1000,'yyyyMMdd') as play_date,
from_unixtime(my_view1.records.PlayTime/1000,'HHmmss') as play_time,
my_view1.records.WaitTime as waittime,
my_view1.records.OpType as optype,
my_view1.records.Type as type,
case when my_view1.records.cards is null then array(-1)
else my_view1.records.cards
end as card
from
origin
LATERAL VIEW explode(%(replace)srecords) my_view1 as records
) a
) t1
left join card_enum t2
on
t1.card=t2.id
''' % {"replace": data_prefix}
transfer_data_df = spark.sql(execute_1)
transfer_data_df.printSchema()
return transfer_data_df
2.2、强解
适用于这种json里面套json数组的
# 处理消费的数据
source_df = kafka_df.select(
lower(kafka_df.value.cast("string")).alias("kafka_value"), #将所有消息转成小写
kafka_df.offset.cast("bigint").alias("kafka_offset"), # 记录kafka偏移量
kafka_df.partition.cast("bigint").alias("kafka_partition"), # 记录kafka分区
kafka_df.timestamp.cast("timestamp").alias("kafka_timestamp"), # 记录数据落在kafka的时间
kafka_df.topic.cast("string").alias("kafka_topic") # 记录kafka的topic
)
middle_df = source_df.select(
get_json_object(source_df.kafka_value, "$.resultguid").cast(
"string").alias("resultguid"),
get_json_object(source_df.kafka_value, "$.startguid").cast(
"string").alias("startguid"),
get_json_object(source_df.kafka_value, "$.gamecode").cast(
"string").alias("game_code"),
get_json_object(source_df.kafka_value, "$.records").cast(
"string").alias("records"),
source_df.kafka_value.cast("string").alias("message_content"),
"kafka_offset",
"kafka_partition",
"kafka_timestamp",
"kafka_topic"
)
df = middle_df.select(
regexp_replace(regexp_replace(regexp_replace("records", "\\[\\{", "\\{"), "\\}]", "\\}"), "\\}\\,\\{",
"\\}\\#\\v\\#\\{").alias('records'),
'message_content',
"kafka_offset",
"kafka_partition",
"kafka_timestamp",
"kafka_topic",
"resultguid",
"startguid",
"game_code",
)
3、写入到hdfs
# 写入临时目录
df.writeStream \
.format("orc") \
.option("path", hdfspath) \ # 数据写入的位置
.option("checkpointLocation", hdfspath) \ # spark自己维护偏移量的位置
.trigger(processingTime='10 seconds') \
.outputMode("append").start()
2894

被折叠的 条评论
为什么被折叠?



