实现了从kafka取nginx日志,并在本地打印出来
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2019/01/09 18:00
# @Author : xuanda
# @Site :
# @File : kafka_to_sparkstreaming.py
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
if __name__ == '__main__':
zkQuorum = '192.168.1.20:2181'
topic = {'nginx-access-log': 1}
groupid = "kafka-to-sparkstreaming"
appName = "KafkaToSparkstreaming"
timecell = 5
sc = SparkContext(master="spark://192.168.1.20:7077", appName=appName)
ssc = StreamingContext(sc, timecell)
lines = KafkaUtils.createStream(ssc, zkQuorum, groupid, topic)
line1 = lines.map(lambda x:x[1])
line1.saveAsTextFiles("/tmp/kafka/nginx")
line1.pprint()
ssc.start()
ssc.awaitTermination()
运行
spark-2.4.0-bin-hadoop2.7/bin/spark-sub