Pyspark 从Kafka 接收数据

createStream Pyspark 从Kafka 读取数据

Python
#!/usr/bin/<span class="wp_keywordlink"><a href="http://www.168seo.cn/python" title="python">python</a></span> # -*- coding: utf-8 -*- """ one method to get data from <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/kafka" title="View all posts in kafka" target="_blank">kafka</a></span> is createStream() another is createDirectStream() this is example of createStream() and only print what it consume from <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/kafka" title="View all posts in kafka" target="_blank">kafka</a></span> """ import os from <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/pyspark" title="View all posts in pyspark" target="_blank">pyspark</a></span> import SparkContext from <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/pyspark" title="View all posts in pyspark" target="_blank">pyspark</a></span>.streaming import StreamingContext from <span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/pyspark" title="View all posts in pyspark" target="_blank">pyspark</a></span>.streaming.<span class="wp_keywordlink_affiliate"><a href="https://www.168seo.cn/tag/kafka" title="View all posts in kafka" target="_blank">kafka</a></span> import KafkaUtils os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3" os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3" os.environ['PYSPARK_SUBMIT_ARGS'] \ = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.1 ' \ 'pyspark-shell' sc = SparkContext("local[2]", "Streaming") sc.setLogLevel("WARN") ssc = StreamingContext(sc, 2) zookeeper = "127.0.0.1:2181" # todo: confirm which is the right one... kfk_brokers = {"bootstrap_servers": "127.0.0.1:9092", "kafka.bootstrap.servers": "127.0.0.1:9092", "brokers": "127.0.0.1:9092", "host": "127.0.0.1:9092"} topic = {"test": 1} group_id = "test_2018" lines = KafkaUtils.createStream(ssc, zookeeper, group_id, topic, kafkaParams=kfk_brokers) print(lines) lines_tmp = lines.map(lambda x: x[1]) lines_tmp.pprint() ssc.start() ssc.awaitTermination(10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
one method to get data from kafka is createStream()
another is createDirectStream()
 
this is example of createStream() and only print what it consume from kafka
"""
import os
 
from pyspark import SparkContext
from pyspark . streaming import StreamingContext
from pyspark . streaming . kafka import KafkaUtils
 
os . environ [ "PYSPARK_PYTHON" ] = "/usr/bin/python3"
os . environ [ "PYSPARK_DRIVER_PYTHON" ] = "/usr/bin/python3"
os . environ [ 'PYSPARK_SUBMIT_ARGS' ] \
     = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.1 ' \
       'pyspark-shell'
 
 
sc = SparkContext ( "local[2]" , "Streaming" )
sc . setLogLevel ( "WARN" )
ssc = StreamingContext ( sc , 2 )
 
zookeeper = "127.0.0.1:2181"
# todo: confirm which is the right one...
kfk_brokers = { "bootstrap_servers" : "127.0.0.1:9092" ,
               "kafka.bootstrap.servers" : "127.0.0.1:9092" ,
               "brokers" : "127.0.0.1:9092" ,
               "host" : "127.0.0.1:9092" }
topic = { "test" : 1 }
group_id = "test_2018"
 
lines = KafkaUtils . createStream ( ssc , zookeeper , group_id , topic ,
                                 kafkaParams = kfk_brokers )
print ( lines )
lines_tmp = lines . map ( lambda x : x [ 1 ] )
lines_tmp . pprint ( )
 
ssc . start ( )
ssc . awaitTermination ( 10 )
 

createDirectStream Pyspark 从Kafka 读取数据

Python
#!/usr/bin/<span class="wp_keywordlink"><a href="http://www.168seo.cn/python" title="python">python</a></span> # -*- coding: utf-8 -*- """ word count with kafka consume data from kafka topic "test" and do word count with batch duration of 1 second """ from __future__ import print_function import os from pyspark.sql import SparkSession from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3" os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3" os.environ['PYSPARK_SUBMIT_ARGS'] = \ '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.1 ' \ 'pyspark-shell' spark = SparkSession\ .builder\ .appName("word_count")\ .master("local[*]")\ .getOrCreate() sc = spark.sparkContext ssc = StreamingContext(sparkContext=sc, batchDuration=1) # the topic to subscribe topic_to_sub = ["test"] # the address of kafka, separate with comma if there are many bootstrap_servers = "localhost:9092" # kafka config info kafka_params = {"metadata.broker.list": bootstrap_servers} def main(): # initialize stream to consume data from kafka kafka_stream = KafkaUtils\ .createDirectStream(ssc=ssc, topics=topic_to_sub, kafkaParams=kafka_params) kafka_stream.pprint() # get data lines = kafka_stream.map(lambda x: x[1]) # word count counts = lines\ .flatMap(lambda line: line.split(" "))\ .map(lambda word: (word, 1))\ .reduceByKey(lambda x, y: x + y) counts.pprint() ssc.start() ssc.awaitTermination() if __name__ == "__main__": main()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
word count with kafka
consume data from kafka topic "test" and do word count with batch duration of 1 second
"""
from __future__ import print_function
import os
 
from pyspark . sql import SparkSession
from pyspark . streaming import StreamingContext
from pyspark . streaming . kafka import KafkaUtils
 
os . environ [ "PYSPARK_PYTHON" ] = "/usr/bin/python3"
os . environ [ "PYSPARK_DRIVER_PYTHON" ] = "/usr/bin/python3"
os . environ [ 'PYSPARK_SUBMIT_ARGS' ] = \
     '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.1 ' \
     'pyspark-shell'
 
spark = SparkSession \
     . builder \
     . appName ( "word_count" ) \
     . master ( "local[*]" ) \
     . getOrCreate ( )
 
sc = spark . sparkContext
ssc = StreamingContext ( sparkContext = sc , batchDuration = 1 )
 
# the topic to subscribe
topic_to_sub = [ "test" ]
# the address of kafka, separate with comma if there are many
bootstrap_servers = "localhost:9092"
# kafka config info
kafka_params = { "metadata.broker.list" : bootstrap_servers }
 
 
def main ( ) :
     # initialize stream to consume data from kafka
     kafka_stream = KafkaUtils \
         . createDirectStream ( ssc = ssc ,
                             topics = topic_to_sub ,
                             kafkaParams = kafka_params )
 
     kafka_stream . pprint ( )
     # get data
     lines = kafka_stream . map ( lambda x : x [ 1 ] )
 
     # word count
     counts = lines \
         . flatMap ( lambda line : line . split ( " " ) ) \
         . map ( lambda word : ( word , 1 ) ) \
         . reduceByKey ( lambda x , y : x + y )
 
     counts . pprint ( )
 
     ssc . start ( )
     ssc . awaitTermination ( )
 
 
if __name__ == "__main__" :
     main ( )
 



  • zeropython 微信公众号 5868037 QQ号 5868037@qq.com QQ邮箱
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值