Pyspark 从kafka 读取数据 rdd，转成 DataFrame

最新推荐文章于 2024-07-20 07:15:00 发布

songhao8080

最新推荐文章于 2024-07-20 07:15:00 发布

阅读量1.2k

点赞数

本文链接：https://blog.youkuaiyun.com/songhao8080/article/details/103670122

版权

spark.createDataFrame(rowRdd)

Python

from __future__ import print_function import sys import json from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from pyspark.sql import Row, SparkSession def getSparkSessionInstance(sparkConf): # 生成单例模式的getSparkSessionInstance if ('sparkSessionSingletonInstance' not in globals()): globals()['sparkSessionSingletonInstance'] = SparkSession .builder .config(conf=sparkConf) .getOrCreate() return globals()['sparkSessionSingletonInstance'] sc = SparkContext("local[2]","NetWordCount") ssc = StreamingContext(sc,1) topic = "connect-test" # kafka 中的 topic kvs = KafkaUtils.createStream(ssc,"localhost:2181","spark-streaming-consumer",{topic:1}) # kafka 读取数据 words = kvs.map(lambda x:x[1]) # words = kvs.map(lambda line: line.split(",")) # words = kvs.flatMap(lambda line: line.split(" ")) # Convert RDDs of the words DStream to DataFrame and run SQL query def process(time, rdd): print("========= %s =========" % str(time)) try: # Get the singleton instance of SparkSession spark = getSparkSessionInstance(rdd.context.getConf()) # Convert RDD[String] to RDD[Row] to DataFrame rowSplitted = rdd.flatMap(lambda line: line.split(",")) rowRdd = rowSplitted.map(lambda w: Row(word=w)) # rowRdd = rdd.flatMap(lambda line: line.split(",")) wordsDataFrame = spark.createDataFrame(rowRdd) # Creates a temporary view using the DataFrame. wordsDataFrame.createOrReplaceTempView("words") # Do word count on table using SQL and print it wordCountsDataFrame = spark.sql("select word, count(*) as total from words group by word") wordCountsDataFrame.show() except: pass words.pprint() words.foreachRDD(process) # foreachRDD Apply a function to each RDD in this DStream(流处理). ssc.start() ssc.awaitTermination()

from __future__ import print_function

import sys

import json

from pyspark import SparkContext

from pyspark . streaming import StreamingContext

from pyspark . streaming . kafka import KafkaUtils

from pyspark . sql import Row , SparkSession

def getSparkSessionInstance ( sparkConf ) :

# 生成单例模式的getSparkSessionInstance

if ( 'sparkSessionSingletonInstance' not in globals ( ) ) :

globals ( ) [ 'sparkSessionSingletonInstance' ] = SparkSession

. builder

. config ( conf = sparkConf )

. getOrCreate ( )

return globals ( ) [ 'sparkSessionSingletonInstance' ]

sc = SparkContext ( "local[2]" , "NetWordCount" )

ssc = StreamingContext ( sc , 1 )

topic = "connect-test"

# kafka 中的 topic

kvs = KafkaUtils . createStream ( ssc , "localhost:2181" , "spark-streaming-consumer" , { topic : 1 } )

# kafka 读取数据

words = kvs . map ( lambda x : x [ 1 ] )

# words = kvs.map(lambda line: line.split(","))

# words = kvs.flatMap(lambda line: line.split(" "))

# Convert RDDs of the words DStream to DataFrame and run SQL query

def process ( time , rdd ) :

print ( "========= %s =========" % str ( time ) )

try :

# Get the singleton instance of SparkSession

spark = getSparkSessionInstance ( rdd . context . getConf ( ) )

# Convert RDD[String] to RDD[Row] to DataFrame

rowSplitted = rdd . flatMap ( lambda line : line . split ( "," ) )

rowRdd = rowSplitted . map ( lambda w : Row ( word = w ) )

# rowRdd = rdd.flatMap(lambda line: line.split(","))

wordsDataFrame = spark . createDataFrame ( rowRdd )

# Creates a temporary view using the DataFrame.

wordsDataFrame . createOrReplaceTempView ( "words" )

# Do word count on table using SQL and print it

wordCountsDataFrame =

spark . sql ( "select word, count(*) as total from words group by word" )

wordCountsDataFrame . show ( )

except :

pass

words . pprint ( )

words . foreachRDD ( process )

# foreachRDD Apply a function to each RDD in this DStream(流处理).

ssc . start ( )

ssc . awaitTermination ( )

toDF() 转成 DataFrame

Python

>>> from pyspark.sql.types import Row >>> def f(x): ... rel = {} ... rel['name'] = x[0] ... rel['age'] = x[1] ... return rel ... >>> peopleDF = sc.textFile("file:///usr/local/spark/examples/src/main/resources/people.txt").map(lambda line : line.split(',')).map(lambda x: Row(**f(x))).toDF() >>> peopleDF.createOrReplaceTempView("people") //必须注册为临时表才能供下面的查询使用 >>> personsDF = spark.sql("select * from people") >>> personsDF.rdd.map(lambda t : "Name:"+t[0]+","+"Age:"+t[1]).foreach(print) Name: 19,Age:Justin Name: 29,Age:Michael Name: 30,Age:Andy

>>> from pyspark . sql . types import Row

>>> def f ( x ) :

. . . rel = { }

. . . rel [ 'name' ] = x [ 0 ]

. . . rel [ 'age' ] = x [ 1 ]

. . . return rel

. . .

>>> peopleDF = sc . textFile ( "file:///usr/local/spark/examples/src/main/resources/people.txt" ) . map ( lambda line : line . split ( ',' ) ) . map ( lambda x : Row ( * * f ( x ) ) ) . toDF ( )

>>> peopleDF . createOrReplaceTempView ( "people" ) / /必须注册为临时表才能供下面的查询使用

>>> personsDF = spark . sql ( "select * from people" )

>>> personsDF . rdd . map ( lambda t : "Name:" + t [ 0 ] + "," + "Age:" + t [ 1 ] ) . foreach ( print )

Name : 19 , Age : Justin

Name : 29 , Age : Michael

Name : 30 , Age : Andy