map 和 mapPartitions 的区别
-
map
是对rdd
中的每一个元素进行操作 -
mapPartitions
则是对rdd
中的每个分区的迭代器进行操作,返回的也必须是迭代对象;执行效率高,但是一次传入一个partition
的数据,有可能会导致OOM
,因此可以在此之前进行repartition()
# coding=utf-8
"""
Function:测试 mapPartitions()
Time: 2021.06.07
"""
from pyspark import SparkConf, SparkContext
def process(partitions):
sum = 0
for item in partitions:
sum += item
yield sum
if __name__ == '__main__':
conf = SparkConf().setMaster("local[2]").setAppName("test_mapPartitions")
sc = SparkContext(conf=conf)
data = [1, 2, 3, 4, 5, 6, 7, 8]
rdd = sc.parallelize(data, 3)
rdd.cache()
print(f"分区数目:{rdd.getNumPartitions()}\t{rdd.glom().collect()}")
rdd = rdd.mapPartitions(process)
print(rdd.collect())
sc.stop()
广播大变量+mapPartitions 求差集
# coding=utf-8
"""
Function:测试 mapPartitions()
Time: 2021.06.07
"""
from pyspark import SparkConf, SparkContext
def test(partitions):
data_dict = dict(broads.value) # {'rowkey1': 'url1', 'rowkey4': 'url4'}
for item in partitions:
if item[0] not in data_dict:
yield item
if __name__ == '__main__':
conf = SparkConf().setMaster("local[2]").setAppName("test_mapPartitions")
sc = SparkContext(conf=conf)
data1 = [('rowkey1', 'url1'), ('rowkey2', 'url2'), ('rowkey3', 'url3'), ('rowkey4', 'url4')]
data2 = [('rowkey1', 'url1'), ('rowkey4', 'url4')]
rdd1 = sc.parallelize(data1)
rdd2 = sc.parallelize(data2)
res = rdd2.collect()
broads = sc.broadcast(res)
rdd3 = rdd1.mapPartitions(test)
print(rdd3.collect())
sc.stop()
运行结果:
[('rowkey2', 'url2'), ('rowkey3', 'url3')]