0. 写在前面
Spark总共有两类算子,分别是Transformation算子和Action算子。Transformation算子变换不触发提交作业,而Action算子会触发SparkContext提交Job作业,下面主要使用pySpark API来作为事例,图解Spark的Transformation算子。
1. join
# joinx = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])z = x.join(y)print(x.collect())print(y.collect())print(z.collect())[('C', 4), ('B', 3), ('A', 2), ('A', 1)][('A', 8), ('B', 7), ('A', 6), ('D', 5)][('A', (2, 8)), ('A', (2, 6)), ('A', (1, 8)), ('A', (1, 6)), ('B', (3, 7))]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
2. leftOuterJoin
# leftOuterJoinx = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])z = x.leftOuterJoin(y)print(x.collect())print(y.collect())print(z.collect())[('C', 4), ('B', 3), ('A', 2), ('A', 1)][('A', 8), ('B', 7), ('A', 6), ('D', 5)][('A', (2, 8)), ('A', (2, 6)), ('A', (1, 8)), ('A', (1, 6)), ('C', (4, None)), ('B', (3, 7))]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
3. rightOuterJoin
# rightOuterJoinx = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])y = sc.parallelize([('A',8),('B',7),('A',6),('D',5)])z = x.rightOuterJoin(y)print(x.collect())print(y.collect())print(z.collect())[('C', 4), ('B', 3), ('A', 2), ('A', 1)][('A', 8), ('B', 7), ('A', 6), ('D', 5)][('A', (2, 8)), ('A', (2, 6)), ('A', (1, 8)), ('A', (1, 6)), ('B', (3, 7)), ('D', (None, 5))]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
4. partitionBy
# partitionByx = sc.parallelize([(0,1),(1,2),(2,3)],2)y = x.partitionBy(numPartitions = 3, partitionFunc = lambda x: x) # only key is passed to paritionFuncprint(x.glom().collect())print(y.glom().collect())[[(0, 1)], [(1, 2), (2, 3)]][[(0, 1)], [(1, 2)], [(2, 3)]]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
5. combineByKey
# combineByKeyx = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])createCombiner = (lambda el: [(el,el**2)]) mergeVal = (lambda aggregated, el: aggregated + [(el,el**2)]) # append to aggregatedmergeComb = (lambda agg1,agg2: agg1 + agg2 ) # append agg1 with agg2y = x.combineByKey(createCombiner,mergeVal,mergeComb)print(x.collect())print(y.collect())[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)][('A', [(3, 9), (4, 16), (5, 25)]), ('B', [(1, 1), (2, 4)])]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
6. aggregateByKey
# aggregateByKeyx = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])zeroValue = [] # empty list is 'zero value' for append operationmergeVal = (lambda aggregated, el: aggregated + [(el,el**2)])mergeComb = (lambda agg1,agg2: agg1 + agg2 )y = x.aggregateByKey(zeroValue,mergeVal,mergeComb)print(x.collect())print(y.collect())[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)][('A', [(3, 9), (4, 16), (5, 25)]), ('B', [(1, 1), (2, 4)])]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
7. foldByKey
# foldByKeyx = sc.parallelize([('B',1),('B',2),('A',3),('A',4),('A',5)])zeroValue = 1 # one is 'zero value' for multiplicationy = x.foldByKey(zeroValue,lambda agg,x: agg*x ) # computes cumulative product within each keyprint(x.collect())print(y.collect())[('B', 1), ('B', 2), ('A', 3), ('A', 4), ('A', 5)][('A', 60), ('B', 2)]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
8. groupByKey
# groupByKeyx = sc.parallelize([('B',5),('B',4),('A',3),('A',2),('A',1)])y = x.groupByKey()print(x.collect())print([(j[0],[i for i in j[1]]) for j in y.collect()])[('B', 5), ('B', 4), ('A', 3), ('A', 2), ('A', 1)][('A', [3, 2, 1]), ('B', [5, 4])]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
9. flatMapValues
# flatMapValuesx = sc.parallelize([('A',(1,2,3)),('B',(4,5))])y = x.flatMapValues(lambda x: [i**2 for i in x]) # function is applied to entire value, then result is flattenedprint(x.collect())print(y.collect())[('A', (1, 2, 3)), ('B', (4, 5))][('A', 1), ('A', 4), ('A', 9), ('B', 16), ('B', 25)]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
10. mapValues
# mapValuesx = sc.parallelize([('A',(1,2,3)),('B',(4,5))])y = x.mapValues(lambda x: [i**2 for i in x]) # function is applied to entire valueprint(x.collect())print(y.collect())[('A', (1, 2, 3)), ('B', (4, 5))][('A', [1, 4, 9]), ('B', [16, 25])]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
11. groupWith
# groupWithx = sc.parallelize([('C',4),('B',(3,3)),('A',2),('A',(1,1))])y = sc.parallelize([('B',(7,7)),('A',6),('D',(5,5))])z = sc.parallelize([('D',9),('B',(8,8))])a = x.groupWith(y,z)print(x.collect())print(y.collect())print(z.collect())print("Result:")for key,val in list(a.collect()): print(key, [list(i) for i in val])[('C', 4), ('B', (3, 3)), ('A', 2), ('A', (1, 1))][('B', (7, 7)), ('A', 6), ('D', (5, 5))][('D', 9), ('B', (8, 8))]Result:D [[], [(5, 5)], [9]]C [[4], [], []]B [[(3, 3)], [(7, 7)], [(8, 8)]]A [[2, (1, 1)], [6], []]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
12. cogroup
# cogroupx = sc.parallelize([('C',4),('B',(3,3)),('A',2),('A',(1,1))])y = sc.parallelize([('A',8),('B',7),('A',6),('D',(5,5))])z = x.cogroup(y)print(x.collect())print(y.collect())for key,val in list(z.collect()): print(key, [list(i) for i in val])[('C', 4), ('B', (3, 3)), ('A', 2), ('A', (1, 1))][('A', 8), ('B', 7), ('A', 6), ('D', (5, 5))]A [[2, (1, 1)], [8, 6]]C [[4], []]B [[(3, 3)], [7]]D [[], [(5, 5)]]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
13. sampleByKey
# sampleByKeyx = sc.parallelize([('A',1),('B',2),('C',3),('B',4),('A',5)])y = x.sampleByKey(withReplacement=False, fractions={'A':0.5, 'B':1, 'C':0.2})print(x.collect())print(y.collect())[('A', 1), ('B', 2), ('C', 3), ('B', 4), ('A', 5)][('B', 2), ('C', 3), ('B', 4)]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
14. subtractByKey
# subtractByKeyx = sc.parallelize([('C',1),('B',2),('A',3),('A',4)])y = sc.parallelize([('A',5),('D',6),('A',7),('D',8)])z = x.subtractByKey(y)print(x.collect())print(y.collect())print(z.collect())[('C', 1), ('B', 2), ('A', 3), ('A', 4)][('A', 5), ('D', 6), ('A', 7), ('D', 8)][('C', 1), ('B', 2)]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
15. subtract
# subtractx = sc.parallelize([('C',4),('B',3),('A',2),('A',1)])y = sc.parallelize([('C',8),('A',2),('D',1)])z = x.subtract(y)print(x.collect())print(y.collect())print(z.collect())[('C', 4), ('B', 3), ('A', 2), ('A', 1)][('C', 8), ('A', 2), ('D', 1)][('A', 1), ('C', 4), ('B', 3)]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
16. keyBy
# keyByx = sc.parallelize([1,2,3])y = x.keyBy(lambda x: x**2)print(x.collect())print(y.collect())[1, 2, 3][(1, 1), (4, 2), (9, 3)]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
17. repartition
# repartitionx = sc.parallelize([1,2,3,4,5],2)y = x.repartition(numPartitions=3)print(x.glom().collect())print(y.glom().collect())[[1, 2], [3, 4, 5]][[], [1, 2, 3, 4], [5]]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
18. coalesce
# coalescex = sc.parallelize([1,2,3,4,5],2)y = x.coalesce(numPartitions=1)print(x.glom().collect())print(y.glom().collect())[[1, 2], [3, 4, 5]][[1, 2, 3, 4, 5]]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
19. zip
# zipx = sc.parallelize(['B','A','A'])# zip expects x and y to have same #partitions and #elements/partitiony = x.map(lambda x: ord(x)) z = x.zip(y)print(x.collect())print(y.collect())print(z.collect())['B', 'A', 'A'][66, 65, 65][('B', 66), ('A', 65), ('A', 65)]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
20. zipWithIndex
# zipWithIndexx = sc.parallelize(['B','A','A'],2)y = x.zipWithIndex()print(x.glom().collect())print(y.collect())[['B'], ['A', 'A']][('B', 0), ('A', 1), ('A', 2)]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
21. zipWithUniqueId
# zipWithUniqueIdx = sc.parallelize(['B','A','A'],2)y = x.zipWithUniqueId()print(x.glom().collect())print(y.collect())[['B'], ['A', 'A']][('B', 0), ('A', 1), ('A', 3)]
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
【完】
再分享一下我老师大神的人工智能教程吧。零基础!通俗易懂!风趣幽默!还带黄段子!希望你也加入到我们人工智能的队伍中来!https://blog.youkuaiyun.com/jiangjunshow