文章目录
• 1 pyspark version
• 2 map
• 3 flatMap
• 4 mapPartitions
• 5 mapPartitionsWithIndex
• 6 getNumPartitions
• 7 filter
• 8 distinct
• 9 sample
• 10 takeSample
• 11 union
• 12 intersection
• 13 sortByKey
pyspark version
# print Spark version print ( "pyspark version:" + str (sc.version)) pyspark version: 1.2 . 2 |
map

# map # sc = spark context, parallelize creates an RDD from the passed object x = sc.parallelize([ 1 , 2 , 3 ]) y = x. map ( lambda x: (x,x * * 2 )) # collect copies RDD elements to a list on the driver print (x.collect()) print (y.collect()) [ 1 , 2 , 3 ] [( 1 , 1 ), ( 2 , 4 ), ( 3 , 9 )] |
flatMap

# flatMap x = sc.parallelize([ 1 , 2 , 3 ]) y = x.flatMap( lambda x: (x, 100 * x, x * * 2 )) print (x.collect()) print (y.collect()) [ 1 , 2 , 3 ] [ 1 , 100 , 1 , 2 , 200 , 4 , 3 , 300 , 9 ] |
mapPartitions

# mapPartitions x = sc.parallelize([ 1 , 2 , 3 ], 2 ) def f(iterator): yield sum (iterator) y = x.mapPartitions(f) # glom() flattens elements on the same partition print (x.glom().collect()) print (y.glom().collect()) [[ 1 ], [ 2 , 3 ]] [[ 1 ], [ 5 ]] |
mapPartitionsWithIndex

# mapPartitionsWithIndex x = sc.parallelize([ 1 , 2 , 3 ], 2 ) def f(partitionIndex, iterator): yield (partitionIndex, sum (iterator)) y = x.mapPartitionsWithIndex(f) # glom() flattens elements on the same partition print (x.glom().collect()) print (y.glom().collect()) [[ 1 ], [ 2 , 3 ]] [[( 0 , 1 )], [( 1 , 5 )]] |
getNumPartitions

# getNumPartitions x = sc.parallelize([ 1 , 2 , 3 ], 2 ) y = x.getNumPartitions() print (x.glom().collect()) print (y) [[ 1 ], [ 2 , 3 ]] 2 |
filter

# filter x = sc.parallelize([ 1 , 2 , 3 ]) y = x. filter ( lambda x: x % 2 = = 1 ) # filters out even elements print (x.collect()) print (y.collect()) [ 1 , 2 , 3 ] [ 1 , 3 ] |
distinct

# distinct x = sc.parallelize([ 'A' , 'A' , 'B' ]) y = x.distinct() print (x.collect()) print (y.collect()) [ 'A' , 'A' , 'B' ] [ 'A' , 'B' ] |
sample

# sample x = sc.parallelize( range ( 7 )) # call 'sample' 5 times ylist = [x.sample(withReplacement = False , fraction = 0.5 ) for i in range ( 5 )] print ( 'x = ' + str (x.collect())) for cnt,y in zip ( range ( len (ylist)), ylist): print ( 'sample:' + str (cnt) + ' y = ' + str (y.collect())) x = [ 0 , 1 , 2 , 3 , 4 , 5 , 6 ] sample: 0 y = [ 0 , 2 , 5 , 6 ] sample: 1 y = [ 2 , 6 ] sample: 2 y = [ 0 , 4 , 5 , 6 ] sample: 3 y = [ 0 , 2 , 6 ] sample: 4 y = [ 0 , 3 , 4 ] |
takeSample

# takeSample x = sc.parallelize( range ( 7 )) # call 'sample' 5 times ylist = [x.takeSample(withReplacement = False , num = 3 ) for i in range ( 5 )] print ( 'x = ' + str (x.collect())) for cnt,y in zip ( range ( len (ylist)), ylist): print ( 'sample:' + str (cnt) + ' y = ' + str (y)) # no collect on y x = [ 0 , 1 , 2 , 3 , 4 , 5 , 6 ] sample: 0 y = [ 0 , 2 , 6 ] sample: 1 y = [ 6 , 4 , 2 ] sample: 2 y = [ 2 , 0 , 4 ] sample: 3 y = [ 5 , 4 , 1 ] sample: 4 y = [ 3 , 1 , 4 ] |
union

# union x = sc.parallelize([ 'A' , 'A' , 'B' ]) y = sc.parallelize([ 'D' , 'C' , 'A' ]) z = x.union(y) print (x.collect()) print (y.collect()) print (z.collect()) [ 'A' , 'A' , 'B' ] [ 'D' , 'C' , 'A' ] [ 'A' , 'A' , 'B' , 'D' , 'C' , 'A' ] |
intersection

# intersection x = sc.parallelize([ 'A' , 'A' , 'B' ]) y = sc.parallelize([ 'A' , 'C' , 'D' ]) z = x.intersection(y) print (x.collect()) print (y.collect()) print (z.collect()) [ 'A' , 'A' , 'B' ] [ 'A' , 'C' , 'D' ] [ 'A' ] |
sortByKey

# sortByKey x = sc.parallelize([( 'B' , 1 ),( 'A' , 2 ),( 'C' , 3 )]) y = x.sortByKey() print (x.collect()) print (y.collect()) [( 'B' , 1 ), ( 'A' , 2 ), ( 'C' , 3 )] [( 'A' , 2 ), ( 'B' , 1 ), ( 'C' , 3 )] |