from pyspark import SparkContext
sc = SparkContext()
data = sc.parallelize([('amber',22),('alfred',23),('skye',4),('albert',12),('amber',9)])
data
ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:194
data_file = sc.textFile('./data/VS14MORT.txt.gz',4)
data_heterogenous = sc.parallelize([('Ferrari','fast'),{'Porsche':100000},['Spain','visited',4504]]).collect()
data_heterogenous
[('Ferrari', 'fast'), {'Porsche': 100000}, ['Spain', 'visited', 4504]]
data_file.take(2)
def extracInformation(row):
import re
import numpy as np
selected_indices = [
2,4,5,6,7,9,10,11,12,13,14,15,16,17,18,
19,21,22,23,24,25,27,28,29,30,32,33,34,
36,37,38,39,40,41,42,43,44,45,46,47,48,
49,50,51,52,53,54,55,56,58,60,61,62,63,
64,65,66,67,68,69,70,71,72,73,74,75,76,
77,78,79,81,82,83,84,85,87,89
]
record_split = re.compile(
r'([\s]{19})([0-9]{1})([\s]{40})([0-9\s]{2})([0-9\s]{1})([0-9]{1})([0-9]{2})' +
r'([\s]{2})([FM]{1})([0-9]{1})([0-9]{3})([0-9\s]{1})([0-9]{2})([0-9]{2})' +
r'([0-9]{2})([0-9\s]{2})([0-9]{1})([SMWDU]{1})([0-9]{1})([\s]{16})([0-9]{4})' +
r'([YNU]{1})([0-9\s]{1})([BCOU]{1})([YNU]{1})([\s]{34})([0-9\s]{1})([0-9\s]{1})' +
r'([A-Z0-9\s]{4})([0-9]{3})([\s]{1})([0-9\s]{3})([0-9\s]{3})([0-9\s]{2})([\s]{1})' +
r'([0-9\s]{2})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' +
r'([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' +
r'([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' +
r'([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})([A-Z0-9\s]{7})' +
r'([A-Z0-9\s]{7})([\s]{36})([A-Z0-9\s]{2})([\s]{1})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' +
r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' +
r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' +
r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})' +
r'([A-Z0-9\s]{5})([A-Z0-9\s]{5})([A-Z0-9\s]{5})([\s]{1})([0-9\s]{2})([0-9\s]{1})' +
r'([0-9\s]{1})([0-9\s]{1})([0-9\s]{1})([\s]{33})([0-9\s]{3})([0-9\s]{1})([0-9\s]{1})'
)
try:
rs = np.array(record_split.split(row))[selected_indices]
except:
rs = np.array(['-99'] * len(selected_indices))
return rs
data_file_conv = data_file.map(extracInformation)
data_file_conv.map(lambda row:row).take(1)
全局作用域和局部作用域
Spark可以在两种模式下运行:本地的和集群的。
集群模式下,提交任务时,任务被发送给了驱动程序节点(或主节点)该驱动程序节点为任务创建DAG,并且决定哪一个执行者(工作者)节点将运行特定的任务。
然后,该该驱动程序指示工作者执行他们的任务,并且在结束时将记过返回给驱动程序。然而在这之前驱动程序为每一个任务的终止做准备:驱动程序中有一组变量和方法,以便工作者在RDD上执行任务。
这组变量和方法在执行者的上席文中本质上时静态的,即每个执行器从驱动程序中获得一份变量和方法的副本。运行任务时,如果执行者改变这些变量或则覆盖这些方法,它不影响任何其他执行者的副本或驱动程序的变量和方法。这可能会导致一些意想不到的行为和运行错误,这些行为和错误通常是很难被追踪到。
.map(…) 转换
data_20 = data_file_conv.map(lambda row: row[16])
data_20.take(10)
data_20_2 = data_file_conv.map(
lambda row: (row[16], int(row[16])))
data_20_2.take(5)
.filter(…)转换
data_filtered = data_file_conv.filter(
lambda row: row[16] == '2014' and row[21] =='0')
data_filtered.count()
.flat Map(…)转换
data_20_flat = data_file_conv.flatMap(
lambda row: (row[16], int(row[16]) + 1))
data_20_flat.take(10)
.distinct(…)转换
distinct_gender = data_file_conv.map(
lambda row: row[5]).distinct()
distinct_gender.collect()
.sample(…)转换
function = 0.1
data_sample = data_file_conv.sample(False, function, 666)
print('Original dataset : {0}, sample : {1}' .format(data_file_conv.count(),
data_sample.count()))
.leftOuterJoin(…)转换
rdd1 = sc.parallelize([('a',1), ('b',4), ('c',10)])
rdd2 = sc.parallelize([('a',4), ('a',1), ('b','6'), ('d',15)])
rdd3 = rdd1.leftOuterJoin(rdd2)
rdd3.collect()
[('a', (1, 4)), ('a', (1, 1)), ('b', (4, '6')), ('c', (10, None))]
rdd4 = rdd1.join(rdd2)
rdd4.collect()
[('a', (1, 4)), ('a', (1, 1)), ('b', (4, '6'))]
rdd1.repartition(4)
MapPartitionsRDD[27] at coalesce at NativeMethodAccessorImpl.java:0
len(rdd1.glom().collect())
4
rdd1.map(lambda row: row[1]).reduce(lambda x,y: x + y)
15
data_reduce = sc.parallelize([1, 2, 5, .1, 5, .2], 1)
works = data_reduce.reduce(lambda x : x / y)
data_key = sc.parallelize(
[('a', 4),('b', 4),('c', 2), ('a', 8), ('d', 2), ('b', 1)
('d', 3)], 4
)