#使用文本文件做数据源
sc = SparkContext.getOrCreate(conf)
rows = sc.textFile("file:///Users/chuzhengkai/Desktop/test.txt")
print(rows.first())
print(rows.take(2))
print(rows.count())
print(rows.top(2))
sc.stop()
today
['today', 'is']
7
['weather', 'too']
#使用多个文本文件
#进行词频统计
sc = SparkContext.getOrCreate(conf)
#多个文本文件获取到一个RDD里面
filesRDD = sc.wholeTextFiles('file:///Users/chuzhengkai/Desktop/*.txt')
#文件内容RDD
fileConRDD = filesRDD.map(lambda x:x[1])
#用回车符分隔字符串,形成列表
def sp(x):
return x.split('\n')
#对每个文件内容做映射,结果是多个文件内容列表
#存在二维结构
strRDD = fileConRDD.map(sp)
#同样是映射,结果展平成一维结构
wordRDD = fileConRDD.flatMap(sp)
#结果,形成类一个元组表达一个文件,多个元组的列表
#词频统计map
wordDictRDD = wordRDD.map(lambda x:(x,1))
#Reduce
r = wordDictRDD.reduceByKey(lambda x,y:x+y)
#print(strRDD.collect())
#print(wordRDD.collect())
#print(wordDictRDD.collect())
print(r.collect())
sc.stop()
[('good', 6), ('weather', 3), ('is', 3), ('day', 2), ('has', 2), ('today', 3), ('too', 1)]
sc = SparkContext.getOrCreate(conf)
rdd = sc.textFile('file:///Users/chuzhengkai/Desktop/beijing_companies_poi.csv')
#统计各区公司数量
rdda = rdd.map(lambda x:x.split(','))
# ['名称', '大类', '中类', '小类', '地址', '省', '市', '区', 'WGS84_经度', 'WGS84_纬度']
rddb = rdda.map(lambda x:(x[7],1))
rddc = rddb.reduceByKey(lambda x,y:x+y)
rddc.cache()
rddd = rddc.sortBy(lambda x:x[1],ascending=False)
#rddd = rddc.map()
print(rddd.collect())
sc.stop()
[('朝阳区', 36782), ('海淀区', 21882), ('丰台区', 12485), ('大兴区', 12028), ('通州区', 9210), ('昌平区', 8446), ('西城区', 6781), ('东城区', 6287), ('顺义区', 6200), ('房山区', 3803), ('石景山区', 2207), ('怀柔区', 1757), ('密云区', 1721), ('平谷区', 1463), ('延庆区', 698), ('门头沟区', 695), ('北京市', 165), ('8层"', 1), ('区', 1), ('320室"', 1), ('1门9层"', 1)]
#RDD 的集合运算
spark = SparkContext.getOrCreate(conf)
rdd1 = spark.parallelize([('a',(2,4)),('b',3),('c',5)])
rdd2 = spark.parallelize([('x',1),('a',5),('t',9)])
#rdd3 = rdd1.union(rdd2)
#rdd4 = rdd1.intersection(rdd2)
#rdd5 = rdd1.subtract(rdd2)
#rdd6 = rdd1.cartesian(rdd2)
rdd7 = rdd1.join(rdd2)
print(rdd7.collect())
spark.stop()
[('a', ((2, 4), 5))]
#小王子和哈姆雷特词频统计
import re
spark = SparkContext.getOrCreate(conf)
hRDD = spark.textFile('file:///Users/chuzhengkai/Desktop/work/大数据课件/data/Hamlet.txt')
pRDD = spark.textFile('file:///Users/chuzhengkai/Desktop/work/大数据课件/data/Little_prince.txt')
#
def spBySpace(s):#以空字符和空格分隔
return s.split()
def filt(s):#用正则表达式过滤不必要的字符,只保留英文单词
r1 = u'[0-9’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'
#用户也可以在此进行自定义过滤字符
return re.sub(r1,'',s).lower()#都转换成小写
def m2t(s):
return (s,1)
def rdc(v1,v2):#对次数做加法
return v1+v2
def odr(x):#排序依据函数,以键值对的值作为排序依据
return x[1]
def swapKV(x):#交换键值对的键和值
return (x[1],x[0])
#以空格分割成单词# 数据整理,成为小写的英文单词#设置成键值对,值为1,表示该单词出现1次
rh = hRDD\
.flatMap(spBySpace)\
.map(filt)\
.map(m2t)
rp = pRDD\
.flatMap(spBySpace)\
.map(filt)\
.map(m2t)
#以key为条件,做reduce
#以给定函数排序 ascending表示升序
rhR = rh.reduceByKey(rdc)\
.sortBy(odr,ascending=False)
rhR.cache()
rpR = rp.reduceByKey(rdc)\
.map(swapKV)\
.sortByKey(ascending=False)\
.map(swapKV)
rpR.cache()
rR = rhR.union(rpR)\
.reduceByKey(rdc)
hc = rh.count()
pc = rp.count()
def rate(s):
return (s[0],s[1]/(hc+pc))
rateRDD = rR.map(rate)
print(rhR.count())
print(rpR.count())
print(rR.sortBy(odr,ascending=False).take(10))
print(rateRDD.take(10))
rateRDD.saveAsTextFile('file:///Users/chuzhengkai/Desktop/rate.txt')
spark.stop()
4968
5880
[('the', 4010), ('and', 3572), ('to', 2559), ('she', 2264), ('a', 2173), ('of', 1924), ('her', 1766), ('in', 1498), ('i', 1491), ('it', 1467)]
[('of', 0.01905573108045203), ('i', 0.014767201164736994), ('in', 0.01483653074767003), ('ham', 0.003545712955718205), ('but', 0.005922727227708063), ('he', 0.0076559668010340014), ('king', 0.0019511325482583419), ('have', 0.004496518664514148), ('do', 0.0033872453375855477), ('no', 0.002753374865054919)]