【博学谷学习记录】超强总结,用心分享|Spark的入门案例-从HDFS上读取文件并实现排序
一、上传一个words.txt文件到HDFS中
vim words.txt
输入 i 进入插入模式
添加以下内容:
hadoop hive zookeeper hive hadoop
hadoop hadoop sqoop hive sqoop
hive sqoop sqoop hive hive
hadoop hive sqoop zookeeper
上传到HDFS
hdfs dfs -mkdir -p /pyspark/wd/input
hdfs dfs -put words.txt /pyspark/wd/input
二、从HDFS上读取文件, 完成WordCount案例实现(链式编程)
import os
from pyspark import SparkContext, SparkConf
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'
if __name__ == '__main__':
print("wordcount案例实现: 链式编程")
conf = SparkConf().setAppName("wordcount_01").setMaster("local[*]")
sc = SparkContext(conf=conf)
res = sc.textFile(name="hdfs://node1:8020/pyspark/wd/input/words.txt")\
.flatMap(lambda line: line.split())\
.map(lambda word:(word,1))\
.reduceByKey(lambda agg,curr: agg+ curr).collect()
print(res)
sc.stop()
三、从HDFS中读取数据, 并且对结果数据进行排序操作
import os
from pyspark import SparkContext, SparkConf
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'
if __name__ == '__main__':
print("pyspark的入门案例: 从HDFS中读取数据并对结果数据进行排序")
conf = SparkConf().setAppName("wordcount_01").setMaster("local[*]")
sc = SparkContext(conf=conf)
path = "hdfs://node1:8020/pyspark/wd/input/words.txt"
rdd_init = sc.textFile(name=path)
rdd_flatMap = rdd_init.flatMap(lambda line: line.split())
rdd_map = rdd_flatMap.map(lambda word: (word, 1))
rdd_res = rdd_map.reduceByKey(lambda agg, curr: agg + curr)
res = rdd_res.top(5,lambda res_tuple:res_tuple[1])
print(res)
sc.stop()
四、将统计的结果进行输出操作
import os
from pyspark import SparkContext, SparkConf
os.environ['SPARK_HOME'] = '/export/server/spark'
os.environ['PYSPARK_PYTHON'] = '/root/anaconda3/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/root/anaconda3/bin/python3'
if __name__ == '__main__':
print("pyspark的入门案例: 从HDFS中读取数据")
conf = SparkConf().setAppName("wordcount_01").setMaster("local[*]")
sc = SparkContext(conf=conf)
path = "hdfs://node1:8020/pyspark/wd/input/words.txt"
rdd_init = sc.textFile(name=path)
rdd_flatMap = rdd_init.flatMap(lambda line: line.split())
rdd_map = rdd_flatMap.map(lambda word: (word, 1))
rdd_res = rdd_map.reduceByKey(lambda agg, curr: agg + curr)
rdd_res.saveAsTextFile(path="hdfs://node1:8020/pyspark/wd/output")
sc.stop()