数据原始格式

# -*- coding: utf-8 -*-
from pyspark import SparkContext,SparkConf
from pyspark.sql import SparkSession, HiveContext
from pyspark.sql.types import *
from pyspark.sql import Row
# 自定义测试函数
def ceshi(score):
if score<0.1:
return 0.01
else:
return 100
# 保存文件前,先删除输出目录
def save_file(rdd, output_path):
try:
os.system('hdfs dfs -rm -r ' + output_path)
except Exception as e:
print(e)
rdd.saveAsTextFile(output_path)
if __name__ == "__main__":
conf = SparkConf().setMaster("yarn").setAppName("My App")
sc = SparkContext(conf=conf)
#spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()
#spark = SparkSession.builder.config('spark.driver.memory', '2g').getOrCreate()
spark = SparkSession.builder.enableHiveSupport().ge

最低0.47元/天 解锁文章
1151

被折叠的 条评论
为什么被折叠?



