pyspark入门学习demo
最近数据机太大,用pandas处理耗时太久,于是用学习pyspark处理数据。
pyspark创建Dataframe
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as fn
from pyspark.sql import Window
from pyspark.sql.functions import current_date
from pyspark.sql.functions import datediff
from pyspark.sql.functions import lit
from pyspark.sql.functions import col,when, max
# 创建一个SparkSession对象
conf = SparkConf().setAppName("spark_1").setMaster("local[2]")
ss = SparkSession.builder.config(conf=conf).getOrCreate()
# 创建DateFrame
df1 = ss.createDataFrame([