PySpark代统计分析

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# 创建SparkSession
spark = SparkSession.builder.appName("RetailAnalysis").getOrCreate()

# 假设prod和sales表已注册为临时视图(实际需从数据源加载)
# spark.read.csv(...).createOrReplaceTempView("prod")
# spark.read.csv(...).createOrReplaceTempView("sales")

# ============== 分析任务 ==============

# 1. 商品小类价格TOP5
df_top5_product = spark.sql("""
    SELECT *, 
           dense_rank() OVER (PARTITION BY cataB ORDER BY price DESC) AS rank 
    FROM prod
""").filter("rank <= 5")

# 2. 月度销售统计
df_month_sale = spark.sql("""
    SELECT od_month,
           SUM(od_quantity) AS total_quantity,
           SUM(od_amount) AS total_amount
    FROM sales
    GROUP BY od_month
    ORDER BY od_month
""")

# 3. 城市销量TOP20
df_city_sale = spark.sql("""
    SELECT cust_city,
           SUM(od_quantity) AS total_quantity
    FROM sales
    GROUP BY cust_city
    ORDER BY total_quantity DESC
    LIMIT 20
""")

# 4. 美妆品类需求分析
df_best_seller = spark.sql("""
    SELECT p.cataA, p.cataB,
           SUM(s.od_quantity) AS total_quantity
    FROM sales s
    JOIN prod p ON s.prod_id = p.prod_id
    GROUP BY p.cataA, p.cataB
    ORDER BY p.cataA ASC, total_quantity DESC
""")

# 5. 省份美妆需求排行
df_province_sale = spark.sql("""
    SELECT cust_province,
           SUM(od_quantity) AS total_quantity
    FROM sales
    GROUP BY cust_province
""")

# 6. 客户RFM分析
df_rfm_base = spark.sql("""
    SELECT cust_id,
           MAX(od_date) AS od_latest,
           COUNT(order_id) AS total_count,
           SUM(od_amount) AS total_amount
    FROM sales
    GROUP BY cust_id
""").withColumn("cust_all", F.lit(1))

df_rfm_base.createOrReplaceTempView("customer")

df_rfm_score = spark.sql("""
    SELECT cust_id, od_latest, total_count, total_amount,
           percent_rank() OVER (ORDER BY od_latest) AS R,
           percent_rank() OVER (ORDER BY total_count) AS F,
           percent_rank() OVER (ORDER BY total_amount) AS M
    FROM customer
""")

df_customerRFM = df_rfm_score.withColumn(
    "score", 
    F.round(F.col("R")*20 + F.col("F")*30 + F.col("M")*50, 1)
).orderBy(F.desc("score"))

# ============== 结果导出 ==============
colmap = {
    'prod_id': '商品编号', 'product': '商品名称',
    'cataB': '商品小类', 'cataA': '商品大类',
    'price': '销售单价', 'order_id': '订单编码',
    'od_date': '订单日期', 'cust_id': '客户编码',
    'cust_region': '所在区域', 'cust_province': '所在省份',
    'cust_city': '所在地市', 'od_quantity': '订购数量',
    'od_price': '订购单价', 'od_amount': '金额',
    'total_quantity': '订购数量', 'od_month': '订单月份',
    'total_amount': '金额', 'od_latest': '最近一次购买时间',
    'total_count': '消费频率', 'score': '综合分数'
}

# 统一导出配置
def export_csv(df, path, rename_map=None):
    if rename_map:
        for old_name, new_name in rename_map.items():
            df = df.withColumnRenamed(old_name, new_name)
    df.coalesce(1).write.csv(
        f"hdfs://localhost:9000/datas/{path}",
        header=True,
        sep=",",
        mode="overwrite"
    )

export_csv(df_top5_product.drop("rank"), "result.top5_product", {
    "prod_id": colmap["prod_id"],
    "product": colmap["product"],
    "cataB": colmap["cataB"],
    "cataA": colmap["cataA"],
    "price": colmap["price"]
})

export_csv(df_month_sale, "result.month_sale", {
    "od_month": colmap["od_month"],
    "total_quantity": colmap["total_quantity"],
    "total_amount": colmap["total_amount"]
})

# ... 其他数据集类似导出 ...

# ============== 结果校验 ==============
print("==== 商品TOP5样例 ====")
df_top5_product.show(5)

print("==== 月度销售统计样例 ====")
df_month_sale.show(5)

# 停止Spark会话
spark.stop()

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值