Spark 实战:结构化数据处理全流程
下面通过一个完整的案例,展示如何使用 Spark 处理结构化/半结构化数据,涵盖复杂查询、过滤、聚合、连接操作,并读写多种数据格式。
场景:电商销售数据分析
数据集
-
订单数据 (JSON格式)
{"order_id": 1001, "customer_id": 201, "order_date": "2023-01-15", "total_amount": 150.50} {"order_id": 1002, "customer_id": 202, "order_date": "2023-01-16", "total_amount": 89.99}
-
客户数据 (CSV格式)
customer_id,name,location,join_date 201,"Alice","New York","2020-05-10" 202,"Bob","Los Angeles","2021-02-15" 203,"Charlie","Chicago","2022-11-30"
-
产品数据 (Parquet格式)
- 列式存储,包含产品ID、名称、类别、价格
完整代码实现 (Python)
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
# 初始化 SparkSession(启用 Hive 支持)
spark = SparkSession.builder \
.appName("EcommerceAnalysis") \
.config("spark.sql.parquet.compression.codec", "snappy") \
.enableHiveSupport() \
.getOrCreate()
# 1. 读取多格式数据源
def load_data():
"""从不同格式加载数据集"""
# JSON 订单数据(半结构化)
orders_df = spark.read.json("hdfs:///data/ecommerce/orders/", multiLine=True)
# CSV 客户数据(带 Schema 推断)
customers_df = spark.read.csv(
"hdfs:///data/ecommerce/customers.csv",
header=True,
inferSchema=True
)
# Parquet 产品数据(列式存储)
products_df = spark.read.parquet("hdfs:///data/ecommerce/products/")
return orders_df, customers_df, products_df
# 2. 数据预处理
def preprocess_data(orders_df, customers_df, products_df):
"""数据清洗和转换"""
# 订单数据预处理
orders_clean = orders_df.filter(col("total_amount") > 0) \
.withColumn("order_date", to_date(col("order_date"))) \
.withColumn("order_month", date_format(col("order_date"), "yyyy-MM"))
# 客户数据预处理
customers_clean = customers_df \
.withColumn("join_year", year(col("join_date"))) \
.dropna(subset=["name"])
# 产品数据预处理
products_clean = products_df.filter(col("price") > 0) \
.withColumnRenamed("id", "product_id")
return orders_clean, customers_clean, products_clean
# 3. 复杂查询:连接操作
def create_enriched_orders(orders, customers, products):
"""创建增强订单视图"""
# 订单-客户连接(左外连接)
orders_with_customers = orders.join(
customers,
orders.customer_id == customers.customer_id,
"left"
)
# 订单-产品连接(需要订单详情表,这里简化为随机关联)
# 实际场景中应有订单详情表,这里仅作演示
from pyspark.sql.functions import rand
orders_with_products = orders_with_customers \
.withColumn("product_id", (rand() * 100).cast("int"))
# 最终连接
enriched_orders = orders_with_products.join(
products,
"product_id",
"left"
).select(
orders.order_id,
"order_date",
"customer_id",
"name",
"location",
"product_id",
"product_name",
"category",
"price",
"total_amount"
)
return enriched_orders
# 4. 聚合分析
def perform_analytics(enriched_orders):
"""执行多维分析"""
# 按月份和类别的销售聚合
monthly_category_sales = enriched_orders.groupBy(
"order_month", "category"
).agg(
sum("total_amount").alias("total_sales"),
countDistinct("order_id").alias("order_count"),
avg("total_amount").alias("avg_order_value")
).orderBy("order_month", col("total_sales").desc())
# 客户价值分析(RFM)
from pyspark.sql.window import Window
current_date = current_date()
rfm_analysis = enriched_orders.groupBy("customer_id", "name") \
.agg(
max("order_date").alias("last_order_date"),
count("order_id").alias("frequency"),
sum("total_amount").alias("monetary")
).withColumn("recency", datediff(current_date, col("last_order_date"))) \
.withColumn("rfm_score",
(ntile(5).over(Window.orderBy(desc("recency"))) +
(ntile(5).over(Window.orderBy(desc("frequency"))) +
(ntile(5).over(Window.orderBy(desc("monetary"))))
# 区域销售分析
region_sales = enriched_orders.groupBy(
"location", "category"
).pivot(
"order_month",
["2023-01", "2023-02", "2023-03"] # 实际应动态获取
).agg(
sum("total_amount").alias("sales")
).fillna(0)
return {
"monthly_category": monthly_category_sales,
"customer_rfm": rfm_analysis,
"region_sales": region_sales
}
# 5. 数据输出
def save_results(results):
"""将结果保存为不同格式"""
# 保存为 Parquet(分析基础)
results["monthly_category"].write.mode("overwrite") \
.parquet("hdfs:///output/ecommerce/monthly_sales.parquet")
# 保存为 ORC(Hive 集成)
results["customer_rfm"].write.mode("overwrite") \
.format("orc") \
.save("hdfs:///output/ecommerce/customer_rfm.orc")
# 保存为 JSON(供其他系统使用)
results["region_sales"].write.mode("overwrite") \
.json("hdfs:///output/ecommerce/region_sales.json")
# 保存到 Hive
results["customer_rfm"].write.mode("overwrite") \
.saveAsTable("ecommerce.customer_rfm")
# 保存为 CSV(供 Excel 分析)
results["region_sales"].write.mode("overwrite") \
.option("header", "true") \
.csv("hdfs:///output/ecommerce/region_sales.csv")
# 主执行流程
if __name__ == "__main__":
# 加载数据
orders, customers, products = load_data()
# 数据预处理
orders_clean, customers_clean, products_clean = preprocess_data(
orders, customers, products
)
# 创建缓存视图供 SQL 查询
orders_clean.createOrReplaceTempView("orders")
customers_clean.createOrReplaceTempView("customers")
products_clean.createOrReplaceTempView("products")
# 执行 SQL 查询(混合使用 SQL 和 DataFrame API)
high_value_orders = spark.sql("""
SELECT o.*, c.name, c.location
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE o.total_amount > 1000
AND o.order_date >= '2023-01-01'
""")
# 创建增强订单视图
enriched_orders = create_enriched_orders(
orders_clean, customers_clean, products_clean
)
# 执行分析
analytics_results = perform_analytics(enriched_orders)
# 保存结果
save_results(analytics_results)
# 执行即席 SQL 查询
spark.sql("""
SELECT
location,
AVG(monetary) AS avg_spent,
PERCENTILE(monetary, 0.5) AS median_spent
FROM ecommerce.customer_rfm
GROUP BY location
ORDER BY avg_spent DESC
""").show()
# 停止 Spark 会话
spark.stop()
关键技术解析
1. 多格式数据读写
- JSON:半结构化数据,保留原始嵌套结构
- CSV:带表头和 Schema 推断
- Parquet:列式存储,高效压缩
- ORC:Hive 优化格式
- Hive:数据仓库集成
2. 复杂数据操作
-
连接操作:
# 左外连接 orders.join(customers, "customer_id", "left") # 复杂条件连接 orders.join(products, orders.product_id == products.id)
-
聚合分析:
# 多维度聚合 df.groupBy("category", "month").agg( sum("sales").alias("total_sales"), avg("price").alias("avg_price") ) # 窗口函数 (RFM分析) Window.orderBy(desc("monetary"))
-
透视表:
df.groupBy("location").pivot("month").sum("sales")
3. 混合使用 API
-
DataFrame DSL:
df.filter(col("amount") > 100).select("id", "name")
-
SQL 查询:
spark.sql("SELECT * FROM orders WHERE total_amount > 1000")
4. 性能优化技巧
-
列式存储:使用 Parquet/ORC 格式
df.write.parquet("path", compression="snappy")
-
谓词下推:自动在 Parquet 读取时过滤
spark.read.parquet("path").filter("date > '2023-01-01'")
-
分区写入:
df.write.partitionBy("year", "month").parquet("path")
进阶实践建议
1. 处理半结构化数据
# 解析嵌套 JSON
from pyspark.sql.functions import from_json
json_schema = StructType([
StructField("user", StructType([
StructField("id", StringType()),
StructField("name", StringType())
])),
StructField("items", ArrayType(StructType([
StructField("product_id", StringType()),
StructField("quantity", IntegerType())
])))
])
df = spark.read.json("complex_data.json") \
.withColumn("parsed", from_json(col("value"), json_schema)) \
.select("parsed.user.*", "parsed.items")
2. 增量数据处理
# 使用 Structured Streaming
streaming_df = spark.readStream \
.format("kafka") \
.option("kafka.bootstrap.servers", "localhost:9092") \
.option("subscribe", "orders") \
.load()
parsed_df = streaming_df.select(
from_json(col("value").cast("string"), order_schema).alias("data")
).select("data.*")
# 写入增量 Parquet
query = parsed_df.writeStream \
.format("parquet") \
.option("path", "hdfs:///incremental/orders") \
.option("checkpointLocation", "/checkpoints") \
.trigger(processingTime="1 minute") \
.start()
3. 数据质量检查
from pyspark.sql import functions as F
# 定义数据质量规则
data_quality_rules = {
"total_amount_positive": F.col("total_amount") > 0,
"valid_customer": F.col("customer_id").isNotNull(),
"future_orders": F.col("order_date") <= F.current_date()
}
# 应用规则
for rule_name, condition in data_quality_rules.items():
invalid_count = df.filter(~condition).count()
if invalid_count > 0:
print(f"数据质量问题: {rule_name}, 无效记录: {invalid_count}")
# 写入问题数据
df.filter(~condition).write.mode("append") \
.parquet(f"hdfs:///data_issues/{rule_name}/")
4. 跨格式性能测试
import time
formats = ["parquet", "orc", "json", "csv"]
for fmt in formats:
start = time.time()
# 写入
df.write.format(fmt).save(f"hdfs:///perf_test/{fmt}_write")
# 读取
test_df = spark.read.format(fmt).load(f"hdfs:///perf_test/{fmt}_write")
test_df.count()
duration = time.time() - start
print(f"格式 {fmt}: {duration:.2f} 秒")
最佳实践总结
-
格式选择原则:
- 分析处理:Parquet/ORC
- 数据交换:JSON/CSV
- 流处理:Delta Lake/Iceberg
- 数据湖:Parquet + Hive Metastore
-
性能优化关键:
- 分区策略:按时间分区
- 压缩算法:Snappy(速度)或 Zlib(压缩率)
- 列式存储:只读取需要的列
- 缓存策略:对重用数据集进行 cache()
-
代码可维护性:
- 使用 DSL 和 SQL 混合编程
- 封装数据转换逻辑为函数
- 统一 Schema 管理
- 添加数据质量检查点
-
生产环境建议:
# 启用动态分区 spark.conf.set("hive.exec.dynamic.partition", "true") spark.conf.set("hive.exec.dynamic.partition.mode", "nonstrict") # 优化 shuffle 分区 spark.conf.set("spark.sql.shuffle.partitions", "200") # 启用向量化读取 (ORC) spark.conf.set("spark.sql.orc.enableVectorizedReader", "true")
通过这个实战案例,你可以全面掌握 Spark 处理结构化/半结构化数据的核心技能,包括复杂查询、多格式读写和性能优化,为构建高效数据管道打下坚实基础。