精彩专栏推荐订阅:在下方主页👇🏻👇🏻👇🏻👇🏻
💖🔥作者主页:计算机毕设木哥🔥 💖
一、项目介绍
随着我国汽车保有量持续增长,二手车交易市场规模不断扩大,据中国汽车流通协会数据显示,2022年全国二手车交易量达1773.88万辆,交易额为1.37万亿元,同比增长约13.3%。在这个快速发展的市场中,消费者面临信息不对称、定价不透明等问题,而懂车帝作为国内知名的汽车资讯平台,其二手车数据包含了大量有价值的市场信息。传统的数据分析方法已难以应对如此庞大且复杂的数据集,大数据技术的应用成为必然趋势。基于Spark的懂车帝二手车数据分析系统正是在这一背景下应运而生,该系统利用Hadoop和Spark的分布式计算能力,结合Python、Django、Vue和Echarts等技术,旨在从海量二手车交易数据中提取有价值的市场洞察,为相关决策提供数据支持。
本系统的开发具有重要的实际意义和技术价值。实际应用层面,该系统能够帮助消费者了解二手车市场行情,识别影响车辆价格的关键因素,做出更明智的购买决策;对于二手车平台和经销商,系统提供的市场分析可优化定价策略和库存管理,提高运营效率。技术层面看,系统整合了Hadoop、Spark等大数据处理框架与Python数据分析技术,实现了从数据采集、存储、处理到可视化的完整流程,展示了大数据技术在垂直行业中的应用价值。学术层面,系统通过K-Means聚类算法对二手车市场进行细分,揭示了不同车辆群体的特征,为二手车市场研究提供了新的分析视角和方法论,推动了大数据技术与传统行业的深度融合。
二、视频展示
计算机大数据毕设选题推荐-基于大数据的懂车帝二手车数据分析系统
三、开发环境
- 大数据技术:Hadoop、Spark、Hive
- 开发技术:Python、Django框架、Vue、Echarts
- 软件工具:Pycharm、DataGrip、Anaconda
- 可视化 工具 Echarts
四、系统展示
登录模块:
可视化模块展示:
五、代码展示
# 核心功能1: 二手车价值影响因素分析 - 车龄与价格关系分析
def analyze_car_age_price_relationship(spark_session):
# 从Hadoop读取处理后的二手车数据
car_data = spark_session.read.parquet("hdfs://localhost:9000/user/hadoop/car_data_processed.parquet")
# 注册为临时视图以便使用SQL查询
car_data.createOrReplaceTempView("car_data")
# 使用Spark SQL分析车龄与价格的关系
age_price_analysis = spark_session.sql("""
SELECT
car_age,
ROUND(AVG(sh_price), 2) as avg_price,
ROUND(MIN(sh_price), 2) as min_price,
ROUND(MAX(sh_price), 2) as max_price,
ROUND(STDDEV(sh_price), 2) as price_stddev,
COUNT(*) as car_count,
ROUND(AVG(sh_price) / AVG(official_price) * 100, 2) as avg_retention_rate
FROM car_data
WHERE car_age BETWEEN 0 AND 15
AND sh_price > 0
AND official_price > 0
GROUP BY car_age
ORDER BY car_age
""")
# 转换为Pandas DataFrame以便进一步处理
age_price_df = age_price_analysis.toPandas()
# 计算折旧曲线模型参数 (使用指数衰减模型)
from scipy.optimize import curve_fit
import numpy as np
def depreciation_model(x, a, b, c):
return a * np.exp(-b * x) + c
try:
params, _ = curve_fit(
depreciation_model,
age_price_df['car_age'],
age_price_df['avg_retention_rate'],
p0=[100, 0.2, 20], # 初始参数猜测
bounds=([50, 0.1, 10], [150, 0.5, 40]) # 参数边界
)
# 生成预测值并计算R方值(拟合优度)
age_price_df['predicted_rate'] = depreciation_model(age_price_df['car_age'], *params)
residuals = age_price_df['avg_retention_rate'] - age_price_df['predicted_rate']
ss_res = np.sum(residuals**2)
ss_tot = np.sum((age_price_df['avg_retention_rate'] - np.mean(age_price_df['avg_retention_rate']))**2)
r_squared = 1 - (ss_res / ss_tot)
# 保存分析结果到Django模型
from car_analysis.models import CarAgeAnalysis
for _, row in age_price_df.iterrows():
analysis = CarAgeAnalysis(
car_age=row['car_age'],
avg_price=row['avg_price'],
min_price=row['min_price'],
max_price=row['max_price'],
price_stddev=row['price_stddev'],
car_count=row['car_count'],
retention_rate=row['avg_retention_rate'],
predicted_rate=row['predicted_rate']
)
analysis.save()
# 返回分析结果和模型参数
return {
'data': age_price_df.to_dict('records'),
'model_params': {
'a': params[0],
'b': params[1],
'c': params[2],
'r_squared': r_squared,
'formula': f"Value(%) = {params[0]:.2f} * exp(-{params[1]:.4f} * age) + {params[2]:.2f}"
}
}
except Exception as e:
return {'error': str(e), 'data': age_price_df.to_dict('records')}
# 核心功能2: 主流汽车品牌市场竞争力分析 - 品牌保值率排行
def analyze_brand_value_retention(spark_session):
# 从Hadoop读取处理后的二手车数据
car_data = spark_session.read.parquet("hdfs://localhost:9000/user/hadoop/car_data_processed.parquet")
car_data.createOrReplaceTempView("car_data")
# 使用Spark SQL分析品牌保值率
brand_retention_analysis = spark_session.sql("""
WITH brand_stats AS (
SELECT
brand_name,
COUNT(*) as car_count,
AVG(sh_price) as avg_sh_price,
AVG(official_price) as avg_official_price,
AVG(car_age) as avg_car_age,
AVG(car_mileage) as avg_mileage
FROM car_data
WHERE brand_name IS NOT NULL
AND sh_price > 0
AND official_price > 0
AND car_age BETWEEN 0 AND 10
GROUP BY brand_name
HAVING COUNT(*) >= 50
)
SELECT
brand_name,
car_count,
ROUND(avg_sh_price, 2) as avg_sh_price,
ROUND(avg_official_price, 2) as avg_official_price,
ROUND(avg_car_age, 1) as avg_car_age,
ROUND(avg_mileage, 1) as avg_mileage,
ROUND((avg_sh_price / avg_official_price) * 100, 2) as retention_rate,
ROUND((avg_sh_price / avg_official_price) * 100 / (POWER(0.8, avg_car_age)), 2) as adjusted_retention_rate
FROM brand_stats
ORDER BY adjusted_retention_rate DESC
""")
# 转换为Pandas DataFrame
brand_df = brand_retention_analysis.toPandas()
# 计算行业平均保值率作为基准
industry_avg_retention = brand_df['retention_rate'].mean()
industry_avg_adjusted = brand_df['adjusted_retention_rate'].mean()
# 对品牌进行分类(高保值、中等保值、低保值)
brand_df['retention_category'] = brand_df['adjusted_retention_rate'].apply(
lambda x: 'high' if x > industry_avg_adjusted * 1.1 else
('low' if x < industry_avg_adjusted * 0.9 else 'medium')
)
# 计算各品牌相对于行业平均的保值率差异百分比
brand_df['retention_diff_pct'] = ((brand_df['retention_rate'] - industry_avg_retention) / industry_avg_retention) * 100
# 分析不同车龄段的保值率表现
age_brackets = [(0, 1), (1, 3), (3, 5), (5, 8), (8, 10)]
retention_by_age = {}
for min_age, max_age in age_brackets:
age_analysis = spark_session.sql(f"""
SELECT
brand_name,
COUNT(*) as car_count,
ROUND(AVG(sh_price / official_price) * 100, 2) as retention_rate
FROM car_data
WHERE brand_name IS NOT NULL
AND sh_price > 0
AND official_price > 0
AND car_age BETWEEN {min_age} AND {max_age}
GROUP BY brand_name
HAVING COUNT(*) >= 20
ORDER BY retention_rate DESC
""")
retention_by_age[f"{min_age}-{max_age}"] = age_analysis.toPandas().to_dict('records')
# 保存分析结果到Django模型
from car_analysis.models import BrandRetentionAnalysis
for _, row in brand_df.iterrows():
analysis = BrandRetentionAnalysis(
brand_name=row['brand_name'],
car_count=row['car_count'],
avg_sh_price=row['avg_sh_price'],
avg_official_price=row['avg_official_price'],
avg_car_age=row['avg_car_age'],
avg_mileage=row['avg_mileage'],
retention_rate=row['retention_rate'],
adjusted_retention_rate=row['adjusted_retention_rate'],
retention_category=row['retention_category'],
retention_diff_pct=row['retention_diff_pct']
)
analysis.save()
# 返回完整的分析结果
return {
'overall_analysis': brand_df.to_dict('records'),
'industry_averages': {
'retention_rate': industry_avg_retention,
'adjusted_retention_rate': industry_avg_adjusted
},
'retention_by_age': retention_by_age
}
# 核心功能3: 二手车市场供给画像与聚类分析 - K-Means聚类
def perform_car_clustering_analysis(spark_session):
# 从Hadoop读取处理后的二手车数据
car_data = spark_session.read.parquet("hdfs://localhost:9000/user/hadoop/car_data_processed.parquet")
# 选择用于聚类的特征
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
# 特征选择和预处理
feature_cols = ['car_age', 'car_mileage', 'sh_price', 'official_price']
# 过滤掉缺失值和异常值
car_data_filtered = car_data.filter(
(car_data.car_age.isNotNull()) &
(car_data.car_mileage.isNotNull()) &
(car_data.sh_price > 0) &
(car_data.official_price > 0) &
(car_data.car_age <= 15) &
(car_data.car_mileage <= 30) # 30万公里
)
# 创建特征向量
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
car_data_features = assembler.transform(car_data_filtered)
# 标准化特征
scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True)
scaler_model = scaler.fit(car_data_features)
car_data_scaled = scaler_model.transform(car_data_features)
# 确定最佳聚类数 (使用肘部法则)
silhouette_scores = []
evaluator = ClusteringEvaluator(predictionCol="prediction", featuresCol="scaled_features", metricName="silhouette")
for k in range(2, 11):
kmeans = KMeans(k=k, seed=42, featuresCol="scaled_features")
model = kmeans.fit(car_data_scaled)
predictions = model.transform(car_data_scaled)
score = evaluator.evaluate(predictions)
silhouette_scores.append((k, score))
# 找出最佳聚类数
best_k = max(silhouette_scores, key=lambda x: x[1])[0]
# 使用最佳聚类数进行最终聚类
kmeans = KMeans(k=best_k, seed=42, featuresCol="scaled_features")
model = kmeans.fit(car_data_scaled)
cluster_centers = model.clusterCenters()
# 对数据进行聚类预测
predictions = model.transform(car_data_scaled)
# 分析每个聚类的特征
cluster_analysis = predictions.select(
"prediction", "car_age", "car_mileage", "sh_price", "official_price", "brand_name", "model_name"
).groupBy("prediction").agg(
{"car_age": "avg", "car_mileage": "avg", "sh_price": "avg", "official_price": "avg", "*": "count"}
).orderBy("prediction")
# 转换为Pandas DataFrame进行进一步分析
cluster_df = cluster_analysis.toPandas()
# 为每个聚类添加描述性标签
cluster_labels = []
for _, row in cluster_df.iterrows():
avg_age = row['avg(car_age)']
avg_price = row['avg(sh_price)']
avg_mileage = row['avg(car_mileage)']
if avg_age < 3 and avg_price > 20:
label = "高端准新车"
elif avg_age < 3 and avg_price <= 20:
label = "经济型准新车"
elif 3 <= avg_age < 6 and avg_price > 15:
label = "中高端中年车"
elif 3 <= avg_age < 6 and avg_price <= 15:
label = "经济型中年车"
elif avg_age >= 6 and avg_mileage > 10:
label = "高里程老车"
else:
label = "低里程老车"
cluster_labels.append(label)
cluster_df['cluster_label'] = cluster_labels
# 分析每个聚类中的主要品牌
brand_distribution = []
for i in range(best_k):
top_brands = predictions.filter(predictions.prediction == i) \
.groupBy("brand_name") \
.count() \
.orderBy("count", ascending=False) \
.limit(5) \
.toPandas()
brand_distribution.append({
'cluster': i,
'label': cluster_labels[i],
'top_brands': top_brands.to_dict('records')
})
# 保存聚类结果到Django模型
from car_analysis.models import CarClusterAnalysis
for i, row in cluster_df.iterrows():
analysis = CarClusterAnalysis(
cluster_id=row['prediction'],
cluster_label=row['cluster_label'],
car_count=row['count(1)'],
avg_car_age=row['avg(car_age)'],
avg_mileage=row['avg(car_mileage)'],
avg_sh_price=row['avg(sh_price)'],
avg_official_price=row['avg(official_price)']
)
analysis.save()
# 返回完整的聚类分析结果
return {
'silhouette_scores': silhouette_scores,
'best_k': best_k,
'cluster_centers': [center.tolist() for center in cluster_centers],
'cluster_analysis': cluster_df.to_dict('records'),
'brand_distribution': brand_distribution
}
六、项目文档展示
七、项目总结
本文设计并实现了基于Spark的懂车帝二手车数据分析系统,该系统充分利用Hadoop和Spark的分布式计算能力,结合Python、Django、Vue和Echarts等技术,构建了一个完整的二手车市场数据分析平台。系统从四个核心维度展开分析:二手车市场宏观特征分析、价值核心影响因素分析、主流汽车品牌市场竞争力分析以及市场供给画像与聚类分析。通过对车龄分布、里程分布、价格关系等多维度数据的深入挖掘,系统成功构建了车辆折旧曲线模型,揭示了不同品牌的保值率差异,并利用K-Means聚类算法实现了对二手车市场的精细化分类。实验结果表明,该系统能够有效识别影响二手车价格的关键因素,为消费者提供科学的购车决策支持,同时为二手车平台和经销商的定价策略与库存管理提供数据依据。本系统的开发不仅展示了大数据技术在垂直行业中的应用价值,也为二手车市场研究提供了新的分析视角和方法论,推动了大数据技术与传统汽车行业的深度融合,具有重要的实际应用价值和技术创新意义。
大家可以帮忙点赞、收藏、关注、评论啦 👇🏻
💖🔥作者主页:计算机毕设木哥🔥 💖