文章目录
一、算法和背景介绍
关于XGBoost的算法原理,已经进行了介绍与总结,相关内容可参考【机器学习(一)】分类和回归任务-XGBoost算法-Sentosa_DSML社区版一文。本文以预测二手车的交易价格为目标,通过Python代码和Sentosa_DSML社区版分别实现构建XGBoost回归预测模型,并对模型进行评估,包括评估指标的选择与分析。最后得出实验结论,确保模型在二手汽车价格回归预测中的有效性和准确性。
数据集介绍
以预测二手车的交易价格为任务,数据来自某交易平台的二手车交易记录,总数据量超过40w,包含31列变量信息,其中15列为匿名变量。数据集概况介绍:
二、Python代码和Sentosa_DSML社区版算法实现对比
(一) 数据读入与统计分析
1、python代码实现
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from matplotlib import rcParams
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
数据读入
file_path = r'.\二手汽车价格.csv'
output_dir = r'.\xgb'
if not os.path.exists(file_path):
raise FileNotFoundError(f"文件未找到: {
file_path}")
if not os.path.exists(output_dir):
os.makedirs(output_dir)
df = pd.read_csv(file_path)
print(df.isnull().sum())
print(df.head())
>> SaleID name regDate model ... v_11 v_12 v_13 v_14
0 0 736 20040402 30.0 ... 2.804097 -2.420821 0.795292 0.914763
1 1 2262 20030301 40.0 ... 2.096338 -1.030483 -1.722674 0.245522
2 2 14874 20040403 115.0 ... 1.803559 1.565330 -0.832687 -0.229963
3 3 71865 19960908 109.0 ... 1.285940 -0.501868 -2.438353 -0.478699
4 4 111080 20120103 110.0 ... 0.910783 0.931110 2.834518 1.923482
统计分析
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['SimHei']
stats_df = pd.DataFrame(columns=[
'列名', '数据类型', '最大值', '最小值', '平均值', '非空值数量', '空值数量',
'众数', 'True数量', 'False数量', '标准差', '方差', '中位数', '峰度', '偏度',
'极值数量', '异常值数量'
])
def detect_extremes_and_outliers(column, extreme_factor=3, outlier_factor=5):
if not np.issubdtype(column.dtype, np.number):
return None, None
q1 = column.quantile(0.25)
q3 = column.quantile(0.75)
iqr = q3 - q1
lower_extreme = q1 - extreme_factor * iqr
upper_extreme = q3 + extreme_factor * iqr
lower_outlier = q1 - outlier_factor * iqr
upper_outlier = q3 + outlier_factor * iqr
extremes = column[(column < lower_extreme) | (column > upper_extreme)]
outliers = column[(column < lower_outlier) | (column > upper_outlier)]
return len(extremes), len(outliers)
for col in df.columns:
col_data = df[col]
dtype = col_data.dtype
if np.issubdtype(dtype, np.number):
max_value = col_data.max()
min_value = col_data.min()
mean_value = col_data.mean()
std_value = col_data.std()
var_value = col_data.var()
median_value = col_data.median()
kurtosis_value = col_data.kurt()
skew_value = col_data.skew()
extreme_count, outlier_count = detect_extremes_and_outliers(col_data)
else:
max_value = min_value = mean_value = std_value = var_value = median_value = kurtosis_value = skew_value = None
extreme_count = outlier_count = None
non_null_count = col_data.count()
null_count = col_data.isna().sum()
mode_value = col_data.mode().iloc[0] if not col_data.mode().empty else None
true_count = col_data[col_data == True].count() if dtype == 'bool' else None
false_count = col_data[col_data == False].count() if dtype == 'bool' else None
new_row = pd.DataFrame({
'列名': [col],
'数据类型': [dtype],
'最大值': [max_value],
'最小值': [min_value],
'平均值': [mean_value],
'非空值数量': [non_null_count],
'空值数量': [null_count],
'众数'