import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from diffprivlib.mechanisms import Laplace
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR
import joblib
# 加载数据
df_a = pd.read_csv('A_cars.csv')
df_b = pd.read_csv('B_cars.csv')
df = pd.concat([df_a, df_b], ignore_index=True)
# 数据清洗
# 1. 清洗 mileage 列
df['milage'] = df['milage'].str.replace(',', '').str.extract(r'(\d+)').astype(float)
# 2. 生成 age 属性(防止除0错误)
current_year = 2023
df['age'] = current_year - df['model_year']
df.loc[df['age'] == 0, 'age'] = 1 # 将age=0替换为1,避免除0错误
# 3. 生成 mileage_per_year 属性(处理无穷大)
df['mileage_per_year'] = df['milage'] / df['age']
df.replace([np.inf, -np.inf], np.nan, inplace=True) # 替换无穷大为NaN
# 4. 生成其他属性
df['is_accident_free'] = df['accident'].apply(lambda x: 'None reported' in str(x))
df['is_clean_title'] = df['clean_title'].apply(lambda x: bool(x))
df['engine_power'] = df['engine'].str.extract(r'(\d+\.?\d*)\s*HP').astype(float)
df.drop(columns=['accident', 'clean_title'], inplace=True)
# 5. 处理缺失值(改进的鲁棒填充)
def robust_fillna(series):
# 移除异常值(使用IQR方法)
q1 = series.quantile(0.25)
q3 = series.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
# 创建过滤条件
valid_mask = (series >= lower_bound) & (series <= upper_bound)
# 确保有有效值可用
if valid_mask.any():
return series.fillna(series[valid_mask].mean())
return series.fillna(0) # 没有有效值时填充0
numeric_columns = ['milage', 'age', 'mileage_per_year', 'engine_power']
for col in numeric_columns:
df[col] = robust_fillna(df[col])
# 6. 应用差分隐私(正确实现)
epsilon = 1.0 # 隐私预算
for col in numeric_columns:
# 计算敏感度(使用全局范围)
sensitivity = df[col].max() - df[col].min()
laplace = Laplace(epsilon=epsilon, sensitivity=sensitivity)
# 为每个值添加噪声
df[col] = df[col].apply(lambda x: laplace.randomise(x))
# 7. 对分类变量进行独热编码
categorical_columns = ['brand', 'model']
df = pd.get_dummies(df, columns=categorical_columns)
# 8. 标准化数值特征(添加检查)
# 确保没有NaN或无穷大
assert not df[numeric_columns].isnull().values.any(), "存在NaN值"
assert not np.isinf(df[numeric_columns].values).any(), "存在无穷大值"
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
# 9. 模型训练与评估
X = df.drop(columns=['price'])
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = SVR(kernel='rbf', C=1.0, gamma='scale')
model.fit(X_train, y_train)
# 交叉验证
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validation MSE scores: {-scores}")
# 保存结果
joblib.dump(model, 'car_price_model.pkl')
df.to_csv('cleaned_cars_private.csv', index=False)
运行后:
Traceback (most recent call last):
File "D:\peixunruanjian0106\python\pythonprojects\cd03\实训开始\二手车(第二周)\data_cleaning.py", line 84, in <module>
model.fit(X_train, y_train)
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\sklearn\base.py", line 1363, in wrapper
return fit_method(estimator, *args, **kwargs)
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\sklearn\svm\_base.py", line 197, in fit
X, y = validate_data(
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\sklearn\utils\validation.py", line 2971, in validate_data
X, y = check_X_y(X, y, **check_params)
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\sklearn\utils\validation.py", line 1368, in check_X_y
X = check_array(
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\sklearn\utils\validation.py", line 971, in check_array
array = array.astype(new_dtype)
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\pandas\core\generic.py", line 6643, in astype
new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors)
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\pandas\core\internals\managers.py", line 430, in astype
return self.apply(
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\pandas\core\internals\managers.py", line 363, in apply
applied = getattr(b, f)(**kwargs)
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\pandas\core\internals\blocks.py", line 758, in astype
new_values = astype_array_safe(values, dtype, copy=copy, errors=errors)
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\pandas\core\dtypes\astype.py", line 237, in astype_array_safe
new_values = astype_array(values, dtype, copy=copy)
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\pandas\core\dtypes\astype.py", line 182, in astype_array
values = _astype_nansafe(values, dtype, copy=copy)
File "D:\peixunruanjian0106\python\python3.10.2\lib\site-packages\pandas\core\dtypes\astype.py", line 133, in _astype_nansafe
return arr.astype(dtype, copy=True)
ValueError: could not convert string to float: 'Gasoline'