python 数据预处理之处理缺失值报错:ModuleNotFoundError: No module named 'sklearn.impute'

报错:ModuleNotFoundError: No module named ‘sklearn.impute’

刚开始学习数据预处理,学习插值法,网上找的代码都是各种报错,这个报错是这样的:
01
在网上找了一段代码,引包的时候报了这个错误,关于这个错误的解说很少,后来经过仔细查找,发现是版本更新的问题,以前Scikit-Learn的Imputer模块在2.0以上版本变成了SimpleImpeter,我自己环境配置的是0.18.1的,你可以查看一下自己的版本是多少,可以在自己的Anaconda下面找,我的路径是:D:\Anaconda3\Lib\site-packages\sklearn_init_.py,打开这个文件,可以看到sklearn的版本,如图所示:02

from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='median')

用这个方式就不会报错了。

import pandas as pd import numpy as np import matplotlib.pyplot as plt import os from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.ensemble import RandomForestRegressor from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.impute import SimpleImputer # 设置随机种子保证可重复性 np.random.seed(42) # 1. 数据加载 - 如果文件存在则加载,否则生成模拟数据 data_file = 'wastewater_data.csv' if os.path.exists(data_file): print(f"加载真实数据: {data_file}") data = pd.read_csv(data_file) else: print("未找到数据文件,生成模拟数据...") # 创建模拟数据 n_samples = 1000 data = pd.DataFrame({ 'Influent_COD': np.random.uniform(200, 800, n_samples), # 进水COD (mg/L) 'Influent_Flow': np.random.uniform(500, 2000, n_samples), # 进水流量 (m³/h) 'DO': np.random.uniform(1.0, 4.0, n_samples), # 溶解氧 (mg/L) 'MLSS': np.random.uniform(2000, 6000, n_samples), # 混合液悬浮固体 (mg/L) 'Temperature': np.random.uniform(15, 30, n_samples), # 温度 (°C) 'HRT': np.random.uniform(4, 12, n_samples), # 水力停留时间 (h) }) # 创建目标变量 (出水COD) 基于特征的非线性关系 data['Effluent_COD'] = ( 20 + 0.08 * data['Influent_COD'] + 0.002 * data['Influent_Flow'] - 2.5 * data['DO'] + 0.001 * data['MLSS'] + 0.3 * data['Temperature'] - 1.2 * data['HRT'] + np.random.normal(0, 5, n_samples) # 添加噪声 ) # 添加一些缺失值 for col in data.columns[:-1]: # 不在目标变量中添加 idx = np.random.choice(n_samples, size=int(n_samples*0.05), replace=False) data.loc[idx, col] = np.nan print("\n数据概览:") print(f"数据集形状: {data.shape}") print(f"前5行数据:\n{data.head()}") print("\n数据统计描述:\n", data.describe()) print("\n缺失值统计:\n", data.isnull().sum()) # 2. 数据预处理 print("\n=== 数据预处理 ===") # 2.1 分离特征和目标变量 X = data.drop('Effluent_COD', axis=1) y = data['Effluent_COD'] # 2.2 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 2.3 处理缺失值 - 使用中位数填充 imputer = SimpleImputer(strategy='median') X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) print("\n训练集缺失值处理结果:\n", X_train.isnull().sum()) print("\n测试集缺失值处理结果:\n", X_test.isnull().sum()) # 2.4 数据标准化 scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # 3. 模型训练与超参数优化 print("\n=== 模型训练与优化 ===") # 定义随机森林模型 rf = RandomForestRegressor(random_state=42) # 设置超参数网格 param_grid = { 'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4] } # 网格搜索交叉验证 grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1, scoring='neg_mean_squared_error') grid_search.fit(X_train_scaled, y_train) # 获取最佳模型 best_rf = grid_search.best_estimator_ print(f"\n最佳参数: {grid_search.best_params_}") print(f"最佳模型交叉验证分数: {-grid_search.best_score_:.4f}") # 4. 模型评估 print("\n=== 模型评估 ===") # 训练集预测 y_train_pred = best_rf.predict(X_train_scaled) # 测试集预测 y_test_pred = best_rf.predict(X_test_scaled) # 计算评估指标 def evaluate_model(y_true, y_pred, set_name): mse = mean_squared_error(y_true, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_true, y_pred) r2 = r2_score(y_true, y_pred) print(f"\n{set_name}评估结果:") print(f"MSE: {mse:.4f}, RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}") return rmse, r2 train_rmse, train_r2 = evaluate_model(y_train, y_train_pred, "训练集") test_rmse, test_r2 = evaluate_model(y_test, y_test_pred, "测试集") # 5. 特征重要性分析 print("\n=== 特征重要性分析 ===") feature_importances = best_rf.feature_importances_ features = X.columns importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances}) importance_df = importance_df.sort_values('Importance', ascending=False) print("\n特征重要性排序:") print(importance_df) # 可视化特征重要性 plt.figure(figsize=(10, 6)) plt.barh(importance_df['Feature'], importance_df['Importance']) plt.xlabel('特征重要性') plt.title('随机森林特征重要性分析') plt.tight_layout() plt.savefig('feature_importance.png') plt.show() # 6. 预测结果可视化 plt.figure(figsize=(10, 6)) plt.scatter(y_test, y_test_pred, alpha=0.5) plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--') plt.xlabel('实际COD值') plt.ylabel('预测COD值') plt.title('实际值与预测值对比') plt.tight_layout() plt.savefig('prediction_vs_actual.png') plt.show() print("\n模型训练与评估完成!") 这个代码能运行吗
06-21
警告: 目标列包含 44 个缺失值,已删除相应行 检测到数值特征: 1342个 检测到分类特征: 14个 Traceback (most recent call last): File "F:\Python_task\Lasso1.py", line 73, in <module> pipeline.fit(X, y) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\pipeline.py", line 390, in fit Xt = self._fit(X, y, **fit_params_steps) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\pipeline.py", line 348, in _fit X, fitted_transformer = fit_transform_one_cached( File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\joblib\memory.py", line 326, in __call__ return self.func(*args, **kwargs) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one res = transformer.fit_transform(X, y, **fit_params) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\compose\_column_transformer.py", line 675, in fit_transform result = self._fit_transform(X, y, _fit_transform_one) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\compose\_column_transformer.py", line 606, in _fit_transform return Parallel(n_jobs=self.n_jobs)( File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\joblib\parallel.py", line 1986, in __call__ return output if self.return_generator else list(output) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\joblib\parallel.py", line 1914, in _get_sequential_output res = func(*args, **kwargs) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\utils\fixes.py", line 216, in __call__ return self.function(*args, **kwargs) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one res = transformer.fit_transform(X, y, **fit_params) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\pipeline.py", line 426, in fit_transform Xt = self._fit(X, y, **fit_params_steps) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\pipeline.py", line 348, in _fit X, fitted_transformer = fit_transform_one_cached( File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\joblib\memory.py", line 326, in __call__ return self.func(*args, **kwargs) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one res = transformer.fit_transform(X, y, **fit_params) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\base.py", line 855, in fit_transform return self.fit(X, y, **fit_params).transform(X) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\impute\_base.py", line 319, in fit X = self._validate_input(X, in_fit=True) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\impute\_base.py", line 287, in _validate_input raise ve File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\impute\_base.py", line 270, in _validate_input X = self._validate_data( File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\base.py", line 566, in _validate_data X = check_array(X, **check_params) File "C:\Users\lenovo\anaconda3\envs\radi\lib\site-packages\sklearn\utils\validation.py", line 805, in check_array raise ValueError( ValueError: Found array with 0 sample(s) (shape=(0, 1342)) while a minimum of 1 is required. 报错并修改,且我需要将筛选完的文件保存到和读取文件一样的文件夹里
07-22
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值