inv_selection 中判断选中行的数量和所选中的行

本文概述了AI音视频处理领域的关键技术,包括视频分割、语义识别、自动驾驶、AR、SLAM等,并探讨了其在不同场景的应用。

inv_rowselect.of_selectedcount(ref long al_selectedrows[]) returns long

返回选中的行数,参数,形式参数,数组,返回选中行所在下标的值

import os import geopandas as gpd import numpy as np import rasterio from rasterio.windows import Window from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.feature_selection import RFECV import matplotlib.pyplot as plt import matplotlib.colors as colors from tqdm import tqdm import time import joblib import warnings from pathlib import Path import pandas as pd from scipy.spatial.distance import cdist # 确保能正常保存Excel文件 try: import openpyxl except ImportError: print("⚠️ 未安装openpyxl库,将尝试安装...") import subprocess import sys subprocess.check_call([sys.executable, "-m", "pip", "install", "openpyxl"]) import openpyxl warnings.filterwarnings('ignore') plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False class YouyangAvailablePhosphorusPredictor: def __init__(self): # 文件路径 - 请根据实际情况修改这些路径! self.point_shp_path = r"C:\成果编制\酉阳\酉阳数据处理后(最终)\属性\有效磷" # 有效磷点数据路径 self.raster_folder = r"C:\成果编制\酉阳\酉阳数据处理后(最终)\环境变量数据" # 环境因子栅格文件夹 self.output_folder = r"C:\成果编制\酉阳\结果文件" # 创建输出文件夹 os.makedirs(self.output_folder, exist_ok=True) # 38个环境变量名称 self.expected_rasters = [ 'YX', 'Win', 'wencha', 'Valley_Dep', 'TZDM', 'TWI', 'TRI', 'TPI', 'Total_Catc', 'Tem', 'SPI', 'SLP', 'SER3', 'SER2', 'SER1', 'SAVI', 'RPI', 'Rad', 'PW', 'Prs', 'Pre', 'PRC', 'PLC', 'NDVI', 'MNDWI', 'LSWI', 'LS_Factor', 'LAIgy', 'ASP', 'Evp', 'AH', 'EVI', 'DLBM', 'DEM', 'DCDM', 'CND', 'CNBL', 'CI_' ] # 随机森林参数 - 针对有效磷特性调整 self.rf_params = dict( n_estimators=200, max_depth=15, min_samples_split=8, min_samples_leaf=4, max_features=0.7, max_samples=0.8, bootstrap=True, random_state=42, n_jobs=-1 ) self.nodata = -9999 self.block_size = 512 # RFK相关参数 self.variogram_model = 'spherical' self.n_lags = 15 self.max_distance = None # 十折交叉验证设置 self.cv = KFold(n_splits=10, shuffle=True, random_state=42) # 特征择参数 self.min_features_to_select = 8 self.step = 1 def verify_paths(self): """验证所有必要路径是否存在""" print("🔍 验证文件路径...") # 验证点数据路径 if not os.path.exists(self.point_shp_path): raise FileNotFoundError( f"点数据文件不存在: {self.point_shp_path}\n" f"请检查路径是否正确,或修改__init__方法中的point_shp_path参数" ) # 验证栅格文件夹路径 if not os.path.exists(self.raster_folder): raise NotADirectoryError( f"栅格文件夹不存在: {self.raster_folder}\n" f"请检查路径是否正确,或修改__init__方法中的raster_folder参数" ) # 检查栅格文件夹是否为空 if not os.listdir(self.raster_folder): raise ValueError( f"栅格文件夹为空: {self.raster_folder}\n" f"请确保该文件夹中包含.tif或.tiff格式的环境因子栅格文件" ) print("✅ 所有路径验证通过") def load_environment_rasters(self): """加载环境因子栅格,增加错误处理提示""" print("📁 加载环境因子栅格...") # 先验证路径 self.verify_paths() raster_files = {} found_files = [] # 查找所有栅格文件,包括子文件夹 for root, _, files in os.walk(self.raster_folder): for file in files: if file.lower().endswith(('.tif', '.tiff')): full_path = os.path.join(root, file) found_files.append((file, full_path)) print(f" 发现栅格文件: {os.path.basename(full_path)}") print(f"📂 共找到 {len(found_files)} 个栅格文件") if not found_files: raise ValueError( f"在 {self.raster_folder} 及其子文件夹中没有找到任何.tif或.tiff文件\n" f"请确认环境因子文件是否存在,且格式为.tif或.tiff" ) # 创建文件名到路径的映射(更灵活的匹配) file_mapping = {} for file, full_path in found_files: file_stem = Path(file).stem # 添加多种可能的匹配形式 variations = [ file_stem, file_stem.replace(' ', ''), file_stem.replace('_', ''), file_stem.lower(), file_stem.upper(), file_stem.replace(' ', '').lower(), file_stem.replace('_', '').lower() ] for var in variations: if var not in file_mapping: file_mapping[var] = full_path # 尝试匹配预期的环境因子 matched_count = 0 print("\n🎯 开始匹配环境因子...") for expected_name in self.expected_rasters: matched_path = None # 生成多种可能的匹配关键词 possible_keys = [ expected_name, expected_name.replace('_', ''), expected_name.lower(), expected_name.upper(), expected_name.replace('_', '').lower() ] for key in possible_keys: if key in file_mapping: matched_path = file_mapping[key] break if matched_path: try: with rasterio.open(matched_path) as src: if src.count == 1: # 确保是单波段栅格 raster_files[expected_name] = matched_path matched_count += 1 print(f" ✅ {expected_name} -> {os.path.basename(matched_path)}") except Exception as e: print(f" ❌ {expected_name}: 文件读取失败 - {str(e)[:50]}...") else: print(f" ❌ {expected_name}: 未找到对应文件") print(f"\n🎯 成功匹配 {matched_count}/{len(self.expected_rasters)} 个环境因子") if not raster_files: raise ValueError( "未能匹配到任何环境因子栅格文件\n" "请检查栅格文件命名是否与预期的环境因子名称相符" ) return raster_files def load_point_data(self): """加载点数据""" print("📊 加载点数据...") try: points = gpd.read_file(self.point_shp_path) print(f"✅ 成功加载 {len(points)} 个样本点") print(f" 点数据坐标系统: {points.crs}") return points except Exception as e: raise RuntimeError( f"加载点数据失败: {str(e)}\n" f"请检查点数据文件是否完好,路径是否正确: {self.point_shp_path}" ) def prepare_data(self, points, raster_files): """准备训练数据""" print("🔧 准备训练数据...") # 查找有效磷字段 phosphorus_fields = [col for col in points.columns if '有效磷' in col or '速效磷' in col or 'availablephosphorus' in col.lower() or 'p_available' in col.lower() or 'ap' in col.lower() and not 'shape' in col.lower()] if not phosphorus_fields: numeric_fields = points.select_dtypes(include=[np.number]).columns.tolist() if numeric_fields: phosphorus_fields = [numeric_fields[0]] print(f"⚠️ 未找到明确的有效磷字段,使用第一个数值字段: {phosphorus_fields[0]}") else: raise ValueError("点数据中未找到任何数值字段作为有效磷数据") target_field = phosphorus_fields[0] print(f"🎯 使用目标字段: {target_field}") # 匹配可用的环境因子 available_features = [] point_cols_lower = [col.lower() for col in points.columns] for raster_feat in raster_files.keys(): raster_lower = raster_feat.lower() for i, point_col_lower in enumerate(point_cols_lower): if (raster_lower in point_col_lower or point_col_lower in raster_lower or raster_lower.replace(' ', '') == point_col_lower.replace(' ', '') or raster_lower.replace('_', '') == point_col_lower.replace('_', '')): original_point_col = list(points.columns)[i] if original_point_col not in available_features: available_features.append(original_point_col) if not available_features: raise ValueError( "点数据中没有找到与环境因子匹配的字段\n" "请检查点数据属性表是否包含与栅格对应的环境因子属性" ) print(f"📊 找到 {len(available_features)} 个匹配的环境因子") print(f"📋 环境因子列表: {available_features}") # 仅移除缺失值 initial_count = len(points) points_clean = points.dropna(subset=[target_field] + available_features).copy() cleaned_count = len(points_clean) print(f"🧹 数据清洗: 移除缺失值 → {initial_count} → {cleaned_count} 个样本") if cleaned_count < 50: # 降低样本数量要求,便于测试 print(f"⚠️ 样本数量较少 ({cleaned_count}个),可能影响模型精度") # raise ValueError(f"有效样本数不足 (当前{cleaned_count},建议≥100)") # 准备训练数据 X = points_clean[available_features].values y = points_clean[target_field].values print(f"📈 训练数据统计:") print(f" 特征矩阵形状: {X.shape}") print(f" 有效磷范围: {y.min():.2f}~{y.max():.2f} mg/kg, 均值: {y.mean():.2f} mg/kg") return points_clean, X, y, available_features, target_field def optimized_data_split(self, X, y, coords, test_size=0.15): """优化数据分割""" print("🎯 进优化数据分割...") n_bins = min(10, len(y) // 25) if n_bins < 3: n_bins = 3 # 处理可能的分箱错误 try: bins = pd.qcut(y, n_bins, labels=False, duplicates='drop') except: print("⚠️ 分箱失败,使用随机分割") bins = None X_train, X_test, y_train, y_test, coords_train, coords_test = train_test_split( X, y, coords, test_size=test_size, random_state=42, stratify=bins ) print(f"📊 数据分割结果:") print(f" 训练集: {len(X_train)} 样本") print(f" 测试集: {len(X_test)} 样本") return X_train, X_test, y_train, y_test, coords_train, coords_test def calculate_variogram(self, coords, residuals): """计算经验变异函数""" print("📊 计算经验变异函数...") n = len(coords) if n < 10: print("⚠️ 样本太少,无法计算变异函数") return np.array([]), np.array([]), np.array([]) distances = cdist(coords, coords, metric='euclidean') residual_pairs = np.abs(residuals[:, None] - residuals[None, :]) if self.max_distance is None: self.max_distance = np.max(distances) * 0.7 # 样本数过多时抽样 sample_ratio = 1.0 if n > 1000: sample_ratio = 0.3 sample_idx = np.random.choice(n, int(n*sample_ratio), replace=False) distances = distances[np.ix_(sample_idx, sample_idx)] residual_pairs = residual_pairs[np.ix_(sample_idx, sample_idx)] print(f"⚠️ 样本数过多({n}),抽样{sample_ratio*100}%计算变异函数") # 创建滞后距离 lags = np.linspace(0, self.max_distance, self.n_lags + 1) gamma = np.zeros(self.n_lags) pairs_count = np.zeros(self.n_lags) for i in range(self.n_lags): mask = (distances > lags[i]) & (distances <= lags[i+1]) if np.any(mask): upper_mask = mask & np.triu(np.ones_like(mask, dtype=bool), k=1) if np.sum(upper_mask) > 0: gamma[i] = 0.5 * np.mean(residual_pairs[upper_mask] **2) pairs_count[i] = np.sum(upper_mask) # 移除没有数据的滞后 valid_lags = pairs_count > 5 lags_center = (lags[:-1] + lags[1:]) / 2 lags_center = lags_center[valid_lags] gamma = gamma[valid_lags] pairs_count = pairs_count[valid_lags] return lags_center, gamma, pairs_count def fit_variogram_model(self, lags, gamma): """拟合变异函数模型""" print(f"🔧 拟合 {self.variogram_model} 变异函数模型...") if len(lags) < 3: print("⚠️ 样本点太少,无法拟合变异函数模型,使用默认参数") return {'nugget': 0.1, 'sill': 1.0, 'range': 1500} if self.variogram_model == 'spherical': nugget = max(0.01, gamma[0]) sill = max(0.01, np.max(gamma) - nugget) range_param = lags[np.argmax(gamma > 0.9 * np.max(gamma))] if np.any(gamma > 0.9 * np.max(gamma)) else np.max(lags) params = { 'nugget': nugget, 'sill': sill, 'range': max(1.0, range_param) } print(f"✅ 变异函数模型参数: {params}") return params def krige_residuals(self, train_coords, train_residuals, pred_coords, variogram_params): """普通克里格插值""" n_train = len(train_coords) n_pred = len(pred_coords) if n_train < 5: print("⚠️ 训练样本太少,无法进克里格插值") return np.zeros(n_pred) chunk_size = 800 pred_residuals = np.zeros(n_pred, dtype=np.float32) try: for i in range(0, n_pred, chunk_size): chunk_end = min(i + chunk_size, n_pred) pred_coords_chunk = pred_coords[i:chunk_end] if i == 0: K = np.zeros((n_train + 1, n_train + 1), dtype=np.float32) K[:n_train, :n_train] = self.variogram_function( cdist(train_coords, train_coords, metric='euclidean'), variogram_params ) K[:n_train, n_train] = 1.0 K[n_train, :n_train] = 1.0 K[n_train, n_train] = 0.0 k_vector_chunk = np.zeros((chunk_end - i, n_train + 1), dtype=np.float32) dist_chunk = cdist(pred_coords_chunk, train_coords, metric='euclidean') k_vector_chunk[:, :-1] = self.variogram_function(dist_chunk, variogram_params) k_vector_chunk[:, -1] = 1.0 try: if i == 0: K_inv = np.linalg.inv(K) weights_chunk = k_vector_chunk @ K_inv except np.linalg.LinAlgError: if i == 0: K_inv = np.linalg.pinv(K) weights_chunk = k_vector_chunk @ K_inv pred_residuals[i:chunk_end] = weights_chunk[:, :n_train] @ train_residuals except Exception as e: print(f"⚠️ 克里格插值出错: {e},将使用RF结果") pred_residuals = np.zeros(n_pred) return pred_residuals def variogram_function(self, h, params): """变异函数计算""" nugget = params['nugget'] sill = params['sill'] range_param = params['range'] if self.variogram_model == 'spherical': if np.isscalar(h): h = np.array([h]) result = np.where(h <= range_param, nugget + sill * (1.5 * h / range_param - 0.5 * (h / range_param)**3), nugget + sill) return result[0] if len(result) == 1 else result return np.zeros_like(h) def calculate_mad(self, y_true, y_pred): """计算平均绝对偏差""" return np.mean(np.abs(y_pred - np.mean(y_pred))) def optimize_rf_hyperparameters(self, X, y): """优化随机森林超参数""" print("🔍 开始随机森林超参数优化...") # 减少迭代次数,加快小样本情况下的训练速度 n_iter = 200 if len(X) > 500 else 20 param_dist = { 'n_estimators': [150, 200, 250], 'max_depth': [10, 12, 15, None], 'min_samples_split': [4, 6, 8], 'min_samples_leaf': [2, 3, 4], 'max_features': [0.6, 0.7, 0.8, 'sqrt'], 'max_samples': [0.7, 0.8, 0.9] } rf = RandomForestRegressor(random_state=42, n_jobs=-1, bootstrap=True) random_search = RandomizedSearchCV( estimator=rf, param_distributions=param_dist, n_iter=n_iter, cv=5, scoring='r2', n_jobs=-1, verbose=1, random_state=42 ) random_search.fit(X, y) print(f"✅ 最佳参数: {random_search.best_params_}") print(f"✅ 最佳十折交叉验证R²: {random_search.best_score_:.4f}") return random_search.best_estimator_, random_search.best_params_ def select_optimal_features(self, X, y, features): """使用RFECV进最优特征择""" print("🔍 开始最优特征择...") rf = RandomForestRegressor(** self.rf_params) rfecv = RFECV( estimator=rf, step=self.step, cv=self.cv, scoring='r2', min_features_to_select=self.min_features_to_select, n_jobs=-1, verbose=1 ) rfecv.fit(X, y) # 适配scikit-learn新版本 cv_scores = rfecv.cv_results_['mean_test_score'] max_cv_score = np.max(cv_scores) print(f"✅ 最优特征数量: {rfecv.n_features_}") print(f"✅ 最优特征R²得分: {max_cv_score:.4f}") selected_indices = rfecv.support_ selected_features = [features[i] for i, selected in enumerate(selected_indices) if selected] print(f"✅ 选中的特征: {selected_features}") self.plot_feature_selection_results(rfecv, features, cv_scores) self.save_feature_selection_results(rfecv, features, selected_features, max_cv_score) return selected_features, rfecv.transform(X), rfecv def plot_feature_selection_results(self, rfecv, features, cv_scores): """绘制特征择结果图""" plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) n_features_list = range(self.min_features_to_select, len(cv_scores) + self.min_features_to_select) plt.plot(n_features_list, cv_scores) plt.xlabel('特征数量') plt.ylabel('交叉验证R²分数') plt.title('特征数量与模型性能关系') plt.grid(True, alpha=0.3) optimal_idx = np.argmax(cv_scores) optimal_n_features = n_features_list[optimal_idx] plt.scatter(optimal_n_features, cv_scores[optimal_idx], color='red', s=100, zorder=5) plt.subplot(1, 2, 2) sorted_indices = np.argsort(rfecv.ranking_) ranks = rfecv.ranking_[sorted_indices] feature_names = [features[i] for i in sorted_indices] plt.barh(range(len(sorted_indices)), ranks, color=plt.cm.Set3(np.linspace(0, 1, len(features)))) plt.yticks(range(len(sorted_indices)), feature_names) plt.xlabel('特征排名 (1表示被选中)') plt.title('特征重要性排名') plt.grid(True, alpha=0.3, axis='x') plt.tight_layout() plot_path = os.path.join(self.output_folder, "特征择结果图.png") plt.savefig(plot_path, dpi=300, bbox_inches='tight') print(f"💾 特征择结果图保存至: {plot_path}") plt.close() def save_feature_selection_results(self, rfecv, features, selected_features, max_cv_score): """保存特征择结果到Excel""" ranking_data = [] for i, feature in enumerate(features): ranking_data.append({ '特征名称': feature, '排名': rfecv.ranking_[i], '是否被选中': '是' if rfecv.support_[i] else '否' }) ranking_df = pd.DataFrame(ranking_data) ranking_df = ranking_df.sort_values('排名') summary_df = pd.DataFrame({ '汇总项': ['最优特征数量', '最优交叉验证R²分数', '总特征数量', '选中特征数量'], '数值': [rfecv.n_features_, max_cv_score, len(features), len(selected_features)] }) excel_path = os.path.join(self.output_folder, "特征择结果.xlsx") with pd.ExcelWriter(excel_path, engine='openpyxl') as writer: ranking_df.to_excel(writer, sheet_name='特征排名详情', index=False) summary_df.to_excel(writer, sheet_name='择结果汇总', index=False) print(f"💾 特征择结果保存至: {excel_path}") def train_rfk_model(self, X, y, coords, features): """训练随机森林-克里格模型""" print("🤖 训练随机森林-克里格模型...") selected_features, X_selected, rfecv = self.select_optimal_features(X, y, features) print(f"🎯 已择 {len(selected_features)} 个最优特征") X_train, X_test, y_train, y_test, coords_train, coords_test = self.optimized_data_split( X_selected, y, coords, test_size=0.15 ) print("⚙️ 进随机森林超参数优化...") rf_model, best_params = self.optimize_rf_hyperparameters(X_train, y_train) # 计算残差 y_train_pred_rf = rf_model.predict(X_train) residuals_train = y_train - y_train_pred_rf # 检查空间自相关性 spatial_corr = self.check_spatial_autocorrelation(coords_train, residuals_train) print(f"📊 残差空间自相关性: {spatial_corr:.4f}") use_kriging = spatial_corr > 0.12 if use_kriging: print("✅ 检测到空间自相关,使用RFK模型") lags, gamma, pairs_count = self.calculate_variogram(coords_train, residuals_train) if len(lags) > 2: variogram_params = self.fit_variogram_model(lags, gamma) residuals_train_kriged = self.krige_residuals( coords_train, residuals_train, coords_train, variogram_params ) y_train_pred_rfk = y_train_pred_rf + residuals_train_kriged residuals_test_kriged = self.krige_residuals( coords_train, residuals_train, coords_test, variogram_params ) y_test_pred_rfk = rf_model.predict(X_test) + residuals_test_kriged else: print("⚠️ 变异函数计算失败,仅使用RF模型") use_kriging = False variogram_params = None y_train_pred_rfk = y_train_pred_rf y_test_pred_rfk = rf_model.predict(X_test) else: print("⚠️ 空间自相关性不足,仅使用RF模型") use_kriging = False variogram_params = None y_train_pred_rfk = y_train_pred_rf y_test_pred_rfk = rf_model.predict(X_test) # 计算评估指标 metrics = {} for name, y_true, y_pred in [ ('训练集_RF', y_train, y_train_pred_rf), ('训练集_RFK', y_train, y_train_pred_rfk), ('测试集_RF', y_test, rf_model.predict(X_test)), ('测试集_RFK', y_test, y_test_pred_rfk) ]: metrics[name] = { 'R2': r2_score(y_true, y_pred), 'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)), 'MAE': mean_absolute_error(y_true, y_pred), 'MAD': self.calculate_mad(y_true, y_pred) } # 执十折交叉验证 print("📊 执十折交叉验证...") cv_r2_scores = [] cv_rmse_scores = [] cv_mae_scores = [] for fold, (train_idx, val_idx) in enumerate(self.cv.split(X_selected)): X_cv_train, X_cv_val = X_selected[train_idx], X_selected[val_idx] y_cv_train, y_cv_val = y[train_idx], y[val_idx] fold_model = RandomForestRegressor(** best_params, random_state=42, n_jobs=-1) fold_model.fit(X_cv_train, y_cv_train) y_cv_pred = fold_model.predict(X_cv_val) r2 = r2_score(y_cv_val, y_cv_pred) rmse = np.sqrt(mean_squared_error(y_cv_val, y_cv_pred)) mae = mean_absolute_error(y_cv_val, y_cv_pred) cv_r2_scores.append(r2) cv_rmse_scores.append(rmse) cv_mae_scores.append(mae) print(f" 第{fold+1}折: R²={r2:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}") # 计算十折交叉验证的平均指标 cv_mean_r2 = np.mean(cv_r2_scores) cv_std_r2 = np.std(cv_r2_scores) cv_mean_rmse = np.mean(cv_rmse_scores) cv_std_rmse = np.std(cv_rmse_scores) cv_mean_mae = np.mean(cv_mae_scores) cv_std_mae = np.std(cv_mae_scores) print(f"📊 十折交叉验证结果:") print(f" R²: {cv_mean_r2:.4f} (±{cv_std_r2:.4f})") print(f" RMSE: {cv_mean_rmse:.4f} (±{cv_std_rmse:.4f})") print(f" MAE: {cv_mean_mae:.4f} (±{cv_std_mae:.4f})") # 保存交叉验证结果 cv_results = { 'r2_scores': cv_r2_scores, 'rmse_scores': cv_rmse_scores, 'mae_scores': cv_mae_scores, 'mean_r2': cv_mean_r2, 'std_r2': cv_std_r2, 'mean_rmse': cv_mean_rmse, 'std_rmse': cv_std_rmse, 'mean_mae': cv_mean_mae, 'std_mae': cv_std_mae } # 保存交叉验证结果到Excel cv_df = pd.DataFrame({ '折数': range(1, 11), 'R²': cv_r2_scores, 'RMSE(mg/kg)': cv_rmse_scores, 'MAE(mg/kg)': cv_mae_scores }) cv_df.loc[10] = ['平均值', cv_mean_r2, cv_mean_rmse, cv_mean_mae] cv_df.loc[11] = ['标准差', cv_std_r2, cv_std_rmse, cv_std_mae] cv_excel_path = os.path.join(self.output_folder, "十折交叉验证结果.xlsx") cv_df.to_excel(cv_excel_path, index=False, engine='openpyxl') print(f"💾 十折交叉验证结果保存至: {cv_excel_path}") # 绘制交叉验证结果图 self.plot_cv_results(cv_r2_scores, cv_rmse_scores, cv_mae_scores) # 输出评估结果 print("=" * 70) print(f"📊 最优特征RFK模型评估结果(有效磷,{len(selected_features)}个特征)") print("=" * 70) print(f"{'数据集':<15} {'模型':<8} {'R²':<8} {'RMSE(mg/kg)':<12} {'MAE(mg/kg)':<12} {'MAD(mg/kg)':<12}") print("-" * 70) for dataset in ['训练集', '测试集']: for model in ['RF', 'RFK']: key = f"{dataset}_{model}" m = metrics[key] print(f"{dataset:<15} {model:<8} {m['R2']:.4f} {m['RMSE']:.4f} {m['MAE']:.4f} {m['MAD']:.4f}") print("=" * 70) # 模型择建议 test_rf_r2 = metrics['测试集_RF']['R2'] test_rfk_r2 = metrics['测试集_RFK']['R2'] if test_rfk_r2 > test_rf_r2 + 0.01: print("🎯 推荐使用RFK模型 (测试集R²有提升)") best_model_type = 'RFK' else: print("🎯 推荐使用RF模型 (RFK提升不明显)") best_model_type = 'RF' use_kriging = False # 创建评估图表 self.create_rfk_evaluation_plot(y_train, y_train_pred_rf, y_train_pred_rfk, y_test, rf_model.predict(X_test), y_test_pred_rfk, metrics, best_model_type, cv_results) # 保存模型数据 model_data = { 'rf_model': rf_model, 'selected_features': selected_features, 'variogram_params': variogram_params if use_kriging else None, 'coords_train': coords_train, 'residuals_train': residuals_train, 'use_kriging': use_kriging, 'best_model_type': best_model_type, 'cv_results': cv_results, 'feature_selector': rfecv } return model_data, metrics def plot_cv_results(self, r2_scores, rmse_scores, mae_scores): """绘制十折交叉验证结果图""" fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6)) ax1.boxplot(r2_scores) ax1.set_title('十折交叉验证R²分布') ax1.set_ylabel('R²值') ax1.set_xticklabels(['RF模型']) ax1.grid(True, alpha=0.3) for i, val in enumerate(r2_scores): ax1.text(1, val, f'{val:.3f}', ha='center', va='bottom' if i < 5 else 'top') ax2.boxplot(rmse_scores) ax2.set_title('十折交叉验证RMSE分布') ax2.set_ylabel('RMSE (mg/kg)') ax2.set_xticklabels(['RF模型']) ax2.grid(True, alpha=0.3) for i, val in enumerate(rmse_scores): ax2.text(1, val, f'{val:.3f}', ha='center', va='bottom' if i < 5 else 'top') ax3.boxplot(mae_scores) ax3.set_title('十折交叉验证MAE分布') ax3.set_ylabel('MAE (mg/kg)') ax3.set_xticklabels(['RF模型']) ax3.grid(True, alpha=0.3) for i, val in enumerate(mae_scores): ax3.text(1, val, f'{val:.3f}', ha='center', va='bottom' if i < 5 else 'top') plt.tight_layout() cv_plot_path = os.path.join(self.output_folder, "十折交叉验证结果图.png") plt.savefig(cv_plot_path, dpi=300, bbox_inches='tight') print(f"💾 十折交叉验证结果图保存至: {cv_plot_path}") plt.close() def check_spatial_autocorrelation(self, coords, values): """检查空间自相关性""" try: from libpysal.weights import DistanceBand from esda.moran import Moran w = DistanceBand(coords, threshold=np.percentile(cdist(coords, coords), 25)) moran = Moran(values, w) return moran.I except ImportError: print("⚠️ 未安装libpysalesda,使用简化的空间自相关计算") distances = cdist(coords, coords) np.fill_diagonal(distances, np.inf) min_distances = np.min(distances, axis=1) corr = np.corrcoef(values, min_distances)[0, 1] return abs(corr) except Exception as e: print(f"⚠️ 空间自相关计算出错: {e}") return 0 def create_rfk_evaluation_plot(self, y_train, y_train_pred_rf, y_train_pred_rfk, y_test, y_test_pred_rf, y_test_pred_rfk, metrics, best_model_type, cv_results): """创建RFK评估图表""" print("📈 生成RFK模型评估图表...") fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12)) ax1.scatter(y_train, y_train_pred_rf, alpha=0.6, color='blue', label='RF', s=30) ax1.scatter(y_train, y_train_pred_rfk, alpha=0.6, color='red', label='RFK', s=30) ax1.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'k--', lw=2) ax1.set_xlabel('实测有效磷 (mg/kg)') ax1.set_ylabel('预测有效磷 (mg/kg)') ax1.set_title('训练集: RF vs RFK') ax1.legend() ax1.grid(True, alpha=0.3) ax2.scatter(y_test, y_test_pred_rf, alpha=0.6, color='blue', label='RF', s=30) ax2.scatter(y_test, y_test_pred_rfk, alpha=0.6, color='red', label='RFK', s=30) ax2.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) ax2.set_xlabel('实测有效磷 (mg/kg)') ax2.set_ylabel('预测有效磷 (mg/kg)') ax2.set_title(f'测试集: RF vs RFK (推荐: {best_model_type})') ax2.legend() ax2.grid(True, alpha=0.3) residuals_train_rf = y_train - y_train_pred_rf residuals_train_rfk = y_train - y_train_pred_rfk residuals_test_rf = y_test - y_test_pred_rf residuals_test_rfk = y_test - y_test_pred_rfk ax3.boxplot([residuals_train_rf, residuals_train_rfk, residuals_test_rf, residuals_test_rfk], labels=['训练集_RF', '训练集_RFK', '测试集_RF', '测试集_RFK']) ax3.set_ylabel('残差 (mg/kg)') ax3.set_title('残差分布比较') ax3.grid(True, alpha=0.3) models = ['训练集_RF', '训练集_RFK', '测试集_RF', '测试集_RFK', '十折CV平均'] r2_values = [ metrics['训练集_RF']['R2'], metrics['训练集_RFK']['R2'], metrics['测试集_RF']['R2'], metrics['测试集_RFK']['R2'], cv_results['mean_r2'] ] x = np.arange(len(models)) colors_list = ['lightblue', 'lightcoral', 'blue', 'red', 'green'] ax4.bar(x, r2_values, color=colors_list) ax4.set_xlabel('模型') ax4.set_ylabel('R²') ax4.set_title('模型R²性能比较(含十折CV)') ax4.set_xticks(x) ax4.set_xticklabels(models, rotation=45) ax4.grid(True, alpha=0.3) for i, v in enumerate(r2_values): ax4.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom') plt.tight_layout() eval_plot_path = os.path.join(self.output_folder, "最优特征RFK模型评估图表(有效磷).png") plt.savefig(eval_plot_path, dpi=300, bbox_inches='tight') print(f"💾 RFK评估图表保存至: {eval_plot_path}") plt.close() def spatial_prediction_rfk(self, model_data, features, raster_files): """RFK空间预测""" print("🌍 开始RFK空间预测...") raster_handles = {} try: for feature in model_data['selected_features']: if feature in raster_files: src = rasterio.open(raster_files[feature]) raster_handles[feature] = src print(f"📁 加载栅格: {feature} -> {os.path.basename(raster_files[feature])}") else: raise ValueError(f"特征 {feature} 对应的栅格文件不存在") except Exception as e: for handle in raster_handles.values(): handle.close() raise e # 输出文件 if model_data['use_kriging']: out_tif = Path(self.output_folder) / f"最优特征RFK有效磷预测结果({len(model_data['selected_features'])}特征).tif" else: out_tif = Path(self.output_folder) / f"最优特征RF有效磷预测结果({len(model_data['selected_features'])}特征).tif" if out_tif.exists(): try: os.remove(out_tif) print(f"🗑️ 删除旧文件: {out_tif}") except Exception as e: print(f'⚠️ 无法删除旧文件 {out_tif}: {e}') # 使用第一个特征栅格作为模板 first_feature = model_data['selected_features'][0] src_template = raster_handles[first_feature] meta = src_template.meta.copy() height, width = src_template.height, src_template.width transform = src_template.transform crs = src_template.crs meta.update({ 'count': 1, 'dtype': 'float32', 'nodata': self.nodata, 'compress': 'lzw', 'crs': crs }) print(f"📐 栅格尺寸: {width} x {height} 像素") print(f"🎯 输出文件: {out_tif}") # 创建输出栅格 with rasterio.open(out_tif, 'w', **meta) as dst: print("✅ 创建输出栅格文件成功") # 分块处理 total_blocks = ((height + self.block_size - 1) // self.block_size) * \ ((width + self.block_size - 1) // self.block_size) print(f"📦 分块处理: {total_blocks} 个块") processed_blocks = 0 valid_pixels = 0 with tqdm(total=total_blocks, desc="空间预测") as pbar: for row in range(0, height, self.block_size): for col in range(0, width, self.block_size): win_height = min(self.block_size, height - row) win_width = min(self.block_size, width - col) window = Window(col, row, win_width, win_height) try: block_data = [] valid_mask = None for feature in model_data['selected_features']: src = raster_handles[feature] data = src.read(1, window=window, boundless=False) if valid_mask is None: valid_mask = (data != src.nodata) & (~np.isnan(data)) & (data != -9999) else: valid_mask = valid_mask & (data != src.nodata) & (~np.isnan(data)) & (data != -9999) block_data.append(data) if valid_mask is not None and np.any(valid_mask): X_block = np.stack([data[valid_mask] for data in block_data], axis=1) # RF预测 y_pred_rf = model_data['rf_model'].predict(X_block) # 有效磷范围约束 y_pred_rf = np.clip(y_pred_rf, 0.0, 200.0) # RFK预测 if model_data['use_kriging'] and model_data['variogram_params'] is not None: rows, cols = np.mgrid[row:row+win_height, col:col+win_width] xs, ys = rasterio.transform.xy(transform, rows, cols) coords_block = np.column_stack([np.array(xs).ravel(), np.array(ys).ravel()]) coords_block_valid = coords_block[valid_mask.ravel()] if len(coords_block_valid) > 0: residuals_pred = self.krige_residuals( model_data['coords_train'], model_data['residuals_train'], coords_block_valid, model_data['variogram_params'] ) y_pred = y_pred_rf + residuals_pred y_pred = np.clip(y_pred, 0.0, 200.0) else: y_pred = y_pred_rf else: y_pred = y_pred_rf out_block = np.full((win_height, win_width), self.nodata, dtype='float32') out_block[valid_mask] = y_pred with rasterio.open(out_tif, 'r+') as dst: dst.write(out_block, 1, window=window) valid_pixels += np.sum(valid_mask) else: out_block = np.full((win_height, win_width), self.nodata, dtype='float32') with rasterio.open(out_tif, 'r+') as dst: dst.write(out_block, 1, window=window) except Exception as e: print(f"⚠️ 处理块 ({row},{col}) 时出错: {e}") out_block = np.full((win_height, win_width), self.nodata, dtype='float32') with rasterio.open(out_tif, 'r+') as dst: dst.write(out_block, 1, window=window) processed_blocks += 1 pbar.update(1) # 关闭所有栅格句柄 for handle in raster_handles.values(): handle.close() print(f"✅ 空间预测完成:") print(f" - 处理块数: {processed_blocks}") print(f" - 有效像素: {valid_pixels}") print(f" - 输出文件: {out_tif}") # 验证输出文件 if os.path.exists(out_tif): try: with rasterio.open(out_tif) as src: data = src.read(1) valid_count = np.sum(data != self.nodata) print(f"✅ 输出文件验证成功:") print(f" - 文件大小: {os.path.getsize(out_tif) / (1024*1024):.2f} MB") print(f" - 有效值数量: {valid_count}") print(f" - 有效磷范围: {data[data != self.nodata].min():.2f} ~ {data[data != self.nodata].max():.2f} mg/kg") except Exception as e: print(f"⚠️ 输出文件验证失败: {e}") else: print("❌ 输出文件未生成!") return str(out_tif) def create_prediction_map(self, prediction_path): """创建预测结果可视化地图""" print("🎨 生成预测结果可视化地图...") if not os.path.exists(prediction_path): print("❌ 预测结果文件不存在,无法生成可视化地图") return try: with rasterio.open(prediction_path) as src: data = src.read(1) transform = src.transform extent = [src.bounds.left, src.bounds.right, src.bounds.bottom, src.bounds.top] fig, ax = plt.subplots(figsize=(12, 10)) masked_data = np.ma.masked_where(data == self.nodata, data) cmap = plt.cm.plasma vmin, vmax = np.nanpercentile(masked_data, 2), np.nanpercentile(masked_data, 98) # 去除极端值影响 norm = colors.Normalize(vmin=vmin, vmax=vmax) im = ax.imshow(masked_data, extent=extent, cmap=cmap, norm=norm, aspect='auto') cbar = plt.colorbar(im, ax=ax, shrink=0.8) cbar.set_label('有效磷含量 (mg/kg)', fontsize=12) model_type = "RFK" if "RFK" in prediction_path else "RF" feature_count = prediction_path.split('(')[1].split('特征')[0] ax.set_title(f'酉阳土壤有效磷{model_type}预测结果({feature_count}个最优特征)', fontsize=16, fontweight='bold') ax.set_xlabel('经度', fontsize=12) ax.set_ylabel('纬度', fontsize=12) ax.grid(True, alpha=0.3) map_output_path = os.path.join(self.output_folder, f"最优特征{model_type}有效磷预测图({feature_count}特征).png") plt.savefig(map_output_path, dpi=300, bbox_inches='tight') plt.close() print(f"✅ 预测结果可视化地图保存至: {map_output_path}") except Exception as e: print(f"❌ 生成可视化地图失败: {e}") def create_feature_importance_plot(self, model, features): """创建特征重要性图并保存为Excel文件""" print("📊 生成特征重要性图并保存为Excel...") importances = model.feature_importances_ indices = np.argsort(importances)[::-1] plt.figure(figsize=(14, 10)) bars = plt.bar(range(len(features)), importances[indices], color=plt.cm.Set3(np.linspace(0, 1, len(features)))) for i, (bar, importance) in enumerate(zip(bars, importances[indices])): plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001, f'{importance:.3f}', ha='center', va='bottom', fontsize=8) plt.xticks(range(len(features)), [features[i] for i in indices], rotation=45, ha='right') plt.xlabel('环境因子', fontsize=12) plt.ylabel('重要性', fontsize=12) plt.title(f'有效磷预测环境因子重要性排序({len(features)}个最优特征)', fontsize=16, fontweight='bold') plt.grid(True, alpha=0.3, axis='y') plt.tight_layout() plot_output_path = os.path.join(self.output_folder, "有效磷最优特征重要性.png") plt.savefig(plot_output_path, dpi=300, bbox_inches='tight') plt.close() print(f"✅ 特征重要性图保存至: {plot_output_path}") importance_data = { '环境因子': [features[i] for i in indices], '重要性值': [importances[i] for i in indices], '排名': list(range(1, len(features) + 1)) } importance_df = pd.DataFrame(importance_data) excel_output_path = os.path.join(self.output_folder, "有效磷最优特征重要性结果.xlsx") importance_df.to_excel(excel_output_path, index=False, engine='openpyxl') print(f"✅ 环境因子重要性结果保存至Excel: {excel_output_path}") def run(self): """运完整RFK流程""" start_time = time.time() try: print("=" * 70) print("🧪 酉阳土壤有效磷空间预测系统 - 无数据清洗版(十折交叉验证)") print("=" * 70) print(f"⏰ 开始时间: {time.strftime('%Y-%m-%d %H:%M:%S')}") print(f"📂 点数据路径: {self.point_shp_path}") print(f"📂 栅格文件夹路径: {self.raster_folder}") print(f"📂 结果输出路径: {self.output_folder}") print("=" * 70) # 步骤1: 加载数据 raster_files = self.load_environment_rasters() points = self.load_point_data() # 步骤2: 准备数据 points_clean, X, y, features, target_field = self.prepare_data(points, raster_files) # 获取坐标信息 coords = np.array([(geom.x, geom.y) for geom in points_clean.geometry]) # 步骤3: 训练RFK模型 model_data, metrics = self.train_rfk_model(X, y, coords, features) # 步骤4: 特征重要性图Excel保存 self.create_feature_importance_plot(model_data['rf_model'], model_data['selected_features']) # 步骤5: 空间预测 prediction_path = self.spatial_prediction_rfk(model_data, features, raster_files) # 步骤6: 创建预测结果可视化地图 self.create_prediction_map(prediction_path) # 步骤7: 保存模型结果 model_path = os.path.join(self.output_folder, f"最优特征{model_data['best_model_type']}_有效磷预测模型({len(model_data['selected_features'])}特征).pkl") joblib.dump(model_data, model_path) print(f"💾 模型保存至: {model_path}") total_time = time.time() - start_time print(f"\n🎉 所有任务完成! 总耗时: {total_time:.1f} 秒") print(f"⏰ 结束时间: {time.strftime('%Y-%m-%d %H:%M:%S')}") print(f"📁 结果保存在: {self.output_folder}") # 输出生成的文件列表 print("\n📋 生成的文件列表:") for file in os.listdir(self.output_folder): file_path = os.path.join(self.output_folder, file) if os.path.isfile(file_path): size = os.path.getsize(file_path) / (1024 * 1024) # MB print(f" - {file} ({size:.2f} MB)") except Exception as e: print(f"\n❌ 程序执失败: {e}") import traceback traceback.print_exc() finally: print("\n程序已结束运") # 运程序 if __name__ == "__main__": # 修改这里的路径为你的实际文件路径! predictor = YouyangAvailablePhosphorusPredictor() # 如需修改路径,可以在这里重新设置 # predictor.point_shp_path = r"你的点数据路径.shp" # predictor.raster_folder = r"你的栅格文件夹路径" # predictor.output_folder = r"你的结果输出路径" predictor.run() 以上是用随机森林模型来预测有效磷的代码,帮我优化代码,减少过拟合的情况,并提高测试集R²精度
10-23
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值