forest_train训练文件的生成代码

本文介绍了一种图像数据集的准备与处理流程,包括如何从指定路径读取图像文件,进行基本尺寸调整,并记录图像的基本信息以供后续机器学习模型训练使用。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%forest_train pos
main_path = 'D:\\workspace\\xx\\机器学习训练框架\\hough_forests_db\\tougue\\';
main_path_1 = 'D:\\workspace\\xx\\机器学习训练框架\\hough_forests_db\\';
train_path = [main_path_1, 'train_pos.txt'];
fid=fopen(train_path,'w');
fclose(fid);
dat = dir( main_path );
icount = 0;
for j = 1 : length( dat )
    % If maindir(i) is not a dir, skip
    if( isequal( dat( j ).name, '.' )||...
        isequal( dat( j ).name, '..'))
        continue;
    end
    icount = icount + 1;
end
fid=fopen(train_path,'a+');
fprintf(fid,'%d 1\n',icount);
fclose(fid);
for j = 1 : length( dat )
    % If maindir(i) is not a dir, skip
    if( isequal( dat( j ).name, '.' )||...
        isequal( dat( j ).name, '..'))
        continue;
    end
    datapath = fullfile( main_path, dat( j ).name, '');
    ss = size(strfind(datapath, 'Thumbs.db', 'ForceCellOutput',false));
    if(ss(1) == 0)
        datapath = fullfile( main_path, dat( j ).name, '');
        if(1)
            img=imread(datapath);
            tt = (size(img));
            %tt(1) = 96;
            %tt(2) = 160;
            %img1 = imresize(img, [tt(1) tt(2)]);
        end
        %clear img;
        %imwrite(img1, datapath);
        fid=fopen(train_path,'a+');
        fprintf(fid,'%s 0 0 %d %d %d %d\n',dat( j ).name, tt(2), tt(1), floor(tt(1)/2), floor(tt(2)/2));
        fclose(fid);
    end
end
%neg
main_path = 'D:\\workspace\\xx\\机器学习训练框架\\hough_forests_db\\other\\';
train_path = [main_path_1, 'train_neg.txt'];
fid=fopen(train_path,'w');
fclose(fid);
dat = dir( main_path );
icount = 0;
for j = 1 : length( dat )
    % If maindir(i) is not a dir, skip
    if( isequal( dat( j ).name, '.' )||...
        isequal( dat( j ).name, '..'))
        continue;
    end
    icount = icount + 1;
end
fid=fopen(train_path,'a+');
fprintf(fid,'%d 1\n',icount);
fclose(fid);
for j = 1 : length( dat )
    % If maindir(i) is not a dir, skip
    if( isequal( dat( j ).name, '.' )||...
        isequal( dat( j ).name, '..'))
        continue;
    end
    datapath = fullfile( main_path, dat( j ).name, '');
    ss = size(strfind(datapath, 'Thumbs.db', 'ForceCellOutput',false));
    if(ss(1) == 0)
        datapath = fullfile( main_path, dat( j ).name, '');
        img=imread(datapath);
        tt = (size(img));
        fid=fopen(train_path,'a+');
        fprintf(fid,'%s 0 0 %d %d\n',dat( j ).name, tt(2), tt(1));
        fclose(fid);
    end
end
import os import numpy as np import rasterio from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report import matplotlib.pyplot as plt import seaborn as sns import gc # 导入垃圾回收模块 # 定义颜色与类别的映射 COLOR_TO_CATEGORY = { (255, 0, 0): "built-up", (0, 255, 0): "farmland", (0, 255, 255): "forest", (255, 255, 0): "meadow", (0, 0, 255): "water", (0, 0, 0): "unlabeled" } CATEGORY_TO_LABEL_ID = {category: idx for idx, category in enumerate(COLOR_TO_CATEGORY.values())} LABEL_ID_TO_CATEGORY = {idx: category for category, idx in CATEGORY_TO_LABEL_ID.items()} # 1. 加载影像数据并构建数据集 def load_image_data(data_dir, test_size=0.2, random_state=42, sample_size=0.05): X_train, X_test, y_train, y_test = [], [], [], [] label_mapping = {} label_id = 0 print(f"数据目录: {data_dir}") print(f"目录内容: {os.listdir(data_dir)}") for folder in os.listdir(data_dir): folder_path = os.path.join(data_dir, folder) if os.path.isdir(folder_path): label_mapping[label_id] = folder # 映射标签 print(f"正在处理类别: {folder} (标签ID: {label_id})") for file in os.listdir(folder_path): if file.endswith('.tif') or file.endswith('.tiff'): image_path = os.path.join(folder_path, file) print(f"正在加载文件: {image_path}") try: with rasterio.open(image_path) as src: print(f"成功打开文件: {image_path}") image = src.read() # 读取影像数据 image = np.moveaxis(image, 0, -1) # 将通道移动到最后一维 image = image.reshape(-1, image.shape[-1]) # 展平为二维数组 image = image.astype(np.float32) # 转换为 float32 减少内存占用 # 检查并处理 NaN 或 Inf 值 if np.isnan(image).any() or np.isinf(image).any(): print(f"警告:文件 {file} 中存在 NaN 或 Inf 值,将被替换为 0") image = np.nan_to_num(image, nan=0.0, posinf=0.0, neginf=0.0) labels = np.full(image.shape[0], label_id, dtype=np.int8) # 创建对应标签 # 随机采样 indices = np.random.choice(image.shape[0], int(image.shape[0] * sample_size), replace=False) image_sampled = image[indices] labels_sampled = labels[indices] # 划分训练集和测试集 X_train_part, X_test_part, y_train_part, y_test_part = train_test_split( image_sampled, labels_sampled, test_size=test_size, random_state=random_state ) X_train.append(X_train_part) X_test.append(X_test_part) y_train.append(y_train_part) y_test.append(y_test_part) print(f"文件 {file} 加载完成,训练集样本数: {len(X_train_part)},测试集样本数: {len(X_test_part)}") except Exception as e: print(f"加载文件失败: {image_path},错误信息: {e}") label_id += 1 else: if folder.endswith('.tif') or folder.endswith('.tiff'): image_path = os.path.join(data_dir, folder) print(f"正在加载文件: {image_path}") try: with rasterio.open(image_path) as src: print(f"成功打开文件: {image_path}") image = src.read() # 读取影像数据 image = np.moveaxis(image, 0, -1) # 将通道移动到最后一维 image = image.reshape(-1, image.shape[-1]) # 展平为二维数组 image = image.astype(np.float32) # 转换为 float32 减少内存占用 # 检查并处理 NaN 或 Inf 值 if np.isnan(image).any() or np.isinf(image).any(): print(f"警告:文件 {file} 中存在 NaN 或 Inf 值,将被替换为 0") image = np.nan_to_num(image, nan=0.0, posinf=0.0, neginf=0.0) labels = np.full(image.shape[0], label_id, dtype=np.int8) # 创建对应标签 # 随机采样 indices = np.random.choice(image.shape[0], int(image.shape[0] * sample_size), replace=False) image_sampled = image[indices] labels_sampled = labels[indices] # 划分训练集和测试集 X_train_part, X_test_part, y_train_part, y_test_part = train_test_split( image_sampled, labels_sampled, test_size=test_size, random_state=random_state ) X_train.append(X_train_part) X_test.append(X_test_part) y_train.append(y_train_part) y_test.append(y_test_part) print(f"文件 {file} 加载完成,训练集样本数: {len(X_train_part)},测试集样本数: {len(X_test_part)}") except Exception as e: print(f"加载文件失败: {image_path},错误信息: {e}") label_id += 1 if not X_train: print("X为空,未加载到任何影像数据!") raise ValueError("未加载到任何影像数据,请检查目录结构和文件格式!") print("X不为空,继续处理...") X_train = np.vstack(X_train) X_test = np.vstack(X_test) y_train = np.concatenate(y_train) y_test = np.concatenate(y_test) print(f"数据加载完成,总训练集样本数: {len(X_train)},总测试集样本数: {len(X_test)}") return X_train, X_test, y_train, y_test, label_mapping # 2. 数据标准化 def standardize_data(X_train, X_test, batch_size=1000): scaler = StandardScaler() print("开始标准化训练集...") for i in range(0, len(X_train), batch_size): batch = X_train[i:i + batch_size] scaler.partial_fit(batch) print(f"已处理训练集批次:{i // batch_size + 1},当前批次大小: {len(batch)}") print("开始转换训练集...") X_train_batches = [] for i in range(0, len(X_train), batch_size): batch = X_train[i:i + batch_size] X_train_batches.append(scaler.transform(batch)) print(f"已转换训练集批次:{i // batch_size + 1},当前批次大小: {len(batch)}") print("开始转换测试集...") X_test_batches = [] for i in range(0, len(X_test), batch_size): batch = X_test[i:i + batch_size] X_test_batches.append(scaler.transform(batch)) print(f"已转换测试集批次:{i // batch_size + 1},当前批次大小: {len(batch)}") # 将批次数据合并为完整的 NumPy 数组 X_train_full = np.vstack(X_train_batches) X_test_full = np.vstack(X_test_batches) return X_train_full, X_test_full # 3. 训练随机森林模型 def train_random_forest(X_train, y_train): print("正在训练随机森林模型...") if len(y_train) == 0: raise ValueError("y_train 为空,无法训练模型!") model = RandomForestClassifier( n_estimators=50, # 减少树的数量 max_depth=20, max_samples=0.5, max_features=0.5, random_state=42, n_jobs=-1 ) try: model.fit(X_train, y_train) except Exception as e: print(f"训练随机森林模型时出现错误:{e}") return model # 4. 评估模型 def evaluate_model(model, X_test, y_test, label_mapping, batch_size=10000): print("正在评估模型...") y_pred = [] # 确保 X_test 是 NumPy 数组 if not isinstance(X_test, np.ndarray) or X_test.ndim != 2: raise ValueError("X_test 必须是二维 NumPy 数组") # 检查 X_test 和 y_test 是否为空 if X_test.size == 0 or len(y_test) == 0: raise ValueError("测试数据为空,请检查数据加载和处理过程!") # 分批预测测试集 for i in range(0, len(X_test), batch_size): batch = X_test[i:i + batch_size] print(f"Batch shape: {batch.shape}, Batch type: {type(batch)}") if batch.size == 0: print("警告:当前批次为空,跳过预测。") continue try: y_pred_batch = model.predict(batch) if len(y_pred_batch) == 0: print("警告:当前批次预测结果为空,跳过。") continue y_pred.extend(y_pred_batch) except Exception as e: print(f"预测批次时出错: {e}") continue y_pred = np.array(y_pred) # 检查 y_pred 是否为空 if len(y_pred) == 0: raise ValueError("y_pred 为空,无法进行评估!") # 确保 target_names 与类别数量一致 if not label_mapping: print("警告:label_mapping 为空,将使用默认的类别名称。") target_names = [f"Class {i}" for i in range(np.max(y_test) + 1)] else: target_names = list(label_mapping.values()) # 检查 y_test 和 y_pred 的长度是否一致 if len(y_test) != len(y_pred): raise ValueError(f"y_test 和 y_pred 的长度不一致: {len(y_test)} vs {len(y_pred)}") # 打印分类报告 print("分类报告:") print(classification_report(y_test, y_pred, target_names=target_names)) # 打印混淆矩阵 print("混淆矩阵:") cm = confusion_matrix(y_test, y_pred) print(cm) print("\n") # 绘制混淆矩阵热力图 plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names) plt.xlabel('Predicted') plt.ylabel('True') plt.title('Confusion Matrix - RandomForest') plt.show() # 主程序 if __name__ == "__main__": # 数据路径 data_dir = r'E:\GID\train' # 加载数据并划分训练集和测试集 print("开始加载数据...") X_train, X_test, y_train, y_test, label_mapping = load_image_data(data_dir) print("数据加载完成,标签映射:", label_mapping) print("训练集样本数:", len(X_train), "测试集样本数:", len(X_test)) # 检查数据是否为空 if len(X_train) == 0 or len(X_test) == 0: raise ValueError("训练集或测试集为空,请检查数据加载过程!") # 数据标准化 print("开始数据标准化...") X_train_full, X_test_full = standardize_data(X_train, X_test) print("数据标准化完成") # 检查标准化后的数据是否为空 if X_train_full.size == 0 or X_test_full.size == 0: raise ValueError("标准化后的训练集或测试集为空,请检查数据标准化过程!") # 训练随机森林模型 print("开始训练随机森林模型...") trained_model = train_random_forest(X_train_full, y_train) print("随机森林模型训练完成") # 评估模型 print("开始评估模型...") evaluate_model(trained_model, X_test_full, y_test, label_mapping) print("模型评估完成") 精度过低进行修改
05-15
代码 # ================== 最终修复版模型优化代码 ================== from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline from sklearn.model_selection import RandomizedSearchCV from sklearn.ensemble import StackingRegressor from sklearn.linear_model import ElasticNet import joblib # 1. 定义明确的步骤名称和参数映射 MODEL_STEP_NAMES = { '线性回归': 'linear_model', '岭回归': 'ridge_model', '随机森林': 'random_forest', '梯度提升': 'gradient_boosting', '堆叠回归': 'stacking_model' } # 2. 修复参数网格 - 使用明确的步骤名称 extended_param_grids = { '岭回归': {'ridge_model__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}, '随机森林': { 'random_forest__n_estimators': [50, 100, 200], 'random_forest__max_depth': [3, 5, 7, 10, None], 'random_forest__min_samples_split': [2, 5, 10], 'random_forest__min_samples_leaf': [1, 2, 4] }, '梯度提升': { 'gradient_boosting__n_estimators': [50, 100, 200], 'gradient_boosting__learning_rate': [0.01, 0.05, 0.1, 0.2], 'gradient_boosting__max_depth': [2, 3, 4, 5], 'gradient_boosting__subsample': [0.8, 0.9, 1.0] }, '堆叠回归': { 'stacking_model__final_estimator__alpha': [0.01, 0.1, 1.0], 'stacking_model__final_estimator__l1_ratio': [0.2, 0.5, 0.8] } } # 3. 创建特征工程管道 def create_model_pipeline(model, model_name, degree=1): """创建带有多项式特征和标准化的模型管道""" steps = [ ('poly', PolynomialFeatures(degree=degree, include_bias=False)), ('scaler', StandardScaler()) ] # 添加模型步骤 step_name = MODEL_STEP_NAMES.get(model_name, model_name.lower()) steps.append((step_name, model)) return Pipeline(steps) # 4. 重新定义堆叠集成模型 base_models = [ ('ridge', Ridge(alpha=10)), ('rf', RandomForestRegressor(max_depth=5, n_estimators=100, random_state=42, n_jobs=1)), ('gb', GradientBoostingRegressor(learning_rate=0.05, max_depth=3, n_estimators=100, random_state=42)) ] stacking_model = StackingRegressor( estimators=base_models, final_estimator=ElasticNet(alpha=0.1, l1_ratio=0.5), cv=3, n_jobs=1 ) models['堆叠回归'] = stacking_model # 5. 修复模型训练过程 optimized_results = {} optimized_models = {} cv_scores = {} print("\n开始优化模型训练(最终修复版)...") for name, model in models.items(): print(f"\n=== 优化训练模型: {name} ===") plt.show()出现:开始优化模型训练(最终修复版)... === 优化训练模型: 线性回归 === 优化训练模型 线性回归 时出错: name 'evaluate_with_cv' is not defined === 优化训练模型: 岭回归 === 优化训练模型 岭回归 时出错: name 'evaluate_with_cv' is not defined === 优化训练模型: 随机森林 === 优化训练模型 随机森林 时出错: name 'evaluate_with_cv' is not defined === 优化训练模型: 梯度提升 === 优化训练模型 梯度提升 时出错: name 'evaluate_with_cv' is not defined === 优化训练模型: 堆叠回归 === 优化训练模型 堆叠回归 时出错: name 'evaluate_with_cv' is not defined 所有模型优化训练失败 Traceback (most recent call last): File "C:\Users\刘涵\AppData\Local\Temp\ipykernel_2332\1178611129.py", line 84, in <module> cv_eval = evaluate_with_cv(model_pipeline, X_train, y_train, cv=5) ^^^^^^^^^^^^^^^^ NameError: name 'evaluate_with_cv' is not defined Traceback (most recent call last): File "C:\Users\刘涵\AppData\Local\Temp\ipykernel_2332\1178611129.py", line 84, in <module> cv_eval = evaluate_with_cv(model_pipeline, X_train, y_train, cv=5) ^^^^^^^^^^^^^^^^ NameError: name 'evaluate_with_cv' is not defined Traceback (most recent call last): File "C:\Users\刘涵\AppData\Local\Temp\ipykernel_2332\1178611129.py", line 84, in <module> cv_eval = evaluate_with_cv(model_pipeline, X_train, y_train, cv=5) ^^^^^^^^^^^^^^^^ NameError: name 'evaluate_with_cv' is not defined Traceback (most recent call last): File "C:\Users\刘涵\AppData\Local\Temp\ipykernel_2332\1178611129.py", line 84, in <module> cv_eval = evaluate_with_cv(model_pipeline, X_train, y_train, cv=5) ^^^^^^^^^^^^^^^^ NameError: name 'evaluate_with_cv' is not defined Traceback (most recent call last): File "C:\Users\刘涵\AppData\Local\Temp\ipykernel_2332\1178611129.py", line 84, in <module> cv_eval = evaluate_with_cv(model_pipeline, X_train, y_train, cv=5) ^^^^^^^^^^^^^^^^ NameError: name 'evaluate_with_cv' is not defined。进行修改并在模型性能比对中加上关于MSE的比对
最新发布
08-11
``` install.packages("tidyverse") library(tidyverse) c <- read.csv("C:\\Users\\29930\\Desktop\\COPD.csv") colnames(c)<- c("COPD","性别","年龄","BMI","吸烟100支以上","气短频率","咳痰情况", "活动量减少","从不咳嗽","无胸腔痰液","无胸闷","爬坡不喘","居家活动无限制", "自信外出","睡眠好","精力充沛","吸烟总量","无感冒常咳嗽","平时气促", "使用煤柴燃料","家族呼吸病史","吸烟状态","上坡快跑呼吸","无感冒咳痰", "无感冒咳嗽","检测肺功能","Z5","RF","R5实","R5","R20实","R20", "R5_R20","X5实","X5","FeNO","CaNO") train <- sample(nrow(c), nrow(c)*0.7) F_train <- c[train, ] F_train <- na.omit(F_train) F_test <- c[-train, ] any(is.na(F_train)) # randomForest library(randomForest) c$COPD <- as.factor(c$COPD) set.seed(40705) F_train.forest <- randomForest(COPD~., data = F_train, importance = TRUE) F_train.forest ## # Scatterplot install.packages("ggplot2") library(ggplot2) library(ggExtra) library(ggpmisc) library(ggpubr) g <- ggplot(F_train, aes(obs, pre)) + geom_point() + geom_smooth(method="lm", se=F) + geom_abline(slope = 1,intercept = 0,lty="dashed") + stat_poly_eq( aes(label =paste( ..adj.rr.label.., sep = '~~')), formula = y ~ x, parse = TRUE, family="serif", size = 6.4, color="black", label.x = 0.1, #0-1之间的比例确定位置 label.y = 1) g1 <- ggMarginal(g, type = "histogram", fill="transparent") g <- ggplot(F_test, aes(obs, pre)) + geom_point() + geom_smooth(method="lm", se=F) + geom_abline(slope = 1,intercept = 0,lty="dashed") + stat_poly_eq( aes(label =paste( ..adj.rr.label.., sep = '~~')), formula = y ~ x, parse = TRUE, family="serif", size = 6.4, color="black", label.x = 0.1, #0-1之间的比例确定位置 label.y = 1) g2 <- ggMarginal(g, type = "histogram", fill="transparent") ggarrange(g1, g2, ncol = 2) # ggMarginal(g, type = "boxplot", fill="transparent") # ggMarginal(g, type = "density", fill="transparent") ##FEV1FVC 的重要性评估 importance_F <- F_train.forest$importance importance_F importance_plot <- tibble(var = rownames(importance_F), IncMSE = importance_F[,1], IncNodePurity = importance_F[,2]) p1 <- ggplot(importance_plot, aes(x=var, y=IncMSE)) + geom_segment( aes(x=var, xend=var, y=0, yend=IncMSE), color="skyblue") + geom_point( color="blue", size=4, alpha=0.6) + theme_light() + coord_flip() + theme( panel.grid.major.y = element_blank(), panel.border = element_blank(), axis.ticks.y = element_blank() ) p2 <- ggplot(importance_plot, aes(x=var, y=IncNodePurity)) + geom_segment( aes(x=var, xend=var, y=0, yend=IncNodePurity), color="skyblue") + geom_point( color="blue", size=4, alpha=0.6) + theme_light() + coord_flip() + theme( panel.grid.major.y = element_blank(), panel.border = element_blank(), axis.ticks.y = element_blank() ) ggarrange(p1, p2, ncol = 2) #5 次重复五折交叉验证 set.seed(40705) F_train.cv <- replicate(5, rfcv(F_train[-ncol(F_train)], F_train$COPD, cv.fold = 5, step = 0.8), simplify = FALSE) #ozo_train.cv F_train.cv <- data.frame(sapply(F_train.cv, '[[', 'error.cv')) F_train.cv$vars <- rownames(F_train.cv) F_train.cv <- reshape2::melt(F_train.cv, id = 'vars') F_train.cv$vars <- as.numeric(as.character(F_train.cv$vars)) F_train.cv.mean <- aggregate(F_train.cv$value, by = list(F_train.cv$vars), FUN = mean) F_train.cv.mean ggplot(F_train.cv.mean, aes(Group.1, x)) + geom_line() + labs(title = '',x = 'Number of vars', y = 'Cross-validation error') #首先根据某种重要性的高低排个序,例如根据“IncNodePurity”指标 importance_F <- importance_plot[order(importance_plot$IncNodePurity, decreasing = TRUE), ] #然后取出排名靠前的因素 importance_F.select <- importance_F[1:4, ] vars <- c(pull(importance_F.select, var), 'F') F.select <- c[,1:5] F.select <- reshape2::melt(F.select, id = 'F') # 查看下这些重要的 vars 与Ozone的关系 ggplot(F.select, aes(x = F, y = value)) + geom_point() + geom_smooth() + facet_wrap(~variable, ncol = 2, scale = 'free_y') + labs(title = '',x = 'F', y = 'Relative abundance')```重新写一下该代码,使能运行流畅
03-30
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值