创建空的dataframe并逐个添加np.ndarray

本文介绍如何使用Python的pandas库将多个numpy数组逐条添加到一个初始为空的DataFrame中。首先创建一个带有指定列名的空DataFrame,然后通过循环遍历计算出的numpy数组,并将其转换为DataFrame形式后追加到初始DataFrame中。
Python3.9

Python3.9

Conda
Python

Python 是一种高级、解释型、通用的编程语言,以其简洁易读的语法而闻名,适用于广泛的应用,包括Web开发、数据分析、人工智能和自动化脚本

偶尔会遇到需要将元素一个一个添加到一个初始为空的dataframe中,今天刚好碰到这种情况,下面写一下方法。

刚开始import那些就不提了。

创建空dataframe:

colname=['a','b','c']
test=pd.DataFrame(columns=colname)

就创建了名为test的空dataframe,其中列名分别为'a','b','c'

现在假设我计算出了n个形状为(3,)的np.ndarray,3个元素分别对应a, b, c的值,要把这n个array依次填进dataframe里,那么

for i in range(n):
    data = # np.ndarray的计算步骤
    df_i = pd.DataFrame([data], columns=colname) # 这个[]很重要
    test = test.append(df_i) # 这个=很重要,和list不一样

就添加好了。

您可能感兴趣的与本文相关的镜像

Python3.9

Python3.9

Conda
Python

Python 是一种高级、解释型、通用的编程语言,以其简洁易读的语法而闻名,适用于广泛的应用,包括Web开发、数据分析、人工智能和自动化脚本

这段代码:def evaluate_attributes(y_true, y_prob, y_conf, prob_threshold, conf_threshold): """ 通用的属性识别性能评价函数。 Args: y_true (np.ndarray): 真实标签, shape (N, C), -1 代表未知。 y_prob (np.ndarray): 预测的属性概率, shape (N, C)。 y_conf (np.ndarray): 预测的置信度, shape (N, 1) 或 (N, C)。 prob_threshold (float): 属性概率阈值。 conf_threshold (float): 置信度阈值。 Returns: pd.DataFrame: 包含每个子属性 Precision, Recall, F1-Score 的报告。 """ y_pred = np.full_like(y_prob, -1, dtype=int) #处理二元属性 binary_indices = [v['idx'] for k, v in ATTRIBUTE_MAPPING.items() if isinstance(v['idx'], int)] # 提取二元属性对应的数据 binary_prob = y_prob[:, binary_indices] # 根据 y_conf 的形状,正确提取置信度 binary_conf = y_conf if y_conf.shape[1] > 1 else y_conf[:, 0:1] # 应用二元属性的预测逻辑 ##################这里是大于等于和小于等于 cond_predict_1 = (binary_conf >= conf_threshold) & (binary_prob >= prob_threshold) if prob_threshold!=0.5: cond_predict_0 = (binary_conf >= conf_threshold) & (binary_prob <= (1-prob_threshold)) else: cond_predict_0 = (binary_conf >= conf_threshold) & (binary_prob < (1-prob_threshold)) binary_pred = np.select([cond_predict_1, cond_predict_0], [1, 0], default=-1) y_pred[:, binary_indices] = binary_pred # --- 处理互斥多分类属性 (lower_body) --- multi_class_slice = ATTRIBUTE_MAPPING['lower_body']['idx'] # 提取下装类型对应的数据 multi_class_prob = y_prob[:, multi_class_slice] # 根据 y_conf 形状,正确提取计算该组的置信度 if y_conf.shape[1] > 1: # 逐属性置信度:取该组的平均值 multi_class_conf = np.mean(y_conf[:, multi_class_slice], axis=1) else: # 全局置信度 multi_class_conf = y_conf.flatten() # 筛选出置信度达标的样本 confident_samples_mask = multi_class_conf >= conf_threshold if np.any(confident_samples_mask): confident_probs = multi_class_prob[confident_samples_mask] # 找到最大概率及其索引 max_probs = np.max(confident_probs, axis=1) argmax_indices = np.argmax(confident_probs, axis=1) # 在置信度达标的样本中,再次筛选出最大概率也达标的样本 #################裤子判断标准为大于等于 passes_prob_thresh_mask = max_probs >= prob_threshold # passes_prob_thresh_mask = max_probs >= 0 # 获取最终要更新的样本的索引 final_update_indices = np.where(confident_samples_mask)[0][passes_prob_thresh_mask] final_argmax_indices = argmax_indices[passes_prob_thresh_mask] # 创建 one-hot 预测结果更新到 y_pred # 只有在 final_update_indices 中的样本才会被更新(从-1变为0或1) final_predictions = np.zeros((len(final_update_indices), 3), dtype=int) final_predictions[np.arange(len(final_update_indices)), final_argmax_indices] = 1 y_pred[final_update_indices, multi_class_slice] = final_predictions:中所有类别的概率阈值和置信度阈值是统一的,能否修改为不同类别使用不同的阈值。
10-21
import os os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' os.environ['TF_DETERMINISTIC_OPS'] = '1' import numpy as np import pandas as pd import tensorflow as tf from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Concatenate, Reshape, Conv1D, \ GlobalAveragePooling1D from tensorflow.keras.models import Model from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint import tifffile import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import classification_report, confusion_matrix, accuracy_score import gc import warnings warnings.filterwarnings('ignore') # 清除计算图 tf.keras.backend.clear_session() # GPU 配置 try: gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) print("✅ GPU 加速已启用") else: print("⚠️ 未检测到 GPU,使用 CPU 训练") except Exception as e: print(f"❌ GPU 配置失败: {str(e)}") class MultiModalDataGenerator(tf.keras.utils.Sequence): """改进的数据生成器 - 使用 tf.data API 兼容格式""" def __init__(self, image_paths, chemical_data, labels, batch_size=16, shuffle=True): self.image_paths = image_paths self.chemical_data = chemical_data self.labels = labels self.batch_size = batch_size self.shuffle = shuffle self.indices = np.arange(len(self.image_paths)) # 计算均值用于填充无效样本 self.image_mean = self._calculate_image_mean() self.chem_mean = self._calculate_chem_mean() self.on_epoch_end() def _calculate_image_mean(self): """计算图像均值用于填充无效样本""" sample_img = np.zeros((39, 7, 4), dtype=np.float32) count = 0 for img_path in self.image_paths[:min(100, len(self.image_paths))]: try: img = tifffile.imread(img_path) if img.shape == (7, 4, 39): img = np.moveaxis(img, -1, 0) elif img.shape == (39, 4, 7): img = np.transpose(img, (0, 2, 1)) if img.shape == (39, 7, 4): sample_img += img.astype(np.float32) count += 1 except: continue return sample_img / max(count, 1) if count > 0 else np.zeros((39, 7, 4)) def _calculate_chem_mean(self): """计算化学数据均值用于填充无效样本""" if isinstance(self.chemical_data, np.ndarray): return np.nanmean(self.chemical_data, axis=0) elif isinstance(self.chemical_data, pd.DataFrame): return self.chemical_data.mean().values else: return np.zeros(39) def __len__(self): return int(np.ceil(len(self.indices) / self.batch_size)) def __getitem__(self, idx): low = idx * self.batch_size high = min(low + self.batch_size, len(self.indices)) batch_indices = self.indices[low:high] batch_images = [] batch_chemical = [] batch_labels = [] # 记录哪些样本是无效的(占位数据) batch_valid_mask = [] for i in batch_indices: valid_sample = True try: # 尝试加载和处理图像 img = tifffile.imread(self.image_paths[i]) # 统一形状为 (39, 7, 4) if img.shape == (7, 4, 39): img = np.moveaxis(img, -1, 0) elif img.shape == (39, 4, 7): img = np.transpose(img, (0, 2, 1)) elif img.shape != (39, 7, 4): # 使用均值填充无效样本 img = self.image_mean.copy() valid_sample = False img = img.astype(np.float32) # 检查NaN或全零图像 if np.isnan(img).any() or img.max() == img.min(): img = self.image_mean.copy() valid_sample = False except Exception as e: # 加载失败时使用均值图像 img = self.image_mean.copy() valid_sample = False try: # 处理化学数据 if isinstance(self.chemical_data, np.ndarray): chem_feat = self.chemical_data[i].reshape(-1) else: chem_feat = self.chemical_data.iloc[i].values.reshape(-1) if chem_feat.shape != (39,) or np.isnan(chem_feat).any(): chem_feat = self.chem_mean.copy() valid_sample = False except: chem_feat = self.chem_mean.copy() valid_sample = False batch_images.append(img) batch_chemical.append(chem_feat) batch_labels.append(self.labels[i]) batch_valid_mask.append(valid_sample) # 构建批次 X_img = np.stack(batch_images) X_chem = np.array(batch_chemical, dtype=np.float32) y_batch = np.array(batch_labels, dtype=np.int32) valid_mask = np.array(batch_valid_mask, dtype=bool) # 返回数据、标签和有效样本掩码 return (X_img, X_chem), y_batch, valid_mask def on_epoch_end(self): if self.shuffle: np.random.shuffle(self.indices) def to_dataset(self): """转换为 tf.data.Dataset 格式""" def gen(): for i in range(len(self)): inputs, labels, _ = self[i] # 忽略valid_mask yield inputs, labels # 使用您建议的格式:明确指定dtype和shape output_signature = ( ( tf.TensorSpec(shape=(None, 39, 7, 4), dtype=tf.float32), # 图像输入 tf.TensorSpec(shape=(None, 39), dtype=tf.float32) # 化学输入 ), tf.TensorSpec(shape=(None,), dtype=tf.int32) # 标签 ) return tf.data.Dataset.from_generator( gen, output_signature=output_signature ).prefetch(tf.data.AUTOTUNE) class MultiModalFusionModel: def __init__(self, img_root="E:\\西北地区铜镍矿\\多模态测试\\图片训练", data_path="E:\\西北地区铜镍矿\\数据\\训练数据.xlsx"): self.img_root = img_root self.data_path = data_path self.scaler = StandardScaler() self.model = None self.history = None def load_data(self): print("🔍 正在加载数据...") df = pd.read_excel(self.data_path) print(f"原始数据形状: {df.shape}") required = ['name', 'class'] for col in required: if col not in df.columns: raise ValueError(f"Excel 缺少必要列: {col}") feature_cols = df.columns[6:45] chemical_data = df[feature_cols].select_dtypes(include=[np.number]) label_map = {'positive': 0, 'neutral': 1, 'negative': 2} image_paths, labels_list = [], [] for _, row in df.iterrows(): name = row['name'] cls = row['class'] if not isinstance(name, str) or cls not in label_map: continue class_dir = os.path.join(self.img_root, cls) found = False for ext in ['', '.tif', '.tiff']: path = os.path.join(class_dir, f"{name}{ext}") if os.path.exists(path): image_paths.append(path) labels_list.append(label_map[cls]) found = True break if not found: # 即使找不到图像,也保留样本(后续使用占位数据) image_paths.append(os.path.join(class_dir, "placeholder")) # 占位路径 labels_list.append(label_map[cls]) labels_array = np.array(labels_list) print(f"✅ 加载 {len(image_paths)} 个样本") counts = np.bincount(labels_array) print(f"📊 标签分布: positive={counts[0]}, neutral={counts[1]}, negative={counts[2]}") return image_paths, chemical_data, labels_array def build_model(self): print("🧱 正在构建模型...") # 定义输入 image_input = Input(shape=(39, 7, 4), name='image_input') chem_input = Input(shape=(39,), name='chemical_input') # 图像分支 x = Reshape((39, 28))(image_input) x = Conv1D(64, 3, activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Conv1D(128, 3, activation='relu', padding='same')(x) x = GlobalAveragePooling1D()(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) img_features = Dense(128, activation='relu')(x) # 化学分支 y = Dense(128, activation='relu')(chem_input) y = BatchNormalization()(y) y = Dropout(0.3)(y) y = Dense(256, activation='relu')(y) y = Dropout(0.3)(y) chem_features = Dense(128, activation='relu')(y) # 融合分支 merged = Concatenate()([img_features, chem_features]) z = Dense(256, activation='relu')(merged) z = Dropout(0.4)(z) z = Dense(128, activation='relu')(z) z = Dropout(0.3)(z) output = Dense(3, activation='softmax')(z) # 创建模型 model = Model(inputs=[image_input, chem_input], outputs=output) optimizer = Adam(learning_rate=1e-4, clipnorm=1.0) model.compile( loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'] ) # 打印模型结构 print("✅ 模型输入顺序: [图像输入, 化学输入]") print("✅ 模型输入形状:", [i.shape for i in model.inputs]) print("✅ 模型输出形状:", model.output.shape) self.model = model return model def train(self, image_paths, chemical_data, labels, test_size=0.2, batch_size=16, epochs=50): print("🚀 开始训练...") # 分割数据集 X_train_img, X_test_img, X_train_chem, X_test_chem, y_train, y_test = train_test_split( image_paths, chemical_data, labels, test_size=test_size, stratify=labels, random_state=42 ) # 标准化化学数据 print("🔢 标准化化学数据...") self.scaler.fit(X_train_chem) X_train_chem_scaled = self.scaler.transform(X_train_chem) X_test_chem_scaled = self.scaler.transform(X_test_chem) # 创建生成器 print("🔄 创建数据生成器...") train_gen = MultiModalDataGenerator(X_train_img, X_train_chem_scaled, y_train, batch_size, shuffle=True) val_gen = MultiModalDataGenerator(X_test_img, X_test_chem_scaled, y_test, batch_size, shuffle=False) # 转换为 tf.data.Dataset train_ds = train_gen.to_dataset() val_ds = val_gen.to_dataset() # 回调函数 callbacks = [ EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1), ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=8, min_lr=1e-6, verbose=1), ModelCheckpoint('best_multimodal_model.keras', save_best_only=True, monitor='val_accuracy', verbose=1) ] # 开始训练(使用 tf.data.Dataset) print("⏳ 训练中...") self.history = self.model.fit( train_ds, validation_data=val_ds, epochs=epochs, callbacks=callbacks, verbose=1 ) return self.history def evaluate(self, image_paths, chemical_data, labels): """改进的评估方法,解决所有已知问题提高准确率""" print("📈 开始评估...") # 标准化化学数据 chemical_data_scaled = self.scaler.transform(chemical_data) # 创建生成器 test_gen = MultiModalDataGenerator(image_paths, chemical_data_scaled, labels, batch_size=16, shuffle=False) # 收集所有有效样本的预测和标签 all_preds = [] all_labels = [] # 逐个批次预测收集有效样本 for i in range(len(test_gen)): (batch_img, batch_chem), batch_label, valid_mask = test_gen[i] # 预测 batch_pred = self.model.predict([batch_img, batch_chem], verbose=0) # 只保留有效样本 valid_indices = np.where(valid_mask)[0] if len(valid_indices) > 0: all_preds.append(batch_pred[valid_indices]) all_labels.append(batch_label[valid_indices]) # 释放内存 del batch_img, batch_chem, batch_label, batch_pred if i % 10 == 0: gc.collect() # 合所有批次的结果 if not all_preds: raise ValueError("没有有效样本用于评估") y_pred_probs = np.vstack(all_preds) y_true = np.concatenate(all_labels) y_pred = np.argmax(y_pred_probs, axis=1) # 计算打印结果 print(f"✅ 有效样本数量: {len(y_true)}/{len(labels)}") acc = accuracy_score(y_true, y_pred) print(f"🎯 准确率: {acc:.4f}") print("\n📋 分类报告:") print(classification_report(y_true, y_pred, target_names=['positive', 'neutral', 'negative'])) # 混淆矩阵 - 使用非交互式方式保存 cm = confusion_matrix(y_true, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['positive', 'neutral', 'negative'], yticklabels=['positive', 'neutral', 'negative']) plt.title('混淆矩阵') plt.ylabel('真实标签') plt.xlabel('预测标签') plt.tight_layout() # 保存但不显示 plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') plt.close() # 重要:关闭图形释放内存 print("✅ 混淆矩阵已保存为 'confusion_matrix.png'") # 分析模型性能问题 self._analyze_performance(y_true, y_pred, y_pred_probs) return acc, y_pred, y_pred_probs def _analyze_performance(self, y_true, y_pred, y_pred_probs): """分析模型性能问题提供改进建议""" # 计算每个类别的准确率 class_acc = [] for cls in range(3): idx = (y_true == cls) cls_acc = accuracy_score(y_true[idx], y_pred[idx]) class_acc.append(cls_acc) print("\n🔍 性能分析:") print(f"positive类准确率: {class_acc[0]:.4f}") print(f"neutral类准确率: {class_acc[1]:.4f}") print(f"negative类准确率: {class_acc[2]:.4f}") # 识别最难分类的样本 max_prob_diff = np.max(y_pred_probs, axis=1) - np.take_along_axis(y_pred_probs, y_true.reshape(-1, 1), axis=1).flatten() hard_indices = np.argsort(max_prob_diff)[:20] # 找出20个最难样本 print("\n💡 模型改进建议:") if class_acc[1] < 0.5: # neutral类准确率低 print("1. neutral类识别困难,建议增加该类样本或使用数据增强") if abs(class_acc[0] - class_acc[2]) > 0.2: # 类别间不平衡 print("2. 检测到类别不平衡问题,建议使用class_weight参数") if np.mean(max_prob_diff) > 0.3: # 模型不确定性高 print("3. 模型对许多样本预测不确定性高,建议增加训练轮数或模型复杂度") # 保存困难样本分析 plt.figure(figsize=(10, 8)) for i, idx in enumerate(hard_indices): plt.subplot(4, 5, i + 1) cls = y_true[idx] pred = y_pred[idx] prob = y_pred_probs[idx][pred] plt.title(f"T:{cls} P:{pred}\nProb:{prob:.2f}") # 这里可以添加可视化样本的代码 plt.tight_layout() plt.savefig('hard_samples.png', dpi=150) plt.close() print("✅ 困难样本分析已保存为 'hard_samples.png'") def main(): # 强制清除会话 tf.keras.backend.clear_session() # 创建运行模型 model = MultiModalFusionModel() image_paths, chemical_data, labels = model.load_data() model.build_model() # 训练模型 model.train(image_paths, chemical_data, labels, batch_size=8, epochs=100) # 评估模型 acc, y_pred, probs = model.evaluate(image_paths, chemical_data, labels) print(f"\n🎉 最终准确率: {acc:.4f}") # 保存模型以供后续使用 model.model.save('final_multimodal_model.keras') print("💾 模型已保存为 'final_multimodal_model.keras'") if __name__ == "__main__": main() 想把三分类改成二分类,positive为一类,negative、neutral合为一类
11-04
在将 TensorFlow 多模态分类代码从三分类改为二分类(negative 和 neutral 合为一类,positive 为另一类) 代码是import os os.environ[‘TF_ENABLE_ONEDNN_OPTS’] = ‘0’ os.environ[‘TF_DETERMINISTIC_OPS’] = ‘1’ import numpy as np import pandas as pd import tensorflow as tf from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Concatenate, Reshape, Conv1D, GlobalAveragePooling1D from tensorflow.keras.models import Model from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint import tifffile import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.metrics import classification_report, confusion_matrix, accuracy_score import gc import warnings warnings.filterwarnings(‘ignore’) 清除计算图 tf.keras.backend.clear_session() GPU 配置 try: gpus = tf.config.experimental.list_physical_devices(‘GPU’) if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) print(“✅ GPU 加速已启用”) else: print(“⚠️ 未检测到 GPU,使用 CPU 训练”) except Exception as e: print(f"❌ GPU 配置失败: {str(e)}") class MultiModalDataGenerator(tf.keras.utils.Sequence): “”“改进的数据生成器 - 使用 tf.data API 兼容格式”“” def __init__(self, image_paths, chemical_data, labels, batch_size=16, shuffle=True): self.image_paths = image_paths self.chemical_data = chemical_data self.labels = labels self.batch_size = batch_size self.shuffle = shuffle self.indices = np.arange(len(self.image_paths)) # 计算均值用于填充无效样本 self.image_mean = self._calculate_image_mean() self.chem_mean = self._calculate_chem_mean() self.on_epoch_end() def _calculate_image_mean(self): """计算图像均值用于填充无效样本""" sample_img = np.zeros((39, 7, 4), dtype=np.float32) count = 0 for img_path in self.image_paths[:min(100, len(self.image_paths))]: try: img = tifffile.imread(img_path) if img.shape == (7, 4, 39): img = np.moveaxis(img, -1, 0) elif img.shape == (39, 4, 7): img = np.transpose(img, (0, 2, 1)) if img.shape == (39, 7, 4): sample_img += img.astype(np.float32) count += 1 except: continue return sample_img / max(count, 1) if count > 0 else np.zeros((39, 7, 4)) def _calculate_chem_mean(self): """计算化学数据均值用于填充无效样本""" if isinstance(self.chemical_data, np.ndarray): return np.nanmean(self.chemical_data, axis=0) elif isinstance(self.chemical_data, pd.DataFrame): return self.chemical_data.mean().values else: return np.zeros(39) def __len__(self): return int(np.ceil(len(self.indices) / self.batch_size)) def __getitem__(self, idx): low = idx * self.batch_size high = min(low + self.batch_size, len(self.indices)) batch_indices = self.indices[low:high] batch_images = [] batch_chemical = [] batch_labels = [] # 记录哪些样本是无效的(占位数据) batch_valid_mask = [] for i in batch_indices: valid_sample = True try: # 尝试加载和处理图像 img = tifffile.imread(self.image_paths[i]) # 统一形状为 (39, 7, 4) if img.shape == (7, 4, 39): img = np.moveaxis(img, -1, 0) elif img.shape == (39, 4, 7): img = np.transpose(img, (0, 2, 1)) elif img.shape != (39, 7, 4): # 使用均值填充无效样本 img = self.image_mean.copy() valid_sample = False img = img.astype(np.float32) # 检查NaN或全零图像 if np.isnan(img).any() or img.max() == img.min(): img = self.image_mean.copy() valid_sample = False except Exception as e: # 加载失败时使用均值图像 img = self.image_mean.copy() valid_sample = False try: # 处理化学数据 if isinstance(self.chemical_data, np.ndarray): chem_feat = self.chemical_data[i].reshape(-1) else: chem_feat = self.chemical_data.iloc[i].values.reshape(-1) if chem_feat.shape != (39,) or np.isnan(chem_feat).any(): chem_feat = self.chem_mean.copy() valid_sample = False except: chem_feat = self.chem_mean.copy() valid_sample = False batch_images.append(img) batch_chemical.append(chem_feat) batch_labels.append(self.labels[i]) batch_valid_mask.append(valid_sample) # 构建批次 X_img = np.stack(batch_images) X_chem = np.array(batch_chemical, dtype=np.float32) y_batch = np.array(batch_labels, dtype=np.int32) valid_mask = np.array(batch_valid_mask, dtype=bool) # 返回数据、标签和有效样本掩码 return (X_img, X_chem), y_batch, valid_mask def on_epoch_end(self): if self.shuffle: np.random.shuffle(self.indices) def to_dataset(self): """转换为 tf.data.Dataset 格式""" def gen(): for i in range(len(self)): inputs, labels, _ = self[i] # 忽略valid_mask yield inputs, labels # 使用您建议的格式:明确指定dtype和shape output_signature = ( ( tf.TensorSpec(shape=(None, 39, 7, 4), dtype=tf.float32), # 图像输入 tf.TensorSpec(shape=(None, 39), dtype=tf.float32) # 化学输入 ), tf.TensorSpec(shape=(None,), dtype=tf.int32) # 标签 ) return tf.data.Dataset.from_generator( gen, output_signature=output_signature ).prefetch(tf.data.AUTOTUNE) class MultiModalFusionModel: def init(self, img_root=“E:\西北地区铜镍矿\多模态测试\图片训练”, data_path=“E:\西北地区铜镍矿\数据\训练数据.xlsx”): self.img_root = img_root self.data_path = data_path self.scaler = StandardScaler() self.model = None self.history = None def load_data(self): print("🔍 正在加载数据...") df = pd.read_excel(self.data_path) print(f"原始数据形状: {df.shape}") required = ['name', 'class'] for col in required: if col not in df.columns: raise ValueError(f"Excel 缺少必要列: {col}") feature_cols = df.columns[6:45] chemical_data = df[feature_cols].select_dtypes(include=[np.number]) # 修改标签映射,将三分类转为二分类 label_map = {'positive': 1, 'neutral': 0, 'negative': 0} image_paths, labels_list = [], [] for _, row in df.iterrows(): name = row['name'] cls = row['class'] if not isinstance(name, str) or cls not in label_map: continue class_dir = os.path.join(self.img_root, cls) found = False for ext in ['', '.tif', '.tiff']: path = os.path.join(class_dir, f"{name}{ext}") if os.path.exists(path): image_paths.append(path) labels_list.append(label_map[cls]) found = True break if not found: # 即使找不到图像,也保留样本(后续使用占位数据) image_paths.append(os.path.join(class_dir, "placeholder")) # 占位路径 labels_list.append(label_map[cls]) labels_array = np.array(labels_list) print(f"✅ 加载 {len(image_paths)} 个样本") counts = np.bincount(labels_array) print(f"📊 标签分布: 合类={counts[0]}, negative={counts[1]}") return image_paths, chemical_data, labels_array def build_model(self): print("🧱 正在构建模型...") # 定义输入 image_input = Input(shape=(39, 7, 4), name='image_input') chem_input = Input(shape=(39,), name='chemical_input') # 图像分支 x = Reshape((39, 28))(image_input) x = Conv1D(64, 3, activation='relu', padding='same')(x) x = BatchNormalization()(x) x = Conv1D(128, 3, activation='relu', padding='same')(x) x = GlobalAveragePooling1D()(x) x = Dense(256, activation='relu')(x) x = Dropout(0.3)(x) img_features = Dense(128, activation='relu')(x) # 化学分支 y = Dense(128, activation='relu')(chem_input) y = BatchNormalization()(y) y = Dropout(0.3)(y) y = Dense(256, activation='relu')(y) y = Dropout(0.3)(y) chem_features = Dense(128, activation='relu')(y) # 融合分支 merged = Concatenate()([img_features, chem_features]) z = Dense(256, activation='relu')(merged) z = Dropout(0.4)(z) z = Dense(128, activation='relu')(z) z = Dropout(0.3)(z) # 修改输出层为 1 个神经元,使用 sigmoid 激活函数 output = Dense(1, activation='sigmoid')(z) # 创建模型 model = Model(inputs=[image_input, chem_input], outputs=output) optimizer = Adam(learning_rate=1e-4, clipnorm=1.0) # 修改损失函数为 binary_crossentropy model.compile( loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'] ) # 打印模型结构 print("✅ 模型输入顺序: [图像输入, 化学输入]") print("✅ 模型输入形状:", [i.shape for i in model.inputs]) print("✅ 模型输出形状:", model.output.shape) self.model = model return model def train(self, image_paths, chemical_data, labels, test_size=0.2, batch_size=16, epochs=50): print("🚀 开始训练...") # 分割数据集 X_train_img, X_test_img, X_train_chem, X_test_chem, y_train, y_test = train_test_split( image_paths, chemical_data, labels, test_size=test_size, stratify=labels, random_state=42 ) # 标准化化学数据 print("🔢 标准化化学数据...") self.scaler.fit(X_train_chem) X_train_chem_scaled = self.scaler.transform(X_train_chem) X_test_chem_scaled = self.scaler.transform(X_test_chem) # 创建生成器 print("🔄 创建数据生成器...") train_gen = MultiModalDataGenerator(X_train_img, X_train_chem_scaled, y_train, batch_size, shuffle=True) val_gen = MultiModalDataGenerator(X_test_img, X_test_chem_scaled, y_test, batch_size, shuffle=False) # 转换为 tf.data.Dataset train_ds = train_gen.to_dataset() val_ds = val_gen.to_dataset() # 回调函数 callbacks = [ EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1), ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=8, min_lr=1e-6, verbose=1), ModelCheckpoint('best_multimodal_model.keras', save_best_only=True, monitor='val_accuracy', verbose=1) ] # 开始训练(使用 tf.data.Dataset) print("⏳ 训练中...") self.history = self.model.fit( train_ds, validation_data=val_ds, epochs=epochs, callbacks=callbacks, verbose=1 ) return self.history def evaluate(self, image_paths, chemical_data, labels): """改进的评估方法,解决所有已知问题提高准确率""" print("📈 开始评估...") # 标准化化学数据 chemical_data_scaled = self.scaler.transform(chemical_data) # 创建生成器 test_gen = MultiModalDataGenerator(image_paths, chemical_data_scaled, labels, batch_size=16, shuffle=False) # 收集所有有效样本的预测和标签 all_preds = [] all_labels = [] # 逐个批次预测收集有效样本 for i in range(len(test_gen)): (batch_img, batch_chem), batch_label, valid_mask = test_gen[i] # 预测 batch_pred = self.model.predict([batch_img, batch_chem], verbose=0) # 只保留有效样本 valid_indices = np.where(valid_mask)[0] if len(valid_indices) > 0: all_preds.append(batch_pred[valid_indices]) all_labels.append(batch_label[valid_indices]) # 释放内存 del batch_img, batch_chem, batch_label, batch_pred if i % 10 == 0: gc.collect() # 合所有批次的结果 if not all_preds: raise ValueError("没有有效样本用于评估") y_pred_probs = np.vstack(all_preds) y_true = np.concatenate(all_labels) y_pred = np.argmax(y_pred_probs, axis=1) # 计算打印结果 print(f"✅ 有效样本数量: {len(y_true)}/{len(labels)}") acc = accuracy_score(y_true, y_pred) print(f"🎯 准确率: {acc:.4f}") print("\n📋 分类报告:") print(classification_report(y_true, y_pred, target_names=['positive', 'neutral', 'negative'])) # 混淆矩阵 - 使用非交互式方式保存 cm = confusion_matrix(y_true, y_pred) plt.figure(figsize=(8, 6)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['positive', 'neutral', 'negative'], yticklabels=['positive', 'neutral', 'negative']) plt.title('混淆矩阵') plt.ylabel('真实标签') plt.xlabel('预测标签') plt.tight_layout() # 保存但不显示 plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight') plt.close() # 重要:关闭图形释放内存 print("✅ 混淆矩阵已保存为 'confusion_matrix.png'") # 分析模型性能问题 self._analyze_performance(y_true, y_pred, y_pred_probs) return acc, y_pred, y_pred_probs def _analyze_performance(self, y_true, y_pred, y_pred_probs): """分析模型性能问题提供改进建议""" # 计算每个类别的准确率 class_acc = [] for cls in range(3): idx = (y_true == cls) cls_acc = accuracy_score(y_true[idx], y_pred[idx]) class_acc.append(cls_acc) print("\n🔍 性能分析:") print(f"positive类准确率: {class_acc[0]:.4f}") print(f"neutral类准确率: {class_acc[1]:.4f}") print(f"negative类准确率: {class_acc[2]:.4f}") # 识别最难分类的样本 max_prob_diff = np.max(y_pred_probs, axis=1) - np.take_along_axis(y_pred_probs, y_true.reshape(-1, 1), axis=1).flatten() hard_indices = np.argsort(max_prob_diff)[:20] # 找出20个最难样本 print("\n💡 模型改进建议:") if class_acc[1] < 0.5: # neutral类准确率低 print("1. neutral类识别困难,建议增加该类样本或使用数据增强") if abs(class_acc[0] - class_acc[2]) > 0.2: # 类别间不平衡 print("2. 检测到类别不平衡问题,建议使用class_weight参数") if np.mean(max_prob_diff) > 0.3: # 模型不确定性高 print("3. 模型对许多样本预测不确定性高,建议增加训练轮数或模型复杂度") # 保存困难样本分析 plt.figure(figsize=(10, 8)) for i, idx in enumerate(hard_indices): plt.subplot(4, 5, i + 1) cls = y_true[idx] pred = y_pred[idx] prob = y_pred_probs[idx][pred] plt.title(f"T:{cls} P:{pred}\nProb:{prob:.2f}") # 这里可以添加可视化样本的代码 plt.tight_layout() plt.savefig('hard_samples.png', dpi=150) plt.close() print("✅ 困难样本分析已保存为 'hard_samples.png'") def main(): # 强制清除会话 tf.keras.backend.clear_session() # 创建运行模型 model = MultiModalFusionModel() image_paths, chemical_data, labels = model.load_data() model.build_model() # 训练模型 model.train(image_paths, chemical_data, labels, batch_size=8, epochs=100) # 评估模型 acc, y_pred, probs = model.evaluate(image_paths, chemical_data, labels) print(f"\n🎉 最终准确率: {acc:.4f}") # 保存模型以供后续使用 model.model.save('final_multimodal_model.keras') print("💾 模型已保存为 'final_multimodal_model.keras'") if name == “main”: main() 输出完整修改后的代码 # 保存困难样本详情到CSV 这部分以后的
最新发布
11-05
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值