from sklearn.metrics import confusion_matrix
import seaborn as sns
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from glob import glob
设置数据集路径
base_dir = “D:/dataset”
创建数据集DataFrame - 从文件名自动推断标签
def create_dataframe(dataset_path):
data = []
for img_file in glob(dataset_path + r’//0/.png’): if img_file.endswith(‘.png’): # img_path = os.path.join(dataset_path, img_file) img_path = img_file # 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0) # label = 1 if “class1” in img_file.lower() else 0 data.append([img_path, 0]) for img_file in glob(dataset_path + r’//1/.png’): if img_file.endswith(‘.png’): # img_path = os.path.join(dataset_path, img_file) img_path = img_file # 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0) # label = 1 if “class1” in img_file.lower() else 0 data.append([img_path, 1]) # for img_file in glob(dataset_path): # if img_file.endswith(‘.png’): # # img_path = os.path.join(dataset_path, img_file) # img_path = img_file # # 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0) # label = 1 if “class1” in img_file.lower() else 0 # data.append([img_path, label]) return pd.DataFrame(data, columns=[‘path’, ‘label’])
创建数据集DataFrame
df = create_dataframe(base_dir)
df = create_dataframe(“D:/dataset”)
print(“总样本数:”, len(df))
print(df[‘label’].value_counts())
print(df.head())
检查数据集分布
print(f"阴性样本数(0): {len(df[df[‘label’] == 0])}“)
print(f"阳性样本数(1): {len(df[df[‘label’] == 1])}”)
划分训练集和测试集 (80%训练, 20%测试)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df[‘label’])
自定义数据生成器 - 直接从文件路径加载图像
class CustomDataGenerator(tf.keras.utils.Sequence):
def init(self, df, batch_size=32, img_size=(50, 50), shuffle=True, augment=False):
self.df = df
self.batch_size = batch_size
self.img_size = img_size
self.shuffle = shuffle
self.augment = augment
self.on_epoch_end()
创建数据增强生成器 self.augmenter = ImageDataGenerator( rotation_range=20, width_shift_range=0.2, height_shift_range=0.2, shear_range=0.2, zoom_range=0.2, horizontal_flip=True, fill_mode=‘nearest’ ) if augment else None def len(self): return int(np.ceil(len(self.df) / self.batch_size)) def getitem(self, index): batch_paths = self.paths[index * self.batch_size:(index + 1) * self.batch_size] batch_labels = self.labels[index * self.batch_size:(index + 1) * self.batch_size] batch_images = [] for path in batch_paths: img = load_img(path, target_size=self.img_size) img_array = img_to_array(img) / 255.0 # 归一化 if self.augment and self.augmenter: # 应用数据增强 img_array = self.augmenter.random_transform(img_array) batch_images.append(img_array) return np.array(batch_images), np.array(batch_labels) def on_epoch_end(self): self.paths = self.df[‘path’].values self.labels = self.df[‘label’].values if self.shuffle: indices = np.arange(len(self.paths)) np.random.shuffle(indices) self.paths = self.paths[indices] self.labels = self.labels[indices]
图像尺寸 (参考Kaggle数据集)
img_width, img_height = 50, 50
batch_size = 32
创建数据生成器
train_generator = CustomDataGenerator(
train_df,
batch_size=batch_size,
img_size=(img_width, img_height),
augment=True # 训练集使用数据增强
)
test_generator = CustomDataGenerator(
test_df,
batch_size=batch_size,
img_size=(img_width, img_height),
shuffle=False # 测试集不需要打乱
)
构建CNN模型
model = Sequential([
Conv2D(32, (3, 3), activation=‘relu’, input_shape=(img_width, img_height, 3)),
MaxPooling2D((2, 2)),
Conv2D(64, (3, 3), activation=‘relu’), MaxPooling2D((2, 2)), Conv2D(128, (3, 3), activation=‘relu’), MaxPooling2D((2, 2)), Flatten(), Dense(256, activation=‘relu’), Dropout(0.5), Dense(1, activation=‘sigmoid’)
])
编译模型
model.compile(optimizer=‘adam’,
loss=‘binary_crossentropy’,
metrics=[‘accuracy’,
tf.keras.metrics.Precision(name=‘precision’),
tf.keras.metrics.Recall(name=‘recall’),
tf.keras.metrics.AUC(name=‘auc’)])
提前停止回调
early_stop = EarlyStopping(monitor=‘val_loss’, patience=5, restore_best_weights=True)
训练模型
history = model.fit(
train_generator,
epochs=30,
validation_data=test_generator,
callbacks=[early_stop]
)
评估测试集
test_results = model.evaluate(test_generator)
print(
f"测试集准确率: {test_results[1]:.4f}, 精确率: {test_results[2]:.4f}, 召回率: {test_results[3]:.4f}, AUC: {test_results[4]:.4f}")
保存模型
model.save(‘breast_cancer_cnn.h5’)
绘制训练历史
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history[‘accuracy’], label=‘训练准确率’)
plt.plot(history.history[‘val_accuracy’], label=‘验证准确率’)
plt.title(‘模型准确率’)
plt.ylabel(‘准确率’)
plt.xlabel(‘轮次’)
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history[‘loss’], label=‘训练损失’)
plt.plot(history.history[‘val_loss’], label=‘验证损失’)
plt.title(‘模型损失’)
plt.ylabel(‘损失’)
plt.xlabel(‘轮次’)
plt.legend()
plt.savefig(‘training_history.png’)
plt.show()
获取测试集真实标签和预测标签
test_labels = []
for i in range(len(test_generator)):
_, labels = test_generator[i]
test_labels.extend(labels)
test_labels = np.array(test_labels)
模型预测(输出概率)
pred_probs = model.predict(test_generator)
转换为二分类标签(阈值0.5)
pred_labels = (pred_probs > 0.5).astype(int).flatten()
计算混淆矩阵
cm = confusion_matrix(test_labels, pred_labels)
绘制并保存混淆矩阵图
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt=‘d’, cmap=‘Blues’,
xticklabels=[‘IDC(-)’, ‘IDC(+)’],
yticklabels=[‘IDC(-)’, ‘IDC(+)’])
plt.title(‘Confusion Matrix’)
plt.xlabel(‘Predicted Label’)
plt.ylabel(‘True Label’)
plt.savefig(‘confusion_matrix.png’)
plt.show()一部分一部分的解析代码用途用英文写
最新发布