from sklearn.metrics import confusion_matrix
import seaborn as sns
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from glob import glob
from sklearn.model_selection import RandomizedSearchCV
from scikeras.wrappers import KerasRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
import numpy as np
# 设置数据集路径
base_dir = "D:/dataset"
# 创建数据集DataFrame - 从文件名自动推断标签
def create_dataframe(dataset_path):
data = []
for img_file in glob(dataset_path + r'/*/0/*.png'):
if img_file.endswith('.png'):
# img_path = os.path.join(dataset_path, img_file)
img_path = img_file
# 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0)
# label = 1 if "class1" in img_file.lower() else 0
data.append([img_path, 0])
for img_file in glob(dataset_path + r'/*/1/*.png'):
if img_file.endswith('.png'):
# img_path = os.path.join(dataset_path, img_file)
img_path = img_file
# 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0)
# label = 1 if "class1" in img_file.lower() else 0
data.append([img_path, 1])
# for img_file in glob(dataset_path):
# if img_file.endswith('.png'):
# # img_path = os.path.join(dataset_path, img_file)
# img_path = img_file
# # 从文件名推断标签: 文件名中包含"class1"为阳性(1), 其他为阴性(0)
# label = 1 if "class1" in img_file.lower() else 0
# data.append([img_path, label])
return pd.DataFrame(data, columns=['path', 'label'])
# 创建数据集DataFrame
df = create_dataframe(base_dir)
df = create_dataframe("D:/dataset")
print("总样本数:", len(df))
print(df['label'].value_counts())
print(df.head())
# 检查数据集分布
print(f"阴性样本数(0): {len(df[df['label'] == 0])}")
print(f"阳性样本数(1): {len(df[df['label'] == 1])}")
# 划分训练集和测试集 (80%训练, 20%测试)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
# 自定义数据生成器 - 直接从文件路径加载图像
class CustomDataGenerator(tf.keras.utils.Sequence):
def __init__(self, df, batch_size=32, img_size=(50, 50), shuffle=True, augment=False):
self.df = df
self.batch_size = batch_size
self.img_size = img_size
self.shuffle = shuffle
self.augment = augment
self.on_epoch_end()
# 创建数据增强生成器
self.augmenter = ImageDataGenerator(
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True,
fill_mode='nearest'
) if augment else None
def __len__(self):
return int(np.ceil(len(self.df) / self.batch_size))
def __getitem__(self, index):
batch_paths = self.paths[index * self.batch_size:(index + 1) * self.batch_size]
batch_labels = self.labels[index * self.batch_size:(index + 1) * self.batch_size]
batch_images = []
for path in batch_paths:
img = load_img(path, target_size=self.img_size)
img_array = img_to_array(img) / 255.0 # 归一化
if self.augment and self.augmenter:
# 应用数据增强
img_array = self.augmenter.random_transform(img_array)
batch_images.append(img_array)
return np.array(batch_images), np.array(batch_labels)
def on_epoch_end(self):
self.paths = self.df['path'].values
self.labels = self.df['label'].values
if self.shuffle:
indices = np.arange(len(self.paths))
np.random.shuffle(indices)
self.paths = self.paths[indices]
self.labels = self.labels[indices]
# 图像尺寸 (参考Kaggle数据集)
img_width, img_height = 50, 50
batch_size = 32
# 创建数据生成器
train_generator = CustomDataGenerator(
train_df,
batch_size=batch_size,
img_size=(img_width, img_height),
augment=True # 训练集使用数据增强
)
test_generator = CustomDataGenerator(
test_df,
batch_size=batch_size,
img_size=(img_width, img_height),
shuffle=False # 测试集不需要打乱
)
# 构建CNN模型
model = Sequential([
Conv2D(32, (3, 3), activation='relu', input_shape=(img_width, img_height, 3)),
MaxPooling2D((2, 2)),
Conv2D(64, (3, 3), activation='relu'),
MaxPooling2D((2, 2)),
Conv2D(128, (3, 3), activation='relu'),
MaxPooling2D((2, 2)),
Flatten(),
Dense(256, activation='relu'),
Dropout(0.5),
Dense(1, activation='sigmoid')
])
# 编译模型
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy',
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall'),
tf.keras.metrics.AUC(name='auc')])
# 提前停止回调
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
# 训练模型
history = model.fit(
train_generator,
epochs=30,
validation_data=test_generator,
callbacks=[early_stop]
)
# 评估测试集
test_results = model.evaluate(test_generator)
print(
f"测试集准确率: {test_results[1]:.4f}, 精确率: {test_results[2]:.4f}, 召回率: {test_results[3]:.4f}, AUC: {test_results[4]:.4f}")
# 保存模型
model.save('breast_cancer_cnn.h5')
# 绘制训练历史
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='训练准确率')
plt.plot(history.history['val_accuracy'], label='验证准确率')
plt.title('模型准确率')
plt.ylabel('准确率')
plt.xlabel('轮次')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.title('模型损失')
plt.ylabel('损失')
plt.xlabel('轮次')
plt.legend()
plt.savefig('training_history.png')
plt.show()
# 获取测试集真实标签和预测标签
test_labels = []
for i in range(len(test_generator)):
_, labels = test_generator[i]
test_labels.extend(labels)
test_labels = np.array(test_labels)
# 模型预测(输出概率)
pred_probs = model.predict(test_generator)
# 转换为二分类标签(阈值0.5)
pred_labels = (pred_probs > 0.5).astype(int).flatten()
# 计算混淆矩阵
cm = confusion_matrix(test_labels, pred_labels)
# 绘制并保存混淆矩阵图
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['IDC(-)', 'IDC(+)'],
yticklabels=['IDC(-)', 'IDC(+)'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix.png')
plt.show()
# 1. 创建模型构建函数(用于KerasClassifier包装)
def create_model(optimizer='adam', filters=32, kernel_size=3, dense_units=128):
model = Sequential([
Conv2D(filters, kernel_size, activation='relu', input_shape=(28, 28, 1)),
MaxPooling2D(2),
Flatten(),
Dense(dense_units, activation='relu'),
Dense(10, activation='softmax')
])
model.compile(optimizer=optimizer,
loss='sparse_categorical_crossentropy',
metrics=['accuracy'])
return model
# 2. 包装Keras模型为Scikit-learn兼容的估计器
model = KerasClassifier(build_fn=create_model, verbose=0)
# 3. 定义随机搜索的超参数空间
param_dist = {
'optimizer': ['adam', 'rmsprop', 'sgd'],
'filters': [16, 32, 64],
'kernel_size': [3, 5],
'dense_units': [64, 128, 256],
'epochs': [5, 10], # 训练轮数
'batch_size': [32, 64] # 批大小
}
# 4. 配置随机搜索
random_search = RandomizedSearchCV(
estimator=model,
param_distributions=param_dist,
n_iter=10, # 随机采样组合数量
cv=3, # 交叉验证折数
scoring='accuracy',
n_jobs=-1 # 使用所有CPU核心
)
# 5. 加载数据集(示例使用MNIST)
from tensorflow.keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
X_train = X_train.reshape(-1, 28, 28, 1).astype('float32') / 255
X_test = X_test.reshape(-1, 28, 28, 1).astype('float32') / 255
# 6. 执行随机搜索
random_search.fit(X_train, y_train)
# 7. 输出最佳参数和性能
print(f"最佳参数: {random_search.best_params_}")
print(f"最佳准确率: {random_search.best_score_:.4f}")
# 8. 使用最佳模型评估测试集
best_model = random_search.best_estimator_
test_acc = best_model.score(X_test, y_test)
print(f"测试集准确率: {test_acc:.4f}")
修改代码
最新发布