import os
# 在代码最开头添加以下环境变量(放在所有import之前)
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # 明确指定GPU
os.environ["FLAGS_cudnn_deterministic"] = "1" # 固定cuDNN行为
os.environ["FLAGS_conv_workspace_size_limit"] = "1024" # 限制cuDNN工作空间
import paddle
import paddle.nn as nn
from paddle.vision import transforms, datasets
from paddle.io import DataLoader
import numpy as np
import os
import cv2
from datetime import datetime
from collections import Counter
import warnings
import sys
warnings.filterwarnings('ignore')
# 情绪类别
emotions = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
class EmotionCNN(nn.Layer):
def __init__(self, num_classes=7):
super(EmotionCNN, self).__init__()
self.features = nn.Sequential(
nn.Conv2D(3, 64, kernel_size=3, padding=1),
nn.BatchNorm2D(64),
nn.ReLU(),
nn.MaxPool2D(2), # 64x64
nn.Conv2D(64, 128, kernel_size=3, padding=1),
nn.BatchNorm2D(128),
nn.ReLU(),
nn.MaxPool2D(2), # 32x32
nn.Conv2D(128, 256, kernel_size=3, padding=1),
nn.BatchNorm2D(256),
nn.ReLU(),
nn.MaxPool2D(2), # 16x16
nn.Conv2D(256, 512, kernel_size=3, padding=1),
nn.BatchNorm2D(512),
nn.ReLU(),
nn.MaxPool2D(2), # 8x8
)
self.classifier = nn.Sequential(
nn.Linear(512 * 8 * 8, 1024),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(1024, num_classes)
)
def forward(self, x):
x = self.features(x)
x = paddle.flatten(x, 1)
x = self.classifier(x)
return x
def image_loader(path):
try:
img = cv2.imread(path)
if img is None:
raise ValueError(f"无法读取图像: {path}")
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img = cv2.resize(img, (128, 128)) # 确保尺寸一致
return img
except Exception as e:
print(f"加载图像错误: {path}, 错误: {str(e)}")
return np.zeros((128, 128, 3), dtype=np.uint8)
def get_class_distribution(dataset):
class_counts = Counter()
for _, label in dataset.samples:
class_counts[label] += 1
return class_counts
def check_data_consistency(dataset):
"""检查数据集中所有图像的尺寸是否一致"""
sizes = set()
for path, _ in dataset.samples:
img = image_loader(path)
sizes.add(img.shape)
return sizes
def train_model(data_dir, model_save_path):
# 记录开始时间
start_time = datetime.now()
print(f"训练开始时间: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
# 训练参数
num_epochs = 20
batch_size = 32
learning_rate = 0.001
weight_decay = 1e-4
patience = 3
# 数据预处理和增强
transform = transforms.Compose([
transforms.Resize((128, 128)),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(10),
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# 加载数据集
try:
train_dataset = datasets.DatasetFolder(
os.path.join(data_dir, 'train'),
loader=image_loader,
extensions=['.jpg', '.jpeg', '.png'],
transform=transform
)
val_dataset = datasets.DatasetFolder(
os.path.join(data_dir, 'val'),
loader=image_loader,
extensions=['.jpg', '.jpeg', '.png'],
transform=transform
)
# 打印类别到标签的映射
print("\n类别到标签的映射:")
print(train_dataset.class_to_idx)
# 验证标签范围
for i, (_, label) in enumerate(train_dataset.samples[:10]):
print(f"样本 {i}: 路径={train_dataset.samples[i][0]}, 标签={label}")
except Exception as e:
print(f"加载数据集时出错: {str(e)}")
return
# 加载数据集后添加检查
print("\n验证数据集标签范围...")
all_labels = [label for _, label in train_dataset.samples + val_dataset.samples]
print(f"标签最小值: {min(all_labels)}, 最大值: {max(all_labels)}")
invalid_labels = [l for l in all_labels if l < 0 or l >= len(emotions)]
if invalid_labels:
print(f"发现无效标签: {set(invalid_labels)}")
print("请检查数据集目录结构,确保只有以下目录:")
print(emotions)
return
# 在加载数据集后打印更详细的类别信息
print("\n类别到标签的映射:")
print(train_dataset.class_to_idx)
print("确保映射如下:")
for i, emotion in enumerate(emotions):
print(f"{emotion}: {i}")
# 检查数据尺寸一致性
train_sizes = check_data_consistency(train_dataset)
val_sizes = check_data_consistency(val_dataset)
print(f"\n训练集图像尺寸: {train_sizes}")
print(f"验证集图像尺寸: {val_sizes}")
# 打印数据分布
train_counts = get_class_distribution(train_dataset)
val_counts = get_class_distribution(val_dataset)
print("\n数据分布统计:")
for i, emotion in enumerate(emotions):
print(f"{emotion}: 训练集 {train_counts[i]}张, 验证集 {val_counts[i]}张")
# 自定义collate_fn函数,处理数据加载
def collate_fn(batch):
# 过滤掉无效数据
valid_batch = []
for x, y in batch:
if x is not None and 0 <= y < len(emotions): # 使用len(emotions)而不是硬编码7
valid_batch.append((x, y))
if len(valid_batch) == 0:
return None, None
# 转换为paddle tensor
images = paddle.stack([paddle.to_tensor(x) for x, y in valid_batch])
labels = paddle.to_tensor([y for x, y in valid_batch], dtype='int64')
return images, labels
# 创建数据加载器
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=0,
collate_fn=collate_fn
)
val_loader = DataLoader(
val_dataset,
batch_size=batch_size,
num_workers=0,
collate_fn=collate_fn
)
# 初始化模型
model = EmotionCNN(num_classes=7)
# 设置GPU设备
if paddle.is_compiled_with_cuda():
paddle.set_device('gpu')
print("模型将在GPU上运行")
else:
paddle.set_device('cpu')
print("模型将在CPU上运行")
# GPU测试
try:
test_input = paddle.randn([1, 3, 128, 128])
if paddle.is_compiled_with_cuda():
test_input = test_input.cuda()
output = model(test_input)
print(f"\nGPU测试前向传播成功! 输出形状: {output.shape}, 设备: {output.place}")
except Exception as e:
print(f"\nGPU测试前向传播失败: {str(e)}")
print("CUDA是否可用:", paddle.is_compiled_with_cuda())
print("当前设备:", paddle.get_device())
return
# 打印模型结构
print("\n模型结构:")
print(model)
model.train()
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = paddle.optimizer.Adam(
learning_rate=learning_rate,
parameters=model.parameters(),
weight_decay=weight_decay
)
# 训练循环
best_val_acc = 0.0
no_improve = 0
print(f"\n训练数据量: {len(train_dataset)}, 验证数据量: {len(val_dataset)}")
print(f"训练参数: 学习率={learning_rate}, 批量大小={batch_size}, 训练轮次={num_epochs}")
print("开始训练...\n")
for epoch in range(num_epochs):
try:
model.train()
running_loss = 0.0
correct = 0
total = 0
for batch_idx, (inputs, labels) in enumerate(train_loader):
# 跳过无效批次
if inputs is None or labels is None:
print("跳过无效批次")
continue
# 确保输入数据格式正确
if inputs.shape[1] != 3 or inputs.shape[2] != 128 or inputs.shape[3] != 128:
print(f"无效的输入形状: {inputs.shape}, 跳过此batch")
continue
# 验证标签值
if paddle.any(labels < 0) or paddle.any(labels >= 7):
print(f"无效的标签值: {labels}, 跳过此batch")
continue
optimizer.clear_grad()
outputs = model(inputs)
# 确保标签是int64类型
labels = paddle.cast(labels, 'int64')
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
# 计算准确率
predicted = paddle.argmax(outputs, axis=1)
total += labels.shape[0]
correct += paddle.sum(predicted == labels).item()
# 每10个batch打印一次
if batch_idx % 10 == 9:
avg_loss = running_loss / 10
current_acc = 100. * correct / total
print(f'Epoch: {epoch+1}/{num_epochs}, Batch: {batch_idx+1}, '
f'Loss: {avg_loss:.4f}, Acc: {current_acc:.2f}%')
running_loss = 0.0
except Exception as e:
print(f"训练过程中发生错误: {str(e)}")
print("尝试清理CUDA缓存并继续...")
paddle.device.cuda.empty_cache()
continue
# 计算训练准确率
train_acc = 100. * correct / total
train_loss = running_loss / len(train_loader)
print(f'Epoch {epoch+1} 训练准确率: {train_acc:.2f}%, 训练损失: {train_loss:.4f}')
# 验证
model.eval()
val_correct = 0
val_total = 0
val_loss = 0.0
with paddle.no_grad():
for inputs, labels in val_loader:
# 跳过无效批次
if inputs is None or labels is None:
continue
# 确保输入数据格式正确
if inputs.shape[1] != 3 or inputs.shape[2] != 128 or inputs.shape[3] != 128:
continue
outputs = model(inputs)
labels = paddle.cast(labels, 'int64')
loss = criterion(outputs, labels)
val_loss += loss.item()
predicted = paddle.argmax(outputs, axis=1)
val_total += labels.shape[0]
val_correct += paddle.sum(predicted == labels).item()
if val_total == 0:
print("验证集无有效数据!")
break
val_acc = 100. * val_correct / val_total
avg_val_loss = val_loss / len(val_loader)
print(f'Epoch {epoch+1} 验证准确率: {val_acc:.2f}%, 验证损失: {avg_val_loss:.4f}')
# 早停机制和模型保存
if val_acc > best_val_acc:
best_val_acc = val_acc
no_improve = 0
paddle.save(model.state_dict(), model_save_path + '.pdparams')
paddle.save(optimizer.state_dict(), model_save_path + '.pdopt')
print(f'模型已保存,验证准确率: {best_val_acc:.2f}%')
else:
no_improve += 1
if no_improve >= patience:
print(f"验证集准确率{patience}轮未提升,提前停止训练")
break
# 计算总训练时间
end_time = datetime.now()
total_time = end_time - start_time
print(f"\n训练完成! 总耗时: {total_time}")
print(f"最佳验证准确率: {best_val_acc:.2f}%")
if __name__ == '__main__':
# 数据路径
data_dir = 'data'
model_save_path = 'emotion_model'
# 检查并创建必要的目录
os.makedirs('models', exist_ok=True)
# 检查数据目录是否存在
required_dirs = ['train', 'val']
for dir_name in required_dirs:
dir_path = os.path.join(data_dir, dir_name)
if not os.path.exists(dir_path):
print(f"错误: 目录不存在 - {dir_path}")
exit(1)
# 检查每个情绪类别目录
for emotion in emotions:
emotion_path = os.path.join(dir_path, emotion)
if not os.path.exists(emotion_path):
print(f"警告: 情绪目录不存在 - {emotion_path}")
# 检查GPU可用性
if paddle.is_compiled_with_cuda():
paddle.set_device('gpu')
print("使用GPU进行训练")
else:
paddle.set_device('cpu')
print("使用CPU进行训练")
# 开始训练
train_model(data_dir, model_save_path)此代码运行后是无效的标签值: Tensor(shape=[32], dtype=int64, place=Place(gpu:0), stop_gradient=True,
[1, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7]), 跳过此batch,请指出可能出现的问题并解决