import os
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras import layers, models, utils, callbacks
from tensorflow.keras.regularizers import l2
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# ==============================================
# 配置路径
# ==============================================
DATASET_PATH = "E:/genres"
TEST_AUDIO_PATH = "D:/218.wav"
# ==============================================
# 1. 增强版特征提取函数
# ==============================================
def extract_features(file_path, max_pad_len=174, augment=False):
try:
# 固定采样率和时长
audio, sample_rate = librosa.load(file_path, sr=22050, duration=30)
# 更丰富的数据增强策略
if augment:
# 随机组合多种增强方式
if np.random.random() > 0.5:
# 音高变换
n_steps = np.random.uniform(-2.0, 2.0)
audio = librosa.effects.pitch_shift(audio, sr=sample_rate, n_steps=n_steps)
if np.random.random() > 0.5:
# 时间拉伸
rate = np.random.uniform(0.8, 1.2)
audio = librosa.effects.time_stretch(audio, rate=rate)
# 确保音频长度不变
if len(audio) > sample_rate * 30:
audio = audio[:sample_rate * 30]
else:
audio = np.pad(audio, (0, sample_rate * 30 - len(audio)))
if np.random.random() > 0.5:
# 添加噪声
noise = np.random.normal(0, 0.005 * np.std(audio), len(audio))
audio = audio + noise
if np.random.random() > 0.5:
# 随机裁剪和填充
start = np.random.randint(0, max(1, len(audio) - sample_rate * 25))
audio = audio[start:start + sample_rate * 25]
audio = np.pad(audio, (0, sample_rate * 30 - len(audio)))
# 提取更全面的特征
mfccs = librosa.feature.mfcc(
y=audio,
sr=sample_rate,
n_mfcc=40,
n_fft=2048,
hop_length=512
)
# 特征长度统一处理
if mfccs.shape[1] < max_pad_len:
# 使用反射填充,比边缘填充更自然
pad_width = max_pad_len - mfccs.shape[1]
mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode='reflect')
else:
# 随机裁剪(训练时)或中心裁剪(测试时)
if augment and mfccs.shape[1] > max_pad_len:
start = np.random.randint(0, mfccs.shape[1] - max_pad_len)
mfccs = mfccs[:, start:start + max_pad_len]
else:
mfccs = mfccs[:, :max_pad_len]
# 更稳健的标准化处理
mean = np.mean(mfccs, axis=1, keepdims=True)
std = np.std(mfccs, axis=1, keepdims=True)
mfccs = (mfccs - mean) / (std + 1e-8)
except Exception as e:
print(f"Error processing {file_path}: {str(e)}")
return None
return mfccs
# ==============================================
# 2. 数据集加载(增加数据平衡处理)
# ==============================================
def load_dataset(dataset_path, augment_train=False):
genres = ['blues', 'classical', 'country', 'disco', 'hiphop',
'jazz', 'metal', 'pop', 'reggae', 'rock']
features, labels = [], []
# 记录每个类别的样本数
class_counts = {genre: 0 for genre in genres}
for genre_idx, genre in enumerate(genres):
genre_path = os.path.join(dataset_path, genre)
if not os.path.exists(genre_path):
continue
print(f"Processing: {genre}")
audio_files = [f for f in os.listdir(genre_path) if f.endswith('.wav')]
for audio_file in audio_files:
file_path = os.path.join(genre_path, audio_file)
mfccs = extract_features(file_path)
if mfccs is not None:
features.append(mfccs)
labels.append(genre_idx)
class_counts[genre] += 1
# 对训练集进行增强(每个样本增强1-3次)
if augment_train:
num_augmentations = np.random.randint(1, 4)
for _ in range(num_augmentations):
mfccs_aug = extract_features(file_path, augment=True)
if mfccs_aug is not None:
features.append(mfccs_aug)
labels.append(genre_idx)
# 打印类别分布
print("\n类别分布:")
for genre, count in class_counts.items():
print(f"{genre}: {count} samples")
# 分层划分数据集
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(sss.split(features, labels))
X_train = np.array([features[i] for i in train_idx])
y_train = np.array([labels[i] for i in train_idx])
X_test = np.array([features[i] for i in test_idx])
y_test = np.array([labels[i] for i in test_idx])
return X_train, y_train, X_test, y_test
# ==============================================
# 3. 改进的模型架构(更强的正则化)
# ==============================================
def build_and_compile_model(input_shape):
model = models.Sequential([
layers.InputLayer(input_shape=input_shape),
# 第一个卷积块
layers.Conv2D(32, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.0005)),
layers.BatchNormalization(),
layers.MaxPooling2D((2, 2)),
layers.SpatialDropout2D(0.2), # 2D空间Dropout更适合CNN
# 第二个卷积块
layers.Conv2D(64, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.0005)),
layers.BatchNormalization(),
layers.MaxPooling2D((2, 2)),
layers.SpatialDropout2D(0.3),
# 第三个卷积块
layers.Conv2D(128, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.0005)),
layers.BatchNormalization(),
layers.MaxPooling2D((2, 2)),
layers.SpatialDropout2D(0.4),
# 第四个卷积块(增加网络深度)
layers.Conv2D(256, (3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.0005)),
layers.BatchNormalization(),
layers.GlobalAveragePooling2D(),
# 全连接层
layers.Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
layers.BatchNormalization(),
layers.Dropout(0.5),
# 输出层(使用标签平滑)
layers.Dense(10, activation='softmax')
])
# 优化器配置(使用较低的学习率和梯度裁剪)
optimizer = tf.keras.optimizers.Adam(
learning_rate=0.0003,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-07,
clipnorm=1.0 # 梯度裁剪防止梯度爆炸
)
# 使用带标签平滑的损失函数
loss = tf.keras.losses.CategoricalCrossentropy(label_smoothing=0.1)
model.compile(
optimizer=optimizer,
loss=loss,
metrics=['accuracy']
)
return model
# ==============================================
# 4. 改进的训练与评估(增加验证集)
# ==============================================
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
# 进一步划分为训练集和验证集
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
train_idx, val_idx = next(sss.split(X_train, y_train))
X_train_split = X_train[train_idx]
y_train_split = y_train[train_idx]
X_val = X_train[val_idx]
y_val = y_train[val_idx]
print(f"训练集: {len(X_train_split)}, 验证集: {len(X_val)}, 测试集: {len(X_test)}")
# 添加通道维度
X_train_split = X_train_split[..., np.newaxis]
X_val = X_val[..., np.newaxis]
X_test = X_test[..., np.newaxis]
# 转换为one-hot编码
y_train_split = utils.to_categorical(y_train_split, 10)
y_val = utils.to_categorical(y_val, 10)
y_test = utils.to_categorical(y_test, 10)
# 增强的回调函数
callbacks_list = [
# 早停机制(监控验证集准确率)
callbacks.EarlyStopping(
monitor='val_accuracy',
patience=15,
restore_best_weights=True,
verbose=1
),
# 学习率衰减
callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.5,
patience=5,
min_lr=1e-6,
verbose=1
),
# 保存最佳模型
callbacks.ModelCheckpoint(
'best_model.keras',
monitor='val_accuracy',
save_best_only=True,
mode='max',
verbose=1
),
# 记录训练历史
callbacks.CSVLogger('training_history.csv')
]
# 训练模型
history = model.fit(
X_train_split, y_train_split,
validation_data=(X_val, y_val),
epochs=150, # 增加训练轮次
batch_size=16, # 减小批次大小提高泛化能力
callbacks=callbacks_list,
verbose=1
)
# 加载最佳模型
model = tf.keras.models.load_model('best_model.keras')
# 评估模型
train_loss, train_acc = model.evaluate(X_train_split, y_train_split, verbose=0)
val_loss, val_acc = model.evaluate(X_val, y_val, verbose=0)
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print("\n=== 评估结果 ===")
print(f"训练集 - 准确率: {train_acc:.4f}, 损失: {train_loss:.4f}")
print(f"验证集 - 准确率: {val_acc:.4f}, 损失: {val_loss:.4f}")
print(f"测试集 - 准确率: {test_acc:.4f}, 损失: {test_loss:.4f}")
# 绘制训练曲线
plot_training_history(history)
# 绘制混淆矩阵
plot_confusion_matrix(model, X_test, y_test)
return model, history
# ==============================================
# 5. 可视化函数
# ==============================================
def plot_training_history(history):
plt.figure(figsize=(12, 5))
# 准确率曲线
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='训练准确率')
plt.plot(history.history['val_accuracy'], label='验证准确率')
plt.title('模型准确率')
plt.xlabel('训练轮次')
plt.ylabel('准确率')
plt.legend()
# 损失曲线
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='训练损失')
plt.plot(history.history['val_loss'], label='验证损失')
plt.title('模型损失')
plt.xlabel('训练轮次')
plt.ylabel('损失')
plt.legend()
plt.tight_layout()
plt.show()
def plot_confusion_matrix(model, X_test, y_test):
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)
cm = confusion_matrix(y_true, y_pred_classes)
plt.figure(figsize=(10, 8))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('混淆矩阵')
plt.colorbar()
genres = ['blues', 'classical', 'country', 'disco', 'hiphop',
'jazz', 'metal', 'pop', 'reggae', 'rock']
tick_marks = np.arange(len(genres))
plt.xticks(tick_marks, genres, rotation=45)
plt.yticks(tick_marks, genres)
# 在混淆矩阵上标注数值
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
plt.text(j, i, format(cm[i, j], 'd'),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('真实类别')
plt.xlabel('预测类别')
plt.show()
# ==============================================
# 主函数
# ==============================================
def main():
# 检查路径
if not os.path.exists(DATASET_PATH):
print(f"错误:数据集路径不存在!\n当前路径: {os.path.abspath(DATASET_PATH)}")
return
# 加载数据
print("\n=== 加载数据 ===")
X_train, y_train, X_test, y_test = load_dataset(DATASET_PATH, augment_train=True)
# 构建模型
print("\n=== 构建模型 ===")
model = build_and_compile_model((X_train.shape[1], X_train.shape[2], 1))
model.summary()
# 训练评估
print("\n=== 开始训练 ===")
model, history = train_and_evaluate(model, X_train, y_train, X_test, y_test)
# 测试预测
if os.path.exists(TEST_AUDIO_PATH):
print("\n=== 测试预测 ===")
mfccs = extract_features(TEST_AUDIO_PATH)
if mfccs is not None:
mfccs = mfccs[np.newaxis, ..., np.newaxis]
pred = model.predict(mfccs)
genres = ['blues', 'classical', 'country', 'disco', 'hiphop',
'jazz', 'metal', 'pop', 'reggae', 'rock']
print("\n预测概率分布:")
for i, prob in enumerate(pred[0]):
print(f"{genres[i]:<10}: {prob*100:.2f}%")
print(f"\n最终预测: {genres[np.argmax(pred)]}")
if __name__ == "__main__":
main()这串代码是根据genres数据集来实现的,请帮我修改代码提高模型训练的准确率,然后防止模型出现过拟合或者欠拟合
最新发布