这段代码是一个完整的机器学习项目,主要用于音频分类任务。以下是代码的主要部分和功能:
代码:婴儿啼哭
1. 导入必要的库
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from IPython.display import clear_output
import torchaudio
import torch
import torch.nn as nn
from sklearn import model_selection
from sklearn import metrics
from tabulate import tabulate
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import warnings
warnings.filterwarnings("ignore")
这些库提供了数据处理、机器学习、深度学习和可视化等功能。
2. 读取CSV文件并处理数据
import os
import pandas as pd
# 定义标签类别列表
LABEL_LIST = [
'awake',
'diaper',
'hug',
'hungry',
'sleepy',
'uncomfortable'
]
FILE_PATH = "/kaggle/input/yingertiku/"
# 定义数据路径
DATA_PATH = "train" # 替换为实际的路径
# 用于存储结果的列表
data = []
# 遍历每个子文件夹
for label in LABEL_LIST:
label_path = os.path.join(FILE_PATH+DATA_PATH, label)
if not os.path.exists(label_path):
print(f"Warning: Path {label_path} does not exist!")
continue
# 获取当前类别的所有文件
for file_name in os.listdir(label_path):
if file_name.endswith(".wav"): # 检查是否为 WAV 文件
file_path = os.path.join(label_path, file_name)
data.append({"label": label, "file_path": file_path})
# 转换为 DataFrame
df = pd.DataFrame(data)
# 保存到 CSV 文件
output_csv_path = "dataset.csv" # 替换为保存的 CSV 文件名
df.to_csv(output_csv_path, index=False)
print(f"统计完成,CSV 文件已保存到: {output_csv_path}")
这部分代码读取音频文件并将其信息存储在CSV文件中。
3. 数据预处理
class AudioDataset:
def __init__(self, file_path, class_id):
self.file_path = file_path
self.class_id = class_id
def __len__(self):
return len(self.file_path)
def __getitem__(self, idx):
path = self.file_path[idx]
waveform, sr = torchaudio.load(path, normalization=True) # load audio
audio_mono = torch.mean(waveform, dim=0, keepdim=True) # Convert sterio to mono
tempData = torch.zeros([1, 160000])
if audio_mono.numel() < 160000: # if sample_rate < 160000
tempData[:, :audio_mono.numel()] = audio_mono
else:
tempData = audio_mono[:, :160000] # else sample_rate 160000
audio_mono=tempData
mel_specgram = torchaudio.transforms.MelSpectrogram(sr)(audio_mono) # (channel, n_mels, time)
mel_specgram_norm = (mel_specgram - mel_specgram.mean()) / mel_specgram.std() # Noramalization
mfcc = torchaudio.transforms.MFCC(sample_rate=sr)(audio_mono) # (channel, n_mfcc, time)
mfcc_norm = (mfcc - mfcc.mean()) / mfcc.std() # mfcc norm
new_feat = torch.cat([mel_specgram, mfcc], axis=1)
return {
"specgram": torch.tensor(new_feat[0].permute(1, 0), dtype=torch.float),
"label": torch.tensor(self.class_id[idx], dtype=torch.long)
}
def collate_fn(data):
specs = []
labels = []
for d in data:
spec = d["specgram"].to(device)
label = d["label"].to(device)
specs.append(spec)
labels.append(label)
spec = torch.nn.utils.rnn.pad_sequence(specs, batch_first=True, padding_value=0.)
labels = torch.tensor(labels)
return spec, labels
# # device check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Load UrbanSound8K dataset
# 定义标签类别列表
LABEL_DICT = {
'awake':0,
'diaper':1,
'hug':2,
'hungry':3,
'sleepy':4,
'uncomfortable':5}
df = pd.read_csv("dataset.csv")
folder_folds = df["file_path"].values
labels_folds = df["label"].values
labels_id = [LABEL_DICT[lable] for lable in labels_folds ]
# # Split into train and test
# X_train, X_test, y_train, y_test = train_test_split(folder_folds, labels_id, test_size=0.3, random_state=42)
# # Create datasets and loaders
# train_dataset = AudioDataset(X_train, y_train)
# test_dataset = AudioDataset(X_test, y_test)
# train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
# test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn)
X_train, X_test, y_train, y_test = model_selection.train_test_split(folder_folds, labels_id, random_state=42, test_size=0.3)
train_dataset = AudioDataset(
file_path=X_train,
class_id=y_train
)
train_loader = torch.utils.data.DataLoader(
train_dataset, batch_size=128, shuffle=True, drop_last=True, collate_fn=collate_fn
)
test_dataset = AudioDataset(
file_path=X_test,
class_id=y_test
)
test_loader = torch.utils.data.DataLoader(
test_dataset, batch_size=128, shuffle=False, drop_last=True, collate_fn=collate_fn
)
# Test data loader
for features, labels in train_loader:
print(features.shape, labels.shape)
break
AudioDataset
类用于加载和预处理音频数据,collate_fn
函数用于批处理数据。
4. 定义模型
# model
class AudioLSTM(nn.Module):
def __init__(self, n_feature=5, out_feature=5, n_hidden=256, n_layers=2, drop_prob=0.3):
super().__init__()
self.drop_prob = drop_prob
self.n_layers = n_layers
self.n_hidden = n_hidden
self.n_feature = n_feature
self.lstm = nn.LSTM(self.n_feature, self.n_hidden, self.n_layers, dropout=self.drop_prob, batch_first=True)
self.dropout = nn.Dropout(drop_prob)
self.relu = nn.ReLU()
self.fc1 = nn.Linear(int(n_hidden), int(n_hidden/2))
self.fc2 = nn.Linear(int(n_hidden/2), out_feature)
def forward(self, x, hidden):
# x.shape (batch, seq_len, n_features)
l_out, l_hidden = self.lstm(x, hidden)
# out.shape (batch, seq_len, n_hidden*direction)
out = self.dropout(l_out)
# out.shape (batch, out_feature)
out = self.fc1(out)
out = self.fc2(out[:, -1, :])
# print(out.shape)
# return the final output and the hidden state
return out, l_hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
# print(hidden[0].shape)
AudioLSTM
类定义了一个LSTM模型,用于音频分类任务。
5. 训练和测试模型
# Tensorboard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
def save_model(state, filename):
torch.save(state, filename)
print("-> Model Saved")
# Train Set
def train(data_loader, model, epoch, optimizer, device):
losses = []
accuracies = []
labels = []
preds = []
model.train()
loop = tqdm(data_loader) # for progress bar
for batch_idx, (data, target) in enumerate(loop):
data = data.to(device)
target = target.to(device)
model.zero_grad()
output, hidden_state = model(data, model.init_hidden(128))
loss = nn.CrossEntropyLoss()(output, target)
loss.backward()
optimizer.step()
losses.append(loss.item())
probs = torch.softmax(output, dim=1)
winners = probs.argmax(dim=1)
corrects = (winners == target)
accuracy = corrects.sum().float() / float(target.size(0))
accuracies.append(accuracy)
labels += torch.flatten(target).cpu()
preds += torch.flatten(winners).cpu()
loop.set_description(f"EPOCH: {epoch} | ITERATION : {batch_idx}/{len(data_loader)} | LOSS: {loss.item()} | ACCURACY: {accuracy}")
loop.set_postfix(loss=loss.item())
avg_train_loss = sum(losses) / len(losses)
avg_train_accuracy = sum(accuracies) / len(accuracies)
report = metrics.classification_report(torch.tensor(labels).numpy(), torch.tensor(preds).numpy())
print(report)
return avg_train_loss, avg_train_accuracy
# Test Setting
def test(data_loader, model, optimizer, device):
model.eval()
accs = []
preds = []
labels = []
test_accuracies = []
with torch.no_grad():
loop = tqdm(data_loader) # Test progress bar
for batch_idx, (data, target) in enumerate(loop):
data = data.to(device)
target = target.to(device)
output, hidden_state = model(data, model.init_hidden(128))
probs = torch.softmax(output, dim=1)
winners = probs.argmax(dim=1)
corrects = (winners == target)
accuracy = corrects.sum().float() / float(target.size(0))
test_accuracies.append(accuracy)
labels += torch.flatten(target).cpu()
preds += torch.flatten(winners).cpu()
avg_test_acc = sum(test_accuracies) / len(test_accuracies)
return avg_test_acc
train
和 test
函数分别用于训练和测试模型。
6. 主函数
# Epoch
EPOCH = 50
OUT_FEATURE = 6 # class
PATIENCE = 5
def main():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioLSTM(n_feature=168, out_feature=OUT_FEATURE).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", patience=PATIENCE)
best_train_acc, best_epoch = 0, 0 # update acc and epoch
for epoch in range(EPOCH):
avg_train_loss, avg_train_acc = train(train_loader, model, epoch, optimizer, device)
avg_test_acc = test(test_loader, model, optimizer, device)
scheduler.step(avg_train_acc)
if avg_train_acc > best_train_acc:
best_train_acc = avg_train_acc
best_epoch = epoch
filename = f"best_model_at_epoch_{best_epoch}.pth.tar"
checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
save_model(checkpoint, filename)
table = [
["avg_train_loss", avg_train_loss], ["avg_train_accuracy", avg_train_acc],
["best_train_acc", best_train_acc], ["best_epoch", best_epoch]
]
print(tabulate(table)) # tabulate View
test_table = [
["Avg test accuracy", avg_test_acc]
]
writer.add_scalar('Loss/train', avg_train_loss, epoch)
writer.add_scalar('Accuracy/train', avg_train_acc, epoch)
writer.add_scalar('Accuracy/test', avg_test_acc, epoch)
print(tabulate(test_table)) # tabulate View
if __name__ == "__main__":
main() # Run function
main
函数用于执行训练和测试过程。
7. 预测
for lable,path in unique_label.items():
waveform, sr = torchaudio.load(path)
audio_mono = torch.mean(waveform, dim=0, keepdim=True)
tempData = torch.zeros([1, 160000])
if audio_mono.numel() < 160000:
tempData[:, :audio_mono.numel()] = audio_mono
else:
tempData = audio_mono[:, :160000]
audio_mono=tempData
mel_specgram = torchaudio.transforms.MelSpectrogram(sr)(audio_mono)
mel_specgram_norm = (mel_specgram - mel_specgram.mean()) / mel_specgram.std()
mfcc = torchaudio.transforms.MFCC(sample_rate=sr)(audio_mono)
# print(f'mfcc {mfcc.size()}')
mfcc_norm = (mfcc - mfcc.mean()) / mfcc.std()
new_feat = torch.cat([mel_specgram, mfcc], axis=1)
data = torch.utils.data.DataLoader(new_feat.permute(0, 2, 1))
new = torch.load(f"/kaggle/working/best_model_at_epoch_{max_epoch}.pth.tar", map_location=torch.device("cpu"))["state_dict"]
model = AudioLSTM(n_feature=168, out_feature=OUT_FEATURE)
model.load_state_dict(new)
model.eval().cpu()
with torch.no_grad():
for x in data:
x = x.to("cpu")
output, hidden_state = model(x, (torch.zeros(2, 1, 256), torch.zeros(2, 1, 256)))
print(f"Predicted id : {np.argmax(output.numpy())}")
for i, v in LABEL_DICT.items():
if np.argmax(output.numpy()) == v:
print(f"Predicted Label : {i}")
print(f"Real Label :{lable}")
这部分代码用于对新音频数据进行预测。
总之,这段代码实现了一个完整的音频分类任务,包括数据预处理、模型定义、训练、测试和预测。