UrbanSound 模型(婴儿啼哭)

这段代码是一个完整的机器学习项目,主要用于音频分类任务。以下是代码的主要部分和功能:
代码:婴儿啼哭

1. 导入必要的库

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from IPython.display import clear_output
import torchaudio
import torch
import torch.nn as nn
from sklearn import model_selection
from sklearn import metrics
from tabulate import tabulate
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import warnings
warnings.filterwarnings("ignore")

这些库提供了数据处理、机器学习、深度学习和可视化等功能。

2. 读取CSV文件并处理数据

import os
import pandas as pd

# 定义标签类别列表
LABEL_LIST = [
    'awake',
    'diaper',
    'hug',
    'hungry',
    'sleepy',
    'uncomfortable'
]
FILE_PATH = "/kaggle/input/yingertiku/"
# 定义数据路径
DATA_PATH = "train"  # 替换为实际的路径

# 用于存储结果的列表
data = []

# 遍历每个子文件夹
for label in LABEL_LIST:
    label_path = os.path.join(FILE_PATH+DATA_PATH, label)
    if not os.path.exists(label_path):
        print(f"Warning: Path {label_path} does not exist!")
        continue
    
    # 获取当前类别的所有文件
    for file_name in os.listdir(label_path):
        if file_name.endswith(".wav"):  # 检查是否为 WAV 文件
            file_path = os.path.join(label_path, file_name)
            data.append({"label": label, "file_path": file_path})

# 转换为 DataFrame
df = pd.DataFrame(data)

# 保存到 CSV 文件
output_csv_path = "dataset.csv"  # 替换为保存的 CSV 文件名
df.to_csv(output_csv_path, index=False)

print(f"统计完成,CSV 文件已保存到: {output_csv_path}")

这部分代码读取音频文件并将其信息存储在CSV文件中。

3. 数据预处理

class AudioDataset:
    def __init__(self, file_path, class_id):
        self.file_path = file_path
        self.class_id = class_id
        
    def __len__(self):
        return len(self.file_path)
    
    def __getitem__(self, idx):
        path = self.file_path[idx]
        waveform, sr = torchaudio.load(path, normalization=True) # load audio
        audio_mono = torch.mean(waveform, dim=0, keepdim=True) # Convert sterio to mono
        tempData = torch.zeros([1, 160000])
        if audio_mono.numel() < 160000: # if sample_rate < 160000
            tempData[:, :audio_mono.numel()] = audio_mono
        else:
            tempData = audio_mono[:, :160000] # else sample_rate 160000
        audio_mono=tempData
        mel_specgram = torchaudio.transforms.MelSpectrogram(sr)(audio_mono) # (channel, n_mels, time)
        mel_specgram_norm = (mel_specgram - mel_specgram.mean()) / mel_specgram.std() # Noramalization
        mfcc = torchaudio.transforms.MFCC(sample_rate=sr)(audio_mono) # (channel, n_mfcc, time)
        mfcc_norm = (mfcc - mfcc.mean()) / mfcc.std() # mfcc norm
        new_feat = torch.cat([mel_specgram, mfcc], axis=1)
        
        return {
            "specgram": torch.tensor(new_feat[0].permute(1, 0), dtype=torch.float),
            "label": torch.tensor(self.class_id[idx], dtype=torch.long)
        }

def collate_fn(data):
    specs = []
    labels = []
    for d in data:
        spec = d["specgram"].to(device)
        label = d["label"].to(device)
        specs.append(spec)
        labels.append(label)
    spec = torch.nn.utils.rnn.pad_sequence(specs, batch_first=True, padding_value=0.)
    labels = torch.tensor(labels)
    return spec, labels
# # device check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load UrbanSound8K dataset
# 定义标签类别列表
LABEL_DICT = {
    'awake':0,
    'diaper':1,
    'hug':2,
    'hungry':3,
    'sleepy':4,
    'uncomfortable':5}
df = pd.read_csv("dataset.csv")
                                
folder_folds = df["file_path"].values
labels_folds = df["label"].values

labels_id = [LABEL_DICT[lable] for lable in labels_folds ]

# # Split into train and test
# X_train, X_test, y_train, y_test = train_test_split(folder_folds, labels_id, test_size=0.3, random_state=42)

# # Create datasets and loaders
# train_dataset = AudioDataset(X_train, y_train)
# test_dataset = AudioDataset(X_test, y_test)

# train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
# test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn)

X_train, X_test, y_train, y_test = model_selection.train_test_split(folder_folds, labels_id, random_state=42, test_size=0.3)
    
train_dataset = AudioDataset(
        file_path=X_train,
        class_id=y_train
    )
    
train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=128, shuffle=True, drop_last=True, collate_fn=collate_fn
    )
    
test_dataset = AudioDataset(
        file_path=X_test,
        class_id=y_test
    )
    
test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=128, shuffle=False, drop_last=True, collate_fn=collate_fn
    )
# Test data loader
for features, labels in train_loader:
    print(features.shape, labels.shape)
    break

AudioDataset 类用于加载和预处理音频数据,collate_fn 函数用于批处理数据。

4. 定义模型

# model
class AudioLSTM(nn.Module):

    def __init__(self, n_feature=5, out_feature=5, n_hidden=256, n_layers=2, drop_prob=0.3):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.n_feature = n_feature
        self.lstm = nn.LSTM(self.n_feature, self.n_hidden, self.n_layers, dropout=self.drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(int(n_hidden), int(n_hidden/2))
        self.fc2 = nn.Linear(int(n_hidden/2), out_feature)

    def forward(self, x, hidden):
        # x.shape (batch, seq_len, n_features)
        l_out, l_hidden = self.lstm(x, hidden)
        # out.shape (batch, seq_len, n_hidden*direction)
        out = self.dropout(l_out)
        # out.shape (batch, out_feature)
        out = self.fc1(out)
        out = self.fc2(out[:, -1, :])
#         print(out.shape)
        # return the final output and the hidden state
        return out, l_hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
#         print(hidden[0].shape)

AudioLSTM 类定义了一个LSTM模型,用于音频分类任务。

5. 训练和测试模型

# Tensorboard
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
def save_model(state, filename):
    torch.save(state, filename)
    print("-> Model Saved")
# Train Set
def train(data_loader, model, epoch, optimizer, device):
    losses = []
    accuracies = []
    labels = []
    preds = []
    model.train()
    loop = tqdm(data_loader) # for progress bar
    for batch_idx, (data, target) in enumerate(loop):
        data = data.to(device)
        target = target.to(device)
        model.zero_grad()
        output, hidden_state = model(data, model.init_hidden(128))
        loss = nn.CrossEntropyLoss()(output, target)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
        probs = torch.softmax(output, dim=1)
        winners = probs.argmax(dim=1)
        corrects = (winners == target)
        accuracy = corrects.sum().float() / float(target.size(0))
        accuracies.append(accuracy)
        labels += torch.flatten(target).cpu()
        preds += torch.flatten(winners).cpu()
        loop.set_description(f"EPOCH: {epoch} | ITERATION : {batch_idx}/{len(data_loader)} | LOSS: {loss.item()} | ACCURACY: {accuracy}")
        loop.set_postfix(loss=loss.item())
        
    avg_train_loss = sum(losses) / len(losses)
    avg_train_accuracy = sum(accuracies) / len(accuracies)
    report = metrics.classification_report(torch.tensor(labels).numpy(), torch.tensor(preds).numpy())
    print(report)
    return avg_train_loss, avg_train_accuracy
# Test Setting
def test(data_loader, model, optimizer, device):
    model.eval()
    accs = []
    preds = []
    labels = []
    test_accuracies = []
    with torch.no_grad():
        loop = tqdm(data_loader) # Test progress bar
        for batch_idx, (data, target) in enumerate(loop):
            data = data.to(device)
            target = target.to(device)
            output, hidden_state = model(data, model.init_hidden(128))
            probs = torch.softmax(output, dim=1)
            winners = probs.argmax(dim=1)
            corrects = (winners == target)
            accuracy = corrects.sum().float() / float(target.size(0))
            
            test_accuracies.append(accuracy)
            labels += torch.flatten(target).cpu()
            preds += torch.flatten(winners).cpu()
    avg_test_acc = sum(test_accuracies) / len(test_accuracies)
    return avg_test_acc

traintest 函数分别用于训练和测试模型。

6. 主函数

# Epoch
EPOCH = 50
OUT_FEATURE = 6 # class
PATIENCE = 5
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AudioLSTM(n_feature=168, out_feature=OUT_FEATURE).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", patience=PATIENCE)
    
    best_train_acc, best_epoch = 0, 0 # update acc and epoch
    
    for epoch in range(EPOCH):
        avg_train_loss, avg_train_acc = train(train_loader, model, epoch, optimizer, device)
        avg_test_acc = test(test_loader, model, optimizer, device)
        scheduler.step(avg_train_acc)
        if avg_train_acc > best_train_acc:
            best_train_acc = avg_train_acc
            best_epoch = epoch
            filename = f"best_model_at_epoch_{best_epoch}.pth.tar"
            checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
            save_model(checkpoint, filename)
        
        table = [
            ["avg_train_loss", avg_train_loss], ["avg_train_accuracy", avg_train_acc],
            ["best_train_acc", best_train_acc], ["best_epoch", best_epoch]
        ]
        print(tabulate(table)) # tabulate View
        test_table = [
            ["Avg test accuracy", avg_test_acc]
        ]
        writer.add_scalar('Loss/train', avg_train_loss, epoch)
        writer.add_scalar('Accuracy/train', avg_train_acc, epoch)
        writer.add_scalar('Accuracy/test', avg_test_acc, epoch)
        print(tabulate(test_table)) # tabulate View

if __name__ == "__main__":
    main() # Run function

main 函数用于执行训练和测试过程。

7. 预测

for lable,path in unique_label.items():
    waveform, sr = torchaudio.load(path)
    audio_mono = torch.mean(waveform, dim=0, keepdim=True)
    tempData = torch.zeros([1, 160000])
    if audio_mono.numel() < 160000:
        tempData[:, :audio_mono.numel()] = audio_mono
    else:
        tempData = audio_mono[:, :160000]
    audio_mono=tempData
    mel_specgram = torchaudio.transforms.MelSpectrogram(sr)(audio_mono)
    mel_specgram_norm = (mel_specgram - mel_specgram.mean()) / mel_specgram.std()
    mfcc = torchaudio.transforms.MFCC(sample_rate=sr)(audio_mono)
    #         print(f'mfcc {mfcc.size()}')
    mfcc_norm = (mfcc - mfcc.mean()) / mfcc.std()
    new_feat = torch.cat([mel_specgram, mfcc], axis=1)

    data = torch.utils.data.DataLoader(new_feat.permute(0, 2, 1))
    new = torch.load(f"/kaggle/working/best_model_at_epoch_{max_epoch}.pth.tar", map_location=torch.device("cpu"))["state_dict"]
    model = AudioLSTM(n_feature=168, out_feature=OUT_FEATURE)
    model.load_state_dict(new)
    model.eval().cpu()
    with torch.no_grad():
        for x in data:
            x = x.to("cpu")
            output, hidden_state = model(x, (torch.zeros(2, 1, 256), torch.zeros(2, 1, 256)))
            print(f"Predicted id : {np.argmax(output.numpy())}")

            for i, v in LABEL_DICT.items():
                if np.argmax(output.numpy()) == v:
                    print(f"Predicted Label : {i}")
    print(f"Real Label :{lable}")

这部分代码用于对新音频数据进行预测。

总之,这段代码实现了一个完整的音频分类任务,包括数据预处理、模型定义、训练、测试和预测。
在这里插入图片描述

<think>嗯,用户想要了解声音传感器婴儿哭声检测的代码。首先,我得理清这个问题的各个层面。婴儿哭声检测通常涉及声音信号处理、特征提取和分类算法。可能需要先确定用户的技术背景,不过用户直接问代码,可能有一定编程基础。 首先,得考虑硬件部分。声音传感器的话,常用的可能是麦克风模块,比如Arduino的模拟传感器或者树莓派之类的。但代码部分可能更多是处理音频信号,所以可能需要用Python,因为它有丰富的库,比如librosa、pyaudio之类的。 接下来是处理流程。通常步骤是采集声音信号,预处理(滤波、降噪),特征提取(比如MFCC,频谱特征),然后分类模型判断是否是哭声。可能需要用机器学习或者简单的阈值方法,但婴儿哭声的频率和模式可能需要更复杂的模型。 用户可能需要一个示例代码框架。我得考虑是否需要实时检测还是离线处理。实时的话可能需要用到声音流处理,而离线的话可能处理录音文件。假设用户是刚开始,可能先给一个离线的示例,用Python和librosa库。 然后,特征提取部分,MFCC是常用的声音特征,适合哭声检测。分类模型方面,可能需要训练一个分类器,比如SVM或者神经网络。但用户可能没有数据集,所以可能需要提到数据收集的问题,或者使用已有的模型。 另外,实时检测的话,可能会用到声音活动检测(VAD)来减少处理量,或者设置阈值判断声音强度。不过婴儿哭声的特征可能不仅仅是音量,还有频率变化,比如高频成分较多。 可能需要注意的问题包括环境噪声的处理,如何区分哭声和其他声音,比如玩具声、说话声等。可能需要预处理步骤,比如降噪,或者使用带通滤波器保留婴儿哭声的频段。 代码结构大概分为几个部分:读取音频、预处理、提取特征、分类判断。然后给出一个简单的例子,比如使用MFCC特征和简单阈值或分类模型。 还要考虑用户是否需要具体的硬件接口代码,比如Arduino读取传感器然后传输到电脑处理,或者直接在树莓派上运行Python代码。可能需要分开讨论,但示例可能集中在处理部分,假设已经有音频数据。 总结下来,应该分步骤介绍,从硬件选型、信号采集、预处理、特征提取、模型选择,最后给出代码示例,并提醒可能需要注意的问题,比如数据集和模型训练。同时,要建议进一步优化方向,比如实时处理、模型优化等。</think>以下是一个基于声音传感器的婴儿哭声检测代码实现思路及示例框架。该方案主要基于音频特征分析和简单机器学习算法,适合有一定编程基础的开发者参考: 1. 硬件选型建议 - 声音传感器:推荐使用数字麦克风模块(如MAX9814) - 主控芯片:Arduino/树莓派等支持模拟输入的单板机 - 开发环境:Python + Librosa音频处理库 2. 核心处理流程 ``` 音频采集 → 预处理 → 特征提取 → 分类判断 → 报警输出 ``` 3. Python代码框架示例(需配合音频文件使用): ```python import librosa import numpy as np from sklearn.svm import SVC # 音频预处理 def preprocess_audio(file_path): y, sr = librosa.load(file_path, sr=22050) # 加载音频 y = librosa.effects.preemphasis(y) # 预加重滤波 y = librosa.util.normalize(y) # 归一化 return y, sr # 特征提取(MFCC+能量) def extract_features(y, sr): mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) energy = librosa.feature.rms(y=y) features = np.vstack([mfcc, energy]) return features.T # 实时检测示例 def realtime_detection(): # 初始化模型(需预先训练) model = SVC(kernel='rbf') # model.load('baby_cry_model.pkl') # 实时采集音频流(此处为伪代码) while True: audio_chunk = get_audio_chunk() # 需实现音频采集接口 features = extract_features(audio_chunk) prediction = model.predict(features.mean(axis=0).reshape(1,-1)) if prediction == 1: trigger_alarm() # 注:需自行实现音频采集接口和模型训练 ``` 4. 关键特征参数(供调试参考): $$ 婴儿哭声典型特征:\begin{cases} \text{基频范围:300-600Hz} \\ \text{谐波丰富度:≥5个明显谐波} \\ \text{短时能量变化率:ΔE > 0.35} \\ \text{MFCC分布:第1-3维系数显著偏移} \end{cases} $$ 5. 优化建议: - 增加带通滤波(300-3000Hz) - 采用滑动窗口检测(推荐200ms窗口) - 结合长短时能量比(STE/LTER) - 使用深度学习模型(如CNN、RNN)提升准确率 注意事项: 1. 需要收集足够多的婴儿哭声样本进行模型训练 2. 实际部署需考虑环境噪声消除 3. 建议配合视频监控进行双重验证 4. 报警阈值需根据实际环境调试 建议开发步骤: 1. 先用Audacity等工具分析哭声频谱特征 2. 收集正负样本(建议至少200个哭声样本) 3. 使用Jupyter Notebook进行特征分析 4. 部署到嵌入式设备前先进行PC端验证 需要进一步实现具体硬件接口或算法细节,可以告知具体开发平台和需求方向。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值