语音识别

最新推荐文章于 2024-07-27 20:38:11 发布

Chise1

最新推荐文章于 2024-07-27 20:38:11 发布

阅读量240

点赞数

CC 4.0 BY-SA版权

分类专栏： python 文章标签：机器学习

本文链接：https://blog.youkuaiyun.com/weixin_36179862/article/details/85121031

python 专栏收录该内容

57 篇文章

订阅专栏

语音识别

梅尔频率倒谱系数（MFCC）矩阵

首先将音频输入按照时间顺序划分为若干片段，将每个片段做傅里叶变换，得到相对应的频率分布，从中提取人与人类语言内容相关性最强的十三个特征频率对应的能量强度，构成一个样本。将从每个样本中获得的频率样本按行组成一个矩阵，即梅尔频率倒谱系数（MFCC）矩阵。
代码：

import os
import warnings
import numpy as np
import scipy.io.wavfile as wf
import python_speech_features as sf
import hmmlearn.hmm as hl
warnings.filterwarnings(
    'ignore', category=DeprecationWarning)
np.seterr(all='ignore')

# 获取音频的地址和标签
def search_speeches(directory, speeches):
    directory = os.path.normpath(directory)
    if not os.path.isdir(directory):
        raise IOError(
            "The directory '" + directory +
            "' doesn't exist!")
    for entry in os.listdir(directory):
        label = directory[directory.rfind(
            os.path.sep) + 1:]
        path = os.path.join(directory, entry)
        if os.path.isdir(path):
            search_speeches(path, speeches)
        elif os.path.isfile(path) and \
                path.endswith('.wav'):
            if label not in speeches:
                speeches[label] = []
            speeches[label].append(path)


train_speeches = {}
search_speeches(r'C:\Users\Cs\Desktop\机器学习\ML\data\speeches\training',
                train_speeches)
train_x, train_y = [], []
for label, filenames in train_speeches.items():
    mfccs = np.array([])
    # 获取mfcc
    for filename in filenames:
        sample_rate, sigs = wf.read(filename)
        mfcc = sf.mfcc(sigs, sample_rate)
        if len(mfccs) == 0:
            mfccs = mfcc
        else:
            mfccs = np.append(mfccs, mfcc, axis=0)
    train_x.append(mfccs)
    train_y.append(label)
models = {}
for mfccs, label in zip(train_x, train_y):
        # 基于高斯(正态)分布的隐马尔科夫模型
    model = hl.GaussianHMM(
        n_components=4, covariance_type='diag',
        n_iter=1000)
    models[label] = model.fit(mfccs)
test_speeches = {}
search_speeches(r'C:\Users\Cs\Desktop\机器学习\ML\data\speeches\testing',
                test_speeches)
test_x, test_y = [], []
for label, filenames in train_speeches.items():
    mfccs = np.array([])
    for filename in filenames:
        sample_rate, sigs = wf.read(filename)
        mfcc = sf.mfcc(sigs, sample_rate)
        if len(mfccs) == 0:
            mfccs = mfcc
        else:
            mfccs = np.append(mfccs, mfcc, axis=0)
    test_x.append(mfccs)
    test_y.append(label)
pred_test_y = []
for mfccs in test_x:
    best_score, best_label = None, None
    for label, model in models.items():
        score = model.score(mfccs)  # 相似度得分
        if (best_score is None) or (best_score < score):
            best_score, best_label = score, label
    pred_test_y.append(best_label)
print(test_y)
print(pred_test_y)