首先,我们需要导入语音特征提取模型 MFCC_HTK ,便于我们进行特征提取。
In [1]:


# %run /data/bigfiles/2db08cbb-985f-452f-9ff0-c161efabb182.pyimport numpy as npimport osclass MFCC_HTK: """ Class to compute HTK compatible MFCC features from audio. It is designed to be as close as possible to the HTK implementation. For details on HTK implementationlook for HTK Book online, specifically chapter 5 titled "Speech Input/Output". This implementation was somewhat based upon the HTK source code. Most of the interesting code can be found in .../HTKLib/HSigP.c file. The latest version of HTK available during writing of this class was 3.4.1. HTK is licensed by University of Cambridge and isn't in any way related to this particular Python implementation (so don't bother them if you have problems with this code). For more information about HTK go to http://htk.eng.cam.ac.uk/ """ win_len=400 win_shift=160 preemph=0.97 filter_num=24 lifter_num=22 mfcc_num=12 lo_freq = 80 hi_freq = 7500 samp_freq=16000 filter_mat=None raw_energy=False feat_melspec=False feat_mfcc=True feat_energy=True def load_filter(self,filename): """ Internal load filter method -- you don't need to run this yourself. Loads filter spec from file. """ if not os.path.isfile(filename): raise IOError(filename) data=np.genfromtxt(filename,delimiter=',') self.filter_num=np.asscalar(np.max(data[:,1]).astype('int')) self.filter_mat=np.zeros((self.fft_len//2,self.filter_num)) for i in range(len(data)): wt=data[i,0] bin=np.asscalar(data[i,1].astype('int')) if(bin<0): continue if(bin>0): self.filter_mat[i,bin-1]=wt if(bin<self.filter_num): self.filter_mat[i,bin]=1-wt def create_filter(self, num): """ Internal create filter method -- you don't need to runthis yourself. Creates filter specified by their count. """ self.filter_num=num self.filter_mat=np.zeros((self.fft_len//2,self.filter_num)) mel2freq = lambda mel: 700.0*(np.exp((mel)/1127.0)-1) freq2mel = lambda freq: 1127*(np.log(1+((freq)/700.0))) lo_mel=freq2mel(self.lo_freq); hi_mel=freq2mel(self.hi_freq); mel_c=np.linspace(lo_mel,hi_mel,self.filter_num+2) freq_c=mel2freq(mel_c); point_c=freq_c/float(self.samp_freq)*self.fft_len point_c=np.floor(point_c).astype('int') for f in range(self.filter_num): d1=point_c[f+1]-point_c[f] d2=point_c[f+2]-point_c[f+1] self.filter_mat[point_c[f]:point_c[f+1]+1,f]=np.linspace(0,1,d1+1) self.filter_mat[point_c[f+1]:point_c[f+2]+1,f]=np.linspace(1,0,d2+1) def __init__(self, filter_file=None, win_len=400, win_shift=160, preemph=0.97, filter_num=26, lifter_num=22, mfcc_num=12, lo_freq = 80, hi_freq = 7500, samp_freq=16000, raw_energy=False, feat_melspec=False, feat_mfcc=True, feat_energy=True): """ Class contructor -- you can set all the processing parameters here. Args: filter_file (string): load the filter specification from a file. This exists to allow binary comaptibility with HTK, because they implement the filters slightly differently than mentioned in their docs. The format of filter file is CSV, where each line contains two values: weight and id of the filter at the given spectrum point. The number of lines is equal to the number of spectral points computed by FFT (e.g. 256 for 512-point FFT - half due to Nyquist). The file contains only raising edges of the filters. The falling edges are computed by taking the rasing edge of the next filter and inverting it (i.e computing 1-x). Filter id -1 means there is no filter at that point. If you set filter_file is None, a built-in method will be used to create half-overlapping triangular filters spread evenly between lo_freq and hi_freq in the mel domain. win_len (int): Length of frame in samples. Default value is 400, which is equal to 25 ms for a signal sampled at 16 kHz (i.e. 2.5x the win_shift length) win_shift (int): Frame shift in samples - in other words, distance between the start of two consecutive frames. Default value is 160, which is equal to 10 ms for a signal sampled at 16 kHz. This is generates 100 frames per second of the audio, which is a standard framerate for many audio tasks. preemph (float): Preemphasis coefficient. This is used to calculate first-order difference of the signal. filter_num (int): Number of triangular filters used to reduce the spectrum. Default value is 24. If filter_file is used (set to different than None), this value is overwritten with the contents of the filter_file. mfcc_num (int): Number of MFCCs computed. Default value is 12. lo_freq (float): Lowest frequency (in Hz) used in computation. Default value is 80 Hz. This is used exclusively to compute filters. The value is ignored if filters are loaded from file. hi_freq (float): Highest frequency (in Hz) used in computation. Default value is 7500 Hz. This is used exclusively to compute filters. The value is ignored if filters are loaded from file. samp_freq (int): Sampling frequency of the audio. Default value is 16000, which is a common value for recording speech. Due to Nyquist, the maximum frequency stored is half of this value, i.e. 8000 Hz. raw_energy (boolean): Should the energy be computed from the raw signal, or (if false) should the 0'th coepstral coefficient be used instead, which is almost equivalent and much faster to compute (since we compute MFCC anyway). feat_melspec (boolean): Should the spectral features be added to output. These are the values of the logarithm of the filter outputs. The number of these features is eqeual to filter_num. feat_mfcc (boolean): Should MFCCs be added to the output. The number of these features is equal to mfcc_num. feat_energy(boolean): Should energy be added to the output. This is a single value. """ self.__dict__.update(locals()) self.fft_len=(2**(np.floor(np.log2(self.win_len))+1)).astype('int') if filter_file: self.load_filter(filter_file) else: self.create_filter(filter_num) self.hamm=np.hamming(self.win_len) self.dct_base=np.zeros((self.filter_num,self.mfcc_num)); for m in range(self.mfcc_num): self.dct_base[:,m]=np.cos((m+1)*np.pi/self.filter_num*(np.arange(self.filter_num)+0.5)) self.lifter=1+(self.lifter_num/2)*np.sin(np.pi*(1+np.arange(self.mfcc_num))/self.lifter_num); self.mfnorm = np.sqrt(2.0 / self.filter_num) def load_raw_signal(self,filename): """ Helper method that loads a 16-bit signed int RAW signal from file. Uses system-natural endianess (most likely little-endian). If you have a problem with endianess use "byteswap()" method on resulting array. """ return np.fromfile(filename,dtype=np.int16).astype(np.double) def get_feats(self,signal): """ Gets the features from an audio signal based on the configuration set in constructor. Args: signal (numpy.ndarray): audio signal Returns: numpy.ndarray: a WxF matrix, where W is the number of windows in the signal and F is the number of chosen features """ sig_len=len(signal) win_num=np.floor((sig_len-self.win_len)/self.win_shift).astype('int')+1 feats = [] for w in range(win_num): featwin=[] #extract window s=w*self.win_shift e=s+self.win_len win=signal[s:e].copy() #preemphasis win-=np.hstack((win[0],win[:-1]))*self.preemph #windowing win*=self.hamm #fft win=np.abs(np.fft.rfft(win,n=self.fft_len)[:-1]) #filters melspec=np.dot(win,self.filter_mat) #floor (before log) melspec[melspec<0.001]=0.001 #log melspec=np.log(melspec) if self.feat_melspec: featwin.append(melspec) #dct mfcc=np.dot(melspec,self.dct_base) mfcc*=self.mfnorm #lifter mfcc*=self.lifter #sane fixes mfcc[np.isnan(mfcc)]=0 mfcc[np.isinf(mfcc)]=0 if self.feat_mfcc: featwin.append(mfcc) #energy if self.feat_energy: if self.raw_energy: energy=np.log(np.sum(signal**2)) else: energy=np.sum(melspec)*self.mfnorm #sane fixes if np.isnan(energy): energy=0 if np.isinf(energy): energy=0 featwin.append(energy) feats.append(np.hstack(featwin)) return np.asarray(feats) def get_delta(self, feat, deltawin=2): """ Computes delta using the HTK method. Args: feat (numpy.ndarray): Numpy matrix of shape WxF, where W is number of frames and F is number of features. deltawin (int): The DELTAWINDOW parameter of the delta computation. Check HTK Book Chapter 5.6 for details. Returns: numpy.ndarray: A matrix of the same size as argument feat containing the deltas of the provided features. """ deltas=[] norm=2.0*(sum(np.arange(1,deltawin+1)**2)); win_num=feat.shape[0] win_len=feat.shape[1] for win in range(win_num): delta=np.zeros(win_len) for t in range(1,deltawin+1): tm=win-t tp=win+t if tm<0: tm=0 if tp>=win_num: tp=win_num-1 delta+=(t*(feat[tp]-feat[tm]))/norm deltas.append(delta) return np.asarray(deltas)


In [2]:


%run /data/bigfiles/5d4a3552-0ba6-456b-9786-ac42e7adc47e.py


In [3]:


import warningswarnings.filterwarnings('ignore')#忽略红色告警信息import osimport sysimport numpy as npimport matplotlib.pyplot as plt


1、加载数据并显示波形图像
HTK是英国剑桥大学开发的一套语音处理工具箱,广泛应用于语音识别、语音合成、字符识别和DNA排序等领域。WAV文件是在PC机平台上很常见的、最经典的多媒体音频文件,最早于1991年8月出现在Windows 3.1操作系统上,文件扩展名为WAV,是WaveFom的简写,也称为波形文件,可直接存储声音波形,还原的波形曲线十分逼真。WAV文件的编码包括了两方面内容,一是按一定格式存储数据,二是采用一定的算法压缩数据。文件数据的前44个字节一般存储文档标志、数据长度、格式类型、采样频率等信息。具体可以参看以下链接信息:https://www.cnblogs.com/ranson7zop/p/7657874.html
In [4]:


data_path =os.path.join(os.getcwd(),'/data/bigfiles/8c79319b-d054-424a-beb5-59d33a7f930b.wav')


In [5]:


mfcc=MFCC_HTK()signal = mfcc.load_raw_signal(data_path)signal = signal[100:]sig_len=signal.size/16000 #in secondsplt.figure(figsize=(15,4))t=np.linspace(0,sig_len,signal.size)plt.plot(t,signal)


函数介绍
(1)函数 figure(num=None, figsize=None, dpi=None, facecolor=None, edgecolor=None, frameon=True)的各个参数含义为:
num 图像编号或名称,数字为编号,字符串为名称;
figsize:指定figure的宽和高,单位为英寸;
dpi:参数指定绘图对象的分辨率,即每英寸多少个像素,缺省值为80。 1英寸等于2.5cm;
facecolor:背景颜色;
edgecolor:边框颜色;
frameon:是否显示边框;
(2)函数 numpy.linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None):在指定的间隔范围内返回均匀间隔的数字。在[start, stop]范围内计算,返回num个(默认为50)均匀间隔的样本。
2、绘制语音信号频谱图
In [6]:


plt.figure(figsize=(15,4))s=plt.specgram(signal,Fs=16000)plt.xlim(0,sig_len)


时频图以横轴为时间,纵轴为频率,用颜色表示幅值,幅度用亮色如红色表示高,用深色表示低。利用时频图(也叫语谱图)可以查看指定频率的能量分布,在一幅图中表示信号的频率、幅度随时间的变化。
绘制频谱图函数:
plt.specgram(x, NFFT=None, Fs=None, Fc=None, detrend=None, window=None, noverlap=None, cmap=None, xextent=None, pad_to=None, sides=None, scale_by_freq=None, mode=None, scale=None, vmin=None, vmax=None, *, data=None, **kwargs)
主要参数:x:1-D数组或序列;Fs:采样频率,默认为2;NFFT:FFT中每个片段的数据点数(窗长度),默认为256;noverlap:窗之间的重叠长度。默认值是128。其他参数参考官网说明:https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.specgram.html
3、移除数据中心值(即均值)
In [7]:


print( "Before: "+str(np.mean(signal)))signal = signal - np.mean(signal)print( "After: "+str(np.mean(signal)))


说明:移除均值,就是去掉语音信号的直流分量,这样可以去除信号中的一部分底噪,让信号在零值上下波动。
4、分帧
In [8]:


win_shift=160 #帧移10mswin_len=400 #帧长25mssig_len=len(signal)win_num=np.floor((sig_len-win_len)/win_shift).astype('int')+1print("win_num = "+str(win_num))wins=[]for w in range(win_num): #t每个窗的开始和结束 s=w*win_shift e=s+win_len win=signal[s:e].copy() wins.append(win) t=np.linspace(1,400,1) wins=np.asarray(wins)wins.shape


将连续三帧波形数据画出,观察帧移。
In [9]:


t=np.linspace(1,400,400)t1=np.linspace(164,564,400)t2=np.linspace(328,728,400)plt.figure(figsize=(15,4))plt.plot(t,wins[25])plt.plot(t1,wins[26])plt.plot(t2,wins[27])


5、预加重
显示高通滤波器,并对语音信号高频部分进行加重
In [10]:


#演示数据k=0.97h = [1,-k]f=np.linspace(0,8000,257)plt.plot(f,np.abs(np.fft.rfft(h,n=512)))plt.xlabel('Frequency')plt.ylabel('Amplitude correction')#实际数据for win in wins: win-=np.hstack((win[0],win[:-1]))*k


6、加窗
显示矩形窗和汉明窗,并对分帧后的语音信号进行加窗处理。
In [11]:


#演示数据rect = np.ones(400)plt.figure(figsize=(12,4))plt.subplot(2,1,1)plt.stem(rect)plt.xlim(-100,500)plt.ylim(-0.1,1.1)plt.title('Square window')hamm = np.hamming(400)plt.figure(figsize=(12,4))plt.subplot(2,1,2)plt.stem(hamm)plt.xlim(-100,500)plt.ylim(-0.1,1.1)plt.title('Hamming function')#实际数据for win in wins: win*=hamm


7、快速傅里叶变换
In [12]:


fft_len=(2**(np.floor(np.log2(win_len))+1)).astype('int')ffts=[]for win in wins: win=np.abs(np.fft.rfft(win,n=fft_len)[:-1]) ffts.append(win)ffts=np.asarray(ffts)plt.figure(figsize=(10,5))plt.pcolormesh(ffts.T)plt.xlim(0,win_num)plt.ylim(0,fft_len/2)


8、三角带通滤波器
显示三角带通滤波器
In [13]:


f = np.linspace(0,8000,1000)mfcc.create_filter(26)plt.figure(figsize=(15,3))for f in mfcc.filter_mat.T: plt.plot(f)plt.xlim(0,256)


显示快速傅里叶变换的结果与滤波器乘积
In [14]:


melspec=[]for f in ffts: m = np.dot(f,mfcc.filter_mat) melspec.append(m)melspec=np.asarray(melspec)plt.figure(figsize=(15,5))plt.pcolormesh(melspec.T,cmap='gray')plt.xlim(0,win_num)plt.ylim(0,26)


放大数据中的细节同时衰减高光
In [15]:


mels = np.log(melspec)plt.figure(figsize=(15,5))plt.pcolormesh(mels.T,cmap='gray')plt.xlim(0,win_num)plt.ylim(0,26)


9、离散余弦变换效果
我们只对获得傅立叶变换的幅度感兴趣,而要计算的数据却少得多(只有26个输入点),因此我们可以使用离散余弦变换而不是FFT来大大简化此过程。
In [16]:


filter_num=26mfcc_num=12dct_base=np.zeros((filter_num,mfcc_num));for m in range(mfcc_num): dct_base[:,m]=np.cos((m+1)*np.pi/filter_num*(np.arange(filter_num)+0.5)) plt.figure(figsize=(6,3))plt.pcolormesh(dct_base.T,cmap='gray')plt.xlim(0,24)plt.ylim(0,12)plt.xlabel('Filters')plt.ylabel('MFCCs')


10、获得MFCC
将梅尔频谱图与DCT矩阵进行矩阵乘法以获得MFCC。
In [17]:


filter_num=26mfcc_num=12mfccs=[]for m in mels: c=np.dot(m,dct_base) mfccs.append(c)mfccs=np.asarray(mfccs)plt.figure(figsize=(15,5))plt.pcolormesh(mfccs.T,cmap='RdBu')#cmap:RdBu,Oranges,YlGn,OrRd,gray,...plt.xlim(0,win_num)plt.ylim(0,mfcc_num)
最新发布