用python 绘制语谱图_python画出语谱图-优快云博客

用python 绘制语谱图

1.步骤：

1）导入相关模块
2）读入音频并获取音频参数
3）将音频转化为可处理形式（注意读入的是字符串格式，需要转换成int或short型）

代码如下：


 
 
   
   
    
    
   
   
   
   
    
    
     
     import numpy 
     
     as np
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import matplotlib.pyplot 
     
     as plt
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import os
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import wave
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #读入音频。
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     path = 
     
     "E:\SpeechWarehouse\zmkm"
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     name = 
     
     'zmkm0.wav'
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #我音频的路径为E:\SpeechWarehouse\zmkm\zmkm0.wav
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     filename = os.path.join(path, name)
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 打开语音文件。
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     f = wave.open(filename,
     
     'rb')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 得到语音参数
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     params = f.getparams()
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     nchannels, sampwidth, framerate,nframes = params[:
     
     4]
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 将字符串格式的数据转成int型
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     strData = f.readframes(nframes)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     waveData = np.fromstring(strData,dtype=np.short)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 归一化
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     waveData = waveData * 
     
     1.0/max(abs(waveData))
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #将音频信号规整乘每行一路通道信号的格式，即该矩阵一行为一个通道的采样点，共nchannels行
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     waveData = np.reshape(waveData,[nframes,nchannels]).T 
     
     # .T 表示转置
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     f.close()
     
     #关闭文件

其中getparams方法介绍如下：

getnchannels() -- returns number of audio channels (1 for mono, 2 for stereo)
getsampwidth() -- returns sample width in bytes
getframerate() -- returns sampling frequency
getnframes() -- returns number of audio frames
getparams() -- returns a namedtuple consisting of all of the above in the above order

稍微翻译一下：
nchannels:音频通道数(the number of audio channels),getnchannels()
sampwidth:每个音频样本的字节数(the number of bytes per audio sample),getsampwidth()
framerate:采样率(the sampling frequency),getframerate()
nframes:音频采样点数(the number of audio frames),getnframes()

4）绘制时域波形：

4.1）计算时间：t = n/fs
4.2）绘图


 
 
   
   
    
    
   
   
   
   
    
    
     
     '''绘制语音波形'''
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     time = np.arange(
     
     0,nframes) * (
     
     1.0 / framerate)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     time= np.reshape(time,[nframes,
     
     1]).T
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.plot(time[
     
     0,:nframes],waveData[
     
     0,:nframes],c=
     
     "b")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.xlabel(
     
     "time(seconds)")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.ylabel(
     
     "amplitude")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.title(
     
     "Original wave")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.show()

时域波形如图：

5）绘制语谱图：

5.1)求出帧长，一般取20~30ms
N = t*fs 每帧点数等于每帧时间乘以采样率
帧叠点数，一般取每帧点数的1/3~1/2
且FFT点数等于每帧点数（即不补零）
5.2)绘制语谱图，利用specgram()方法


 
 
   
   
    
    
   
   
   
   
    
    
     
     #绘制频谱
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     print(
     
     "plotting spectrogram...")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     framelength = 
     
     0.025 
     
     #帧长20~30ms
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     framesize = framelength*framerate 
     
     #每帧点数 N = t*fs,通常情况下值为256或512,要与NFFT相等\
    
    
   
   

   
   
    
    
   
   
   
   
    
                                        
     
     #而NFFT最好取2的整数次方,即framesize最好取的整数次方
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #找到与当前framesize最接近的2的正整数次方
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     nfftdict = {}
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     lists = [
     
     32,
     
     64,
     
     128,
     
     256,
     
     512,
     
     1024]
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     for i 
     
     in lists:
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         nfftdict[i] = abs(framesize - i)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     sortlist = sorted(nfftdict.items(), key=
     
     lambda x: x[
     
     1])
     
     #按与当前framesize差值升序排列
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     framesize = int(sortlist[
     
     0][
     
     0])
     
     #取最接近当前framesize的那个2的正整数次方值为新的framesize
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     NFFT = framesize 
     
     #NFFT必须与时域的点数framsize相等，即不补零的FFT
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     overlapSize = 
     
     1.0/
     
     3 * framesize 
     
     #重叠部分采样点数overlapSize约为每帧点数的1/3~1/2
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     overlapSize = int(round(overlapSize))
     
     #取整
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     spectrum,freqs,ts,fig = plt.specgram(waveData[
     
     0],NFFT = NFFT,Fs =framerate,window=np.hanning(M = framesize),noverlap=overlapSize,mode=
     
     'default',scale_by_freq=
     
     True,sides=
     
     'default',scale=
     
     'dB',xextent=
     
     None)
     
     #绘制频谱图         
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.ylabel(
     
     'Frequency')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.xlabel(
     
     'Time(s)')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.title(
     
     'Spectrogram')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.show()

specgram()方法概述，详细信息见官网

matplotlib.pyplot.specgram(x, NFFT=None, Fs=None, Fc=None, detrend=None, window=None, noverlap=None, cmap=None, xextent=None, pad_to=None, sides=None, scale_by_freq=None, mode=None, scale=None, vmin=None, vmax=None, *, data=None, **kwargs)
#参数：
x : 信号，一维arry或deqyence
NFFT：fft点数，默认256.不应该用于的零填充，最好为2的整数次方
Fs：采样率，默认2
Fc：信号x的中心频率，默认为0，用于移动图像，
window : 窗函数，长度必须等于NFFT（帧长），默认为汉宁窗
window_hanning(), window_none(), numpy.blackman(), numpy.hamming(), numpy.bartlett(), scipy.signal(), scipy.signal.get_window(), etc.
sides : {'default', 'onesided', 'twosided'}单边频谱或双边谱
Default gives the default behavior, which returns one-sided for real data and both for complex data.
'onesided' forces the return of a one-sided spectrum,
while 'twosided' forces two-sided.
pad_to : 执行FFT时填充数据的点数，可以与NFFT不同（补零，不会增加频谱分辨率，可以减轻栅栏效应，默认为None,即等于NFFT）
scale_by_freq : bool, optional是否按密度缩放频率，MATLAB默认为真
Specifies whether the resulting density values should be scaled by the scaling frequency, which gives density in units of Hz^-1.
This allows for integration over the returned frequency values. The default is True for MATLAB compatibility.
mode : 使用什么样的频谱，默认为PSD谱（功率谱）{'default', 'psd', 'magnitude', 'angle', 'phase'}
'complex' returns the complex-valued frequency spectrum.
'magnitude' returns the magnitude spectrum.
'angle' returns the phase spectrum without unwrapping.
'phase' returns the phase spectrum with unwrapping.
noverlap : 帧叠点数，默认为128
scale : {'default', 'linear', 'dB'}频谱纵坐标单位,默认为dB
xextent : None or (xmin, xmax)图像x轴范围
cmap :A matplotlib.colors.Colormap instance; if , use default determined by rc
detrend : {'default', 'constant', 'mean', 'linear', 'none'}
The function applied to each segment before fft-ing, designed to remove the mean or linear trend.
Unlike in MATLAB, where the detrend parameter is a vector, in matplotlib is it a function.
The mlab module defines detrend_none(), detrend_mean(), and detrend_linear(), but you can use a custom function as well.
You can also use a string to choose one of the functions. 'default', 'constant', and 'mean' call detrend_mean(). 'linear' calls detrend_linear(). 'none' calls detrend_none()
#返回：
spectrum:频谱矩阵
freqs：频谱图每行对应的频率
ts：频谱图每列对应的时间
fig ：图像

结果如图：

2.完整代码


 
 
   
   
    
    
   
   
   
   
    
    
     
     import numpy 
     
     as np
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import matplotlib.pyplot 
     
     as plt
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import os
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     import wave
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #读入音频。
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     path = 
     
     "E:\SpeechWarehouse\zmkm"
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     name = 
     
     'zmkm0.wav'
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #我音频的路径为E:\SpeechWarehouse\zmkm\zmkm0.wav
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     filename = os.path.join(path, name)
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 打开语音文件。
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     f = wave.open(filename,
     
     'rb')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 得到语音参数
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     params = f.getparams()
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     nchannels, sampwidth, framerate,nframes = params[:
     
     4]
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #---------------------------------------------------------------#
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 将字符串格式的数据转成int型
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     print(
     
     "reading wav file......")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     strData = f.readframes(nframes)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     waveData = np.fromstring(strData,dtype=np.short)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     # 归一化
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     waveData = waveData * 
     
     1.0/max(abs(waveData))
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #将音频信号规整乘每行一路通道信号的格式，即该矩阵一行为一个通道的采样点，共nchannels行
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     waveData = np.reshape(waveData,[nframes,nchannels]).T 
     
     # .T 表示转置
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     f.close()
     
     #关闭文件
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     print(
     
     "file is closed!")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #----------------------------------------------------------------#
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     '''绘制语音波形'''
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     print(
     
     "plotting signal wave...")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     time = np.arange(
     
     0,nframes) * (
     
     1.0 / framerate)
     
     #计算时间
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     time= np.reshape(time,[nframes,
     
     1]).T
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.plot(time[
     
     0,:nframes],waveData[
     
     0,:nframes],c=
     
     "b")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.xlabel(
     
     "time")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.ylabel(
     
     "amplitude")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.title(
     
     "Original wave")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.show()
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #--------------------------------------------------------------#
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     '''
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         绘制频谱
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             1.求出帧长、帧叠点数。且FFT点数等于每帧点数（即不补零）
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
             2.绘制语谱图
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     '''
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     print(
     
     "plotting spectrogram...")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     framelength = 
     
     0.025 
     
     #帧长20~30ms
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     framesize = framelength*framerate 
     
     #每帧点数 N = t*fs,通常情况下值为256或512,要与NFFT相等\
    
    
   
   

   
   
    
    
   
   
   
   
    
                                        
     
     #而NFFT最好取2的整数次方,即framesize最好取的整数次方
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     #找到与当前framesize最接近的2的正整数次方
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     nfftdict = {}
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     lists = [
     
     32,
     
     64,
     
     128,
     
     256,
     
     512,
     
     1024]
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     for i 
     
     in lists:
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
         nfftdict[i] = abs(framesize - i)
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     sortlist = sorted(nfftdict.items(), key=
     
     lambda x: x[
     
     1])
     
     #按与当前framesize差值升序排列
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     framesize = int(sortlist[
     
     0][
     
     0])
     
     #取最接近当前framesize的那个2的正整数次方值为新的framesize
    
    
   
   

   
   
    
    
   
   
   
   
    
     
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     NFFT = framesize 
     
     #NFFT必须与时域的点数framsize相等，即不补零的FFT
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     overlapSize = 
     
     1.0/
     
     3 * framesize 
     
     #重叠部分采样点数overlapSize约为每帧点数的1/3~1/2
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     overlapSize = int(round(overlapSize))
     
     #取整
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     print(
     
     "帧长为{},帧叠为{},傅里叶变换点数为{}".format(framesize,overlapSize,NFFT))
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     spectrum,freqs,ts,fig = plt.specgram(waveData[
     
     0],NFFT = NFFT,Fs =framerate,window=np.hanning(M = framesize),noverlap=overlapSize,mode=
     
     'default',scale_by_freq=
     
     True,sides=
     
     'default',scale=
     
     'dB',xextent=
     
     None)
     
     #绘制频谱图
    
    
   
   

   
   
    
    
   
   
   
   
    
              
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.ylabel(
     
     'Frequency')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.xlabel(
     
     'Time')
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.title(
     
     "Spectrogram")
    
    
   
   

   
   
    
    
   
   
   
   
    
    
     
     plt.show()