在以下电话录音质检的代码中加入新的要求:
1、音频文件包含呼叫过程,即会出现铃声或者呼叫音,要求去除不识别。
2、音频文件可能会出现未接通、拒接、中途挂断等异常情况,要求识别并标注。
3、音频文件有效对话内容为接通之后的对话,即检测出电话接通点并仅识别接通之后的对话内容,剔除未接通的铃声部分或者呼叫部分。
4、对话场景为客户与客服的电话对话场景,为客服主动外呼的情况。
5、对话场景嘈杂,存在较多的周边人声噪声,特别是电话呼叫还未接通的时候,更受周边的人声噪声影响,要求降噪,特别是人声降噪。
以下为原始代码:
import os
import torch
import numpy as np
import wave
from pydub import AudioSegment, effects
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
import noisereduce as nr
from scipy.io import wavfile
def extract_voice_activity(audio_path):
"""检测语音活动时间段,返回有效语音段的开始和结束时间"""
# 使用pyannote的语音活动检测(需要安装pyannote.audio)
try:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection")
output = pipeline(audio_path)
# 获取所有语音段
speech_segments = []
for speech in output.get_timeline().support():
speech_segments.append((speech.start, speech.end))
# 如果检测到语音段,返回第一个和最后一个语音段的时间
if speech_segments:
return speech_segments[0][0], speech_segments[-1][1]
except ImportError:
print("pyannote未安装,使用简单能量检测")
# 备用方案:基于能量的简单语音活动检测
audio = AudioSegment.from_wav(audio_path)
samples = np.array(audio.get_array_of_samples())
sample_rate = audio.frame_rate
# 计算能量
energy = np.convolve(np.abs(samples), np.ones(512) / 512, mode='same')
# 设置阈值
threshold = np.percentile(energy, 80) # 取能量最高的20%作为语音
# 找到语音开始和结束点
start_idx = np.argmax(energy > threshold)
end_idx = len(energy) - np.argmax(energy[::-1] > threshold)
return start_idx / sample_rate, end_idx / sample_rate
def extract_main_voice(audio_path, output_path):
"""提取主要说话人声音,抑制背景噪音"""
# 加载音频
rate, data = wavfile.read(audio_path)
# 如果多声道,取第一个声道
if len(data.shape) > 1:
data = data[:, 0]
# 使用noisereduce进行降噪
# 取前0.5秒作为噪音样本
noise_sample = data[:int(rate * 0.5)]
reduced_noise = nr.reduce_noise(
y=data,
sr=rate,
y_noise=noise_sample,
prop_decrease=0.95, # 强烈降噪
stationary=True,
n_std_thresh_stationary=2.0,
use_tqdm=False
)
# 保存处理后的音频
wavfile.write(output_path, rate, reduced_noise.astype(np.int16))
return output_path
def enhance_telephone_quality(audio_path, output_path):
"""增强电话录音质量,优化频段"""
# 加载音频
audio = AudioSegment.from_wav(audio_path)
# 电话频段滤波 (300-3400Hz)
audio = audio.high_pass_filter(300).low_pass_filter(3400)
# 动态范围压缩
audio = audio.compress_dynamic_range(threshold=-20.0, ratio=4.0)
# 音量标准化
audio = effects.normalize(audio)
# 保存
audio.export(output_path, format="wav")
return output_path
def robust_speech_recognition(input_wav, output_txt, target_speakers=2):
"""鲁棒的语音识别与说话人分离"""
# 预处理步骤
temp_files = []
# 步骤1: 检测有效语音时间段
start_time, end_time = extract_voice_activity(input_wav)
print(f"检测到有效语音时间段: {start_time:.2f}s - {end_time:.2f}s")
# 步骤2: 裁剪有效语音段
cropped_path = "temp_cropped.wav"
audio = AudioSegment.from_wav(input_wav)
cropped_audio = audio[int(start_time * 1000):int(end_time * 1000)]
cropped_audio.export(cropped_path, format="wav")
temp_files.append(cropped_path)
# 步骤3: 提取主要说话人声音
denoised_path = "temp_denoised.wav"
extract_main_voice(cropped_path, denoised_path)
temp_files.append(denoised_path)
# 步骤4: 电话质量增强
enhanced_path = "temp_enhanced.wav"
enhance_telephone_quality(denoised_path, enhanced_path)
temp_files.append(enhanced_path)
# 初始化ASR模型
model_dir = r"D:/models/ASR-models/iic/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn"
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model=model_dir,
ngpu=True if torch.cuda.is_available() else False,
# 关键优化参数
vad_batch_size=1,
punc_batch_size=1,
chunk_size=8, # 较小块大小提高准确性
param_dict={
"spk_num": target_speakers,
"hotword": "", # 禁用热词
"vad_threshold": 0.85, # 高阈值避免噪音
"min_silence_duration": 800, # 较长静音才分段
"speech_noise_thres": 0.75 # 语音/噪音区分阈值
}
)
# 执行推理
result = inference_pipeline(input=enhanced_path)
# 清理临时文件
for file in temp_files:
if os.path.exists(file):
os.remove(file)
# 解析结果
if not result:
print("未获取到识别结果")
return
if isinstance(result, list):
result = result[0] # 取第一个结果
output_lines = []
if "sentence_info" in result:
# 按开始时间排序
segments = sorted(result["sentence_info"], key=lambda x: x["start"])
# 过滤过短片段
min_duration = 0.5 # 至少0.5秒
filtered_segments = [
seg for seg in segments
if (seg["end"] - seg["start"]) / 1000 > min_duration
]
# 应用说话人映射
for i, segment in enumerate(filtered_segments):
speaker_id = segment.get("spk", 0)
speaker = f"spk{speaker_id % target_speakers}"
text = segment.get("text", "")
output_lines.append(f"{speaker}: {text}")
else:
print("警告: 无法识别的结果格式")
print(json.dumps(result, indent=2, ensure_ascii=False))
return
# 写入文件
with open(output_txt, 'w', encoding='utf-8') as f:
f.write("\n".join(output_lines))
print(f"结果已保存至 {output_txt}")
return result
def post_process_result(input_txt, output_txt):
"""后期处理:合并短句,纠正常见错误"""
with open(input_txt, 'r', encoding='utf-8') as f:
lines = f.readlines()
# 合并连续同一说话人的短句
merged_lines = []
current_speaker = None
current_text = ""
for line in lines:
if ':' in line:
parts = line.split(':', 1)
speaker = parts[0].strip()
text = parts[1].strip()
if speaker == current_speaker:
current_text += " " + text
else:
if current_speaker is not None:
merged_lines.append(f"{current_speaker}: {current_text}")
current_speaker = speaker
current_text = text
if current_speaker is not None:
merged_lines.append(f"{current_speaker}: {current_text}")
# 常见错误修正(根据实际需求定制)
corrections = {
"你好": "您好", # 修正称呼
"喂": "您好", # 修正开场白
}
corrected_lines = []
for line in merged_lines:
for wrong, correct in corrections.items():
line = line.replace(wrong, correct)
corrected_lines.append(line)
# 写入处理后的结果
with open(output_txt, 'w', encoding='utf-8') as f:
f.write("\n".join(corrected_lines))
print(f"后期处理完成,保存到: {output_txt}")
if __name__ == "__main__":
input_audio = "D:/python/语音情感分析/实际录音/测试/中国移动(10086)_20250506114248_converted.wav"
raw_output = "raw_diarization_result.txt"
final_output = "diarization_result.txt"
# 执行鲁棒识别
robust_speech_recognition(input_audio, raw_output, target_speakers=2)
# 后期处理
post_process_result(raw_output, final_output)
# 打印最终结果
print("\n最终识别结果:")
with open(final_output, 'r', encoding='utf-8') as f:
for line in f:
print(line.strip())