语音识别
import speech_recognition as sr
def audio_to_text(audio_source, is_file=False):
"""
将语音转换为文本
:param audio_source: 音频来源(文件路径或麦克风输入)
:param is_file: 是否为本地文件(True:文件;False:麦克风)
:return: 识别的文本(字符串)
"""
# 初始化识别器
r = sr.Recognizer()
try:
if is_file:
# 处理本地音频文件(支持wav、aiff等格式,mp3需额外转换)
with sr.AudioFile(audio_source) as source:
# 读取音频文件
audio = r.record(source)
else:
# 处理麦克风实时录音
print("请开始说话(录音将持续5秒)...")
with sr.Microphone(sample_rate=16000) as source:
# 调整环境噪音(可选,提高识别率)
r.adjust_for_ambient_noise(source, duration=1)
# 录音5秒(可修改duration参数调整时长)
audio = r.listen(source, timeout=5, phrase_time_limit=5)
# 使用Google Web Speech API识别(需联网)
text = r.recognize_google(audio, language="zh-CN") # 中文识别
return f"识别结果:{text}"
except sr.UnknownValueError:
return "抱歉,无法识别语音(可能模糊或无声音)"
except sr.RequestError as e:
return f"识别请求失败:{e}(请检查网络连接)"
except Exception as e:
return f"发生错误:{e}"
# 测试:根据需求选择模式
if __name__ == "__main__":
# 模式1:识别本地音频文件(需提前准备wav格式文件,示例路径请替换)
# file_path = "test.wav" # 替换为你的音频文件路径
# print(audio_to_text(file_path, is_file=True))
# 模式2:麦克风实时录音识别(默认开启)
print(audio_to_text(None, is_file=False))
语音合成
import pyttsx3
def text_to_speech(text, save_path=None, rate=150, volume=1.0, voice_id=None):
"""
文本转语音
:param text: 待合成的文本
:param save_path: 保存音频文件的路径(如"output.mp3",None则不保存)
:param rate: 语速(默认150,范围一般50-200)
:param volume: 音量(0.0-1.0)
:param voice_id: 语音ID(可选,不同系统有不同的内置语音)
"""
# 初始化引擎
engine = pyttsx3.init()
# 设置语速
engine.setProperty('rate', rate)
# 设置音量
engine.setProperty('volume', volume)
# 设置语音(可选)
if voice_id:
voices = engine.getProperty('voices')
# 遍历可用语音(可先打印voices查看)
for voice in voices:
if voice.id == voice_id:
engine.setProperty('voice', voice.id)
break
# 朗读文本
engine.say(text)
# 保存到文件(部分系统支持,如Windows)
if save_path:
engine.save_to_file(text, save_path)
# 等待语音播放完成
engine.runAndWait()
engine.stop()
# 示例使用
if __name__ == "__main__":
# 简单朗读
text = "你好,这是一个使用pyttsx3的语音合成示例。"
text_to_speech(text)
# 保存为音频文件(Windows系统通常支持)
# text_to_speech(text, save_path="output.mp3", rate=130, volume=0.8)
# 查看可用语音(可选)
# engine = pyttsx3.init()
# for voice in engine.getProperty('voices'):
# print(f"语音ID: {voice.id}, 名称: {voice.name}")
人脸识别
import cv2
import face_recognition
import os
def load_known_faces(known_faces_dir):
"""
加载已知人脸并提取特征编码
:param known_faces_dir: 存储已知人脸图片的文件夹路径
:return: 已知人脸编码列表和对应的名字列表
"""
known_encodings = []
known_names = []
# 遍历文件夹中的所有图片
for filename in os.listdir(known_faces_dir):
# 过滤非图片文件
if not filename.endswith(('.jpg', '.jpeg', '.png')):
continue
# 提取名字(假设文件名格式为"名字.jpg")
name = os.path.splitext(filename)[0]
# 加载图片并提取人脸特征
image_path = os.path.join(known_faces_dir, filename)
image = face_recognition.load_image_file(image_path)
# 假设每张图片只有一个人脸(取第一个编码)
face_encoding = face_recognition.face_encodings(image)[0]
# 存储编码和名字
known_encodings.append(face_encoding)
known_names.append(name)
print(f"已加载 {len(known_names)} 张已知人脸")
return known_encodings, known_names
def recognize_faces(known_encodings, known_names, camera_index=0):
"""
实时人脸识别(通过摄像头)
:param known_encodings: 已知人脸特征编码列表
:param known_names: 已知人脸对应的名字列表
:param camera_index: 摄像头索引(默认0为内置摄像头)
"""
# 打开摄像头
video_capture = cv2.VideoCapture(camera_index)
if not video_capture.isOpened():
print("无法打开摄像头,请检查索引是否正确")
return
while True:
# 读取一帧画面
ret, frame = video_capture.read()
if not ret:
print("无法获取画面")
break
# 缩小画面尺寸(加快处理速度,可选)
small_frame = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25)
# OpenCV默认格式为BGR,转换为RGB(face_recognition需要RGB)
rgb_small_frame = small_frame[:, :, ::-1]
# 检测画面中的人脸位置
face_locations = face_recognition.face_locations(rgb_small_frame)
# 提取人脸特征编码
face_encodings = face_recognition.face_encodings(rgb_small_frame, face_locations)
# 遍历检测到的人脸
face_names = []
for face_encoding in face_encodings:
# 与已知人脸对比(默认阈值0.6,越小越严格)
matches = face_recognition.compare_faces(known_encodings, face_encoding, tolerance=0.6)
name = "未知人脸" # 默认未知
# 如果找到匹配的人脸
if True in matches:
first_match_index = matches.index(True)
name = known_names[first_match_index]
face_names.append(name)
# 在画面上绘制人脸框和名字
for (top, right, bottom, left), name in zip(face_locations, face_names):
# 还原缩放的坐标(因为之前缩小了4倍)
top *= 4
right *= 4
bottom *= 4
left *= 4
# 绘制人脸框(绿色)
cv2.rectangle(frame, (left, top), (right, bottom), (0, 255, 0), 2)
# 绘制名字标签(黑色背景+白色文字)
cv2.rectangle(frame, (left, bottom - 35), (right, bottom), (0, 0, 0), cv2.FILLED)
font = cv2.FONT_HERSHEY_DUPLEX
cv2.putText(frame, name, (left + 6, bottom - 6), font, 1.0, (255, 255, 255), 1)
# 显示画面
cv2.imshow('人脸识别', frame)
# 按'q'退出
if cv2.waitKey(1) & 0xFF == ord('q'):
break
# 释放资源
video_capture.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
# 已知人脸图片文件夹路径(请替换为你的文件夹)
known_faces_directory = "known_faces" # 例如:该文件夹下有"张三.jpg"、"李四.png"等
# 确保文件夹存在
if not os.path.exists(known_faces_directory):
os.makedirs(known_faces_directory)
print(f"已创建已知人脸文件夹:{known_faces_directory},请放入人脸图片(文件名即为名字)")
else:
# 加载已知人脸
known_encodings, known_names = load_known_faces(known_faces_directory)
# 启动实时识别
recognize_faces(known_encodings, known_names)
文字识别
from aip import AipOcr
# 替换为你的百度AI应用信息
APP_ID = '你的APP_ID'
API_KEY = '你的API_KEY'
SECRET_KEY = '你的SECRET_KEY'
# 初始化客户端
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
def get_file_content(file_path):
"""读取图片文件内容"""
with open(file_path, 'rb') as fp:
return fp.read()
def ocr_with_baidu(image_path, type='general_basic'):
"""
调用百度AI识别文字
:param image_path: 图片路径
:param type: 识别类型(general_basic:通用文字,accurate_basic:高精度,handwriting:手写体)
:return: 识别结果字符串
"""
image = get_file_content(image_path)
# 调用不同接口(根据场景选择)
if type == 'general_basic':
# 通用文字识别(免费,每天50000次)
result = client.basicGeneral(image)
elif type == 'accurate_basic':
# 高精度文字识别(收费,每天50次免费)
result = client.basicAccurate(image)
elif type == 'handwriting':
# 手写体识别(收费,每天100次免费)
result = client.handwriting(image)
else:
raise ValueError("不支持的识别类型")
# 解析结果(提取文字)
if 'words_result' in result:
text = '\n'.join([item['words'] for item in result['words_result']])
return text
else:
return f"识别失败:{result.get('error_msg', '未知错误')}"
# 示例使用
if __name__ == "__main__":
image_path = "test.png" # 替换为你的图片路径
try:
# 通用文字识别(免费)
result = ocr_with_baidu(image_path, type='general_basic')
print(f"百度AI识别结果:\n{result}")
except Exception as e:
print(f"错误:{e}")
证件识别
from aip import AipOcr
import json
# 替换为你的百度AI应用信息(从控制台获取)
APP_ID = "你的APP_ID"
API_KEY = "你的API_KEY"
SECRET_KEY = "你的SECRET_KEY"
# 初始化客户端
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
def get_image_content(image_path):
"""读取证件图片内容(二进制)"""
with open(image_path, 'rb') as f:
return f.read()
def id_card_ocr(image_path, side="front"):
"""
身份证识别
:param image_path: 身份证图片路径
:param side: 证件面("front"正面,"back"反面)
:return: 结构化识别结果(字典)
"""
image = get_image_content(image_path)
# 调用百度AI身份证识别接口
# 参数:detect_direction=True 自动检测图片方向(避免旋转导致识别错误)
result = client.idcard(image, side, {"detect_direction": "true"})
# 处理返回结果
if "error_code" in result:
# 识别失败(如密钥错误、图片无效)
return {"status": "error", "message": f"错误码:{result['error_code']},信息:{result['error_msg']}"}
else:
# 提取结构化信息(正面包含姓名、身份证号等;反面包含签发机关、有效期等)
info = result["words_result"]
return {
"status": "success",
"side": side,
"info": {
# 正面字段
"name": info.get("姓名", {}).get("words", ""), # 姓名
"gender": info.get("性别", {}).get("words", ""), # 性别
"nation": info.get("民族", {}).get("words", ""), # 民族
"birth": info.get("出生", {}).get("words", ""), # 出生日期
"address": info.get("住址", {}).get("words", ""), # 地址
"id_number": info.get("公民身份号码", {}).get("words", ""), # 身份证号
# 反面字段(仅当side="back"时有效)
"issuer": info.get("签发机关", {}).get("words", ""), # 签发机关
"valid_period": info.get("有效期至", {}).get("words", "") # 有效期至
}
}
# 示例使用
if __name__ == "__main__":
# 替换为你的身份证图片路径(正面或反面)
id_card_front_path = "id_card_front.jpg" # 身份证正面图片
id_card_back_path = "id_card_back.jpg" # 身份证反面图片
# 识别正面
front_result = id_card_ocr(id_card_front_path, side="front")
print("身份证正面识别结果:")
print(json.dumps(front_result, ensure_ascii=False, indent=2)) # 格式化输出(支持中文)
# 识别反面(可选)
# back_result = id_card_ocr(id_card_back_path, side="back")
# print("\n身份证反面识别结果:")
# print(json.dumps(back_result, ensure_ascii=False, indent=2))
语义理解
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForSentenceSimilarity
import torch
# --------------------------
# 1. 意图识别(判断用户意图)
# --------------------------
def intent_recognition(text):
"""识别用户输入的意图(如查询、预订、闲聊等)"""
# 使用预训练的意图分类模型(支持中文)
intent_classifier = pipeline(
"text-classification",
model="uer/roberta-base-finetuned-dianping-chinese", # 适用于中文意图/情感分类
return_all_scores=True # 返回所有意图的概率
)
result = intent_classifier(text)
# 提取概率最高的意图
top_intent = max(result[0], key=lambda x: x['score'])
return {
"text": text,
"top_intent": top_intent['label'],
"confidence": round(top_intent['score'], 4)
}
# --------------------------
# 2. 命名实体识别(提取关键实体)
# --------------------------
def named_entity_recognition(text):
"""提取文本中的实体(如人名、地点、组织等)"""
# 中文NER模型(识别实体类型:PER-人名,LOC-地点,ORG-组织)
ner_pipeline = pipeline(
"ner",
model="ckiplab/bert-base-chinese-ner",
tokenizer=AutoTokenizer.from_pretrained("ckiplab/bert-base-chinese-ner")
)
entities = ner_pipeline(text)
# 整理实体结果(合并同实体的分词)
merged_entities = []
current_entity = None
for entity in entities:
word = entity['word']
entity_type = entity['entity']
# 实体标签格式:B-XXX(开始)/I-XXX(中间)
if entity_type.startswith('B-'):
if current_entity:
merged_entities.append(current_entity)
current_entity = {
"entity": word,
"type": entity_type[2:], # 去除前缀B-
"score": entity['score']
}
elif entity_type.startswith('I-') and current_entity:
current_entity['entity'] += word
current_entity['score'] = (current_entity['score'] + entity['score']) / 2 # 平均置信度
if current_entity:
merged_entities.append(current_entity)
return {
"text": text,
"entities": merged_entities
}
# --------------------------
# 3. 语义相似度计算(判断句子含义是否相近)
# --------------------------
def semantic_similarity(text1, text2):
"""计算两个句子的语义相似度(0-1,值越高越相似)"""
# 中文语义相似度模型
model = AutoModelForSentenceSimilarity.from_pretrained("shibing624/text2vec-base-chinese")
tokenizer = AutoTokenizer.from_pretrained("shibing624/text2vec-base-chinese")
# 编码句子
inputs1 = tokenizer(text1, return_tensors="pt", padding=True, truncation=True)
inputs2 = tokenizer(text2, return_tensors="pt", padding=True, truncation=True)
# 计算句向量
with torch.no_grad():
embeddings1 = model(**inputs1).embedding_output.mean(dim=1) # 句向量(平均词向量)
embeddings2 = model(** inputs2).embedding_output.mean(dim=1)
# 计算余弦相似度(值越高,语义越近)
similarity = torch.nn.functional.cosine_similarity(embeddings1, embeddings2).item()
return {
"text1": text1,
"text2": text2,
"similarity": round(similarity, 4)
}
# 示例使用
if __name__ == "__main__":
# 1. 意图识别示例
text = "明天北京的天气怎么样?"
intent_result = intent_recognition(text)
print("意图识别结果:")
print(intent_result)
# 2. 命名实体识别示例
text = "张三计划下周去上海参加腾讯的会议"
ner_result = named_entity_recognition(text)
print("\n命名实体识别结果:")
print(ner_result)
# 3. 语义相似度示例
text1 = "我想查询明天的航班"
text2 = "帮我看看明天有哪些飞机可以坐"
similarity_result = semantic_similarity(text1, text2)
print("\n语义相似度结果:")
print(similarity_result)生成可形成演示效果的python代码
最新发布