一、方案架构与核心代码
- 基础环境配置
华为云ModelArts SDK初始化
from modelarts.session import Session
from modelarts.estimator import Estimator
session = Session(access_key='your_access_key',
secret_key='your_secret_key',
project_id='your_project_id',
region='cn-north-4')
# 创建Notebook开发环境
estimator = Estimator(
modelarts_session=session,
train_instance_type='ml.p3.large',
train_instance_count=1,
framework_type='PyTorch-1.8',
framework_version='py3',
log_url='obs://your-bucket/logs/'
)
- 会议音频处理模块
import numpy as np
from huaweicloud_sis.client.rasr_client import RasrClient
from huaweicloud_sis.bean.rasr_request import RasrRequest
# 华为云语音识别服务初始化
def init_speech_client():
ak = 'your_ak'
sk = 'your_sk'
region = 'cn-north-4'
project_id = 'your_project_id'
return RasrClient(ak, sk, region, project_id)
# 音频转文字处理
def audio_to_text(audio_path):
client = init_speech_client()
request = RasrRequest()
request.set_audio_format('wav') # 支持wav, mp3等格式
request.set_property('speaker_diarization', 'true') # 开启说话人分离
request.add_word('公司术语') # 添加自定义词汇
with open(audio_path, 'rb') as f:
audio_data = f.read()
request.set_data(audio_data)
result = client.short_audio_recognize(request)
return result.get_result()
# 示例使用
transcript = audio_to_text('meeting_audio.wav')
二、大模型集成代码
- Flexus+DeepSeek模型加载
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# 加载华为云ModelArts上的预训练模型
model_path = 'obs://your-bucket/models/flexus-deepseek-meeting/'
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
torch_dtype=torch.float16,
trust_remote_code=True
)
# 会议纪要生成prompt模板
MEETING_PROMPT = """
作为专业的会议纪要助手,请根据以下会议转录内容生成结构化会议纪要:
会议主题: {meeting_topic}
参会人员: {participants}
会议时间: {meeting_time}
转录内容:
{transcript}
请按照以下格式输出:
### 会议摘要
- 主要讨论点1
- 主要讨论点2
### 决策事项
1. 事项描述 (负责人: xxx, 截止时间: yyyy-mm-dd)
### 待办事项
- [ ] 任务1 (负责人: xxx)
- [ ] 任务2 (负责人: xxx)
"""
- 会议纪要生成核心逻辑
def generate_meeting_minutes(transcript, meeting_info):
# 预处理输入
inputs = MEETING_PROMPT.format(
meeting_topic=meeting_info['topic'],
participants=", ".join(meeting_info['participants']),
meeting_time=meeting_info['time'],
transcript=transcript
)
# 模型推理
input_ids = tokenizer.encode(inputs, return_tensors="pt").to('cuda')
outputs = model.generate(
input_ids,
max_length=2048,
temperature=0.7,
top_p=0.9,
do_sample=True,
num_return_sequences=1
)
# 后处理
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return post_process_output(generated_text)
def post_process_output(text):
# 提取结构化结果
import re
pattern = r'### (.*?)\n(.*?)(?=###|$)'
sections = re.findall(pattern, text, re.DOTALL)
result = {
'summary': [],
'decisions': [],
'action_items': []
}
for section in sections:
title, content = section
if '摘要' in title:
result['summary'] = [item.strip() for item in content.split('-')[1:]]
elif '决策' in title:
result['decisions'] = parse_decisions(content)
elif '待办' in title:
result['action_items'] = parse_action_items(content)
return result
def parse_decisions(content):
# 解析决策事项
decisions = []
for line in content.split('\n'):
if not line.strip():
continue
match = re.match(r'\d+\. (.*?) \(负责人: (.*?), 截止时间: (.*?)\)', line)
if match:
desc, owner, deadline = match.groups()
decisions.append({
'description': desc,
'owner': owner,
'deadline': deadline
})
return decisions
三、企业系统对接代码
- 与华为云会议服务集成
from huaweicloudsdkcore.auth.credentials import BasicCredentials
from huaweicloudsdkmeeting.v1 import *
# 初始化会议服务客户端
def init_meeting_client():
credentials = BasicCredentials(
'your_ak',
'your_sk',
'your_project_id'
)
return MeetingClient.new_builder() \
.with_credentials(credentials) \
.with_region(MeetingRegion.value_of('cn-north-4')) \
.build()
# 获取会议录制文件
def get_meeting_recordings(meeting_id):
client = init_meeting_client()
request = ListRecordingsRequest()
request.conference_id = meeting_id
response = client.list_recordings(request)
return response.recordings
# 下载会议录音
def download_recording(recording_id, save_path):
client = init_meeting_client()
request = DownloadRecordingRequest()
request.recording_id = recording_id
response = client.download_recording(request, stream=True)
with open(save_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
- 与企业知识库对接
import requests
from datetime import datetime
将会议纪要存入企业知识库
def save_to_knowledge_base(meeting_minutes, meeting_info):
payload = {
“document_type”: “meeting_minutes”,
“title”: f"{meeting_info[‘topic’]}会议纪要",
“content”: meeting_minutes,
“metadata”: {
“participants”: meeting_info[‘participants’],
“meeting_time”: meeting_info[‘time’],
“created_at”: datetime.now().isoformat(),
“related_projects”: meeting_info.get(‘projects’, [])
},
“tags”: [“auto-generated”, “meeting”]
}
headers = {
"Content-Type": "application/json",
"Authorization": "Bearer your_kb_token"
}
response = requests.post(
"https://your-kb-api/api/v1/documents",
json=payload,
headers=headers
)
return response.json()
四、完整工作流示例
主处理流程
def process_meeting(meeting_id):
# 1. 从会议服务获取录音
recordings = get_meeting_recordings(meeting_id)
if not recordings:
raise Exception(“未找到会议录音”)
# 2. 下载最新录音
latest_recording = sorted(recordings, key=lambda x: x.create_time)[-1]
audio_path = f"/tmp/{meeting_id}.wav"
download_recording(latest_recording.id, audio_path)
# 3. 语音转文字
transcript = audio_to_text(audio_path)
# 4. 获取会议元数据
meeting_info = get_meeting_info(meeting_id) # 实现略
# 5. 生成会议纪要
minutes = generate_meeting_minutes(transcript, meeting_info)
# 6. 保存到知识库
save_result = save_to_knowledge_base(minutes, meeting_info)
# 7. 通知相关人员
notify_participants(meeting_info['participants'], save_result['url'])
return {
'status': 'success',
'minutes_url': save_result['url']
}
示例调用
if name == “main”:
result = process_meeting(“meeting123”)
print(f"会议纪要处理完成,访问地址: {result[‘minutes_url’]}")
五、模型训练与优化代码
- 领域适配微调
from modelarts.train import TrainingJob
创建微调训练任务
def fine_tune_model(train_data_path):
job = TrainingJob(
name=‘flexus-deepseek-meeting-ft’,
algorithm=‘PyTorch-1.8’,
inputs=[
{
‘data_url’: train_data_path,
‘type’: ‘obs’
}
],
outputs=[
{
‘train_url’: ‘obs://your-bucket/output/’,
‘type’: ‘obs’
}
],
parameters={
‘learning_rate’: 5e-5,
‘epochs’: 3,
‘batch_size’: 8,
‘max_seq_length’: 2048
},
code_dir=‘obs://your-bucket/code/’,
boot_file=‘train.py’,
instance_type=‘ml.p3.8xlarge’,
instance_count=2
)
job.create()
job.run()
return job
- 评估脚本示例
train.py
import torch
from transformers import Trainer, TrainingArguments
自定义评估指标
def compute_metrics(eval_pred):
predictions, labels = eval_pred
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# 计算关键信息提取准确率
key_info_acc = calculate_key_info_accuracy(decoded_preds, decoded_labels)
# 计算格式合规率
format_score = calculate_format_score(decoded_preds)
return {
'key_info_accuracy': key_info_acc,
'format_score': format_score
}
训练配置
training_args = TrainingArguments(
output_dir=‘./results’,
num_train_epochs=3,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
evaluation_strategy=“epoch”,
save_strategy=“epoch”,
logging_dir=‘./logs’,
logging_steps=50,
learning_rate=5e-5,
fp16=True,
report_to=“none”
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics
)
trainer.train()
六、部署与API服务
- ModelArts在线服务部署
from modelarts.deploy import Predictor
创建在线推理服务
def deploy_model(model_path):
predictor = Predictor(
name=‘meeting-minutes-service’,
model_path=model_path,
inference_spec=‘inference.py’,
instance_type=‘ml.p2.large’,
instance_count=1,
framework=‘PyTorch-1.8’,
framework_version=‘py3’,
wait=True
)
predictor.create()
return predictor
inference.py示例
from flask import Flask, request, jsonify
app = Flask(name)
@app.route(‘/predict’, methods=[‘POST’])
def predict():
data = request.json
transcript = data[‘transcript’]
meeting_info = data[‘meeting_info’]
minutes = generate_meeting_minutes(transcript, meeting_info)
return jsonify(minutes)
if name == ‘main’:
app.run(host=‘0.0.0.0’, port=8080)
2. API调用示例
import requests
def call_meeting_minutes_api(transcript, meeting_info):
url = “https://your-endpoint/predict”
headers = {
“Content-Type”: “application/json”,
“X-Auth-Token”: “your_api_token”
}
payload = {
“transcript”: transcript,
“meeting_info”: meeting_info
}
response = requests.post(url, json=payload, headers=headers)
return response.json()
使用示例
result = call_meeting_minutes_api(transcript, {
“topic”: “Q3产品规划会议”,
“participants”: [“张三”, “李四”, “王五”],
“time”: “2023-09-15 14:00”
})
七、安全与权限控制
华为云IAM权限验证装饰器
def iam_required(permission):
def decorator(f):
@wraps(f)
def decorated_function(*args, **kwargs):
token = request.headers.get(‘X-Auth-Token’)
if not verify_iam_token(token, permission):
return jsonify({“error”: “Unauthorized”}), 403
return f(*args, **kwargs)
return decorated_function
return decorator
数据加密处理
from huaweicloudsdkcore.auth.encryption_signer import EncryptionSigner
def encrypt_sensitive_data(data):
signer = EncryptionSigner(‘your_encryption_key’)
return signer.encrypt(data)
使用示例
@iam_required(‘MeetingMinutes.Write’)
def save_minutes():
data = request.json
encrypted_data = encrypt_sensitive_data(data[‘content’])
# 存储处理…
本方案提供了从音频处理、大模型集成到企业系统对接的完整代码实现,开发者可根据实际需求调整参数和流程。建议在实际部署前进行充分的测试和性能优化,特别是针对企业特定的会议场景和术语进行模型微调。