#将下面的程序由网页版改为程序版
import gradio as gr
import ollama
import cv2
import numpy as np
import base64
import tempfile
import time
from PyPDF2 import PdfReader
import os
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from PIL import Image
import speech_recognition as sr
from pydub import AudioSegment
# 设置中文字体支持
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
# 模型配置参数
MODEL_CONFIG = {
"model": "gemma3:27b",
"temperature": 0.7,
"max_tokens": 1024,
"top_p": 0.9,
"system_prompt": "你是一个多模态AI助手,请用中文回答"
}
# 系统自检函数
def system_self_check():
"""执行系统自检,确保所有组件正常工作"""
results = []
# 检查Ollama连接
try:
ollama.list()
results.append(("Ollama连接", "✅ 正常", "green"))
except Exception as e:
results.append(("Ollama连接", f"❌ 失败: {str(e)}", "red"))
# 检查模型可用性
try:
models = [m['name'] for m in ollama.list()['models']]
if MODEL_CONFIG['model'] in models:
results.append(("模型可用性", f"✅ {MODEL_CONFIG['model']}可用", "green"))
else:
results.append(("模型可用性", f"❌ {MODEL_CONFIG['model']}不可用", "red"))
except Exception as e:
results.append(("模型可用性", f"❌ 检查失败: {str(e)}", "red"))
# 检查OpenCV
try:
cv2.__version__
results.append(("OpenCV", f"✅ 版本 {cv2.__version__}", "green"))
except Exception as e:
results.append(("OpenCV", f"❌ 未安装: {str(e)}", "red"))
# 检查PyPDF2
try:
PdfReader
results.append(("PDF处理", "✅ PyPDF2可用", "green"))
except Exception as e:
results.append(("PDF处理", f"❌ PyPDF2未安装: {str(e)}", "red"))
return results
# ========== 多模态处理函数 ==========
def process_text(prompt, temperature, max_tokens):
"""处理文本输入"""
response = ollama.chat(
model=MODEL_CONFIG['model'],
messages=[{'role': 'user', 'content': prompt}],
options={
'temperature': temperature,
'num_predict': max_tokens
}
)
return response['message']['content']
def process_image(image, prompt, temperature):
"""处理图像输入"""
# 将图像转换为base64
if isinstance(image, str):
img = Image.open(image)
else:
img = Image.fromarray(image.astype('uint8'), 'RGB')
buffered = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)
img.save(buffered, format="JPEG")
img_base64 = base64.b64encode(buffered.read()).decode('utf-8')
os.unlink(buffered.name)
# 创建多模态消息
messages = [
{
'role': 'user',
'content': prompt,
'images': [img_base64]
}
]
response = ollama.chat(
model=MODEL_CONFIG['model'],
messages=messages,
options={'temperature': temperature}
)
# 创建可视化
fig, ax = plt.subplots(figsize=(8, 6))
ax.imshow(img)
ax.set_title("分析图像")
ax.axis('off')
# 添加响应文本
plt.figtext(0.5, 0.01, response['message']['content'],
ha="center", fontsize=10,
bbox={"facecolor": "orange", "alpha": 0.2, "pad": 5})
temp_img = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
plt.savefig(temp_img.name, bbox_inches='tight')
plt.close()
return temp_img.name, response['message']['content']
def process_video(video_path, prompt, temperature, frame_interval):
"""处理视频输入"""
cap = cv2.VideoCapture(video_path)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = cap.get(cv2.CAP_PROP_FPS)
duration = total_frames / fps
results = []
frames_to_process = []
# 提取关键帧
for i in range(0, total_frames, frame_interval):
cap.set(cv2.CAP_PROP_POS_FRAMES, i)
ret, frame = cap.read()
if ret:
frames_to_process.append((i, frame))
cap.release()
# 处理每一帧
for frame_idx, frame in frames_to_process:
# 保存临时图像
temp_frame = tempfile.NamedTemporaryFile(suffix='.jpg', delete=False)
cv2.imwrite(temp_frame.name, frame)
# 处理图像
_, analysis = process_image(temp_frame.name, f"{prompt} (帧 {frame_idx}/{total_frames})", temperature)
results.append({
"frame": frame_idx,
"time": frame_idx / fps,
"analysis": analysis
})
os.unlink(temp_frame.name)
# 创建可视化图表
fig, ax = plt.subplots(2, 1, figsize=(10, 8))
# 帧分析结果
times = [r['time'] for r in results]
analysis_len = [len(r['analysis']) for r in results]
ax[0].plot(times, analysis_len, 'o-')
ax[0].set_title("分析结果长度随时间变化")
ax[0].set_xlabel("时间 (秒)")
ax[0].set_ylabel("分析长度 (字符)")
# 关键词频率
keywords = ["人", "物体", "运动", "场景", "颜色"]
keyword_counts = {k: 0 for k in keywords}
for r in results:
for k in keywords:
if k in r['analysis']:
keyword_counts[k] += 1
ax[1].bar(keyword_counts.keys(), keyword_counts.values(), color='skyblue')
ax[1].set_title("关键词频率分析")
ax[1].set_ylabel("出现次数")
plt.tight_layout()
chart_path = tempfile.NamedTemporaryFile(suffix='.png', delete=False).name
plt.savefig(chart_path)
plt.close()
return results, chart_path
def process_audio(audio_path, prompt, temperature):
"""处理音频输入"""
# 语音识别
r = sr.Recognizer()
with sr.AudioFile(audio_path) as source:
audio_data = r.record(source)
try:
transcript = r.recognize_google(audio_data, language='zh-CN')
except sr.UnknownValueError:
transcript = "无法识别语音"
except sr.RequestError as e:
transcript = f"语音识别服务错误: {e}"
# 处理文本
response = process_text(f"{prompt}\n语音内容: {transcript}", temperature, 512)
# 创建波形图
audio = AudioSegment.from_file(audio_path)
samples = np.array(audio.get_array_of_samples())
plt.figure(figsize=(10, 4))
plt.plot(samples, color='blue')
plt.title("音频波形图")
plt.xlabel("采样点")
plt.ylabel("振幅")
plt.grid(True, alpha=0.3)
waveform_path = tempfile.NamedTemporaryFile(suffix='.png', delete=False).name
plt.savefig(waveform_path)
plt.close()
return transcript, response, waveform_path
def process_pdf(pdf_path, prompt, temperature, max_tokens):
"""处理PDF文档"""
text = ""
with open(pdf_path, "rb") as f:
reader = PdfReader(f)
for page in reader.pages:
text += page.extract_text() + "\n"
# 只取前2000字符避免过长
text = text[:2000] + "..." if len(text) > 2000 else text
# 处理文本
response = process_text(f"{prompt}\n文档内容: {text}", temperature, max_tokens)
# 创建词云图
from wordcloud import WordCloud
wordcloud = WordCloud(
font_path='SimHei.ttf',
background_color='white',
width=800,
height=400
).generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("文档关键词词云")
wordcloud_path = tempfile.NamedTemporaryFile(suffix='.png', delete=False).name
plt.savefig(wordcloud_path)
plt.close()
return text, response, wordcloud_path
# ========== 界面构建 ==========
def create_interface():
"""创建多模态界面"""
with gr.Blocks(
title="多模态Gemma3应用系统",
theme=gr.themes.Soft(primary_hue="teal", secondary_hue="pink"),
css=".gradio-container {background-color: #f5f7fa}"
) as app:
# 标题和状态栏
gr.Markdown("# 🚀 Gemma3多模态应用系统")
with gr.Row():
status_btn = gr.Button("系统状态检查", variant="secondary")
status_output = gr.HTML()
# 系统配置区域
with gr.Accordion("⚙️ 模型参数配置", open=False):
with gr.Row():
temperature = gr.Slider(0.1, 1.0, value=MODEL_CONFIG["temperature"],
label="温度", info="控制随机性 (低=确定, 高=创意)")
max_tokens = gr.Slider(128, 4096, value=MODEL_CONFIG["max_tokens"], step=128,
label="最大Token数", info="控制响应长度")
top_p = gr.Slider(0.1, 1.0, value=MODEL_CONFIG["top_p"],
label="Top-p采样", info="控制词汇选择范围")
system_prompt = gr.Textbox(value=MODEL_CONFIG["system_prompt"],
label="系统提示词", lines=2)
model_selector = gr.Dropdown(["gemma3:27b", "gemma3:9b", "llama3"],
value=MODEL_CONFIG["model"], label="选择模型")
# 多模态标签页
with gr.Tabs():
# 文本对话标签页
with gr.Tab("💬 文本对话"):
with gr.Row():
with gr.Column(scale=3):
text_input = gr.Textbox(label="输入问题", lines=5,
placeholder="请输入您的问题...")
text_btn = gr.Button("发送", variant="primary")
with gr.Column(scale=7):
text_output = gr.Textbox(label="模型回复", interactive=False, lines=10)
# 图像分析标签页
with gr.Tab("🖼️ 图像分析"):
with gr.Row():
with gr.Column(scale=4):
img_input = gr.Image(label="上传图像", type="filepath")
img_prompt = gr.Textbox(label="分析指令",
placeholder="描述您想分析的内容...")
img_btn = gr.Button("分析图像", variant="primary")
with gr.Column(scale=6):
img_output = gr.Image(label="分析结果可视化")
img_analysis = gr.Textbox(label="详细分析", interactive=False)
# 视频分析标签页
with gr.Tab("🎬 视频分析"):
with gr.Row():
with gr.Column(scale=4):
video_input = gr.Video(label="上传视频")
video_prompt = gr.Textbox(label="分析指令",
placeholder="输入视频分析指令...")
frame_slider = gr.Slider(10, 100, value=30, step=10,
label="帧采样间隔",
info="间隔越大处理越快")
video_btn = gr.Button("分析视频", variant="primary")
with gr.Column(scale=6):
video_output = gr.Plot(label="分析结果可视化")
video_analysis = gr.JSON(label="帧分析结果")
# 语音处理标签页
with gr.Tab("🎧 语音处理"):
with gr.Row():
with gr.Column(scale=4):
audio_input = gr.Audio(label="上传音频", type="filepath")
audio_prompt = gr.Textbox(label="分析指令",
placeholder="输入语音分析指令...")
audio_btn = gr.Button("处理音频", variant="primary")
with gr.Column(scale=6):
audio_waveform = gr.Image(label="音频波形")
audio_transcript = gr.Textbox(label="语音转写", interactive=False)
audio_analysis = gr.Textbox(label="分析结果", interactive=False)
# PDF分析标签页
with gr.Tab("📄 文档分析"):
with gr.Row():
with gr.Column(scale=4):
pdf_input = gr.File(label="上传PDF文档", file_types=[".pdf"])
pdf_prompt = gr.Textbox(label="分析指令",
placeholder="输入文档分析指令...")
pdf_btn = gr.Button("分析文档", variant="primary")
with gr.Column(scale=6):
pdf_wordcloud = gr.Image(label="关键词词云")
pdf_content = gr.Textbox(label="文档内容摘要", interactive=False)
pdf_analysis = gr.Textbox(label="分析结果", interactive=False)
# ========== 事件绑定 ==========
# 系统状态检查
status_btn.click(
fn=lambda: "<br>".join(
[f"<span style='color:{c}'>{n}: {s}</span>" for n, s, c in system_self_check()]
),
outputs=status_output
)
# 文本处理
text_btn.click(
fn=process_text,
inputs=[text_input, temperature, max_tokens],
outputs=text_output
)
# 图像处理
img_btn.click(
fn=process_image,
inputs=[img_input, img_prompt, temperature],
outputs=[img_output, img_analysis]
)
# 视频处理
video_btn.click(
fn=process_video,
inputs=[video_input, video_prompt, temperature, frame_slider],
outputs=[video_analysis, video_output]
)
# 音频处理
audio_btn.click(
fn=process_audio,
inputs=[audio_input, audio_prompt, temperature],
outputs=[audio_transcript, audio_analysis, audio_waveform]
)
# PDF处理
pdf_btn.click(
fn=process_pdf,
inputs=[pdf_input, pdf_prompt, temperature, max_tokens],
outputs=[pdf_content, pdf_analysis, pdf_wordcloud]
)
# 模型参数更新
model_selector.change(
fn=lambda x: gr.update(value=x),
inputs=model_selector,
outputs=model_selector
)
return app
# ========== 主程序 ==========
if __name__ == "__main__":
# 系统自检
print("=== 系统自检 ===")
for name, status, _ in system_self_check():
print(f"{name}: {status}")
# 创建界面
app = create_interface()
app.launch(
server_name="127.0.0.100",
server_port=7860,
share=True,
favicon_path="favicon.ico"
)
最新发布