开源介绍
一直想开源一个软件版本的源代码,回馈于互联网,毕竟自己很多学习资料也是从网上各路大神的日常笔记中找到解决方法的。
本次开源的代码是早期的某个版本,在开源之前再次调试能够无错运行,才放出源代码。希望能给大家日常学习,二次开发提供一些有用的灵感与帮助。
功能点
(1)从word文档中提取图文,并使图片与文案建立对应关系。
(2)图片转换成视频画面,文案生成配音音频、生成对应的字幕
(3)添加背景音乐,并嵌入到视频中
应用场景
(1)各种word教程资料视频化
(2)使用图文制作有声绘本视频
调试截图
视频作品
演示视频
默认设置为,当背景音乐时长大于视频时长,会一直播放背景音乐。
源码
main.py
import os
import shutil
import subprocess
import tkinter as tk
from mutagen.mp3 import MP3
from pydub.utils import mediainfo
from tkinter import filedialog
from tkinter import messagebox
import moviepy.editor as mp
import cv2
import docx
import numpy as np
from io import BytesIO
import asyncio
from moviepy.audio.io.AudioFileClip import AudioFileClip
from moviepy.video.io.VideoFileClip import VideoFileClip
from pydub import AudioSegment
from moviepy.editor import CompositeAudioClip
from moviepy.config import change_settings
import threading
import tts as tts
class Application(tk.Tk):
def __init__(self):
super().__init__()
# 获取当前文件所在目录
current_directory = os.path.dirname(os.path.abspath(__file__))
self.title("小米坡 word文档转视频生成工具 v1.0")
self.geometry("1000x800")
self.resizable(False, False) # 禁止调整窗口大小
# 创建两个主容器
self.main_frame = tk.Frame(self)
self.main_frame.pack(fill="both", expand=True)
self.left_frame = tk.LabelFrame(self.main_frame, text="主功能区", padx=10, pady=10)
self.left_frame.pack(side="left", fill="both", expand=True)
self.right_frame = tk.LabelFrame(self.main_frame, text="关于", padx=10, pady=10)
self.right_frame.pack(side="right", fill="both", expand=True)
# 左侧主功能区
self.word_doc_frame = tk.LabelFrame(self.left_frame, text="Word 文档选项", padx=10, pady=10)
self.word_doc_frame.pack(fill="x", padx=5, pady=5)
self.word_doc_path = tk.StringVar()
self.word_doc_entry = tk.Entry(self.word_doc_frame, textvariable=self.word_doc_path, width=40)
self.word_doc_entry.pack(side="left", padx=5)
self.word_doc_button = tk.Button(self.word_doc_frame, text="选择 Word 文档", command=self.choose_word_doc)
self.word_doc_button.pack(side="left", padx=5)
self.background_audio_frame = tk.LabelFrame(self.left_frame, text="背景音乐文件", padx=10, pady=10)
self.background_audio_frame.pack(fill="x", padx=5, pady=5)
self.background_audio_path = tk.StringVar(value="")
self.background_audio_entry = tk.Entry(self.background_audio_frame, textvariable=self.background_audio_path, width=40)
self.background_audio_entry.pack(side="left", padx=5)
self.background_audio_button = tk.Button(self.background_audio_frame, text="选择背景音乐", command=self.choose_background_audio)
self.background_audio_button.pack(side="left", padx=5)
self.export_dir_frame = tk.LabelFrame(self.left_frame, text="视频导出目录", padx=10, pady=10)
self.export_dir_frame.pack(fill="x", padx=5, pady=5)
self.export_dir_path = tk.StringVar()
self.export_dir_entry = tk.Entry(self.export_dir_frame, textvariable=self.export_dir_path, width=40)
self.export_dir_entry.pack(side="left", padx=5)
self.export_dir_button = tk.Button(self.export_dir_frame, text="选择导出目录", command=self.choose_export_dir)
self.export_dir_button.pack(side="left", padx=5)
self.volume_frame = tk.LabelFrame(self.left_frame, text="BGM音量控制", padx=10, pady=10)
self.volume_frame.pack(fill="x", padx=5, pady=5)
self.volume_frame2 = tk.LabelFrame(self.left_frame, text="配音音量控制", padx=10, pady=10)
self.volume_frame2.pack(fill="x", padx=5, pady=5)
self.background_volume_scale = tk.Scale(self.volume_frame, from_=0, to=100, orient=tk.HORIZONTAL, resolution=1, length=400)
self.background_volume_scale.set(50)
self.background_volume_scale.pack(side="left", padx=5)
self.narration_volume_scale = tk.Scale(self.volume_frame2, from_=0, to=100, orient=tk.HORIZONTAL, resolution=1, length=400)
self.narration_volume_scale.set(100)
self.narration_volume_scale.pack(side="left", padx=5)
self.enable_subtitles_frame = tk.LabelFrame(self.left_frame, text="字幕设置", padx=10, pady=10)
self.enable_subtitles_frame.pack(fill="x", padx=5, pady=5)
self.enable_subtitles = tk.BooleanVar(value=True)
self.enable_subtitles_checkbox = tk.Checkbutton(self.enable_subtitles_frame, text="启用字幕", variable=self.enable_subtitles)
self.enable_subtitles_checkbox.pack(side="left", padx=5)
self.progress_frame = tk.LabelFrame(self.left_frame, text="进度", padx=10, pady=10)
self.progress_frame.pack(fill="x", padx=5, pady=5, expand=True)
self.progress_text = tk.Text(self.progress_frame, height=10, width=60, yscrollcommand=True)
self.progress_text_scrollbar = tk.Scrollbar(self.progress_frame, command=self.progress_text.yview)
self.progress_text.config(yscrollcommand=self.progress_text_scrollbar.set)
self.progress_text.pack(side="left", fill="x", expand=True)
self.progress_text_scrollbar.pack(side="right", fill="y")
self.button_frame = tk.Frame(self.left_frame)
self.button_frame.pack(fill="x", padx=5, pady=5)
self.start_button = tk.Button(self.button_frame, text="开始生成", command=self.start_generation)
self.start_button.pack(side="left", padx=5)
self.view_button = tk.Button(self.button_frame, text="点击查看文件", command=self.open_export_dir)
self.view_button.pack(side="left", padx=5)
log_path = os.path.join(current_directory, "static", "logo.gif")
self.logo_image = tk.PhotoImage(file=log_path)
# Create Label with image
self.logo_label = tk.Label(self.right_frame, image=self.logo_image)
self.logo_label.pack(pady=10)
# 右侧关于信息
self.about_label = tk.Label(self.right_frame, text="小米坡 word文档转视频生成工具")
self.about_label.pack(pady=10)
self.version_label = tk.Label(self.right_frame, text="用户版本 :v1")
self.version_label.pack(pady=10)
def choose_word_doc(self):
file_path = filedialog.askopenfilename(filetypes=[("Word 文档", "*.docx")])
self.word_doc_path.set(file_path)
def choose_export_dir(self):
dir_path = filedialog.askdirectory()
self.export_dir_path.set(dir_path)
def choose_background_audio(self):
file_path = filedialog.askopenfilename(filetypes=[("音频文件", "*.mp3;*.wav")])
self.background_audio_path.set(file_path)
def start_generation(self):
# if datetime.now() >= datetime(2024, 4, 8, 23, 59, 59):
# print("内测已结束。")
# return
thread = threading.Thread(target=self.generate_video_thread)
thread.start()
def generate_video_thread(self):
word_doc_file = self.word_doc_path.get()
export_dir = self.export_dir_path.get()
background_audio_volume = self.background_volume_scale.get() / 100
narration_volume = self.narration_volume_scale.get() / 100
enable_subtitles = self.enable_subtitles.get()
if not word_doc_file:
messagebox.showerror("错误", "请选择 Word 文档")
return
if not export_dir:
messagebox.showerror("错误", "请选择导出目录")
return
self.progress_text.insert(tk.END, "开始任务, 如果出现未知错误, 请关掉软件重新打开.\n")
self.progress_text.insert(tk.END, "如果你导出要覆盖的已存在视频, 确保那个视频不被其它进程占用.\n")
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
images, texts = self.get_images_and_texts(word_doc_file)
audio_files = []
print(len(texts))
for i, text in enumerate(texts):
print(i)
audio_file = os.path.join(export_dir, f"audio_{i}.mp3")
audio_files.append(audio_file)
self.generate_audio(text, audio_file)
self.progress_text.insert(tk.END, f"生成配音:{text}.\n")
self.progress_text.insert(tk.END, "完成配音音频文件生成.\n")
output_file = os.path.join(export_dir, "output_video.mp4")
background_audio_file = self.background_audio_path.get()
self.progress_text.insert(tk.END, "正在生成视频文件.\n")
self.generate_video(images, texts, output_file, audio_files, background_audio_file, background_audio_volume, narration_volume, enable_subtitles)
messagebox.showinfo("提示", "视频生成成功")
except Exception as e:
messagebox.showerror("错误", f"生成视频时出错:{e}")
finally:
loop.close()
def open_export_dir(self):
export_dir = self.export_dir_path.get()
if not export_dir:
messagebox.showerror("错误", "请选择导出目录")
return
os.startfile(export_dir)
def open_url(self, url):
import webbrowser
webbrowser.open(url)
def get_images_and_texts(self, docx_file):
images = []
texts = []
try:
doc = docx.Document(docx_file)
for paragraph in doc.paragraphs:
if paragraph.text.strip():
texts.append(paragraph.text.strip())
for rel in doc.part.rels.values():
if "image" in rel.target_ref:
image_bytes = rel.target_part.blob
images.append(image_bytes)
return images, texts
except Exception as e:
messagebox.showerror("错误", f"提取图片和文字内容时出错:{e}")
return [], []
def generate_audio(self, text, output_file):
try:
asyncio.run(tts.text_to_audio(text, '思思', 1.0, output_file))
except Exception as e:
messagebox.showerror("错误", f"生成音频时出错:{e}")
return None
def generate_video(self, images, texts, output_file, audio_durations, background_audio_file,
background_audio_volume, narration_volume, enable_subtitles):
try:
clips = []
# srt字幕文件内容
srt_content = ''
srt_time1 = 0
srt_time2 = 0
for idx, image_bytes in enumerate(images):
image_stream = BytesIO(image_bytes)
image_array = cv2.imdecode(np.frombuffer(image_stream.read(), np.uint8), -1)
image_path = f"temp_image_{idx}.jpg"
cv2.imwrite(image_path, image_array)
duration = 5
audio_path = ""
if idx < len(audio_durations):
audio_path = audio_durations[idx]
audio_duration = self.get_audio_duration(audio_path)
if audio_duration > duration:
duration = audio_duration
temp_str = str(idx) + '\n'
srt_time2 = srt_time1 + duration
# 添加字幕时间信息
temp_str = temp_str + f"{int(srt_time1 // 3600):02d}:{int((srt_time1 // 60) % 60):02d}:{int(srt_time1 % 60):02d},{int((srt_time1 - int(srt_time1)) * 1000):03d} --> {int(srt_time2 // 3600):02d}:{int((srt_time2 // 60) % 60):02d}:{int(srt_time2 % 60):02d},{int((srt_time2 - int(srt_time2)) * 1000):03d}\n"
temp_str = temp_str + texts[idx] + '\n'
# 更新时间起点
srt_time1 = srt_time2
srt_content = srt_content + temp_str
if audio_path != "":
audio_clip = AudioFileClip(audio_path).volumex(narration_volume)
clip = mp.ImageClip(image_path).set_duration(duration)
clip = clip.set_audio(audio_clip)
if idx > 0:
clip = clip.crossfadein(1)
previous_clip = clips[-1]
previous_clip = previous_clip.crossfadeout(1)
clips[-1] = previous_clip
clips.append(clip)
os.remove(image_path)
# 保存字幕文件
srt_file_path = output_file.replace(".mp4", ".srt")
with open(srt_file_path, "w", encoding="utf-8") as srt_file:
srt_file.write(srt_content)
final_clip = mp.concatenate_videoclips(clips, method="compose")
if os.path.exists(output_file):
os.remove(output_file)
final_clip.write_videofile(output_file, fps=24, codec="libx264", audio_codec="aac")
self.progress_text.insert(tk.END, "正在合并音频视频.\n")
video_clip = VideoFileClip(output_file)
original_audio_clip = video_clip.audio
new_audio_clip = AudioFileClip(background_audio_file).volumex(background_audio_volume)
final_audio_clip = CompositeAudioClip([original_audio_clip, new_audio_clip])
video_clip_with_new_audio = video_clip.set_audio(final_audio_clip)
file2 = output_file.replace(".mp4", "_with_audio.mp4")
if os.path.exists(file2):
os.remove(file2)
video_clip_with_new_audio.write_videofile(file2, fps=24, codec="libx264", audio_codec="aac")
if enable_subtitles:
self.progress_text.insert(tk.END, "正在嵌入字幕.\n")
file3 = output_file.replace(".mp4", "_with_audio_subtitles.mp4")
if os.path.exists(file3):
os.remove(file3)
# 拷贝一份到根目录
current_directory = os.path.dirname(os.path.abspath(__file__))
srt_file_name = "temp.srt"
subtitles_file = os.path.join(current_directory, srt_file_name)
shutil.copy(srt_file_path, subtitles_file)
cmd_line = f'ffmpeg -i {file2} -vf subtitles={srt_file_name} {file3}'
subprocess.call(cmd_line, shell=True)
os.remove(subtitles_file)
os.remove(output_file)
os.remove(file2)
self.progress_text.insert(tk.END, "Video successfully generated.\n")
except Exception as e:
self.progress_text.insert(tk.END, f"Error while generating video: {e}\n")
messagebox.showerror("错误", f"生成视频时出错:{e}")
def get_audio_duration(self, audio_file):
try:
audio = MP3(audio_file)
return audio.info.length
except Exception as e:
print(f"{audio_file} err {e}")
messagebox.showerror("错误", f"获取音频时长时出错:{e}")
return None
if __name__ == "__main__":
app = Application()
app.mainloop()
tts.py
import requests
import asyncio
import subprocess
def voice_list():
res = [
{"中文名":"思思","原始名称":"思思","支持情绪":"不设置"},
]
return res
def get_voice_config(voice_user):
if voice_user == "思思":
return {
"port": 9821,
"dl": "zh"
}
else:
raise ValueError(f"Unknown voice user: {voice_user}")
def save_audio_from_response(audio_data, outfile):
ffmpeg_path = 'ffmpeg.exe'
command = [
ffmpeg_path,
'-i', 'pipe:0', # 从标准输入读取
'-acodec', 'libmp3lame',
'-b:a', '192k',
'-f', 'mp3', # 强制输出格式为 MP3
outfile
]
process = subprocess.Popen(command, stdin=subprocess.PIPE, shell=True, stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT)
stdout, stderr = process.communicate(input=audio_data)
async def text_to_audio(text, voice_user,rate,outfile):
print(f"合成语音:{text},语速{rate}")
retry_delay = 1
max_retries = 5
retries = 0
config = get_voice_config(voice_user)
url = f"http://127.0.0.1:{config['port']}"
payload = {
"text": text,
"speed": rate,
"text_language": config['dl']
}
while retries < max_retries:
try:
response = requests.post(url, json=payload)
if response.status_code == 200:
save_audio_from_response(response.content, outfile)
print(f"保存音频:{outfile}")
break # 成功执行就跳出循环
else:
retries += 1
print(f"请求失败, 状态码: {response.status_code}, {retry_delay}秒后将自动重试(第{retries}次重试)。")
await asyncio.sleep(retry_delay)
except Exception as ex:
retries += 1
print(f"请求异常: {ex}, {retry_delay}秒后将自动重试(第{retries}次重试)。")
await asyncio.sleep(retry_delay)
else:
print(f"经过{max_retries}次重试仍然失败,放弃执行。")
您可能会遇到的问题
1、你本机需要有ffmpeg的环境变量设置,或你需要自己下载ffmpeg.exe放到代码同级目录下。
2、tts.py配音文件,会向本机127.0.0.1:9821请求文本转语音,您需要在本机自己搭建tts配音。建议使用开源的gpt-sovits来自己tts配音,记住下载windows整合包会方便一些。
3、你可以使用小米坡 GPT-SoVITS 批量API管理器来批量管理与运行 GPT-SoVITS,简化 GPT-SoVITS的操作难度。
github
声明
本人水平有限,不喜勿喷。如果您有好建议,欢迎留言建议。仅用于学习交流,请不要直接用于生产环境,使用本代码的任何问题由您自己负责。