一、整体思路
使用bilibili_api对有cc字幕的视频进行字幕提取,使用LLM对所提取的字幕进行总结从而实现对视频的间接总结。
二、获取字幕
创建http请求的头。
_HEADERS = {
'Accept': 'application/json, text/plain, */*',
'Content-Type': 'application/json',
'Host': 'api.bilibili.com',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
}
由于对用户数据和资源访问进行了权限管理,在B站(哔哩哔哩)上获取视频的字幕数据时,需要使用SESSDATA。我们在请求字幕时需要在请求中携带Cookie中的SESSDATA。
可以在Cookie管理中找到SESSDATA。
写入环境变量。
从链接里解析BV号。
def _extract_bv_number(url):
pattern = r'BV([A-Za-z0-9]+)'
match = re.search(pattern, url)
if match:
return match.group(0)
else:
return None
我们需要通过BV号来找到视频的cid号,因为aid 或者是 bvid 对应的是投稿,cid 对应的是视频,要请求视频的字幕首先要找到视频。
def _get_cid(self):
"""
获取cid;若为合集视频,则获取p_num指定的cid
:return: cid
"""
# logger.info("发送请求获取视频的cid")
response = requests.get(self._PAGE_LIST_URL, params={'bvid': self.bv_id}, headers=self._HEADERS)
cid = [x['cid'] for x in response.json()['data']][self.p_num]
# logger.info("成功获取视频的cid")
# print(cid)
return cid
使用已知的bv号和cid来获取字幕的请求url。
def _get_subtitle_url(self, cid: int):
"""
发送请求获取subtitle_url
:param cid: cid
:return: subtitle_url
"""
# logger.info("发送请求获取subtitle_url")
params = (
('bvid', self.bv_id),
('cid', cid)
)
# headers = self._HEADERS.copy() # 复制_HEADERS字典
# headers['Cookie'] = "; ".join(f"{key}={value}" for key, value in self.cookie.items()) # 将cookie添加到headers
time.sleep(3)
response = requests.get(self._SUBTITLE_URL, headers=self._HEADERS, params=params, cookies=self.cookie)
print(response.json())
subtitles = response.json()['data']['subtitle']['subtitles']
if subtitles:
subtitles = ['https:' + sub['subtitle_url'] for sub in subtitles]
# print(subtitles)
# logger.info("成功获取subtitle_url")
return self._request_subtitle_url_content(subtitles[0])
使用此url获取字幕。
def download_subtitle(self) -> str:
"""
清洗subtitle_url页面内容
:return: subtitle
"""
subtitle_list = self._get_subtitle_url(self._get_cid())
subtitle = ", ".join([x['content'] for x in subtitle_list])
# logger.info("成功获取字幕")
return subtitle
三、总结字幕
构建一个字幕总结链(chain),向其传入字幕和指定的摘要条数。
def __init__(self,
summary_info: dict,
summary_count: Optional[int] = None
) -> None:
"""
:param summary_info:要进行摘要生成的文本
:param summary_count:生成摘要的条目数量(默认为 10)
"""
self.b_cid = summary_info["bv_id"] + str(summary_info["cid"])
# self.title = summary_info["title"]
self.subtitle = summary_info["subtitle"]
self.seg_length = 3000
self.summary_count = 5 if summary_count is None else summary_count
self._llm = ChatTongyi(
streaming=True,
model_name="qwen-turbo",
)
SUMMARY_PROMPT = PromptTemplate(
input_variables=["summary_count", "subtitle"], template=SUMMARY_TEMPLATE
)
self._summary_chain = SUMMARY_PROMPT | self._llm | StrOutputParser()
将字幕进行分割,对每个chunk进行总结。
def _summary_seg_content(self) -> List[str]:
"""
将文本按照指定的段落长度(seg_length = 3000)划分成多个段落
:return: 划分后的段落列表
"""
if len(self.subtitle) > self.seg_length:
# logger.info("执行文本分割")
r_splitter = RecursiveCharacterTextSplitter(
chunk_size=self.seg_length,
chunk_overlap=200
)
split_texts = r_splitter.split_text(self.subtitle)
return split_texts
else:
return [self.subtitle]
def _get_summary(self, chunk) -> str:
# logger.info("执行总结任务")
summary_response = self._summary_chain.invoke({"summary_count": self.summary_count, "subtitle": chunk})
return summary_response
拼接返回的总结,形成总的总结。
def write_summary(self) -> str:
"""
拼接返回的总结文本
:return: 总结文本
"""
summary_ans = ""
pre_summary_text = self._summary_seg_content()
for chunk in pre_summary_text:
summary_ans += str(self._get_summary(chunk))
if len(pre_summary_text) >= 2:
summary_ans = self._get_summary(summary_ans)
# logger.info("返回总结")
return summary_ans
其中,形成总结的提示词如下。
SUMMARY_TEMPLATE = """
Identification of the character you play:
1.As a professional video content editor and an educational Content creation, you are proficient in summarizing texts
2.You will help students summarize the essence of the video in Chinese.
You need to comply with the following requirements.txt:
1. Please start by summarizing the whole video(there may be typos in the subtitles, please correct them)
2. Then, summarize the subtitles of the video in detail. The output format for this step is an unordered list.
Every unordered list is a coherent sentence, not a simple word or phrase
3. Please use Unicode encoded 'emoji' and replace the symbols in the unordered list with the appropriate 'emoji'
based on the summarized article.
3. Make sure not to exceed {summary_count} items and all sentences are concise, clear, and complete.
3.1 If you feel that the text is too long to summarize within the specified number of items,
please discard any content that you believe is not highly relevant to the task of summarizing the text
3.2 If 3.1 still cannot be achieved, you can appropriately exceed the specified number of items.
But please strictly control it to within {summary_count}
4. make sure not to repeat any sentences
5. Don't pay attention to the meaningless content in the subtitles, such as promotional advertisements, likes and
follow ups, subscriptions, and greetings, that summarize the main idea of the video.
Note: The output language should be Chinese and must have emoji
The output formats that can be referenced are as follows:
# 概述:
Summarize the entire text
# 亮点:
-
-
-
The following is the video subtitle content for summarizing the task:
```
{subtitle}
```
"""
四、返回总结的字幕和原本字幕
def create_summary(link: str, p_input: str):
"""
生成总结
:param bv_input: 视频bv号
:param p_input: 分集号
"""
# 下载字幕
bilibili_subtitle_downloader = BiliSubtitleDownloader(link, p_input)
bili_info = bilibili_subtitle_downloader.get_bili_info()
bili_b_cid = bili_info["bv_id"] + str(bili_info["cid"])
bili_subtitle = bilibili_subtitle_downloader.download_subtitle()
bili_info["subtitle"] = bili_subtitle
bili_summary = SummaryWriter(bili_info).write_summary()
return bili_subtitle, bili_summary