在搭建本地知识库过程中,需要把confluence空间的内容加载到 本地知识库中,下面介绍如何保存confluence内容
from atlassian import Confluence
from langchain.schema import Document
import requests
class ConfluenceLoader:
"""Confluence文档加载器,支持空间、指定页面或目录加载"""
def __init__(self, url, username, password, space_key=None, page_ids=None, directory_page_ids=None):
self.url = url
self.username = username
self.password = password
self.space_key = space_key
self.page_ids = page_ids if page_ids else []
self.directory_page_ids = directory_page_ids if directory_page_ids else []
# 创建自定义Session,强制添加Host头
self.session = requests.Session()
self.session.headers.update({"Host": "conf2.XXXXX.com"})
self.session.auth = (self.username, self.password)
def load(self):
"""加载Confluence文档"""
try:
confluence = Confluence(
url=self.url,
username=self.username,
password=self.password,
session=self.session, # 注入自定义Session
cloud=False # 如果是云版本需要设置为True
)
documents = []
if self.space_key:
self._load_space_content(confluence, documents)
elif self.directory_page_ids:
for directory_page_id in self.directory_page_ids:
print(f"*******Confluence加载目录******: {directory_page_id}")
self._load_directory_content(confluence, documents, directory_page_id)
return documents
except Exception as e:
print(f"Confluence加载失败: {str(e)}")
return []
def _load_space_content(self, confluence, documents):
"""加载指定空间的所有页面"""
start = 0
limit = 50
while True:
pages = confluence.get_all_pages_from_space(
self.space_key,
start=start,
limit=limit,
expand='body.storage'
)
if not pages:
break
for page in pages:
self._process_page(confluence, page['id'], documents)
start += limit
def _load_directory_content(self, confluence, documents, parent_page_id):
"""先加载子页面,再加载父页面(避免权限继承问题)"""
try:
# 先处理子页面
child_page_ids = self._get_child_page_ids(confluence, parent_page_id)
for page_id in child_page_ids:
self._load_directory_content(confluence, documents, page_id)
# 最后处理当前目录
self._process_page(confluence, parent_page_id, documents)
except Exception as e:
print(f"加载目录失败: {str(e)}")
def _get_child_page_ids(self, confluence, parent_page_id):
try:
children = confluence.get_page_child_by_type(
parent_page_id,
type="page",
start=0,
limit=50
)
return [child["id"] for child in children]
except Exception as e:
print(f"获取子页面失败: {str(e)}")
return []
def _process_page(self, confluence, page_id, documents):
"""处理单个页面内容"""
try:
page = confluence.get_page_by_id(page_id, expand='body.storage')
content = page['body']['storage']['value']
metadata = {
"source": f"{self.url}/pages/viewpage.action?pageId={page_id}",
"title": page['title'],
"page_id": page_id
}
print(f"加载页面,id: {page_id}")
documents.append(Document(page_content=content, metadata=metadata))
except Exception as e:
print(f"加载页面 {page_id} 失败: {str(e)}")
# 使用示例
if __name__ == "__main__":
# 加载指定目录
loader = ConfluenceLoader(
url="http://ip:port",
username="xxx",
password="xxx",
directory_page_id=["xxxx"] # 目录页面的ID
)
documents = loader.load()
注意的问题
- 如果confleunce 的访问,是不需要使用ip加端口的方式的,url直接填conf的域名地址即可
- confluence不允许直接ip访问,才需要session注入的方式,否则直接ip加端口访问即可‘
可以用下面的命令验证:
curl -v -H "Host: conf2.xxx.com" -u "user:pwd" http://ip:port/pages/viewpage.action?pageId=xxx
从域名查看ip和端口 可以用 :
curl -v 域名地址