语雀批量导出成md【保持文件夹结构】

最新推荐文章于 2025-03-06 11:11:13 发布

一定会去到彩虹海的麦当

最新推荐文章于 2025-03-06 11:11:13 发布

阅读量2k

点赞数 3

文章标签： python 前端 json

本文链接：https://blog.youkuaiyun.com/weixin_65349299/article/details/128766645

版权

在别人的基础上，加上保持原有文件夹结构的功能。如果是文档中有子文档的话，会使用该文档名作为文件夹名，然后将对应的文档放入该文件夹中。
原有参考链接：语雀批量导出MarkDown文件，指定知识库所有内容

需要更改config.json文件的中的配置

{
    "TOKEN": "语雀token",                                # 用户token
    "USER_AGENT": "Rion",                               # 就是一个用户名称，可以随意写，但必须要有
    "BASE_URL": "https://customspace.yuque.com/api/v2", # 语雀基础api
    "DATA_PATH": "yuque"                                # 数据储存的文件夹名称（默认当前文件夹下创建）
}

如果需要更改文件最终生成地址的话，可以修改代码的这里：
在这里插入图片描述
全部代码：


import json
import sys
import os
import requests
import re
from datetime import datetime
import yaml
from treelib import Tree
import pickle

if getattr(sys, 'frozen', False):
    APPLICATION_PATH = os.path.dirname(sys.executable)
else:
    APPLICATION_PATH = os.path.dirname('.')
jsonConfig = json.load(open(os.path.join(APPLICATION_PATH, "config.json"), encoding='utf-8'))





class ExportYueQueDoc:
    def __init__(self):
        try:
            if getattr(sys, 'frozen', False):
                APPLICATION_PATH = os.path.dirname(sys.executable)
            else:
                APPLICATION_PATH = os.path.dirname('.')
            self.jsonConfig = json.load(open(os.path.join(APPLICATION_PATH, "config.json"), encoding='utf-8'))
            self.base_url = self.jsonConfig['BASE_URL']
            self.token = self.jsonConfig['TOKEN']
            self.headers = {
                "User-Agent": self.jsonConfig['USER_AGENT'],
                "X-Auth-Token": self.jsonConfig['TOKEN']
            }
            self.data_path = self.jsonConfig['DATA_PATH']
        except:
            raise ValueError("config.json 有误")

    def get_user_info(self):
        """获取用户信息"""
        res_obj = requests.get(url=self.base_url + '/user', headers=self.headers)
        if res_obj.status_code != 200:
            raise ValueError("Token 信息错误")
        user_json = res_obj.json()
        self.login_id = user_json['data']['login']
        self.uid = user_json['data']['id']
        self.username = user_json['data']['name']
        print("=========== 用户信息初始化成功 ==========")

    def get_repos_data(self):
        """获取知识库"""
        repos_json = requests.get(self.base_url + '/users/' + self.login_id + '/repos', headers=self.headers).json()
        repos_list = []
        for item in repos_json['data']:
            rid = item['id']  # 知识库id
            name = item['name']  # 知识库名称
            repos_list.append({"rid": rid, "repos_name": name})
        return repos_list

    def get_top(self, repos_list):
        repos_toc_dic = {}
        for repos in repos_list:
            toc = requests.get(self.base_url + '/repos/' + str(repos['rid']),
                               headers=self.headers).json()
            if toc['data']['toc_yml'] is not None:
                dic_toc = yaml.safe_load(toc['data']['toc_yml'])
                repos_toc_dic[repos['repos_name']] = dic_toc

        return repos_toc_dic

    def get_article_data(self, repos_list):
        """获取文章数据"""
        all_doc_list = {}
        ids = 0
        for repos in repos_list:
            article_datas = requests.get(self.base_url + '/repos/' + str(repos['rid']) + '/docs',
                                         headers=self.headers).json()
            doc_dict = {}
            for item in article_datas['data']:
                per_article_data = requests.get(self.base_url + '/repos/' + str(repos['rid']) + '/docs/' + item['slug'],
                                                headers=self.headers).json()
                posts_text = re.sub(r'\\n', "\n", per_article_data['data']['body'])
                result = re.sub(r'<a name="(.*)"></a>', "", posts_text)
                # 移除<br />
                result = result.replace('<br />', '\n')
                # 删除图片多余后缀
                result = re.sub('#averageHue=[a-z0-9\-&=%\.].*(?=\))','',result)
                result = re.sub('#clientId=[a-z0-9\-&=%\.].*(?=\))', '', result)
                doc_dict[item['id']] = {"title": item['title'], "content": result}
                if ids % 10 == 0:
                    print(ids)
                ids += 1
            all_doc_list[repos['repos_name']] = doc_dict
        return all_doc_list

    def save_article(self, result, repos_name, title):
        """写入文章"""
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        dir_path = f"{self.data_path}/{repos_name}"
        filepath = dir_path + f"/{title}.md"
        dir_ret = os.path.exists(dir_path)
        if not dir_ret:
            os.makedirs(dir_path)


    def build_tree(self, repos_name, toc_list, doc_dict):
        tree = Tree()
        tree.create_node(tag=repos_name, identifier=1, data={'type': 'TITLE', 'content': ''})
        for toc in toc_list:
            if toc['type'] == 'META':
                continue
            if toc['type'] == 'DOC':
                if toc['id'] not in doc_dict:
                    print(f"{toc['title']} 没找到对应文章")
                    continue
                doc = {'type': 'DOC', 'content': doc_dict[toc['id']]['content']}
            else:
                doc = {'type': 'TITLE', 'content': ''}
            parent_id = toc['parent_uuid'] if toc['parent_uuid'] != '' else 1

            if not tree.contains(nid=parent_id):
                continue
            tree.create_node(tag=toc['title'], identifier=toc['uuid'], parent=parent_id, data=doc)

        return tree

    def write_md(self, dir_path, node):
        filepath = dir_path + f"/{node.tag}.md"
        exists_ret = os.path.exists(filepath)
        if exists_ret:
            os.remove(filepath)
        try:
            content = node.data['content']
            with open(filepath, 'a', encoding="utf-8") as fp:
                fp.writelines(content)
        except Exception as e:
            print(f"{node.tag} 写入失败")

    def save_local(self, nid, dir_path, tree):
        node_root = tree[nid]
        node_list = tree.children(nid)
        if len(node_list) == 0:
            return
        dir_path = f"{dir_path}/{node_root.tag}"
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        root_type = node_root.data['type']
        if root_type == 'DOC':
            self.write_md(dir_path, node_root)

        for node in node_list:
            type = node.data['type']
            if type == 'DOC':
                self.write_md(dir_path, node)
            # 处理子节点
            self.save_local(node.identifier,dir_path,tree)


    def main(self):
        self.get_user_info()
        repos_list = self.get_repos_data()
        top_list_dict = self.get_top(repos_list)
        doc_list_dict = self.get_article_data(repos_list)
        #
        # with open("doc_list_dict.pkl", "wb") as tf:
        #     pickle.dump(doc_list_dict, tf,pickle.HIGHEST_PROTOCOL)
        #
        # with open("top_list_dict.pkl", "wb") as tf:
        #     pickle.dump(top_list_dict, tf, pickle.HIGHEST_PROTOCOL)
        #
        # with open("doc_list_dict.pkl", "rb") as tf:
        #     doc_list_dict = pickle.load(tf)
        # with open("top_list_dict.pkl", "rb") as tf:
        #     top_list_dict = pickle.load(tf)

        dir_path = '.'
        for repos in repos_list:
            repos_name = repos['repos_name']
            if repos_name not in top_list_dict:
                continue
            tree = self.build_tree(repos_name, top_list_dict[repos_name], doc_list_dict[repos_name])
            self.save_local(1, dir_path, tree)
        print("end")

if __name__ == "__main__":
    yq = ExportYueQueDoc()
    yq.main()

git仓库地址：https://gitee.com/wu1233456/yuque-export