nodejs使用pdf-lib库合并pdf文件后生成目录
查阅了很多资料,在nodejs下实现生成pdf目录的参考内容不准确,经实际尝试,总结以下经验及代码。
版本说明
pdf-lib版本:1.17.1
pdfjs-dist版本:2.6.347
nodejs版本:20.15.0
pdf生成源
pdf使用puppeteer通过html生成,puppeteer生成pdf时的配置项如下:
{
path: fileName,
format: 'A4',
printBackground: true,
displayHeaderFooter: true,
outline: false, //注意,这里设置为false,outline生成功能在puppeteer当前版本中为实验功能
timeout: 60000000,
headerTemplate: `<span style="padding: 0 60px; font-size: 14px; color: #333;"></span>`,
footerTemplate: '<span style="padding: 0 60px; width: 100%; font-size: 10px; color: #333; text-align: right;"><span class="pageNumber"></span>/<span class="totalPages"></span></span>',
margin: {
top: '60px',
right: '60px',
bottom: '60px',
left: '60px'
}
}
编码思路
TOC
html处理时通过对h1, h2,…, h6标签的识别,生成toc。格式如下:
{
level: "",
text: "",
}
组件引用
const fs = require('fs-extra');
const { getDocument } = require('pdfjs-dist');
const { PDFDocument, PDFName, PDFArray, PDFNumber, PDFHexString } = require('pdf-lib');
创建TOC与页面的绑定关系
对生成的pdf文件,先用pdfreader库创建页面索引数组,进而通过pdf-lib库读取文件获得的PDFDocument对象确定与TOC的关系。示例代码如下:
for (let i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i);
const content = await page.getTextContent();
let contentStr = content.items.map(item => item.str).join('');
contentStr = convertText(contentStr);
contentStr = JSON.stringify(contentStr);
pageDatas.push({
index: i - 1,
content: contentStr
});
}
遍历绑定后的TOC目录,创建书签对象
// 遍历toc,创建书签对象
let lastPageIndex = 0;
for (let i = 0; i < toc.length; i++) {
let text = convertText(toc[i].text);
const pageIndex = getPageIndex(pageDatas, text, lastPageIndex);
if (pageIndex == "notfound") {
console.log(`❌️ ${i}-[${text}] of [${outputPath}] not found.`);
hasError = true;
missedKeys.push(text);
continue;
}
lastPageIndex = pageIndex;
const pageRef = mergedPdf.getPage(pageIndex).ref;
const destArray = PDFArray.withContext(mergedPdf.context);
destArray.push(pageRef);
destArray.push(PDFName.of('Fit'));
const bookmark = mergedPdf.context.obj({});
bookmark.set(PDFName.of('Title'), PDFHexString.fromText(toc[i].text));
bookmark.set(PDFName.of('Dest'), destArray);
const ref = mergedPdf.context.register(bookmark);
toc[i].bookmark = bookmark;
toc[i].ref = ref;
}
创建目录树
通过迭代TOC按level创建出目录树。
function buildTree(data) {
const root = { children: [] };
const lastNodes = []; // 记录各层级最新的节点
for (const item of data) {
if (!item.bookmark) {
continue;
}
const currentLevel = item.level;
const newNode = {
...item,
children: []
};
// 找到父节点
if (currentLevel === 0) {
// 顶层节点,父节点是根节点
root.children.push(newNode);
} else {
// 父节点是上一层的最后一个节点
const parent = lastNodes[currentLevel - 1];
parent.children.push(newNode);
}
// 更新lastNodes数组
lastNodes[currentLevel] = newNode;
// 截断数组,确保长度正确
lastNodes.length = currentLevel + 1;
}
return root.children;
}
创建pdf目录树(Outline)的Prev,Next,Parent
function createOutline(nodes, parent, mergedPdf) {
if (nodes.length <= 0) {
return [];
}
let outline = null;
if (!parent) {
outline = mergedPdf.context.obj({
Type: 'Outlines',
First: undefined,
Last: undefined,
Count: 0
});
} else {
outline = parent.bookmark;
}
for (let i = 0; i < nodes.length; i++) {
if (i > 0) {
nodes[i].bookmark.set(PDFName.of('Prev'), nodes[i - 1].ref);
}
if (i < nodes.length - 1) {
nodes[i].bookmark.set(PDFName.of('Next'), nodes[i + 1].ref);
}
if (nodes[i].children) {
createOutline(nodes[i].children, nodes[i], mergedPdf);
}
}
outline.set(PDFName.of('First'), nodes[0].ref);
outline.set(PDFName.of('Last'), nodes[nodes.length - 1].ref);
outline.set(PDFName.of('Count'), PDFNumber.of(nodes.length));
return outline;
}
全量代码
const fs = require('fs-extra');
const { getDocument } = require('pdfjs-dist');
const { PDFDocument, PDFName, PDFArray, PDFNumber, PDFHexString } = require('pdf-lib');
process.stdout.setEncoding('utf8');
(async () => {
function convertText(text) {
let textRep = text.replaceAll(" ", "");
textRep = textRep.replace(/(\r\n|\n|\r)/g, '');
textRep = textRep.replace(/\r/g, '');
textRep = textRep.replace(/^\uFEFF/, '');
textRep = textRep.replace(/[\u200B-\u200D\uFEFF]/g, '');
textRep = textRep.replace(/[\u0000-\u001F\u25A0-\u25FF]/g, '');
textRep = textRep.replace(/\(\d+\)/g, '');
textRep = textRep.replaceAll("…","...");
return textRep;
}
function buildTree(data) {
const root = { children: [] };
const lastNodes = []; // 记录各层级最新的节点
for (const item of data) {
if (!item.bookmark) {
continue;
}
const currentLevel = item.level;
const newNode = {
...item,
children: []
};
// 找到父节点
if (currentLevel === 0) {
// 顶层节点,父节点是根节点
root.children.push(newNode);
} else {
// 父节点是上一层的最后一个节点
const parent = lastNodes[currentLevel - 1];
parent.children.push(newNode);
}
// 更新lastNodes数组
lastNodes[currentLevel] = newNode;
// 截断数组,确保长度正确
lastNodes.length = currentLevel + 1;
}
return root.children;
}
function getPageIndex(pageDatas, text, lastPageIndex) {
for (let j = 0; j < pageDatas.length; j++) {
if (j < lastPageIndex) {
continue;
}
if (pageDatas[j].content.includes(text)) {
return pageDatas[j].index;
} else {
if (j > 0) {
// 处理目录跨页
let combinedText = convertText(pageDatas[j - 1] + pageDatas[j]);
combinedText = combinedText.replace(/\s*\d+\/\d+\s*/g, "");
if (combinedText.includes(text)) {
foundPageIndex = pageDatas[j].index;
break; // 立即退出循环
}
}
}
}
return "notfound";
}
function createOutline(nodes, parent, mergedPdf) {
if (nodes.length <= 0) {
return [];
}
let outline = null;
if (!parent) {
outline = mergedPdf.context.obj({
Type: 'Outlines',
First: undefined,
Last: undefined,
Count: 0
});
} else {
outline = parent.bookmark;
}
for (let i = 0; i < nodes.length; i++) {
if (i > 0) {
nodes[i].bookmark.set(PDFName.of('Prev'), nodes[i - 1].ref);
}
if (i < nodes.length - 1) {
nodes[i].bookmark.set(PDFName.of('Next'), nodes[i + 1].ref);
}
if (nodes[i].children) {
createOutline(nodes[i].children, nodes[i], mergedPdf);
}
}
outline.set(PDFName.of('First'), nodes[0].ref);
outline.set(PDFName.of('Last'), nodes[nodes.length - 1].ref);
outline.set(PDFName.of('Count'), PDFNumber.of(nodes.length));
return outline;
}
async function mergePdfFiles(inputPaths, outputPath, toc) {
const mergedPdf = await PDFDocument.create();
for (let i = 0; i < inputPaths.length; i++) {
const inputPdf = await PDFDocument.load(fs.readFileSync(inputPaths[i]));
const copiedPages = await mergedPdf.copyPages(inputPdf, Array.from({ length: inputPdf.getPageCount() }, (_, i) => i));
copiedPages.forEach(page => mergedPdf.addPage(page));
}
const { hasError, missedKeys } = await generateOutline(mergedPdf, outputPath, toc);
if (!hasError) {
for (let i = 0; i < inputPaths.length; i++) {
fs.unlinkSync(inputPaths[i]);
}
} else {
// TODO
}
}
async function loadAndGenerateOutline(filePath, toc) {
const inputBytes = fs.readFileSync(filePath);
const inputPdf = await PDFDocument.load(inputBytes);
const { hasError, missedKeys } = await generateOutline(inputPdf, filePath, toc);
if (hasError) {
// TODO
}
}
async function generateOutline(mergedPdf, outputPath, toc) {
const pdfBytes = await mergedPdf.save({ useObjectStreams: false })
const doc = await getDocument(pdfBytes).promise;
const pageDatas = [];
// 创建页面查找用的map
for (let i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i);
const content = await page.getTextContent();
let contentStr = content.items.map(item => item.str).join('');
contentStr = convertText(contentStr);
contentStr = JSON.stringify(contentStr);
pageDatas.push({
index: i - 1,
content: contentStr
});
}
// 遍历toc,创建书签对象
let lastPageIndex = 0;
let hasError = false;
let missedKeys = [];
for (let i = 0; i < toc.length; i++) {
let text = convertText(toc[i].text);
const pageIndex = getPageIndex(pageDatas, text, lastPageIndex);
if (pageIndex == "notfound") {
console.log(`❌️ ${i}-[${text}] of [${outputPath}] not found.`);
hasError = true;
missedKeys.push(text);
continue;
}
lastPageIndex = pageIndex;
const pageRef = mergedPdf.getPage(pageIndex).ref;
const destArray = PDFArray.withContext(mergedPdf.context);
destArray.push(pageRef);
destArray.push(PDFName.of('Fit'));
const bookmark = mergedPdf.context.obj({});
bookmark.set(PDFName.of('Title'), PDFHexString.fromText(toc[i].text));
bookmark.set(PDFName.of('Dest'), destArray);
const ref = mergedPdf.context.register(bookmark);
toc[i].bookmark = bookmark;
toc[i].ref = ref;
}
// 构建目录树
const tocTree = buildTree(toc);
// 创建目录
const outlineRoot = createOutline(tocTree, null, mergedPdf);
const registed = mergedPdf.context.register(outlineRoot)
// 注册大纲根节点
mergedPdf.catalog.set(PDFName.of('Outlines'), registed);
const mergedPdfBytes = await mergedPdf.save({ useObjectStreams: false });
fs.writeFileSync(outputPath, mergedPdfBytes);
return { hasError, missedKeys };
}
module.exports = {
mergePdfFiles,
loadAndGenerateOutline
};
})();