nodejs使用pdf-lib库合并pdf文件后生成目录

查阅了很多资料,在nodejs下实现生成pdf目录的参考内容不准确,经实际尝试,总结以下经验及代码。

版本说明

pdf-lib版本:1.17.1
pdfjs-dist版本:2.6.347
nodejs版本:20.15.0

pdf生成源

pdf使用puppeteer通过html生成,puppeteer生成pdf时的配置项如下:

{
  path: fileName,
  format: 'A4',
  printBackground: true,
  displayHeaderFooter: true,
  outline: false, //注意,这里设置为false,outline生成功能在puppeteer当前版本中为实验功能
  timeout: 60000000,
  headerTemplate: `<span style="padding: 0 60px; font-size: 14px; color: #333;"></span>`,
  footerTemplate: '<span style="padding: 0 60px; width: 100%; font-size: 10px; color: #333; text-align: right;"><span class="pageNumber"></span>/<span class="totalPages"></span></span>',
  margin: {
    top: '60px',
    right: '60px',
    bottom: '60px',
    left: '60px'
  }
}

编码思路

TOC

html处理时通过对h1, h2,…, h6标签的识别,生成toc。格式如下:

{
	level: "",
	text: "",
}

组件引用

const fs = require('fs-extra');
const { getDocument } = require('pdfjs-dist');
const { PDFDocument, PDFName, PDFArray, PDFNumber, PDFHexString } = require('pdf-lib');

创建TOC与页面的绑定关系

对生成的pdf文件,先用pdfreader库创建页面索引数组,进而通过pdf-lib库读取文件获得的PDFDocument对象确定与TOC的关系。示例代码如下:

for (let i = 1; i <= doc.numPages; i++) {
  const page = await doc.getPage(i);
  const content = await page.getTextContent();
  let contentStr = content.items.map(item => item.str).join('');
  contentStr = convertText(contentStr);
  contentStr = JSON.stringify(contentStr);
  pageDatas.push({
    index: i - 1,
    content: contentStr
  });
}

遍历绑定后的TOC目录,创建书签对象

// 遍历toc,创建书签对象
let lastPageIndex = 0;
for (let i = 0; i < toc.length; i++) {
   let text = convertText(toc[i].text);
   const pageIndex = getPageIndex(pageDatas, text, lastPageIndex);
   if (pageIndex == "notfound") {
     console.log(`❌️ ${i}-[${text}] of [${outputPath}] not found.`);
     hasError = true;
     missedKeys.push(text);
     continue;
   }
   lastPageIndex = pageIndex;
   const pageRef = mergedPdf.getPage(pageIndex).ref;
   const destArray = PDFArray.withContext(mergedPdf.context);
   destArray.push(pageRef);
   destArray.push(PDFName.of('Fit'));
   const bookmark = mergedPdf.context.obj({});
   bookmark.set(PDFName.of('Title'), PDFHexString.fromText(toc[i].text));
   bookmark.set(PDFName.of('Dest'), destArray);
   const ref = mergedPdf.context.register(bookmark);

   toc[i].bookmark = bookmark;
   toc[i].ref = ref;
 }

创建目录树

通过迭代TOC按level创建出目录树。

function buildTree(data) {
  const root = { children: [] };
  const lastNodes = []; // 记录各层级最新的节点

  for (const item of data) {
    if (!item.bookmark) {
      continue;
    }
    const currentLevel = item.level;
    const newNode = {
      ...item,
      children: []
    };

    // 找到父节点
    if (currentLevel === 0) {
      // 顶层节点,父节点是根节点
      root.children.push(newNode);
    } else {
      // 父节点是上一层的最后一个节点
      const parent = lastNodes[currentLevel - 1];
      parent.children.push(newNode);
    }

    // 更新lastNodes数组
    lastNodes[currentLevel] = newNode;
    // 截断数组,确保长度正确
    lastNodes.length = currentLevel + 1;
  }

  return root.children;
}

创建pdf目录树(Outline)的Prev,Next,Parent

function createOutline(nodes, parent, mergedPdf) {
   if (nodes.length <= 0) {
     return [];
   }
   let outline = null;
   if (!parent) {
     outline = mergedPdf.context.obj({
       Type: 'Outlines',
       First: undefined,
       Last: undefined,
       Count: 0
     });
   } else {
     outline = parent.bookmark;
   }

   for (let i = 0; i < nodes.length; i++) {
     if (i > 0) {
       nodes[i].bookmark.set(PDFName.of('Prev'), nodes[i - 1].ref);
     }
     if (i < nodes.length - 1) {
       nodes[i].bookmark.set(PDFName.of('Next'), nodes[i + 1].ref);
     }
     if (nodes[i].children) {
       createOutline(nodes[i].children, nodes[i], mergedPdf);
     }
   }

   outline.set(PDFName.of('First'), nodes[0].ref);
   outline.set(PDFName.of('Last'), nodes[nodes.length - 1].ref);
   outline.set(PDFName.of('Count'), PDFNumber.of(nodes.length));
   return outline;
 }

全量代码

const fs = require('fs-extra');
const { getDocument } = require('pdfjs-dist');
const { PDFDocument, PDFName, PDFArray, PDFNumber, PDFHexString } = require('pdf-lib');
process.stdout.setEncoding('utf8');

(async () => {
  function convertText(text) {
    let textRep = text.replaceAll(" ", "");
    textRep = textRep.replace(/(\r\n|\n|\r)/g, '');
    textRep = textRep.replace(/\r/g, '');
    textRep = textRep.replace(/^\uFEFF/, '');
    textRep = textRep.replace(/[\u200B-\u200D\uFEFF]/g, '');
    textRep = textRep.replace(/[\u0000-\u001F\u25A0-\u25FF]/g, '');
    textRep = textRep.replace(/\(\d+\)/g, '');
    textRep = textRep.replaceAll("…","...");
    return textRep;
  }
  
  function buildTree(data) {
    const root = { children: [] };
    const lastNodes = []; // 记录各层级最新的节点

    for (const item of data) {
      if (!item.bookmark) {
        continue;
      }
      const currentLevel = item.level;
      const newNode = {
        ...item,
        children: []
      };

      // 找到父节点
      if (currentLevel === 0) {
        // 顶层节点,父节点是根节点
        root.children.push(newNode);
      } else {
        // 父节点是上一层的最后一个节点
        const parent = lastNodes[currentLevel - 1];
        parent.children.push(newNode);
      }

      // 更新lastNodes数组
      lastNodes[currentLevel] = newNode;
      // 截断数组,确保长度正确
      lastNodes.length = currentLevel + 1;
    }

    return root.children;
  }

  function getPageIndex(pageDatas, text, lastPageIndex) {
    for (let j = 0; j < pageDatas.length; j++) {
      if (j < lastPageIndex) {
        continue;
      }
      if (pageDatas[j].content.includes(text)) {
        return pageDatas[j].index;
      } else {
        if (j > 0) {
          // 处理目录跨页
          let combinedText = convertText(pageDatas[j - 1] + pageDatas[j]);
          combinedText = combinedText.replace(/\s*\d+\/\d+\s*/g, "");
          if (combinedText.includes(text)) {
            foundPageIndex = pageDatas[j].index;
            break; // 立即退出循环
          }
        }
      }
    }
    return "notfound";
  }

  function createOutline(nodes, parent, mergedPdf) {
    if (nodes.length <= 0) {
      return [];
    }
    let outline = null;
    if (!parent) {
      outline = mergedPdf.context.obj({
        Type: 'Outlines',
        First: undefined,
        Last: undefined,
        Count: 0
      });
    } else {
      outline = parent.bookmark;
    }

    for (let i = 0; i < nodes.length; i++) {
      if (i > 0) {
        nodes[i].bookmark.set(PDFName.of('Prev'), nodes[i - 1].ref);
      }
      if (i < nodes.length - 1) {
        nodes[i].bookmark.set(PDFName.of('Next'), nodes[i + 1].ref);
      }
      if (nodes[i].children) {
        createOutline(nodes[i].children, nodes[i], mergedPdf);
      }
    }

    outline.set(PDFName.of('First'), nodes[0].ref);
    outline.set(PDFName.of('Last'), nodes[nodes.length - 1].ref);
    outline.set(PDFName.of('Count'), PDFNumber.of(nodes.length));
    return outline;
  }

  async function mergePdfFiles(inputPaths, outputPath, toc) {
    const mergedPdf = await PDFDocument.create();
    for (let i = 0; i < inputPaths.length; i++) {
      const inputPdf = await PDFDocument.load(fs.readFileSync(inputPaths[i]));
      const copiedPages = await mergedPdf.copyPages(inputPdf, Array.from({ length: inputPdf.getPageCount() }, (_, i) => i));
      copiedPages.forEach(page => mergedPdf.addPage(page));
    }
    const { hasError, missedKeys } = await generateOutline(mergedPdf, outputPath, toc);
    if (!hasError) {
      for (let i = 0; i < inputPaths.length; i++) {
        fs.unlinkSync(inputPaths[i]);
      }
    } else {
      // TODO
    }
  }

  async function loadAndGenerateOutline(filePath, toc) {
    const inputBytes = fs.readFileSync(filePath);
    const inputPdf = await PDFDocument.load(inputBytes);
    const { hasError, missedKeys } = await generateOutline(inputPdf, filePath, toc);
    if (hasError) {
      // TODO
    }
  }

  async function generateOutline(mergedPdf, outputPath, toc) {
    const pdfBytes = await mergedPdf.save({ useObjectStreams: false })
    const doc = await getDocument(pdfBytes).promise;

    const pageDatas = [];
    // 创建页面查找用的map
    for (let i = 1; i <= doc.numPages; i++) {
      const page = await doc.getPage(i);
      const content = await page.getTextContent();
      let contentStr = content.items.map(item => item.str).join('');
      contentStr = convertText(contentStr);
      contentStr = JSON.stringify(contentStr);
      pageDatas.push({
        index: i - 1,
        content: contentStr
      });
    }

    // 遍历toc,创建书签对象
    let lastPageIndex = 0;
    let hasError = false;
    let missedKeys = [];
    for (let i = 0; i < toc.length; i++) {
      let text = convertText(toc[i].text);
      const pageIndex = getPageIndex(pageDatas, text, lastPageIndex);
      if (pageIndex == "notfound") {
        console.log(`❌️ ${i}-[${text}] of [${outputPath}] not found.`);
        hasError = true;
        missedKeys.push(text);
        continue;
      }
      lastPageIndex = pageIndex;
      const pageRef = mergedPdf.getPage(pageIndex).ref;
      const destArray = PDFArray.withContext(mergedPdf.context);
      destArray.push(pageRef);
      destArray.push(PDFName.of('Fit'));
      const bookmark = mergedPdf.context.obj({});
      bookmark.set(PDFName.of('Title'), PDFHexString.fromText(toc[i].text));
      bookmark.set(PDFName.of('Dest'), destArray);
      const ref = mergedPdf.context.register(bookmark);

      toc[i].bookmark = bookmark;
      toc[i].ref = ref;
    }

    // 构建目录树
    const tocTree = buildTree(toc);

    // 创建目录
    const outlineRoot = createOutline(tocTree, null, mergedPdf);

    const registed = mergedPdf.context.register(outlineRoot)
    // 注册大纲根节点
    mergedPdf.catalog.set(PDFName.of('Outlines'), registed);

    const mergedPdfBytes = await mergedPdf.save({ useObjectStreams: false });

    fs.writeFileSync(outputPath, mergedPdfBytes);
    return { hasError, missedKeys };
  }

  module.exports = {
    mergePdfFiles,
    loadAndGenerateOutline
  };
})();
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值