docx4j 的使用

潇锐killer

已于 2024-06-05 16:49:17 修改

阅读量1k

点赞数 2

CC 4.0 BY-SA版权

文章标签：服务器 linux 运维

于 2024-06-05 16:48:26 首次发布

本文链接：https://blog.youkuaiyun.com/wangqiaowq/article/details/139473101

docx4j解析word模板-优快云博客

<dependency>
<groupId>org.docx4j</groupId>
<artifactId>docx4j-core</artifactId>
<version>8.3.9</version>
</dependency>
<dependency>
<groupId>org.docx4j</groupId>
<artifactId>docx4j-export-fo</artifactId>
<version>8.3.9</version>
</dependency>
<dependency>
<groupId>org.docx4j</groupId>
<artifactId>docx4j-JAXB-ReferenceImpl</artifactId>
<version>8.3.9</version>
</dependency>

注意：

docx4j-core 8.3.9 jdk1.8

docx4j-core 11.4.9 jdk11

文本切分

try {

            // 读取.docx文件
            File wordFile = new File("src/main/resources/data/doc/XXXX.docx");
            WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.load(wordFile);

            // 提取文本
            List<String> paragraphs = new ArrayList<>();
            List<Object> bodyContent = wordMLPackage.getMainDocumentPart().getContent();
            for (Object content : bodyContent) {
                if (content instanceof P) { // 检查是否为段落
                    P paragraph = (P) content;
                    StringBuilder paragraphText = new StringBuilder();
                    for (Object run : paragraph.getContent()) {
                        if (run instanceof org.docx4j.wml.R) { // 检查是否为运行
                            org.docx4j.wml.R runObj = (org.docx4j.wml.R) run;
                            for (Object obj : runObj.getContent()) {
                                if (obj instanceof javax.xml.bind.JAXBElement) {
                                    javax.xml.bind.JAXBElement<?> jaxbElement = (javax.xml.bind.JAXBElement<?>) obj;
                                    if (jaxbElement.getDeclaredType() == org.docx4j.wml.Text.class) {
                                        org.docx4j.wml.Text textObj = (org.docx4j.wml.Text) jaxbElement.getValue();
                                        paragraphText.append(textObj.getValue());
                                    }
                                } else if (obj instanceof org.docx4j.wml.Text) {
                                    org.docx4j.wml.Text textObj = (org.docx4j.wml.Text) obj;
                                    paragraphText.append(textObj.getValue());
                                }
                            }
                        }
                    }
                    String trimmedParagraph = paragraphText.toString().replaceAll("[\\p{Z}\\p{C}\\u00A0]+", "");
                    if (!trimmedParagraph.isEmpty()) {
                        paragraphs.add(trimmedParagraph);
                    }
                }
            }
            String contentStr = String.join("\n", paragraphs);

            System.out.println("content: " + contentStr);

            // 文本分段
            // List<String> sentences = segmentText(content);

            // 向量化
            // List<float[]> vectors = sentences.stream().map(TextToMilvus::getVector).collect(Collectors.toList());
        } catch (Exception e) {
            e.printStackTrace();
        }