<dependency>
<groupId>org.docx4j</groupId>
<artifactId>docx4j-core</artifactId>
<version>8.3.9</version>
</dependency>
<dependency>
<groupId>org.docx4j</groupId>
<artifactId>docx4j-export-fo</artifactId>
<version>8.3.9</version>
</dependency>
<dependency>
<groupId>org.docx4j</groupId>
<artifactId>docx4j-JAXB-ReferenceImpl</artifactId>
<version>8.3.9</version>
</dependency>
注意:
docx4j-core 8.3.9 jdk1.8
docx4j-core 11.4.9 jdk11
文本切分
try {
// 读取.docx文件
File wordFile = new File("src/main/resources/data/doc/XXXX.docx");
WordprocessingMLPackage wordMLPackage = WordprocessingMLPackage.load(wordFile);
// 提取文本
List<String> paragraphs = new ArrayList<>();
List<Object> bodyContent = wordMLPackage.getMainDocumentPart().getContent();
for (Object content : bodyContent) {
if (content instanceof P) { // 检查是否为段落
P paragraph = (P) content;
StringBuilder paragraphText = new StringBuilder();
for (Object run : paragraph.getContent()) {
if (run instanceof org.docx4j.wml.R) { // 检查是否为运行
org.docx4j.wml.R runObj = (org.docx4j.wml.R) run;
for (Object obj : runObj.getContent()) {
if (obj instanceof javax.xml.bind.JAXBElement) {
javax.xml.bind.JAXBElement<?> jaxbElement = (javax.xml.bind.JAXBElement<?>) obj;
if (jaxbElement.getDeclaredType() == org.docx4j.wml.Text.class) {
org.docx4j.wml.Text textObj = (org.docx4j.wml.Text) jaxbElement.getValue();
paragraphText.append(textObj.getValue());
}
} else if (obj instanceof org.docx4j.wml.Text) {
org.docx4j.wml.Text textObj = (org.docx4j.wml.Text) obj;
paragraphText.append(textObj.getValue());
}
}
}
}
String trimmedParagraph = paragraphText.toString().replaceAll("[\\p{Z}\\p{C}\\u00A0]+", "");
if (!trimmedParagraph.isEmpty()) {
paragraphs.add(trimmedParagraph);
}
}
}
String contentStr = String.join("\n", paragraphs);
System.out.println("content: " + contentStr);
// 文本分段
// List<String> sentences = segmentText(content);
// 向量化
// List<float[]> vectors = sentences.stream().map(TextToMilvus::getVector).collect(Collectors.toList());
} catch (Exception e) {
e.printStackTrace();
}