本文介绍了一个简单的Java程序,该程序能够从指定路径的 .doc/.docx 文件中读取文本内容。通过使用Apache POI库中的WordExtractor类,实现了对Microsoft Word文档的解析。
1、Maven Jar包
<!-- .docx -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
<!-- .doc -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.3</version>
</dependency>
2、Java代码
package org.example.utils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
public class WordUtils {
public String read(String path) {
try {
if (path.toLowerCase().endsWith(".docx"))
return readDocx(path);
else if (path.toLowerCase().endsWith(".doc"))
return readDoc(path);
else
throw new IllegalArgumentException("不支持的文件格式");
} catch (Exception e){
e.printStackTrace();
}
return null;
}
public String readDocx(String path) throws IOException {
try (InputStream in = new FileInputStream(path);
XWPFDocument doc = new XWPFDocument(in)) {
return new XWPFWordExtractor(doc).getText();
}
}
public String readDoc(String path) throws IOException {
try (InputStream in = new FileInputStream(path);
HWPFDocument doc = new HWPFDocument(in)) {
return new WordExtractor(doc).getText();
}
}
public static void main(String[] args) {
WordUtils wordUtils = new WordUtils();
try {
String docx = wordUtils.read("/Users/work/Documents/数据分析报告.doc");
System.out.println(docx);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
}