我的word文档后缀名为.doc,但实际操作中使用poi解析却报“实际上是xml文档”,先是手工将文档另存为.doc后使用poi顺利解析,但是客户的.doc文档数千份,手动另存为.doc不切合实际,因此后面通过dom4j直接能解析后缀名为.doc的xml文档。
1.poi解析doc
1.1 依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>
1.2 解析方法
/**
*将文档段落存入数组
*
* @param fis
* @throws IOException
*/
public static String[] getTable(InputStream fis) throws IOException {
//使用HWPF组件中WordExtractor类从Word文档中提取文本或段落
WordExtractor wordExtractor = new WordExtractor(fis);
Map map = new HashMap();
String[] s = new String[3000];
int index = 0;
for (String words : wordExtractor.getParagraphText()) {
if (words.endsWith("\u0007")) {
words = StringUtils.replaceLastChar(words);
}
s[index++] = words;
}
return s;
}
2.dom4j解析xml
2.1 依赖
<!-- dom4j -->
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
<!-- jaxen -->
<dependency>
<groupId>jaxen</groupId>
<artifactId>jaxen</artifactId>
<version>1.1-beta-7</version>
</dependency>
2.2 解析方法
/**
*将文档段落存入数组
*
* @param fis
* @throws IOException
*/
public static String[] getTable(InputStream fis) throws IOException {
int index = 0;
String[] s = null;
//创建Reader对象
SAXReader reader = new SAXReader();
try {
Document document = reader.read(fis);
Element root = document.getRootElement();
System.out.println("根节点---》 " + root.getName());
List<Node> nodes = root.selectNodes("//w:t");
s = new String[nodes.size()];
for (Node node : nodes) {
s[index] = node.getText();
index++;
}
} catch (DocumentException e) {
e.printStackTrace();
}
return s;
}