解析word文档内表格（poi解析doc、dom4j解析xml）_xml 类型的 doc。用 poi 怎么都无法解析-优快云博客

本文链接：https://blog.youkuaiyun.com/liu066154/article/details/103982864

	我的word文档后缀名为.doc，但实际操作中使用poi解析却报“实际上是xml文档”，先是手工将文档另存为.doc后使用poi顺利解析，但是客户的.doc文档数千份，手动另存为.doc不切合实际，因此后面通过dom4j直接能解析后缀名为.doc的xml文档。

1.poi解析doc

1.1 依赖

<dependency>
		<groupId>org.apache.poi</groupId>
		<artifactId>poi</artifactId>
		<version>3.17</version>
</dependency>
<dependency>
		<groupId>org.apache.poi</groupId>
		<artifactId>poi-ooxml-schemas</artifactId>
		<version>3.17</version>
</dependency>
<dependency>
		<groupId>org.apache.poi</groupId>
		<artifactId>poi-ooxml</artifactId>
		<version>3.17</version>
</dependency>

1.2 解析方法

    /**
     *将文档段落存入数组
     *
     * @param fis
     * @throws IOException
     */
    public static String[] getTable(InputStream fis) throws IOException {
    	//使用HWPF组件中WordExtractor类从Word文档中提取文本或段落
        WordExtractor wordExtractor = new WordExtractor(fis);
        Map map = new HashMap();
        String[] s = new String[3000];
        int index = 0;
        for (String words : wordExtractor.getParagraphText()) {
            if (words.endsWith("\u0007")) {
                words = StringUtils.replaceLastChar(words);
            }
            s[index++] = words;
        }
        return s;
    }

2.dom4j解析xml

2.1 依赖

<!-- dom4j -->
<dependency>
	<groupId>dom4j</groupId>
	<artifactId>dom4j</artifactId>
	<version>1.6.1</version>
</dependency>
<!-- jaxen -->
<dependency>
	<groupId>jaxen</groupId>
	<artifactId>jaxen</artifactId>
	<version>1.1-beta-7</version>
</dependency>

2.2 解析方法

 	/**
     *将文档段落存入数组
     *
     * @param fis
     * @throws IOException
     */
    public static String[] getTable(InputStream fis) throws IOException {
        int index = 0;
        String[] s = null;
        //创建Reader对象
        SAXReader reader = new SAXReader();
        try {
            Document document = reader.read(fis);
            Element root = document.getRootElement();
            System.out.println("根节点---》 " + root.getName());

            List<Node> nodes = root.selectNodes("//w:t");
            s = new String[nodes.size()];
            for (Node node : nodes) {
               s[index] = node.getText();
               index++;
            }

        } catch (DocumentException e) {
            e.printStackTrace();
        }
        return s;
    }