POI解析word文档

import com.sinitek.sirm.web.plm.funddate.MatchingObject;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;

import java.io.*;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;

public class ParseWordUtil {
    private static final Logger LOGGER = Logger.getLogger(ParseWordUtil.class);
    // word整体样式
    private static CTStyles wordStyles = null;

    public static void getWordStyle(String filepath) {
        XWPFDocument template;
        try {
            // 读取模板文档
            template = new XWPFDocument(new FileInputStream(filepath));
            // 获得模板文档的整体样式
            wordStyles = template.getStyle();
        } catch (FileNotFoundException e) {
            LOGGER.error("未找到文件",e);
        } catch (IOException e) {
            LOGGER.error("",e);
        } catch (XmlException e) {
            LOGGER.error("XML转换异常",e);
        }
    }

    // 获取word文档标题
    public static List<String> getWordTitles(String filepath) throws IOException {
        String filename = getWordVersion(filepath);
        if (".docx".equals(filename)) {
            return getWordTitles2007(filepath);
        } else {
            return getWordTitlesAndContext2003(filepath, 1); // 1:只获取标题;2:只获取内容;3:标题和内容
        }
    }

    // 获取word文档内容
    public static List<String> getWordText(String filepath) throws Exception {
        String filename = getWordVersion(filepath);
        if (".docx".equals(filename)) {
            return getParagraphText2007(filepath);
        } else {
            return getWordTitlesAndContext2003(filepath, 3);
        }
    }

    // 获取文件版本,97基本已经淘汰不考虑,只针对03和07版本word
    public static String getWordVersion(String filepath) {
        File file = new File(filepath);
        String filename = file.getName();
        // filename = filename.substring(0, filename.lastIndexOf("."));
        filename = filename.substring(filename.lastIndexOf("."), filename.length());
        return filename;
    }

    /**
     * 获取03版word文档标题和内容
     * @param path 文件路径
     * @param type 1:只获取标题;2:只获取内容;3:标题和内容都获取
     * @return list
     * @throws IOException
     */
    public static List<String> getWordTitlesAndContext2003(String path, Integer type) throws IOException {
        InputStream is = new FileInputStream(path);
        HWPFDocument doc = new HWPFDocument(is);
        Range r = doc.getRange();
        List<String> list = new ArrayList<String>();
        List<String> titles = new ArrayList<String>();
        List<String> context = new ArrayList<String>();
        for (int i = 0; i <
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值