用Apache POI提取Word文本

本文介绍如何使用Java POI库版本3.17从Word文档中提取文本和图片,包括处理.doc和.docx格式,去除表格、页眉和页脚,并保存第一张图片到指定路径。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

编程语言Java

POI版本为3.17 ,jar包可自行去官网下载

 

达到的最终效果是:去除Word中的表格、超文本、页眉、页脚、取出第一张图片存入硬盘并返回地址;同时也对doc直接修改扩展名为docx以及docx直接修改扩展名为doc这两种情况进行处理(通过捕获异常方式)。

我们的项目需求是只要一张图片,你当然可以取任意张。但注意docx文件取出的图片不是顺序的,要想按顺序读出图片请参考:https://www.cnblogs.com/ct-csu/p/8178932.html

参考文献:

http://poi.apache.org/components/document/index.html

http://poi.apache.org/apidocs/dev/org/apache/poi/hwpf/

http://poi.apache.org/apidocs/dev/org/apache/poi/xwpf/

/**
 * @Description: 提取word内容
 * @param @param path
 * @param @return
 * @return String
 * @author lidw
 * @date 2018年12月25日
 */
public String doWordExtract(String path) {
  // String path = "D:\\temp\\temp\\test.doc";
  JSONObject jsonObject = new JSONObject();
  String content = null;
  String text = "";
  File file = new File(path);
  if (file.exists() && file.isFile()) {
    InputStream is = null;
    HWPFDocument doc = null;
    XWPFDocument docx = null;
    POIXMLTextExtractor extractor = null;
    try {
      is = new FileInputStream(file);
      if (path.endsWith(".doc")) {
        try {
          doc = new HWPFDocument(is);
          WordExtractor ex = new WordExtractor(doc);
          String[] Str = ex.getParagraphText();//通过获取段落的方式可以去页眉和页脚
          for (String str : Str) {
            if (str.indexOf("") == -1) {//去表格
              text += ex.stripFields(str);
            }
          }
          jsonObject.put("txt",
              text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
          PicturesTable picturesTable = doc.getPicturesTable();
          List<Picture> pictures = picturesTable.getAllPictures();
          if (pictures == null || pictures.size() == 0) {
            jsonObject.put("picUrl", "");
          } else {
            // String s = new SimpleDateFormat("yyyyMMdd_HHmmss_").format(new Date());
            Picture picture = pictures.get(0);
            //Linux路径用/,Windows路径用\\,如/home/java_pic/和D:\\Desktop\\doc\\
            String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
                + picture.suggestFileExtension();
            OutputStream out = new FileOutputStream(new File(picUrl));
            picture.writeImageContent(out);
            out.close();
            jsonObject.put("picUrl", picUrl);
          }
        } catch (OfficeXmlFileException e) {// 捕获docx文件直接将扩展名修改为doc造成的异常,按照docx文件解析
          is = new FileInputStream(file);
          docx = new XWPFDocument(is);
          List<XWPFParagraph> Str = docx.getParagraphs();//通过获取段落的方式可以去页眉和页脚
          List<String> picUrls = new ArrayList<String>();
          for (XWPFParagraph str : Str) {
            text += str.getText();
            text += "_|_";
          }
          jsonObject.put("txt",
              text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
          List<XWPFPictureData> pictures = docx.getAllPictures();
          if (pictures == null || pictures.size() == 0) {
            jsonObject.put("picUrl", "");
          } else {
            XWPFPictureData picture = pictures.get(0);
            byte[] bytev = picture.getData();
            String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
                + picture.suggestFileExtension();
            OutputStream out = new FileOutputStream(new File(picUrl));
            out.write(bytev);
            out.close();
            jsonObject.put("picUrl", picUrl);
          }
        }
      } else if (path.endsWith("docx")) {
        try {
          docx = new XWPFDocument(is);
          List<XWPFParagraph> Str = docx.getParagraphs();//通过获取段落的方式可以去页眉和页脚
          List<String> picUrls = new ArrayList<String>();
          for (XWPFParagraph str : Str) {
            text += str.getText();
            text += "_|_";
          }
          jsonObject.put("txt",
              text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
          List<XWPFPictureData> pictures = docx.getAllPictures();
          if (pictures == null || pictures.size() == 0) {
            jsonObject.put("picUrl", "");
          } else {
            XWPFPictureData picture = pictures.get(0);
            byte[] bytev = picture.getData();
            String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
                + picture.suggestFileExtension();
            OutputStream out = new FileOutputStream(new File(picUrl));
            out.write(bytev);
            out.close();
            jsonObject.put("picUrl", picUrl);
          }
        } catch (OLE2NotOfficeXmlFileException e) {// 捕获doc文件直接将扩展名修改为docx造成的异常,按照doc文件解析
          is = new FileInputStream(file);
          doc = new HWPFDocument(is);
          WordExtractor ex = new WordExtractor(doc);
          String[] Str = ex.getParagraphText();//通过获取段落的方式可以去页眉和页脚
          for (String str : Str) {
            if (str.indexOf("") == -1) {//去表格
              text += ex.stripFields(str);
            }
          }
          jsonObject.put("txt",
              text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
          PicturesTable picturesTable = doc.getPicturesTable();
          List<Picture> pictures = picturesTable.getAllPictures();
          if (pictures == null || pictures.size() == 0) {
            jsonObject.put("picUrl", "");
          } else {
            Picture picture = pictures.get(0);
            String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
                + picture.suggestFileExtension();
            OutputStream out = new FileOutputStream(new File(picUrl));
            picture.writeImageContent(out);
            out.close();
            jsonObject.put("picUrl", picUrl);
          }
        }
      } else {
        System.out.println("此文件不是word文件!");
      }
    } catch (FileNotFoundException e) {
    } catch (IOException e) {
    } finally {
      try {
        if (doc != null) {
          doc.close();
        }
        if (extractor != null) {
          extractor.close();
        }
        if (docx != null) {
          docx.close();
        }
        if (is != null) {
          is.close();
        }
      } catch (IOException e) {
      }
    }
  }
  return jsonObject.toString();
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值