package com.adj.readwordfile; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.openxml4j.exceptions.OpenXML4JException; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.xmlbeans.XmlException; import org.textmining.text.extraction.WordExtractor; public class ReadWord { public static void main(String[] args) { String wordFilePath03 = "D:/readWordFileTest.doc"; String wordFilePath07 = "D:/readWordFileTest.docx"; ReadWord word = new ReadWord(); // 读取 word 2003 版本文件 String fileContent03 = word.ReadWordFile2003(wordFilePath03); System.out.println(fileContent03); // 读取 word 2007 版本文件 String fileContent07 = word.ReadWordFile2007(wordFilePath07); System.out.println(fileContent07); } /** * 读取 word 2003 版本文件 .doc格式 * * 使用:tm-extractors-0.4.jar 文件,只可以读取 word2003 版的文件 如果你的word文档使用 wps程序 * 保存过,会出现以下错误,只需要再用 word程序 打开保存一下就可以解决 * org.textmining.text.extraction.FastSavedException: Fast-saved files are * unsupported at this time * * @param filePath * @return */ private String ReadWordFile2003(String filePath) { String text = ""; WordExtractor extractor = null; try { FileInputStream inputStream = new FileInputStream(filePath); extractor = new WordExtractor(); text = extractor.extractText(inputStream); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } return text; } /** * 读取 word 2007 以上版本文件 .docx格式 * * 上面的方法只可读取 word2003版本的文件,无法读取word2007及以上版本的文档, 要想读取word2007 * word2010版本的文档必须使用apache的poi开源项目包,下载地址: * http://www.apache.org/dyn/closer.cgi/poi/release/bin/poi-bin-3.9-20121203.tar.gz * * 使用到的 jar 包 * poi-3.9-20121203.jar poi-ooxml-3.9-20121203.jar * poi-ooxml-schemas-3.9-20121203.jar poi-scratchpad-3.9-20121203.jar * xmlbeans-2.3.0.jar dom4j-1.6.1.jar * * @param filePath * @return */ private String ReadWordFile2007(String filePath) { String text = ""; try { OPCPackage opcPackage = POIXMLDocument.openPackage(filePath); POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage); text = extractor.getText(); } catch (IOException e) { e.printStackTrace(); } catch (XmlException e) { e.printStackTrace(); } catch (OpenXML4JException e) { e.printStackTrace(); } return text; } }
Java读取操作word2003 word2007 word2010文档
最新推荐文章于 2023-01-07 20:56:37 发布