编程语言Java
POI版本为3.17 ,jar包可自行去官网下载
达到的最终效果是:去除Word中的表格、超文本、页眉、页脚、取出第一张图片存入硬盘并返回地址;同时也对doc直接修改扩展名为docx以及docx直接修改扩展名为doc这两种情况进行处理(通过捕获异常方式)。
我们的项目需求是只要一张图片,你当然可以取任意张。但注意docx文件取出的图片不是顺序的,要想按顺序读出图片请参考:https://www.cnblogs.com/ct-csu/p/8178932.html
参考文献:
http://poi.apache.org/components/document/index.html
http://poi.apache.org/apidocs/dev/org/apache/poi/hwpf/
http://poi.apache.org/apidocs/dev/org/apache/poi/xwpf/
/**
* @Description: 提取word内容
* @param @param path
* @param @return
* @return String
* @author lidw
* @date 2018年12月25日
*/
public String doWordExtract(String path) {
// String path = "D:\\temp\\temp\\test.doc";
JSONObject jsonObject = new JSONObject();
String content = null;
String text = "";
File file = new File(path);
if (file.exists() && file.isFile()) {
InputStream is = null;
HWPFDocument doc = null;
XWPFDocument docx = null;
POIXMLTextExtractor extractor = null;
try {
is = new FileInputStream(file);
if (path.endsWith(".doc")) {
try {
doc = new HWPFDocument(is);
WordExtractor ex = new WordExtractor(doc);
String[] Str = ex.getParagraphText();//通过获取段落的方式可以去页眉和页脚
for (String str : Str) {
if (str.indexOf("") == -1) {//去表格
text += ex.stripFields(str);
}
}
jsonObject.put("txt",
text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
PicturesTable picturesTable = doc.getPicturesTable();
List<Picture> pictures = picturesTable.getAllPictures();
if (pictures == null || pictures.size() == 0) {
jsonObject.put("picUrl", "");
} else {
// String s = new SimpleDateFormat("yyyyMMdd_HHmmss_").format(new Date());
Picture picture = pictures.get(0);
//Linux路径用/,Windows路径用\\,如/home/java_pic/和D:\\Desktop\\doc\\
String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
+ picture.suggestFileExtension();
OutputStream out = new FileOutputStream(new File(picUrl));
picture.writeImageContent(out);
out.close();
jsonObject.put("picUrl", picUrl);
}
} catch (OfficeXmlFileException e) {// 捕获docx文件直接将扩展名修改为doc造成的异常,按照docx文件解析
is = new FileInputStream(file);
docx = new XWPFDocument(is);
List<XWPFParagraph> Str = docx.getParagraphs();//通过获取段落的方式可以去页眉和页脚
List<String> picUrls = new ArrayList<String>();
for (XWPFParagraph str : Str) {
text += str.getText();
text += "_|_";
}
jsonObject.put("txt",
text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
List<XWPFPictureData> pictures = docx.getAllPictures();
if (pictures == null || pictures.size() == 0) {
jsonObject.put("picUrl", "");
} else {
XWPFPictureData picture = pictures.get(0);
byte[] bytev = picture.getData();
String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
+ picture.suggestFileExtension();
OutputStream out = new FileOutputStream(new File(picUrl));
out.write(bytev);
out.close();
jsonObject.put("picUrl", picUrl);
}
}
} else if (path.endsWith("docx")) {
try {
docx = new XWPFDocument(is);
List<XWPFParagraph> Str = docx.getParagraphs();//通过获取段落的方式可以去页眉和页脚
List<String> picUrls = new ArrayList<String>();
for (XWPFParagraph str : Str) {
text += str.getText();
text += "_|_";
}
jsonObject.put("txt",
text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
List<XWPFPictureData> pictures = docx.getAllPictures();
if (pictures == null || pictures.size() == 0) {
jsonObject.put("picUrl", "");
} else {
XWPFPictureData picture = pictures.get(0);
byte[] bytev = picture.getData();
String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
+ picture.suggestFileExtension();
OutputStream out = new FileOutputStream(new File(picUrl));
out.write(bytev);
out.close();
jsonObject.put("picUrl", picUrl);
}
} catch (OLE2NotOfficeXmlFileException e) {// 捕获doc文件直接将扩展名修改为docx造成的异常,按照doc文件解析
is = new FileInputStream(file);
doc = new HWPFDocument(is);
WordExtractor ex = new WordExtractor(doc);
String[] Str = ex.getParagraphText();//通过获取段落的方式可以去页眉和页脚
for (String str : Str) {
if (str.indexOf("") == -1) {//去表格
text += ex.stripFields(str);
}
}
jsonObject.put("txt",
text.replaceAll("|\r| ", "").replaceAll("\n", "_|_").replaceAll("[_|_]+", "_|_"));
PicturesTable picturesTable = doc.getPicturesTable();
List<Picture> pictures = picturesTable.getAllPictures();
if (pictures == null || pictures.size() == 0) {
jsonObject.put("picUrl", "");
} else {
Picture picture = pictures.get(0);
String picUrl = "/home/java_pic/" + UUID.randomUUID() + "."
+ picture.suggestFileExtension();
OutputStream out = new FileOutputStream(new File(picUrl));
picture.writeImageContent(out);
out.close();
jsonObject.put("picUrl", picUrl);
}
}
} else {
System.out.println("此文件不是word文件!");
}
} catch (FileNotFoundException e) {
} catch (IOException e) {
} finally {
try {
if (doc != null) {
doc.close();
}
if (extractor != null) {
extractor.close();
}
if (docx != null) {
docx.close();
}
if (is != null) {
is.close();
}
} catch (IOException e) {
}
}
}
return jsonObject.toString();
}