public class WordUtil {
//2003
public static List<String> getWordTitles2003(String path) throws IOException{
File file = new File(path);
String filename = file.getName();
filename = filename.substring(0, filename.lastIndexOf("."));
InputStream is = new FileInputStream(path);
HWPFDocument doc = new HWPFDocument(is);
Range r = doc.getRange();
List<String> list = new ArrayList<String>();
for (int i = 0; i < r.numParagraphs(); i++) {
Paragraph p = r.getParagraph(i);
// check if style index is greater than total number of styles
int numStyles =doc.getStyleSheet().numStyles();
int styleIndex = p.getStyleIndex();
if (numStyles > styleIndex) {
StyleSheet style_sheet = doc.getStyleSheet();
StyleDescription style = style_sheet.getStyleDescription(styleIndex);
String styleName = style.getName();
if (styleName!=null&&styleName.contains("标题")) {
// write style name and associated text
// System.out.println(styleName + " -> " + p.text());
// System.out.println(p.text());
String text = p.text();
list.add(text);
}
}
}
//TODO 图表跟图片不一样,需另外处理
//得到word数据流
byte [] dataStream = doc.getDataStream();
//用于在一段范围内获得段落数
int numCharacterRuns = r.numCharacterRuns();
// System.out.println("CharacterRuns 数:"+numCharacterRuns);
//负责图像提取 和 确定一些文件某块是否包含嵌入的图像。
PicturesTable table = new PicturesTable(doc, dataStream, null, null, null);
//文章图片编号
int i = 1;
for(int j=0 ; j<numCharacterRuns ; j++){
//这个类表示一个文本运行,有着共同的属性。
CharacterRun run = r.getCharacterRun(j);
//是否存在图片
boolean bool = table.hasPicture(run);
if(bool){
//返回图片对象绑定到指定的CharacterRun
Picture pic = table.extractPicture(run, true);
//图片的内容字节写入到指定的输出流。
pic.writeImageContent(new FileOutputStream("E:\\temp\\"+filename+"_"+i+".jpg"));
i++;
}
}
return list;
}
public static List<String> getWordTitles2007(String path) throws IOException{
InputStream is = new FileInputStream(path);
//2007
// OPCPackage p = POIXMLDocument.openPackage(path);
// XWPFWordExtractor e = new XWPFWordExtractor(p);
// POIXMLDocument doc = e.getDocument();
List<String> list = new ArrayList<String>();
XWPFDocument doc = new XWPFDocument(is);
XWPFParagraph[]paras = doc.getParagraphs();
for (XWPFParagraph graph : paras) {
String text = graph.getParagraphText();
String style = graph.getStyle();
if ("1".equals(style)) {
// System.out.println(text+"--["+style+"]");
}else if ("2".equals(style)) {
// System.out.println(text+"--["+style+"]");
}else if ("3".equals(style)) {
// System.out.println(text+"--["+style+"]");
}else{
continue;
}
list.add(text);
}
return list;
}
public static void main(String[] args) throws IOException {
String path = "E:/temp/poi_test.doc";
List<String> list = new ArrayList<String>();
if (path.endsWith(".doc")) {
list = getWordTitles2003(path);
}else if (path.endsWith(".docx")) {
list = getWordTitles2007(path);
}
for (String title : list) {
System.out.println(title);
}
}
}