import com.sinitek.sirm.web.plm.funddate.MatchingObject;
import org.apache.log4j.Logger;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.*;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.*;
import java.io.*;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
public class ParseWordUtil {
private static final Logger LOGGER = Logger.getLogger(ParseWordUtil.class);
// word整体样式
private static CTStyles wordStyles = null;
public static void getWordStyle(String filepath) {
XWPFDocument template;
try {
// 读取模板文档
template = new XWPFDocument(new FileInputStream(filepath));
// 获得模板文档的整体样式
wordStyles = template.getStyle();
} catch (FileNotFoundException e) {
LOGGER.error("未找到文件",e);
} catch (IOException e) {
LOGGER.error("",e);
} catch (XmlException e) {
LOGGER.error("XML转换异常",e);
}
}
// 获取word文档标题
public static List<String> getWordTitles(String filepath) throws IOException {
String filename = getWordVersion(filepath);
if (".docx".equals(filename)) {
return getWordTitles2007(filepath);
} else {
return getWordTitlesAndContext2003(filepath, 1); // 1:只获取标题;2:只获取内容;3:标题和内容
}
}
// 获取word文档内容
public static List<String> getWordText(String filepath) throws Exception {
String filename = getWordVersion(filepath);
if (".docx".equals(filename)) {
return getParagraphText2007(filepath);
} else {
return getWordTitlesAndContext2003(filepath, 3);
}
}
// 获取文件版本,97基本已经淘汰不考虑,只针对03和07版本word
public static String getWordVersion(String filepath) {
File file = new File(filepath);
String filename = file.getName();
// filename = filename.substring(0, filename.lastIndexOf("."));
filename = filename.substring(filename.lastIndexOf("."), filename.length());
return filename;
}
/**
* 获取03版word文档标题和内容
* @param path 文件路径
* @param type 1:只获取标题;2:只获取内容;3:标题和内容都获取
* @return list
* @throws IOException
*/
public static List<String> getWordTitlesAndContext2003(String path, Integer type) throws IOException {
InputStream is = new FileInputStream(path);
HWPFDocument doc = new HWPFDocument(is);
Range r = doc.getRange();
List<String> list = new ArrayList<String>();
List<String> titles = new ArrayList<String>();
List<String> context = new ArrayList<String>();
for (int i = 0; i <
POI解析word文档
最新推荐文章于 2025-04-18 01:45:29 发布