word内容读取为html
背景: 需要将word内容导入到富文本,工具类包含了doc和docx文件读取。
工具类 WordToHtml.java:
import fr.opensagres.poi.xwpf.converter.core.ImageManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import org.w3c.dom.Document;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.util.List;
/**
* @description word转html
*/
@Component
public class WordToHtml {
//图片保存目录
@Value("${word.pic.save.path}")
private String picPath;
/**
* @param file 待转换的文件
* @return java.lang.String
* @description 对文件进行word转换成html字符串返回
*/
public String readeWordToHtml(MultipartFile file) {
// 需要判断文件是否为doc,docx
if (file == null) {
return "";
}
String suffix = file.getOriginalFilename().substring(file.getOriginalFilename().lastIndexOf(".") + 1);
// 配置服务器访问体制
String picViewPath = "http://127.0.0.1:8761/server/dietc/source/view/word/pic/";
try {
if (suffix.equals("doc") || suffix.equals("DOC")) {
HWPFDocument wordDocument = new HWPFDocument(file.getInputStream());
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument());
wordToHtmlConverter.setPicturesManager(new PicturesManager() {
@Override
public String savePicture(byte[] content,
PictureType pictureType, String suggestedName,
float widthInches, float heightInches) {
return picViewPath + suggestedName;
}
});
wordToHtmlConverter.processDocument(wordDocument);
//save pictures
List pics = wordDocument.getPicturesTable().getAllPictures();
if (pics != null) {
for (int i = 0; i < pics.size(); i++) {
Picture pic = (Picture) pics.get(i);
try {
pic.writeImageContent(new FileOutputStream(new File(picPath
+ pic.suggestFullFileName())));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
Document htmlDocument = wordToHtmlConverter.getDocument();
ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
String result = new String(out.toByteArray()).replaceAll("↵", "");
out.close();
return result;
} else if (suffix.equals("docx") || suffix.equals("DOCX")) {
XWPFDocument document = new XWPFDocument(file.getInputStream());
XHTMLOptions options = XHTMLOptions.create();
//图片提取
//图片路径
ImageManager imageManager = new ImageManager(new File(picPath), "");
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
options.setImageManager(imageManager);
// 3) 将 XWPFDocument转换成XHTML
ByteArrayOutputStream out = new ByteArrayOutputStream();
XHTMLConverter.getInstance().convert(document, out, options);
String result = new String(out.toByteArray());
out.close();
return result.replaceAll("<img src=\"", "<img src=\"" + picViewPath);
} else {
return "请上传.doc或者.docx文件";
}
} catch (Exception e) {
e.printStackTrace();
System.out.println("文件格式错误!");
return "文件格式错误!";
}
}
}
感谢阅读,有问题欢迎留言,看到第一时间回复!(*^_^*)