word预览

Word文档转HTML

最新推荐文章于 2025-07-16 13:22:51 发布

weixin_36531868

最新推荐文章于 2025-07-16 13:22:51 发布

阅读量593

点赞数

分类专栏： word预览文章标签： word预览，图片base64处理 word转html，图片base64处理

word预览专栏收录该内容

1 篇文章

订阅专栏

本文介绍了一种将Word文档转换为HTML的方法，包括处理Word 2003和2007及以上版本。对于Word 2007及更高版本，通过读取InputStream，使用XHTMLConverter将.docx文件转换为HTML，并将图片转换为base64编码内嵌于HTML中。对于Word 2003，使用WordToHtmlConverter处理.doc文件，同样将图片作为base64编码插入HTML。

word转html，图片转为base64，预览

public class InlineImageWordToHtmlConverter extends WordToHtmlConverter {

public InlineImageWordToHtmlConverter(Document document) {
    super(document);
}

@Override
protected void processImageWithoutPicturesManager(Element currentBlock,
                                                  boolean inlined, Picture picture)
{
    Element imgNode = currentBlock.getOwnerDocument().createElement("img");
    StringBuilder sb = new StringBuilder();
    sb.append(Base64.getMimeEncoder().encodeToString(picture.getRawContent()));
    sb.insert(0, "data:"+picture.getMimeType()+";base64,");
    imgNode.setAttribute("src", sb.toString());
    currentBlock.appendChild(imgNode);
}

}

//docx转换html
public static String word2007ToHtmlStr(InputStream inputStream){
    ByteArrayOutputStream baos = null;
    String content = "";
    try{
        XWPFDocument docxDocument = new XWPFDocument(inputStream);
        // 配置
        XHTMLOptions options = XHTMLOptions.create();
        // 设置图片存储路径
        String path = System.getProperty("java.io.tmpdir");
        String firstImagePathStr = path + "/" + String.valueOf(System.currentTimeMillis());
        options.setExtractor(new FileImageExtractor(new File(firstImagePathStr)));
        options.URIResolver(new BasicURIResolver(firstImagePathStr));
        // 转换html
        baos = new ByteArrayOutputStream();
        XHTMLConverter.getInstance().convert(docxDocument, baos, options);
        content = baos.toString();
        // 将image文件转换为base64并替换到html字符串里
        String middleImageDirStr = "/word/media";
        String imageDirStr = firstImagePathStr + middleImageDirStr;
        File imageDir = new File(imageDirStr);
        String[] imageList = imageDir.list();
        if (imageList != null) {
            for (int i = 0; i < imageList.length; i++) {
                String oneImagePathStr = imageDirStr + "/" + imageList[i];
                File oneImageFile = new File(oneImagePathStr);
                String imageBase64Str = new String(Base64.encodeBase64(FileUtils.readFileToByteArray(oneImageFile)), "UTF-8");
                content = content.replace(oneImagePathStr, "data:image/png;base64," + imageBase64Str);
            }
        }
        //删除图片路径
        File firstImagePath = new File(firstImagePathStr);
        FileUtils.deleteDirectory(firstImagePath);
        if(baos!=null)baos.close();
        if(inputStream!=null)inputStream.close();
    } catch (Exception e){
        try{
            if(baos!=null)baos.close();
            if(inputStream!=null)inputStream.close();
        } catch (IOException e1) {
            log.error("wordutil.docxToHtml IOException:"+e1.getMessage());
        }
        log.error("wordutil.docxToHtml Exception:"+e.getMessage());
    }
    return content;
}


private static String word2003ToHtmlStr(InputStream inputStream){
    ByteArrayOutputStream baos = null;
    String content = "";
    try{
        HWPFDocument wordDocument = new HWPFDocument(inputStream);
        WordToHtmlConverter wordToHtmlConverter = new InlineImageWordToHtmlConverter(
                DocumentBuilderFactory.newInstance().newDocumentBuilder()
                        .newDocument());

        // 解析word文档
        wordToHtmlConverter.processDocument(wordDocument);
        org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();

        // 也可以使用字符数组流获取解析的内容
        baos = new ByteArrayOutputStream();
        DOMSource domSource = new DOMSource(htmlDocument);
        StreamResult streamResult = new StreamResult(baos);

        TransformerFactory factory = TransformerFactory.newInstance();
        Transformer serializer = factory.newTransformer();
        serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
        serializer.setOutputProperty(OutputKeys.INDENT, "yes");
        serializer.setOutputProperty(OutputKeys.METHOD, "html");
        serializer.transform(domSource, streamResult);

        // 也可以使用字符数组流获取解析的内容
        content = new String(baos.toByteArray());
        baos.close();
        inputStream.close();
    } catch (Exception e) {
        try{
            if(baos !=null)baos.close();
            if(inputStream !=null)inputStream.close();
        } catch (IOException e1) {
            log.error("wordutil.word2003ToHtmlStr IOException:"+e1.getMessage());
        }
        log.error("wordutil.word2003ToHtmlStr Exception:"+e.getMessage());
    }
    return content;


}