java将Word文件转换为html文件

我这里是maven项目,只需在资源文件中配置,会自动下载ar包

在pox.xml中配置

<!--word转html https://mvnrepository.com/artifact/org.apache.poi/poi-scratchpad -->
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-scratchpad</artifactId>
                <version>3.17</version>
            </dependency>
                    
            <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
            <dependency>
                <groupId>org.apache.poi</groupId>
                <artifactId>poi-ooxml</artifactId>
                <version>3.17</version>
            </dependency>
                
            <!-- https://mvnrepository.com/artifact/fr.opensagres.xdocreport/fr.opensagres.xdocreport.converter.docx.xwpf -->
            <dependency>
                <groupId>fr.opensagres.xdocreport</groupId>
                <artifactId>fr.opensagres.xdocreport.converter.docx.xwpf</artifactId>
                <version>2.0.1</version>
            </dependency>

java代码

package com.lmt.service.file;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.UUID;

import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.util.IOUtils;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.web.multipart.MultipartFile;
import org.w3c.dom.Document;


import fr.opensagres.poi.xwpf.converter.core.ImageManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;

@Component
public class WordToHtml {
    private static final Logger logger = LoggerFactory.getLogger(WordToHtml.class);
    
    
    @Autowired
    private ParseFile parseFile;
    
    public File convert(MultipartFile file) {
        String filename = file.getOriginalFilename();
        String suffix=filename.substring(filename.lastIndexOf("."));
        String newName=UUID.randomUUID().toString();
        // TODO 需要保存在一个新的位置
        File convFile = new File("D:/test/" + newName +suffix);
        FileOutputStream fos = null;
        try {
            convFile.createNewFile(); 
            fos = new FileOutputStream(convFile); 
            fos.write(file.getBytes());
        } catch (IOException ex) {
            logger.error("上传文件出错!", ex);
            return null;
        } finally {
            IOUtils.closeQuietly(fos);
        }
        
        // 输入文件名的所在文件夹
        // 加上反斜杠
        String parentDirectory = convFile.getParent();
        if (!parentDirectory.endsWith("\\")) {
            parentDirectory = parentDirectory + "\\";
        }
        
        if (filename.endsWith(".docx")) {
            return docxConvert(parentDirectory, convFile.getAbsolutePath(),newName);
        } else if (filename.endsWith(".doc")) {
            return docConvert(parentDirectory, convFile.getAbsolutePath(),newName);
        } else {
            logger.error("不支持的文件格式!");
            return null;
        }
    }
    
    
    private File docxConvert(String parentDirectory, String filename,String newName) {
        try {
            XWPFDocument document = new XWPFDocument(new FileInputStream(filename));
            XHTMLOptions options = XHTMLOptions.create().setImageManager(new ImageManager(new File(parentDirectory), UUID.randomUUID().toString())).indent(4);
            FileOutputStream out = new FileOutputStream(new File(parentDirectory + newName+ ".html"));
            XHTMLConverter.getInstance().convert(document, out, options);
            return new File(parentDirectory + newName+ ".html");
        } catch (IOException ex) {
            logger.error("word转化出错!", ex);
            return null;
        }
        
    }
    
    
    private File docConvert(String parentDirectory, String filename,String newName) {
        try {
            HWPFDocument document = new HWPFDocument(new FileInputStream(filename));
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
                    DocumentBuilderFactory.newInstance().newDocumentBuilder()
                            .newDocument());
            
            // converter默认对图片不作处理,需要手动下载图片并嵌入到html中
             wordToHtmlConverter.setPicturesManager(new PicturesManager() {
                    @Override
                    public String savePicture(byte[] bytes, PictureType pictureType, String s, float v, float v1) {
                        String imageFilename = parentDirectory + "";
                        String identity=UUID.randomUUID().toString();
                        File imageFile = new File(imageFilename, identity+s);
                        imageFile.getParentFile().mkdirs();
                        InputStream in = null;
                        FileOutputStream out = null;

                        try {
                            in = new ByteArrayInputStream(bytes);
                            out = new FileOutputStream(imageFile);
                            IOUtils.copy(in, out);

                        } catch (IOException ex) {
                            logger.error("word转化出错!", ex);
                        } finally {
                            if (in != null) {
                                IOUtils.closeQuietly(in);
                            }

                            if (out != null) {
                                IOUtils.closeQuietly(out);
                            }

                        }
                        return imageFile.getName();
                    }
                });
            
            wordToHtmlConverter.processDocument(document);
            Document htmlDocument = wordToHtmlConverter.getDocument();
            ByteArrayOutputStream out = new ByteArrayOutputStream();
            DOMSource domSource = new DOMSource(htmlDocument);
            StreamResult streamResult = new StreamResult(out);

            TransformerFactory tf = TransformerFactory.newInstance();
            Transformer serializer = tf.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, streamResult);
            out.close();

            String result = new String(out.toByteArray());
            FileWriter writer = new FileWriter(parentDirectory + newName + ".html");
            writer.write(result);
            writer.close();                        
        } catch (IOException | TransformerException | ParserConfigurationException ex) {
            logger.error("word转化出错!", ex);
        }
        return new File(parentDirectory + newName + ".html");
    }
    
    /**
     * 将上传的Word文档转化成HTML字符串
     * @param attachfile
     * @return
     */
    public String convertToHtml(MultipartFile attachfile) {
        String wordContent = "";
        // 将Word文件转换为html
        File file = convert(attachfile);
        // 读取html文件
        if (file != null) {
            wordContent = parseFile.readHtml(file);
        }
        return wordContent;
    }
    
}