1. 程式人生 > >Java 將Word2003(doc)/Word2007(docx)轉Html格式檔案

Java 將Word2003(doc)/Word2007(docx)轉Html格式檔案

程式碼實現:

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.PicturesManager;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.hwpf.usermodel.PictureType;
import org.apache.poi.xwpf.converter.core.BasicURIResolver;
import org.apache.poi.xwpf.converter.core.FileImageExtractor;
import org.apache.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.w3c.dom.Document;

import javax.xml.parsers.DocumentBuilderFactory;
import java.io.*;
import java.nio.file.Path;
import java.nio.file.Paths;

/**
 * @ClassName:WordToString
 * @Description:
 * @author:
 * @data:2017/10/24
 */
public class WordToString {
    public static void main(String[] args) throws Throwable {
        //final String path = "D:\\Test\\xxx.doc";
        final String filePath = "D:\\Test\\xxx.docx";
        readWordToString(filePath);
    }

    public static String readWordToString(String filePath) throws Exception{
        String str = "";
        if (FileNameUtil.isWord2003(filePath)) {
//            docToHtml(filePath, "D:\\Test\\Word2003(doc).html");
            str = docToHtml(filePath, "D:\\Test\\Word2003(doc).html");
//            System.out.print(doc);
//            FileNameUtil.StringToFile(doc, "D:\\Test\\xxx.txt");
//            FileNameUtil.txtToHtml("D:\\Test\\xxx.txt", "D:\\Test\\xxx.html");
        }
        if (FileNameUtil.isWord2007(filePath)) {
            str = docxToHtml(filePath, "D:\\Test\\Word2007(docx).html");
//            System.out.print(docx);
//            FileNameUtil.StringToFile(docx, "D:\\Test\\xxx.txt");
//            FileNameUtil.txtToHtml("D:\\Test\\xxx.txt", "D:\\Test\\xxx.html");
        }
        return str;
    }
    /*
      * doc轉換為html
      * docFilename:源word檔案路徑
      * htmlFilename:生成的html檔案路徑
      */
    public static String docToHtml(String docFilename, String targetFileName) throws Exception {
        final Path imagePath = Paths.get(targetFileName).getParent().resolve("image");
        HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(docFilename));
        Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
        WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(document);

        // 儲存圖片,並返回圖片的相對路徑
        wordToHtmlConverter.setPicturesManager(new PicturesManager() {
            @Override
            public String savePicture(byte[] content, PictureType pictureType, String name, float width, float height) {
                try (FileOutputStream out = new FileOutputStream(imagePath.resolve(name).toString())) {
                    out.write(content);
                } catch (Exception e) {
                    e.printStackTrace();
                }
                return "../tmp/image/" + name;
            }
        });
        wordToHtmlConverter.processDocument(wordDocument);
        try {
            String str = "";
            FileInputStream in = new FileInputStream(targetFileName);
            // size  為字串的長度 ,這裡一次性讀完
            int size = in.available();
            byte[] buffer = new byte[size];
            in.read(buffer);
            in.close();
            str = new String(buffer, "UTF-8");
            return str;
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return null;
        }

    }
      /*
     * docx轉換為html
     * sourceFilePath:源word檔案路徑
     * targetFileName:生成的html檔案路徑
     */

    public static String docxToHtml(String sourceFilePath, String targetFileName) throws Exception {
        String imagePathStr = Paths.get(targetFileName).getParent().resolve("../tmp/image/word/media").toString();
        OutputStreamWriter outputStreamWriter = null;
        try {
            XWPFDocument document = new XWPFDocument(new FileInputStream(sourceFilePath));
            XHTMLOptions options = XHTMLOptions.create();
            // 存放圖片的資料夾
            options.setExtractor(new FileImageExtractor(new File(imagePathStr)));
            // html中圖片的路徑
            options.URIResolver(new BasicURIResolver("../tmp/image/word/media"));

            String str = "";
            FileInputStream in = new FileInputStream(targetFileName);
            // size  為字串的長度 ,這裡一次性讀完
            int size = in.available();
            byte[] buffer = new byte[size];
            in.read(buffer);
            in.close();
            str = new String(buffer, "UTF-8");
            return str;
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
            return null;
        }
    }
}
這裡將Word轉為html格式的字串返回給前臺,可以通過我下面提供的方法使其轉換成txt和html檔案
import java.io.*;
/**
 * @ClassName:FileNameUtil
 * @Description:
 * @author:
 * @data:2017/10/24
 */
public class FileNameUtil {

    public static boolean isWord2003(String filePath)
    {
        return filePath.matches("^.+\\.(?i)(doc)$");
    }

    public static boolean isWord2007(String filePath)
    {
        return filePath.matches("^.+\\.(?i)(docx)$");
    }


    public static boolean isExcel2003(String filePath)
    {
        return filePath.matches("^.+\\.(?i)(xls)$");

    }

    public static boolean isExcel2007(String filePath)
    {

        return filePath.matches("^.+\\.(?i)(xlsx)$");

    }

    public static boolean isPDF(String filePath)
    {

        return filePath.matches("^.+\\.(?i)(pdf)$");

    }
    /**
     * 字串儲存到.txt檔案
     * @param str
     * @param filename
     */
    public static void StringToFile(String str, String filename)
    {
        try
        {
            //建立檔案物件
            File file = new File(filename);
            // 向檔案寫入物件寫入資訊
            FileWriter fileWriter = new FileWriter(file);

            // 寫檔案
            fileWriter.write(str);
            // 關閉
            fileWriter.close();
        }
        catch (IOException e)
        {
            //
            e.printStackTrace();
        }
    }

    /**
     * .txt檔案儲存為html檔案
     * @param filePath
     * @param htmlPosition
     */
    public static void txtToHtml(String filePath, String htmlPosition) {
        try {
//                        String encoding = "GBK";
            String encoding = "UTF-8";
            File file = new File(filePath);
            if (file.isFile() && file.exists()) { // 判斷檔案是否存在
                InputStreamReader read = new InputStreamReader(new FileInputStream(file), encoding);
                // 考慮到編碼格式
                BufferedReader bufferedReader = new BufferedReader(read);
                // 寫檔案
                FileOutputStream fos = new FileOutputStream(new File(htmlPosition));
                OutputStreamWriter osw = new OutputStreamWriter(fos, "UTF-8");
                BufferedWriter bw = new BufferedWriter(osw);
                String lineTxt = null;
                while ((lineTxt = bufferedReader.readLine()) != null) {
                    bw.write(lineTxt + "</br>");
                }
                bw.close();
                osw.close();
                fos.close();
                read.close();
            } else {
                System.out.println("找不到指定的檔案");
            }
        } catch (Exception e) {
            System.out.println("讀取檔案內容出錯");
            e.printStackTrace();
        }
    }
}



需要說明的一點,在我匯入poi包執行docx檔案時會報錯,最後通過匯入ooxml-schemas-1.1.jar得以解決