1. 程式人生 > >java利用poi解析docx生成html

java利用poi解析docx生成html

公司業務需要把world文件中編輯好的新聞(文字+圖片)錄入到CMS管理後臺,生成一篇新聞釋出。因為不能把圖片直接複製貼上到UEditor編輯器上,還要一個一個上傳太麻煩。所以這裡做了一個上傳docx檔案解析後,直接返回html正文放到前端編輯器繼續編輯。 功能要求: 1.圖片要下載到伺服器指定位置,並把前端請求圖片地址拼接到img標籤的src上。 2.圖片文字要按照順序排列。 3.過濾掉超連結、其他圖形等一般新聞不用的元素。 實現:

  1. maven最小依賴,3.17版本支援jdk1.6及以上。4版本需要jdk1.8及以上支援了
    	<groupId>org.apache.poi</groupId>
    		<artifactId>poi</artifactId>
    		<version>3.17</version>
		</dependency>
		<dependency>
    	<groupId>org.apache.poi</groupId>
    		<artifactId>poi-ooxml</artifactId>
   		 	<version>3.17</version>
		</dependency>
		<dependency>
    	<groupId>org.apache.poi</groupId>
    		<artifactId>poi-ooxml-schemas</artifactId>
    		<version>3.17</version>
		</dependency>

2.程式碼實現

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.List;

import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.xmlbeans.XmlCursor;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.drawingml.x2006.main.CTGraphicalObject;
import org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture;
import org.openxmlformats.schemas.drawingml.x2006.wordprocessingDrawing.CTInline;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDrawing;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;

public class AnalyzeDocx {

	public static void main(String[] args) throws Exception {
		String content = analyzeDocx("e://abc.docx");
		System.out.println(content);
	}

	public static String analyzeDocx(String path) throws Exception {

		StringBuilder sb = new StringBuilder();
		try (InputStream in = new FileInputStream(path); XWPFDocument xwpfDocument = new XWPFDocument(in);) {
			List<XWPFParagraph> paragraphs = xwpfDocument.getParagraphs();
			for (XWPFParagraph xwpfParagraph : paragraphs) {
				List<XWPFRun> runs = xwpfParagraph.getRuns();
				for (XWPFRun xwpfRun : runs) {
					CTR ctr = xwpfRun.getCTR();
					XmlCursor newCursor = ctr.newCursor();
					newCursor.selectPath("./*");
					while (newCursor.toNextSelection()) {
						XmlObject object = newCursor.getObject();
						if (object instanceof CTText) {// 文字
							CTText ctText = (CTText) object;
							if (ctText.isSetSpace()) {
								continue;// 先不支援超連結
							}
							String text = ctText.getStringValue();
							if (text != null && text.length() > 0) {
								sb.append(text);
							}
						} else if (object instanceof CTDrawing) {// 圖片1
							CTDrawing drawing = (CTDrawing) object;
							CTInline[] inlineArray = drawing.getInlineArray();
							for (CTInline ctInline : inlineArray) {
								CTGraphicalObject graphic = ctInline.getGraphic();
								XmlCursor newCursor2 = graphic.getGraphicData().newCursor();
								newCursor2.selectPath("./*");
								while (newCursor2.toNextSelection()) {
									XmlObject object2 = newCursor2.getObject();
									if (object2 instanceof CTPicture) {
										CTPicture picture = (org.openxmlformats.schemas.drawingml.x2006.picture.CTPicture) object2;
										sb.append("<br>").append(
												imgHtml(xwpfDocument, picture.getBlipFill().getBlip().getEmbed()))
												.append("<br>");
									}
								}
							}
						}
					}
				}
				sb.append("<br>");// 分段
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		return sb.toString();
	}

	private static String imgHtml(XWPFDocument xwpfDocument, String blipID) {
		XWPFPictureData pictureData = xwpfDocument.getPictureDataByID(blipID);
		String imageName = pictureData.getFileName();
		String newfilename = System.currentTimeMillis() + imageName;
		byte[] bytev = pictureData.getData();
		try (FileOutputStream fos = new FileOutputStream("E:/" + newfilename);) {
			fos.write(bytev);// 此處儲存圖片後,變成可訪問的http然後用<img>標籤包裹
		} catch (Exception e) {
			e.printStackTrace();
		}
		return "<img src='/rongmeitiapi/api/picture/find/image/20181107/d66ce5ffc18365a3dab1e46c484dfabb.jpeg'>";
	}

}

imgHtml方法需要把圖片重新命名後,變成前端可訪問的連線,再去拼接img標籤。我這邊因為是測試,所以寫死了img標籤。 注意:這個只是處理正常的可檢視片,對於emf型別的圖片,不處理因為新聞也用不到。 如果需要捕獲所有的,請參考https://www.jb51.net/article/132091.htm