1. 程式人生 > >php解析word文件

php解析word文件

一個簡單的word文件閱讀類,使用正則實現簡單的docx文件閱讀,下面是程式碼

<?php
class Lib {
	/**
	 * @param $file docx檔案路徑
	 * @return string 生成的html字串
	 * ---讀取docx文件轉換為html,僅保留段落,表格,文字框,不保留樣式
	 * by sdxjwkq
	 */
	public function docxToHtml($file) {
		$zip = new \ZipArchive();
		$zip->open($file);
		$xml = $zip->getFromName("word/document.xml");
		$xml = file_get_contents("a.xml");
		$table = array(); //快取表格
		$textbox = array(); //快取文字框
		//處理表格
		preg_match_all('/<w:tbl>([\s\S]*?)<\/w:tbl>/s', $xml, $tableHandel);
		for ($i = 0; $i < count($tableHandel[0]); $i++) {
			$table["@
[email protected]
" . $i] = $tableHandel[0][$i]; $xml = str_replace($tableHandel[0][$i], "@[email protected]" . $i, $xml); } //處理文字框 preg_match_all('/<w:pict>([\s\S]*?)<\/w:pict>/s', $xml, $textboxHandel); foreach ($textboxHandel[0] as $key => &$value) { $temp = $value; $temp2 = ""; preg_match_all('/<w:t>([\s\S]*?)<\/w:t>/s', $value, $div); foreach ($div[0] as $k => &$v) { $temp2 .= $v; } $xml = str_replace($temp, $temp2, $xml); } for ($i = 0; $i < count($textboxHandel[0]); $i++) { $textbox["@
[email protected]
" . $i] = $textboxHandel[0][$i]; $xml = str_replace($textboxHandel[0][$i], "@[email protected]" . $i, $xml); } preg_match_all('/<w:p([\s\S]*?)<\/w:p>|@[email protected]\d|@[email protected]\d/s', $xml, $content); foreach ($content[0] as $key => &$value) { if (strpos($value, "TABLECONTENT")) { $value = $table[$value]; } $value = str_replace("w:", "", $value); } $content = $content[0]; //把段落和表格解析出來 $docx = <<<HTML_ENTITIES <style> table{ background-color:#000; } table td{ padding:5px 5px 5px 5px; } table tr{ background-color:#fff; } </style> HTML_ENTITIES; foreach ($content as $a => &$b) { $b = json_decode( json_encode( simplexml_load_string($b) ), true ); if (isset($b['tr'])) { //表格 $docx .= "<table border='0' cellspacing='1' cellpadding='0'>"; foreach ($b['tr'] as $key => $value) { $docx .= "<tr>"; foreach ($value['tc'] as $k => $v) { if (isset($v['p']['r'][0])) { $docx .= "<td>"; foreach ($v['p']['r'] as $ke => $va) { $docx .= $va['t']; } $docx .= "</td>"; } else { $docx .= "<td>" . $v['p']['r']['t'] . "</td>"; } } $docx .= "</tr>"; } $docx .= "</table>"; } else { //段落 $docx .= "<p>"; if (isset($b['r'][0])) { foreach ($b['r'] as $key => &$value) { if (is_string($value['t'])) { $docx .= $value['t']; } } } else { if (is_string($b['r']['t'])) { $docx .= $b['r']['t']; } } $docx .= "</p>"; } } return $docx; } }