file_get_contents 採集網頁內容實現自定義函式過濾
阿新 • • 發佈:2019-02-01
以下兩個函式是實現字元任意攫取和編碼自動轉換成utf8,可以不用考慮編碼
function strCutByStr($str,$start,$end) { $temp=strstr($str,$start); $temp=str_replace($start,'',$temp); $contgzw=strstr($temp,$end,TRUE); return $contgzw; } function array_iconv($data, $output = 'utf-8') { $encode_arr = array('UTF-8','ASCII','GBK','GB2312','BIG5','JIS','eucjp-win','sjis-win','EUC-JP'); $encoded = mb_detect_encoding($data, $encode_arr); if (!is_array($data)) { return mb_convert_encoding($data, $output, $encoded); } else { foreach ($data as $key=>$val) { $key = array_iconv($key, $output); if(is_array($val)) { $data[$key] = array_iconv($val, $output); } else { $data[$key] = mb_convert_encoding($data, $output, $encoded); } } return $data; } }
呼叫$fcontents =array_iconv(file_get_contents($url), $output = 'utf-8');
$str =array_iconv(file_get_contents('https://www.chumoshu.com'), $output = 'utf-8'); $title= strCutByStr($str,$zw7428cn_titlestart,$zw7428cn_titleend); $content= trim(strCutByStr($str, $zw7428cn_contentstart, $zw7428cn_contentend)); $content = preg_replace("/<a[^>]*>/i", "", $content); $content = preg_replace("/<\/a>/i", "", $content); $content = preg_replace("/<div[^>]*>/i", "", $content); $content = preg_replace("/<\/div>/i", "", $content); $content = preg_replace("/<!--[^>]*-->/i", "", $content); $content = preg_replace("/style=.+?['|\"]/i",'',$content); $content = preg_replace("/class=.+?['|\"]/i",'',$content); $content = preg_replace("/id=.+?['|\"]/i",'',$content); $content = preg_replace("/lang=.+?['|\"]/i",'',$content); $content = preg_replace("/width=.+?['|\"]/i",'',$content); $content = preg_replace("/height=.+?['|\"]/i",'',$content); $content = preg_replace("/border=.+?['|\"]/i",'',$content); $content = preg_replace("/face=.+?['|\"]/i",'',$content); $content = preg_replace("/<script[\s\S]*?<\/script>/i",'',$content); $content = preg_replace("/face=.+?['|\"]/",'',$content); $content =trim($content); echo '<font color=red>成功採集檔案:</font>'.$title; $zw7428cn_bookid=$zw7428cn_mulu; $zw7428cn_bookname=$zw7428cn_bookname; $zw7428cn_articlename=addslashes(htmlspecialchars($title)); $zw7428cn_content=addslashes(htmlspecialchars($content)); $zw7428cn_date=date('y-m-d h:i:s',time()); $sql="insert into zw7428cnchapter(zw7428cn_bookid,zw7428cn_bookname,zw7428cn_articlename,zw7428cn_content,zw7428cn_date) values('$zw7428cn_bookid','$zw7428cn_bookname','$zw7428cn_articlename','$zw7428cn_content','$zw7428cn_date')"; mysqli_query($link,$sql); echo '<font color=red>同時已成功入庫!</font><br>';
以上是採集觸控書城的主要實現程式碼部分以及如何入庫部分。