1. 程式人生 > >file_get_contents 採集網頁內容實現自定義函式過濾

file_get_contents 採集網頁內容實現自定義函式過濾

以下兩個函式是實現字元任意攫取和編碼自動轉換成utf8,可以不用考慮編碼
function strCutByStr($str,$start,$end)          
{  
$temp=strstr($str,$start);
$temp=str_replace($start,'',$temp);
$contgzw=strstr($temp,$end,TRUE);
 return $contgzw;         
} 

function array_iconv($data, $output = 'utf-8') {
  $encode_arr = array('UTF-8','ASCII','GBK','GB2312','BIG5','JIS','eucjp-win','sjis-win','EUC-JP');
  $encoded = mb_detect_encoding($data, $encode_arr);
  if (!is_array($data)) {
    return mb_convert_encoding($data, $output, $encoded);
  }
  else {
    foreach ($data as $key=>$val) {
      $key = array_iconv($key, $output);
      if(is_array($val)) {
        $data[$key] = array_iconv($val, $output);
      } else {
      $data[$key] = mb_convert_encoding($data, $output, $encoded);
      }
    }
  return $data;
  }
}

呼叫$fcontents =array_iconv(file_get_contents($url), $output = 'utf-8');

$str =array_iconv(file_get_contents('https://www.chumoshu.com'), $output = 'utf-8');

$title= strCutByStr($str,$zw7428cn_titlestart,$zw7428cn_titleend);
$content= trim(strCutByStr($str, $zw7428cn_contentstart, $zw7428cn_contentend));

$content = preg_replace("/<a[^>]*>/i", "", $content);  
   $content = preg_replace("/<\/a>/i", "", $content);   
   $content = preg_replace("/<div[^>]*>/i", "", $content);  
   $content = preg_replace("/<\/div>/i", "", $content);           
   $content = preg_replace("/<!--[^>]*-->/i", "", $content);         
   $content = preg_replace("/style=.+?['|\"]/i",'',$content); 
   $content = preg_replace("/class=.+?['|\"]/i",'',$content); 
   $content = preg_replace("/id=.+?['|\"]/i",'',$content);    
   $content = preg_replace("/lang=.+?['|\"]/i",'',$content);    
   $content = preg_replace("/width=.+?['|\"]/i",'',$content);   
   $content = preg_replace("/height=.+?['|\"]/i",'',$content);  
   $content = preg_replace("/border=.+?['|\"]/i",'',$content);  
   $content = preg_replace("/face=.+?['|\"]/i",'',$content);  
   $content = preg_replace("/<script[\s\S]*?<\/script>/i",'',$content);  
   $content = preg_replace("/face=.+?['|\"]/",'',$content);
   $content =trim($content);
echo '<font color=red>成功採集檔案:</font>'.$title;
$zw7428cn_bookid=$zw7428cn_mulu;
$zw7428cn_bookname=$zw7428cn_bookname;

	 $zw7428cn_articlename=addslashes(htmlspecialchars($title));
	 $zw7428cn_content=addslashes(htmlspecialchars($content));
$zw7428cn_date=date('y-m-d h:i:s',time());
	 $sql="insert into zw7428cnchapter(zw7428cn_bookid,zw7428cn_bookname,zw7428cn_articlename,zw7428cn_content,zw7428cn_date) values('$zw7428cn_bookid','$zw7428cn_bookname','$zw7428cn_articlename','$zw7428cn_content','$zw7428cn_date')";
	 mysqli_query($link,$sql);
echo '<font color=red>同時已成功入庫!</font><br>';

以上是採集觸控書城的主要實現程式碼部分以及如何入庫部分。