1. 程式人生 > >php抓取網頁內容

php抓取網頁內容

function curl_file_get_contents($durl){
  $ch = curl_init();
  curl_setopt($ch, CURLOPT_URL, $durl);
  curl_setopt($ch, CURLOPT_TIMEOUT, 5);
  curl_setopt($ch, CURLOPT_USERAGENT, _USERAGENT_);
  curl_setopt($ch, CURLOPT_REFERER,_REFERER_);
  curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
  $r = curl_exec($ch);
  curl_close($ch);
  return $r;
}

//例1
$txt = file_get_contents('http://ju.taobao.com/tg/today_items.htm?spm=608.1000525.0.51&frontCatId=4000');
//$txt=curl_file_get_contents('http://ju.taobao.com/tg/today_items.htm?spm=608.1000525.0.51&frontCatId=4000');
$txt=mb_convert_encoding($txt,"UTF-8","GBK");

//$tpic	= '/<img width=\"285\" data-ks-lazyload=\"([^\"]+)\"\/>/isu';
$tpic	= '/<img width=\"285\" data-ks-lazyload=\"([^<>]+)\"\/>/isu';
$ttitle = '/<h3><a target=\"_blank\" title=\"([^<>]+)\" href/s';
//preg_match_all($tpic, $txt, $m);
//var_dump($m);

preg_match_all($tpic,$txt,$match1[]);
preg_match_all($ttitle,$txt,$match1[]);


for($i=0;$i<10;$i++){
	//echo $match1[1][1][$i].'<br>';
	echo '圖片:<img src='.$match1[0][1][$i].'><br>';
	echo '標題'.$match1[1][1][$i].'<br>';
}



//例2

$contents = file_get_contents("http://video.baidu.com/top/");
$contents = iconv("gb2312", "utf-8",$contents);

$paiming = '/<span class=\"color-v6 sum\">(\d+)<\/span>/s';  // 排名
$title   = '/<span class=\"matter\" title=\"([^<>]+)\">/s';    // 標題
$url	 = '/<a statisic=\"name\"  href=\'([^<>]+)\' class=\"block\" target=\"_blank\" >/s';
$num     = '/<span class=\"color-v6 tr\">(\d+)<\/span>/s';       // 瀏覽量
preg_match_all($paiming,$contents,$match[]);
preg_match_all($title,$contents,$match[]);
preg_match_all($url,$contents,$match[]);
preg_match_all($num,$contents,$match[]);

//print_r($match);
echo '<table><tr><td>排名</td><td>電影名稱</td><td>網址</td><td>點選量</td></tr>';
for($i=0;$i<10;$i++){
	echo '<tr><td>'.$match[0][1][$i].'</td>
			  <td>'.$match[1][1][$i].'</td>
			  <td><a href='.$match[2][1][$i].' target="_blank">'.$match[2][1][$i].'</a></td>
			  <td>'.$match[3][1][$i].'</td></tr>';
}
echo '</table>';