PHP多程序抓取百度搜索結果
阿新 • • 發佈:2019-01-03
<?php /** * 多程序抓取百度結果頁自然結果,包括標題、摘要、圖片、連結、來源 * @since 2016-04-15 */ class NaturalResultSpider { private $_strQuery = null; public $worker_process = 4; //開啟程序數 private $_arrPids = array(); private $_intPageNum; //需要抓取的自然結果頁數 public $arrAllResult = array(); public $dataHandler = null; //鉤子,可以回撥指定的函式完成對應功能 private $masterPid = null; private $retry_times = 1; private $strReg = '/<div\sclass="result\sc-result\sc-clk-recommend"(.*)?>(.*)?(<img\ssrc="(.*)?">)?(.*)?(<p\sclass="c-line-clamp3\sc-color">(.*)?)+<\/div>/Uis'; private static $_arrPattern = array( array('name'=>'nature_result', 'reg'=>'/data-log=\"(.*?)\"/', 'location'=>1), array('name'=>'title', 'reg'=>'/<h3(.*?)>(.*?)<\/h3>/', 'location'=>2), array('name'=>'abstract', 'reg'=>'/<p class=\"c-line-clamp3 c-color\">(.*?)<\/p>/', 'location'=>1), array('name'=>'source_url', 'reg'=>'/<div class=\"c-showurl c-line-clamp1\"><span>(.*?)<\/span>/', 'location'=>1), array('name'=>'url', 'reg'=>'/<div class=\"c-container\"><a(.*?)class=\"c-blocka\" href=\"(.*?)\">/', 'location'=>2), array('name'=>'img', 'reg'=>'/<div class=\"c-img c-img-s\"><img data-imagedelaysrc=\"(.*?)\"/', 'location'=>1), ); public function __construct($strQuery, $intPageNum=76) { $this->_strQuery = $strQuery; $this->_intPageNum = $intPageNum; } public function execute() { $this->setMasterPid(); $this->forkWorker(); $this->monitorWorker(); } private function setMasterPid() { $this->masterPid = posix_getpid(); } public function setWorkerProcess($intWorkerProcess) { if ($intWorkerProcess <= 0) { return false; } $this->worker_process = $intWorkerProcess; } public function setRetryTimes($intTimes) { if ($intTimes <= 0) { return false; } $this->retry_times = $intTimes; } public function setRegPattern($strReg) { if (empty($strReg)) { return false; } $this->strReg = $strReg; } public function setPattern($arrPattern) { if (!is_array($arrPattern) || empty($arrPattern)) { return false; } self::$_arrPattern[] = $arrPattern; } private function monitorWorker() { if ($this->masterPid === posix_getpid()) { foreach ($this->_arrPids as $intPid) { pcntl_waitpid($intPid, $status, WUNTRACED); $status = pcntl_wexitstatus($status); if ($status === 100) { unset($this->_arrPids[$inPid]); } } } } /*主呼叫方法*/ public function forkWorker() { for ($i=0; $i<$this->worker_process; ++$i) { $pid = pcntl_fork(); if ($pid === -1) { exit; } elseif ($pid > 0) { $this->_arrPids[$pid] = $pid; } else { $arrResult = $this->run($i); if ($this->dataHandler) { call_user_func($this->dataHandler, $arrResult); } exit(100); } } } /*為worker分配任務*/ private function run($intWorkerId) { $intPage = ceil($this->_intPageNum / $this->worker_process); $intBegin = $intWorkerId * $intPage; $intEnd = ($intWorkerId + 1) * $intPage; $intEnd = $intEnd > $this->_intPageNum ? $this->_intPageNum : $intEnd; for ($i=$intBegin; $i<$intEnd; ++$i) { $strUrl = 'm.baidu.com/s?word=' . urlencode($this->_strQuery); $strUrl .= $i == 0 ? '' : '&pn=' . $i*10; //如果失敗則重試 $error_times = 0; while (true) { if ($error_times >= $this->retry_times) { break; } $strHtml = $this->curl($strUrl); $arrMatches = $this->getHtmlContent($strHtml); $arrNaturalResult = $this->getNaturalResult($arrMatches); if (!empty($arrNaturalResult)) { $arrResult[$i] = $arrNaturalResult; break; } $error_times++; } } return $arrResult; } private function curl($url) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_TIMEOUT, 10); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30); $result = curl_exec($ch); if (curl_errno($ch)) { exit; } return $result; } public function getHtmlContent($strHtml) { if (empty($strHtml)) { return false; } preg_match_all($this->strReg, $strHtml, $arrMatches); return $arrMatches[0]; } public function getNaturalResult($arrMatches) { if (empty($arrMatches) || !is_array($arrMatches)) { return false; } $arrNaturalResult = array(); foreach ($arrMatches as $key=>$div) { foreach (self::$_arrPattern as $val) { $strName = $val['name']; $$strName = ''; } foreach (self::$_arrPattern as $val) { $strName = $val['name']; preg_match_all($val['reg'], $div, $matches); if (!isset($matches[$val['location']][0])) { continue; } $$strName = isset($matches[$val['location']][0]) ? $matches[$val['location']][0] : ''; if ($val['name'] === 'nature_result') { $$strName = str_replace('\'', '"', $$strName); $$strName = json_decode($$strName, true); } else { $$strName = strip_tags($$strName); } $arrNaturalResult[$key][$val['name']] = $$strName; } } return $arrNaturalResult; } }
呼叫方法:
$obj = new NaturalResultSpider($strQuery, $pageNo);
指定需要抓取什麼query的搜尋結果,和抓取的頁數,最多76頁
$obj->setWorkerProcess(4);
指定4個程序進行抓取
$obj->setRetryTimes(3);
抓取失敗重試次數
$obj->dataHandler = 'printRes';
指定回撥方法進行資料處理
$obj->execute();
以上設定好之後開始執行