<?php
/**
* 多进程抓取百度结果页自然结果,包括标题、摘要、图片、链接、来源
* @since 2016-04-15
*/
class NaturalResultSpider {
private $_strQuery = null;
public $worker_process = 4; //开启进程数
private $_arrPids = array();
private $_intPageNum; //需要抓取的自然结果页数
public $arrAllResult = array();
public $dataHandler = null; //钩子,可以回调指定的函数完成对应功能
private $masterPid = null;
private $retry_times = 1;
private $strReg = '/<div\sclass="result\sc-result\sc-clk-recommend"(.*)?>(.*)?(<img\ssrc="(.*)?">)?(.*)?(<p\sclass="c-line-clamp3\sc-color">(.*)?)+<\/div>/Uis';
private static $_arrPattern = array(
array('name'=>'nature_result', 'reg'=>'/data-log=\"(.*?)\"/', 'location'=>1),
array('name'=>'title', 'reg'=>'/<h3(.*?)>(.*?)<\/h3>/', 'location'=>2),
array('name'=>'abstract', 'reg'=>'/<p class=\"c-line-clamp3 c-color\">(.*?)<\/p>/', 'location'=>1),
array('name'=>'source_url', 'reg'=>'/<div class=\"c-showurl c-line-clamp1\"><span>(.*?)<\/span>/', 'location'=>1),
array('name'=>'url', 'reg'=>'/<div class=\"c-container\"><a(.*?)class=\"c-blocka\" href=\"(.*?)\">/', 'location'=>2),
array('name'=>'img', 'reg'=>'/<div class=\"c-img c-img-s\"><img data-imagedelaysrc=\"(.*?)\"/', 'location'=>1),
);
public function __construct($strQuery, $intPageNum=76) {
$this->_strQuery = $strQuery;
$this->_intPageNum = $intPageNum;
}
public function execute() {
$this->setMasterPid();
$this->forkWorker();
$this->monitorWorker();
}
private function setMasterPid() {
$this->masterPid = posix_getpid();
}
public function setWorkerProcess($intWorkerProcess) {
if ($intWorkerProcess <= 0) {
return false;
}
$this->worker_process = $intWorkerProcess;
}
public function setRetryTimes($intTimes) {
if ($intTimes <= 0) {
return false;
}
$this->retry_times = $intTimes;
}
public function setRegPattern($strReg) {
if (empty($strReg)) {
return false;
}
$this->strReg = $strReg;
}
public function setPattern($arrPattern) {
if (!is_array($arrPattern) || empty($arrPattern)) {
return false;
}
self::$_arrPattern[] = $arrPattern;
}
private function monitorWorker() {
if ($this->masterPid === posix_getpid()) {
foreach ($this->_arrPids as $intPid) {
pcntl_waitpid($intPid, $status, WUNTRACED);
$status = pcntl_wexitstatus($status);
if ($status === 100) {
unset($this->_arrPids[$inPid]);
}
}
}
}
/*主调用方法*/
public function forkWorker() {
for ($i=0; $i<$this->worker_process; ++$i) {
$pid = pcntl_fork();
if ($pid === -1) {
exit;
} elseif ($pid > 0) {
$this->_arrPids[$pid] = $pid;
} else {
$arrResult = $this->run($i);
if ($this->dataHandler) {
call_user_func($this->dataHandler, $arrResult);
}
exit(100);
}
}
}
/*为worker分配任务*/
private function run($intWorkerId) {
$intPage = ceil($this->_intPageNum / $this->worker_process);
$intBegin = $intWorkerId * $intPage;
$intEnd = ($intWorkerId + 1) * $intPage;
$intEnd = $intEnd > $this->_intPageNum ? $this->_intPageNum : $intEnd;
for ($i=$intBegin; $i<$intEnd; ++$i) {
$strUrl = 'm.baidu.com/s?word=' . urlencode($this->_strQuery);
$strUrl .= $i == 0 ? '' : '&pn=' . $i*10;
//如果失败则重试
$error_times = 0;
while (true) {
if ($error_times >= $this->retry_times) {
break;
}
$strHtml = $this->curl($strUrl);
$arrMatches = $this->getHtmlContent($strHtml);
$arrNaturalResult = $this->getNaturalResult($arrMatches);
if (!empty($arrNaturalResult)) {
$arrResult[$i] = $arrNaturalResult;
break;
}
$error_times++;
}
}
return $arrResult;
}
private function curl($url) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
$result = curl_exec($ch);
if (curl_errno($ch)) {
exit;
}
return $result;
}
public function getHtmlContent($strHtml) {
if (empty($strHtml)) {
return false;
}
preg_match_all($this->strReg, $strHtml, $arrMatches);
return $arrMatches[0];
}
public function getNaturalResult($arrMatches) {
if (empty($arrMatches) || !is_array($arrMatches)) {
return false;
}
$arrNaturalResult = array();
foreach ($arrMatches as $key=>$div) {
foreach (self::$_arrPattern as $val) {
$strName = $val['name'];
$$strName = '';
}
foreach (self::$_arrPattern as $val) {
$strName = $val['name'];
preg_match_all($val['reg'], $div, $matches);
if (!isset($matches[$val['location']][0])) {
continue;
}
$$strName = isset($matches[$val['location']][0]) ? $matches[$val['location']][0] : '';
if ($val['name'] === 'nature_result') {
$$strName = str_replace('\'', '"', $$strName);
$$strName = json_decode($$strName, true);
} else {
$$strName = strip_tags($$strName);
}
$arrNaturalResult[$key][$val['name']] = $$strName;
}
}
return $arrNaturalResult;
}
}
调用方法:
$obj = new NaturalResultSpider($strQuery, $pageNo);
指定需要抓取什么query的搜索结果,和抓取的页数,最多76页
$obj->setWorkerProcess(4);
指定4个进程进行抓取
$obj->setRetryTimes(3);
抓取失败重试次数
$obj->dataHandler = 'printRes';
指定回调方法进行数据处理
$obj->execute();
以上设置好之后开始运行