centos7+TP5.1+selenium+chrome抓取搜狗微信推文
一、Composer 引入
"require": {
"php-webdriver/webdriver": "1.8.0",
"jaeger/querylist": "^4.1"
}
二、安装谷歌浏览器yum install google-chrome
三、安装chromedriver
所有版本的下载地址:https://sites.google.com/a/chromium.org/chromedriver/downloads
wget https://chromedriver.storage.googleapis.com/2.39/chromedriver_linux64.zip
unzip chromedriver_linux64.zip
四、代码操作
<?php
namespace app\index\controller;
use Facebook\WebDriver\Remote\DesiredCapabilities;
use Facebook\WebDriver\Remote\RemoteWebDriver;
use Facebook\WebDriver\WebDriverBy;
use QL\QueryList;
class Test{
protected $host ;
protected $driver ;
public function __construct() {
$this->host = 'http://localhost:9515' ;
$res = $this->query_process('chromedriver.exe');
if(!$res){
//启动chromedriver.exe
print_r("启动chromedriver.exe");
$out1 = "";
$a=exec ("C:\Users\Dongmei\Desktop\selenium\chromedriver.exe",$out1);
print_r($out1);
echo '<br/>';
print_r($a);
}
$this->driver = RemoteWebDriver::create( $this->host, DesiredCapabilities:: chrome());
}
public function index() {
$this->driver->get("https://weixin.sogou.com/");
$element = $this->driver->findElement(
WebDriverBy::cssSelector('.query')
);
$element->clear(); //清空
$element->sendKeys("一门式政务");
//点击搜索
$this->driver->findElement(WebDriverBy::cssSelector('.swz'))->click();
sleep(1);
$html = $this->driver->getPageSource();
$html = str_replace("<!--headTrap<body></body><head></head><html></html>-->", "", $html); //去除微信干扰元素!!!否则乱码
$rules=[
'html'=>array('.txt-box', 'html'),
];
$data = QueryList::html($html)->rules($rules)->query()->getData();
$res= $data->all();
$doc=\phpQuery::newDocumentHTML($res['html']);
$as = pq($doc)->find('a');
$hrefs=[];
foreach ($as as $a) {
$href= pq($a)->attr('href');
$host = parse_url($href);
if(!isset($host['host'])){
$hrefs[] = "https://weixin.sogou.com".$href;
}
}
foreach ($hrefs as $v) {
$this->getHtmlByUrl($v);
die;
}
$this->driver->close();
}
protected function getHtmlByUrl($url){
$this->driver->get($url);
echo '<span style="color:#ad3c27">延迟2S...</span>';
sleep('2');
echo '<span style="color:#3bad43">..开始抓取————</span>';
$html = $this->driver->getPageSource();
$html = str_replace("<!--headTrap<body></body><head></head><html></html>-->", "", $html); //去除微信干扰元素!!!否则乱码
$rules = array( //设置QueryList的解析规则
'content' => array('#js_content', 'html'), //文章内容
'title' => array('#activity-name', 'text'), //文章标题
'author'=> array('.rich_media_meta_text:eq(1)','text'), //作者
'account_name' => array('#js_profile_qrcode .profile_nickname','text'), //公众号
'account_en_name' => array('#js_profile_qrcode .profile_meta:eq(0) .profile_meta_value','text'), //公众号英文标识
);
$data = QueryList::html($html)->rules($rules)->query()->getData();
return $data;
}
在window测试挺正常的,但在centos就会有一些问题
有没有大神指导一下