<?php
namespace AdslCurl\AdslCurl;
class AdslCurl
{
public $aUserAgent = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)',
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
'Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
];
public $iConnectTimeOut;
public $iTimeOut;
public $cookieFile;
// 初始化
public function __construct($config)
{
$this->cookieFile = isset($config['cookieFile']) ? $config['cookieFile'] : './ck.txt';
$this->iConnectTimeOut = isset($config['iConnectTimeOut']) ? $config['iConnectTimeOut'] : 5;
$this->iTimeOut = isset($config['iTimeOut']) ? $config['iTimeOut'] : 3;
}
// 检测是否被封锁
public static function detectIsForbidden($forbiddenString, $data, $url, $file)
{ // 参数,当前 url,错误写到哪里 file, 采集返回内容
if ($isForbidden = strstr($data, $forbiddenString)) {
file_put_contents($file, "'".$url . "', // 被封了\r\n", FILE_APPEND);
}
return $isForbidden;
}
/*
*/
public static function changeIP($networkName, $adslPassword, $adslAccount)
{
system("rasdial " . $networkName . " /DISCONNECT > null");
system("rasdial " . $networkName . " $adslAccount $adslPassword > null");
return 0;
}
public function getCurlResultDirect($url, $cookie)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->iConnectTimeOut);
curl_setopt($ch, CURLOPT_TIMEOUT, $this->iTimeOut);
$userAgent = $this->aUserAgent[array_rand($this->aUserAgent)];
curl_setopt($ch, CURLOPT_REFERER, 'http://www.***************.com');
curl_setopt($ch, CURLOPT_HTTPHEADER, [
'Host: www.***************.com',
'Proxy-Connection: keep-alive',
'User-Agent: ' . $userAgent,
'Upgrade-Insecure-Requests: 1',
'Accept-Language: zh-CN,zh;q=0.9',
$cookie,
]);
$res=curl_exec($ch);
curl_close($ch);
return $res;
}
public function getValidCurlResult($forbiddenString, $data, $url, $file){
if(self::detectIsForbidden($forbiddenString, $data, $url, $file)){
return 0;
}
preg_match_all('/<a[\s\S]+?\/a>/', $data, $res);
if($res[0]){
foreach ($res[0] as $o) {
if (strstr($o, 'class="shop-images"') or strstr($o, 'class="pic"') or strstr($o, 'data-src')) {
$tmp[] = $o;
}
}
if(isset($tmp)){
return $tmp;
}else{
file_put_contents($file, "'".$url . "', // 未获取到数据\r\n", FILE_APPEND);
}
}
return 0;
}
//
// public function getCurlResultFromTheRootUrl($ch, $rootUrl, $url)
// {
// curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookieFile);
// curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookieFile);
// curl_setopt($ch, CURLOPT_REFERER, $rootUrl);
// curl_setopt($ch, CURLOPT_URL, $url);
// return curl_exec($ch);
// }
}
// 配置开始
set_time_limit(0);
$host='localhost';
$user='root';
$pwd='root';
$db='curl';
$con=mysqli_connect($host,$user,$pwd,$db);
mysqli_query($con,'set names "utf8"');
$adslAccount = '***************';
$adslPassword = '***************';
$networkName = 'net';
$config = [
'cookieFile' => './ck.txt',
];
$forbiddenString = "have permission to access the URL on this server.<hr/>Powered by Tengine</body";
$errorFile = './curl_error.log';
$cookie = 'Cookie: ***************';
include ('./url.php');
function curlContentToSql($data,$url)
{
foreach ($data as $o) {
preg_match('/href="([\s\S]+?)"[\s\S]+?src="([\s\S]+?)"/', $o, $tmp2);
$fieldUrl[] = $tmp2[1];
$fieldLogo[] = $tmp2[2];
}
// (‘tom’), (‘paul’)
$num = count($fieldLogo);
preg_match('/(g\d+)(r\d+)/', $url, $tmp3);
$smallUrl = $tmp3[1];
$bqUrl = $tmp3[2];
$str = '';
for ($j = 0; $j < $num; $j++) {
$str .= "('$bqUrl','$smallUrl','{$fieldLogo[$j]}','{$fieldUrl[$j]}' ),";
}
$sql = rtrim("INSERT IGNORE INTO smallbq (bq,small, logo,url)VALUES {$str}",',');
return $sql;
}
function getPageFromCurlResult($data,$url){
preg_match_all('/g\d+r\d+p(\d+)/',$data,$res);
if(!$res[0]){
return 0;
}else{
$countPage=$res[1][count($res[1])-2];
for($l=2;$l<=$countPage;$l++){
$aPageLink[]=$url.'p'.$l;
}
if(!isset($aPageLink)){
return 0;
}
return $aPageLink;
}
}
// 配置结束
// 初始化 curl
$adslCurl = new AdslCurl($config);
// 计算 url 个数并且开始循环
$countUrl=count($aUrls);
$start=0;
$end=$countUrl;
for($i=$start;$i<$end;$i++){
// 开始计数
echo "$i\n";
// usleep(500);
if ($i % 3 == 2) {
AdslCurl::changeIP($networkName, $adslPassword, $adslAccount);
}
// 获取 curl 的结果
$data = $adslCurl->getCurlResultDirect($aUrls[$i], $cookie);
if(!$data){
AdslCurl::changeIP($networkName, $adslPassword, $adslAccount);
$data = $adslCurl->getCurlResultDirect($aUrls[$i], $cookie);
}
// 如果结果有效就转成 sql 入库,否则下一次循环
if($tmp=$adslCurl->getValidCurlResult($forbiddenString, $data, $aUrls[$i], $errorFile)){
$sql=curlContentToSql($tmp,$aUrls[$i]);
mysqli_query($con,$sql);
usleep(50);
// echo $sql;exit;
if($aPageUrls=getPageFromCurlResult($data,$aUrls[$i])){
$countPageUrl=count($aPageUrls);
for($k=0;$k<$countPageUrl;$k++){
if ($k % 3 == 1) {
AdslCurl::changeIP($networkName, $adslPassword, $adslAccount);
}
$data = $adslCurl->getCurlResultDirect($aPageUrls[$k], $cookie);
if(!$data){
AdslCurl::changeIP($networkName, $adslPassword, $adslAccount);
$data = $adslCurl->getCurlResultDirect($aPageUrls[$k], $cookie);
}
$tmp=$adslCurl->getValidCurlResult($forbiddenString, $data, $aPageUrls[$k], $errorFile);
$sql=curlContentToSql($tmp,$aPageUrls[$k]);
mysqli_query($con,$sql);
usleep(50);
// echo $sql;exit;
}
}
}
}
adsl 采集 php curl
最新推荐文章于 2025-04-02 08:30:59 发布