adsl 采集 php curl

<?php

namespace AdslCurl\AdslCurl;
class AdslCurl
{
    public $aUserAgent = [
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
        'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
        'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
        'User-Agent: Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
        'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
        'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)',
        'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)',
        'Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    ];
    public $iConnectTimeOut;
    public $iTimeOut;
    public $cookieFile;

    // 初始化
    public function __construct($config)
    {
        $this->cookieFile = isset($config['cookieFile']) ? $config['cookieFile'] : './ck.txt';
        $this->iConnectTimeOut = isset($config['iConnectTimeOut']) ? $config['iConnectTimeOut'] : 5;
        $this->iTimeOut = isset($config['iTimeOut']) ? $config['iTimeOut'] : 3;
    }

    // 检测是否被封锁
    public static function detectIsForbidden($forbiddenString, $data, $url, $file)
    {  //  参数,当前 url,错误写到哪里 file, 采集返回内容
        if ($isForbidden = strstr($data, $forbiddenString)) {
            file_put_contents($file, "'".$url . "',    // 被封了\r\n", FILE_APPEND);
        }
        return $isForbidden;
    }

    /*

     */

    public static function changeIP($networkName, $adslPassword, $adslAccount)
    {
        system("rasdial " . $networkName . " /DISCONNECT > null");
        system("rasdial " . $networkName . " $adslAccount $adslPassword > null");
        return 0;
    }


    public function getCurlResultDirect($url, $cookie)
    {
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $this->iConnectTimeOut);
        curl_setopt($ch, CURLOPT_TIMEOUT, $this->iTimeOut);
        $userAgent = $this->aUserAgent[array_rand($this->aUserAgent)];
        curl_setopt($ch, CURLOPT_REFERER, 'http://www.***************.com');

        curl_setopt($ch, CURLOPT_HTTPHEADER, [
            'Host: www.***************.com',
            'Proxy-Connection: keep-alive',
            'User-Agent: ' . $userAgent,
            'Upgrade-Insecure-Requests: 1',
            'Accept-Language: zh-CN,zh;q=0.9',
            $cookie,
        ]);
        $res=curl_exec($ch);

        curl_close($ch);
        return $res;
    }
    public function getValidCurlResult($forbiddenString, $data, $url, $file){
        if(self::detectIsForbidden($forbiddenString, $data, $url, $file)){
            return 0;
        }
        preg_match_all('/<a[\s\S]+?\/a>/', $data, $res);
        if($res[0]){
            foreach ($res[0] as $o) {
                if (strstr($o, 'class="shop-images"') or strstr($o, 'class="pic"') or strstr($o, 'data-src')) {
                    $tmp[] = $o;
                }
            }
            if(isset($tmp)){
                return $tmp;
            }else{
                file_put_contents($file, "'".$url . "',    // 未获取到数据\r\n", FILE_APPEND);
            }
        }
        return 0;
    }
//
//    public function getCurlResultFromTheRootUrl($ch, $rootUrl, $url)
//    {
//        curl_setopt($ch, CURLOPT_COOKIEFILE, $this->cookieFile);
//        curl_setopt($ch, CURLOPT_COOKIEJAR, $this->cookieFile);
//        curl_setopt($ch, CURLOPT_REFERER, $rootUrl);
//        curl_setopt($ch, CURLOPT_URL, $url);
//        return curl_exec($ch);
//    }

}
// 配置开始
set_time_limit(0);
$host='localhost';
$user='root';
$pwd='root';
$db='curl';
$con=mysqli_connect($host,$user,$pwd,$db);
mysqli_query($con,'set names "utf8"');

$adslAccount = '***************';
$adslPassword = '***************';
$networkName = 'net';
$config = [
    'cookieFile' => './ck.txt',
];
$forbiddenString = "have permission to access the URL on this server.<hr/>Powered by Tengine</body";
$errorFile = './curl_error.log';
$cookie = 'Cookie: ***************';
include ('./url.php');
function curlContentToSql($data,$url)
{
    foreach ($data as $o) {
        preg_match('/href="([\s\S]+?)"[\s\S]+?src="([\s\S]+?)"/', $o, $tmp2);
        $fieldUrl[] = $tmp2[1];
        $fieldLogo[] = $tmp2[2];
    }
    //        (‘tom’), (‘paul’)
    $num = count($fieldLogo);
    preg_match('/(g\d+)(r\d+)/', $url, $tmp3);
    $smallUrl = $tmp3[1];
    $bqUrl = $tmp3[2];
    $str = '';
    for ($j = 0; $j < $num; $j++) {
        $str .= "('$bqUrl','$smallUrl','{$fieldLogo[$j]}','{$fieldUrl[$j]}' ),";
    }
    $sql = rtrim("INSERT IGNORE INTO smallbq (bq,small, logo,url)VALUES {$str}",',');
    return $sql;
}
function getPageFromCurlResult($data,$url){
    preg_match_all('/g\d+r\d+p(\d+)/',$data,$res);
    if(!$res[0]){
        return 0;
    }else{
        $countPage=$res[1][count($res[1])-2];
        for($l=2;$l<=$countPage;$l++){
            $aPageLink[]=$url.'p'.$l;
        }
        if(!isset($aPageLink)){
            return 0;
        }
        return $aPageLink;
    }
}
// 配置结束

// 初始化 curl
$adslCurl = new AdslCurl($config);

// 计算 url 个数并且开始循环
$countUrl=count($aUrls);
$start=0;
$end=$countUrl;
for($i=$start;$i<$end;$i++){
    // 开始计数
    echo "$i\n";
//    usleep(500);
    if ($i % 3 == 2) {
        AdslCurl::changeIP($networkName, $adslPassword, $adslAccount);
    }

    // 获取 curl 的结果
    $data = $adslCurl->getCurlResultDirect($aUrls[$i], $cookie);
    if(!$data){
        AdslCurl::changeIP($networkName, $adslPassword, $adslAccount);
        $data = $adslCurl->getCurlResultDirect($aUrls[$i], $cookie);
    }
    // 如果结果有效就转成 sql 入库,否则下一次循环
    if($tmp=$adslCurl->getValidCurlResult($forbiddenString, $data, $aUrls[$i], $errorFile)){
        $sql=curlContentToSql($tmp,$aUrls[$i]);
        mysqli_query($con,$sql);
        usleep(50);
        // echo $sql;exit;
        if($aPageUrls=getPageFromCurlResult($data,$aUrls[$i])){
            $countPageUrl=count($aPageUrls);
            for($k=0;$k<$countPageUrl;$k++){
                if ($k % 3 == 1) {
                    AdslCurl::changeIP($networkName, $adslPassword, $adslAccount);
                }
                $data = $adslCurl->getCurlResultDirect($aPageUrls[$k], $cookie);
                if(!$data){
                    AdslCurl::changeIP($networkName, $adslPassword, $adslAccount);
                    $data = $adslCurl->getCurlResultDirect($aPageUrls[$k], $cookie);
                }

                $tmp=$adslCurl->getValidCurlResult($forbiddenString, $data, $aPageUrls[$k], $errorFile);
                $sql=curlContentToSql($tmp,$aPageUrls[$k]);
                mysqli_query($con,$sql);
                usleep(50);
                // echo $sql;exit;
            }
        }
    }

}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值