php 采集页面

                                        php 采集页面

(示例:采集微博,淘宝)

首先我们先验证页面是否是需要cookie的

如果页面存在cookie那么我们需要加入cookie或者模拟登陆采集

方法一:

使用cookie采集

1,

header('content-type:text/html;charset=utf-8');  
function curlPost($url,$data,$method){  
    $ch = curl_init();   //1.初始化  
    curl_setopt($ch, CURLOPT_URL, $url); //2.请求地址  
    curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method);//3.请求方式  
    //4.参数如下  
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);//https
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);  
    curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)');//模拟浏览器  
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);  
    curl_setopt($ch, CURLOPT_AUTOREFERER, 1);  
    curl_setopt($ch, CURLOPT_HTTPHEADER,array('Accept-Encoding: gzip, deflate'));//gzip解压内容  
    curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');  
      
    if($method=="POST"){//5.post方式的时候添加数据  
        curl_setopt($ch, CURLOPT_POSTFIELDS, $data);  
    }  
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);  
    $tmpInfo = curl_exec($ch);//6.执行  
  
    if (curl_errno($ch)) {//7.如果出错  
        return curl_error($ch);  
    }  
    curl_close($ch);//8.关闭  
    return $tmpInfo;  
}  
$data=array('name' => '1234');  
$url="http://www.sohu.com/";  
  
$method="GET";  
$file=curlPost($url,$data,$method);  
$file=mb_convert_encoding($file,'UTF-8','GBK');  
echo $file;     curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);//https
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);  
    curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)');//模拟浏览器  
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);  
    curl_setopt($ch, CURLOPT_AUTOREFERER, 1);  
    curl_setopt($ch, CURLOPT_HTTPHEADER,array('Accept-Encoding: gzip, deflate'));//gzip解压内容  
    curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');  
      
    if($method=="POST"){//5.post方式的时候添加数据  
        curl_setopt($ch, CURLOPT_POSTFIELDS, $data);  
    }  
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);  
    $tmpInfo = curl_exec($ch);//6.执行  
  
    if (curl_errno($ch)) {//7.如果出错  
        return curl_error($ch);  
    }  
    curl_close($ch);//8.关闭  
    return $tmpInfo;  
}  
$data=array('name' => '1234');  
$url="http://www.sohu.com/";  
  
$method="GET";  
$file=curlPost($url,$data,$method);  
$file=mb_convert_encoding($file,'UTF-8','GBK');  
echo $file; 

2,

 $cookie_file = tempnam('./temp','cookie');  
    function weixinPost($url,$data,$method,$setcooke=false,$cookie_file=false){  
        $ch = curl_init();   //1.初始化  
        curl_setopt($ch, CURLOPT_URL, $url); //2.请求地址  
        curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method);//3.请求方式  
        //4.参数如下      
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);  
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);  
        curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)');  
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);  
        curl_setopt($ch, CURLOPT_AUTOREFERER, 1);  
          
        if($method=="POST"){//5.post方式的时候添加数据     
            curl_setopt($ch, CURLOPT_POSTFIELDS, $data);  
        }  
        if($setcooke==true){  
            curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file);  
        }else{  
            curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file);  
        }  
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);  
        $tmpInfo = curl_exec($ch);//6.执行  
  
        if (curl_errno($ch)) {//7.如果出错  
            return curl_error($ch);  
        }  
        curl_close($ch);//8.关闭  
        return $tmpInfo;  
    }  
    $data=array('username' => '***','password'=>'***');  
    $url="http://www.xinxinj.com/login.php";  
    $method="POST";  
    $file=weixinPost($url,$data,$method,true,$cookie_file);  
    echo $file;  
          
    $url="http://www.xinxinj.com/admin.php";  
    $method="GET";  
    $file=weixinPost($url,$data,$method,false,$cookie_file);  
    echo $file;

由前两个可以简单来运用在此我们要注意请求地址前面是https那么需要绕过ssl保密协议 加入如1标红两行

 

 

$ch = curl_init();    
$url = '*******';    
$header = array(  
'cookie:********'  
);  
// 添加apikey到header    
curl_setopt($ch, CURLOPT_HTTPHEADER  , $header);    
//curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);    
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:50.0) Gecko/20100101 Firefox/50.0');    
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);    
curl_setopt($ch, CURLOPT_AUTOREFERER, 1);   
// 执行HTTP请求    
curl_setopt($ch , CURLOPT_URL , $url);    
$res = curl_exec($ch);    
var_dump($res);  

 

模拟登陆采集微博

 

采集到的示例图如下

实现代码:

 

<?php
header('Content-type:text/html;charset=utf-8');
//微博登录地址
$loginUrl = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)&_=1403138799543';

//在当前文件下建立cookie目录 并且 创建对应的txt文件
$cookie_file1 = dirname(__FILE__).'/cookie/cookie1.txt';
$cookie_file2 = dirname(__FILE__).'/cookie/cookie2.txt';
$cookie_file3 = dirname(__FILE__).'/cookie/cookie3.txt';

$u = '微博用户名';
$p = '微博密码';

$username = base64_encode($u);
$password = $p;

//登录新浪通行证
$loginData['entry'] = 'sso';
$loginData['gateway'] = '1';
$loginData['from'] = 'null';
$loginData['savestate'] = '30';
$loginData['useticket'] = '0';
$loginData['pagerefer'] = '';
$loginData['vsnf'] = '1';
$loginData['su'] = base64_encode($u);
$loginData['service'] = 'sso';
$loginData['sp'] = $password;
$loginData['sr'] = '1920*1080';
$loginData['encoding'] = 'UTF-8';
$loginData['cdult'] = '3';
$loginData['domain'] = 'sina.com.cn';
$loginData['prelt'] = '0';
$loginData['returntype'] = 'TEXT';

$login = json_decode(loginPost($loginUrl,$loginData),true);
// var_dump($login);

//获取微博cookie
$ch = curl_init();
curl_setopt($ch,CURLOPT_URL,$login['crossDomainUrlList'][0]);    
curl_setopt ( $ch, CURLOPT_SSL_VERIFYPEER, 0 );
curl_setopt ( $ch, CURLOPT_SSL_VERIFYHOST, 2 );
curl_setopt($ch,CURLOPT_HEADER,0);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);    
curl_setopt($ch,CURLOPT_COOKIEFILE, $cookie_file1);    
curl_setopt($ch,CURLOPT_COOKIEJAR,$cookie_file2);    
$return = curl_exec($ch);
curl_close($ch);

//通过获取的cookie 登录微博, 自动跳转
$ch = curl_init();
curl_setopt($ch,CURLOPT_URL,"http://weibo.com/5589516034/profile?topnav=1&wvr=6&is_all=1");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);    
curl_setopt($ch,CURLOPT_COOKIEFILE, $cookie_file2);    
curl_setopt($ch,CURLOPT_COOKIEJAR,$cookie_file3);
curl_setopt ( $ch, CURLOPT_FOLLOWLOCATION, 1 );    
$return = curl_exec($ch);
//$info = curl_getinfo($ch);
curl_close($ch);

echo $return;


function loginPost($url,$data){
        global $cookie_file1 ;
        //echo $cookie_file ;exit;
        $ch = curl_init();
        curl_setopt($ch,CURLOPT_URL,$url);    
        curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);    
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);        
        curl_setopt($ch,CURLOPT_POST,1);
        curl_setopt($ch,CURLOPT_POSTFIELDS, http_build_query($data));
        curl_setopt($ch,CURLOPT_COOKIEJAR,$cookie_file1);
        $return = curl_exec($ch);
        $info = curl_getinfo($ch);
        curl_close($ch);
        return $return;

}

?>

 

 

 

我们还要在上php文件同一目录下创建cookie文件夹在内创建txt格式文件cookie1.txt   cookie2.txt    cookie3.txt

 

 

 

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值