php 采集页面
(示例:采集微博,淘宝)
首先我们先验证页面是否是需要cookie的
如果页面存在cookie那么我们需要加入cookie或者模拟登陆采集
方法一:
使用cookie采集
1,
header('content-type:text/html;charset=utf-8');
function curlPost($url,$data,$method){
$ch = curl_init(); //1.初始化
curl_setopt($ch, CURLOPT_URL, $url); //2.请求地址
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method);//3.请求方式
//4.参数如下
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);//https
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)');//模拟浏览器
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_AUTOREFERER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER,array('Accept-Encoding: gzip, deflate'));//gzip解压内容
curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
if($method=="POST"){//5.post方式的时候添加数据
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$tmpInfo = curl_exec($ch);//6.执行
if (curl_errno($ch)) {//7.如果出错
return curl_error($ch);
}
curl_close($ch);//8.关闭
return $tmpInfo;
}
$data=array('name' => '1234');
$url="http://www.sohu.com/";
$method="GET";
$file=curlPost($url,$data,$method);
$file=mb_convert_encoding($file,'UTF-8','GBK');
echo $file;
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);//https
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)');//模拟浏览器
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_AUTOREFERER, 1);
curl_setopt($ch, CURLOPT_HTTPHEADER,array('Accept-Encoding: gzip, deflate'));//gzip解压内容
curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
if($method=="POST"){//5.post方式的时候添加数据
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$tmpInfo = curl_exec($ch);//6.执行
if (curl_errno($ch)) {//7.如果出错
return curl_error($ch);
}
curl_close($ch);//8.关闭
return $tmpInfo;
}
$data=array('name' => '1234');
$url="http://www.sohu.com/";
$method="GET";
$file=curlPost($url,$data,$method);
$file=mb_convert_encoding($file,'UTF-8','GBK');
echo $file;
2,
$cookie_file = tempnam('./temp','cookie');
function weixinPost($url,$data,$method,$setcooke=false,$cookie_file=false){
$ch = curl_init(); //1.初始化
curl_setopt($ch, CURLOPT_URL, $url); //2.请求地址
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $method);//3.请求方式
//4.参数如下
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; MSIE 5.01; Windows NT 5.0)');
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_AUTOREFERER, 1);
if($method=="POST"){//5.post方式的时候添加数据
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
}
if($setcooke==true){
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie_file);
}else{
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookie_file);
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$tmpInfo = curl_exec($ch);//6.执行
if (curl_errno($ch)) {//7.如果出错
return curl_error($ch);
}
curl_close($ch);//8.关闭
return $tmpInfo;
}
$data=array('username' => '***','password'=>'***');
$url="http://www.xinxinj.com/login.php";
$method="POST";
$file=weixinPost($url,$data,$method,true,$cookie_file);
echo $file;
$url="http://www.xinxinj.com/admin.php";
$method="GET";
$file=weixinPost($url,$data,$method,false,$cookie_file);
echo $file;
由前两个可以简单来运用(在此我们要注意请求地址前面是https那么需要绕过ssl保密协议 加入如1标红两行)
$ch = curl_init();
$url = '*******';
$header = array(
'cookie:********'
);
// 添加apikey到header
curl_setopt($ch, CURLOPT_HTTPHEADER , $header);
//curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:50.0) Gecko/20100101 Firefox/50.0');
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_AUTOREFERER, 1);
// 执行HTTP请求
curl_setopt($ch , CURLOPT_URL , $url);
$res = curl_exec($ch);
var_dump($res);
模拟登陆采集微博
采集到的示例图如下
实现代码:
<?php
header('Content-type:text/html;charset=utf-8');
//微博登录地址
$loginUrl = 'https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.15)&_=1403138799543';
//在当前文件下建立cookie目录 并且 创建对应的txt文件
$cookie_file1 = dirname(__FILE__).'/cookie/cookie1.txt';
$cookie_file2 = dirname(__FILE__).'/cookie/cookie2.txt';
$cookie_file3 = dirname(__FILE__).'/cookie/cookie3.txt';
$u = '微博用户名';
$p = '微博密码';
$username = base64_encode($u);
$password = $p;
//登录新浪通行证
$loginData['entry'] = 'sso';
$loginData['gateway'] = '1';
$loginData['from'] = 'null';
$loginData['savestate'] = '30';
$loginData['useticket'] = '0';
$loginData['pagerefer'] = '';
$loginData['vsnf'] = '1';
$loginData['su'] = base64_encode($u);
$loginData['service'] = 'sso';
$loginData['sp'] = $password;
$loginData['sr'] = '1920*1080';
$loginData['encoding'] = 'UTF-8';
$loginData['cdult'] = '3';
$loginData['domain'] = 'sina.com.cn';
$loginData['prelt'] = '0';
$loginData['returntype'] = 'TEXT';
$login = json_decode(loginPost($loginUrl,$loginData),true);
// var_dump($login);
//获取微博cookie
$ch = curl_init();
curl_setopt($ch,CURLOPT_URL,$login['crossDomainUrlList'][0]);
curl_setopt ( $ch, CURLOPT_SSL_VERIFYPEER, 0 );
curl_setopt ( $ch, CURLOPT_SSL_VERIFYHOST, 2 );
curl_setopt($ch,CURLOPT_HEADER,0);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch,CURLOPT_COOKIEFILE, $cookie_file1);
curl_setopt($ch,CURLOPT_COOKIEJAR,$cookie_file2);
$return = curl_exec($ch);
curl_close($ch);
//通过获取的cookie 登录微博, 自动跳转
$ch = curl_init();
curl_setopt($ch,CURLOPT_URL,"http://weibo.com/5589516034/profile?topnav=1&wvr=6&is_all=1");
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch,CURLOPT_COOKIEFILE, $cookie_file2);
curl_setopt($ch,CURLOPT_COOKIEJAR,$cookie_file3);
curl_setopt ( $ch, CURLOPT_FOLLOWLOCATION, 1 );
$return = curl_exec($ch);
//$info = curl_getinfo($ch);
curl_close($ch);
echo $return;
function loginPost($url,$data){
global $cookie_file1 ;
//echo $cookie_file ;exit;
$ch = curl_init();
curl_setopt($ch,CURLOPT_URL,$url);
curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch,CURLOPT_POST,1);
curl_setopt($ch,CURLOPT_POSTFIELDS, http_build_query($data));
curl_setopt($ch,CURLOPT_COOKIEJAR,$cookie_file1);
$return = curl_exec($ch);
$info = curl_getinfo($ch);
curl_close($ch);
return $return;
}
?>
我们还要在上php文件同一目录下创建cookie文件夹在内创建txt格式文件cookie1.txt cookie2.txt cookie3.txt