通过搜狗搜索爬取微信公众号文章，小程序版可体验

最新推荐文章于 2024-06-26 10:42:04 发布

天马小Q

最新推荐文章于 2024-06-26 10:42:04 发布

阅读量1.2k

点赞数 1

文章标签： html php python

本文链接：https://blog.youkuaiyun.com/h1126560716/article/details/121583030

版权

需要说明的是搜狗通过关键词搜索出来的结果有100页那么多，采集的话也是可以实现的。
但是我的项目需求是首先通过搜狗的关键词推荐API获取某个关键词的所有长尾词入库，
以此来生成文章的一个标题，文章内容就是通过这个标题搜素公众号文章，
从100页文章中随机抽一篇文章获取到文章内容然后入库。
项目需求相当于通过长尾关键词自动生成文章

#获取微信公众号文文章
function http_post_json($str)
{
	
	$url = 'https://weixin.sogou.com/weixin?type=2&query='.urlencode($str).'&ie=utf8&s_from=input&_sug_=y&_sug_type_=';
	$cookie_file='cookie.txt';
    $ch = curl_init();
    //curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_URL, $url);
    //curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($jsonStr));
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    //curl_setopt($ch, CURLOPT_HEADER, 1);
    curl_setopt($ch, CURLOPT_COOKIEJAR,  $cookie_file);
    curl_setopt($ch, CURLOPT_ENCODING, "gzip"); 
    curl_setopt($ch, CURLOPT_HTTPHEADER, array(
            "content-type: text/html; charset=utf-8",
            "accept-encoding: gzip, deflate, br",
            "accept-language: zh-CN,zh;q=0.9,en;q=0.8",
        "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
        )
    );
    $response = curl_exec($ch);
    //$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    $response= mb_convert_encoding($response, 'utf-8', 'GBK,UTF-8,ASCII');
    curl_close($ch);
    $response=get_between($response,'news-list','pagebar_container');
    preg_match_all('#href="([\s\S]*?)" uigs="article_image[\s\S]*?href="[\s\S]*?" id="[\s\S]*?href="[\s\S]*?" data-headimage#is',$response,$arr);
    if(count($arr)>0){
    	$num=mt_rand(0,count($arr));
    	
    	$url = 'https://weixin.sogou.com/'.$arr[1][$num];//随机获取一篇文章链接
    	
    	$b=mt_rand(0,100);
    	$a=strpos($url, 'url=');
    	$a=substr($url, $a+4+21+$b, 1);
    	$url.="&k=".$b."&h=".$a;
    	
    	/*获取cookie开始*/
    	
    	$cookie_url='https://weixin.sogou.com/new/wap/images/app_spread.png?v=f23bdb0e';
    	$cookie_ch = curl_init();
    //curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($cookie_ch, CURLOPT_URL, $cookie_url);
    //curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($jsonStr));
    curl_setopt($cookie_ch, CURLOPT_RETURNTRANSFER, 1);
    curl_setopt($cookie_ch, CURLOPT_HEADER, 1);
    //curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
    //curl_setopt($cookie_ch,CURLOPT_COOKIE,$cookie);
    curl_setopt($cookie_ch , CURLOPT_NOBODY, 1);
    curl_setopt($cookie_ch, CURLOPT_RETURNTRANSFER, 1);
	//若给定url自动跳转到新的url,有了下面参数可自动获取新url内容：302跳转
	//curl_setopt($cookie_ch, CURLOPT_FOLLOWLOCATION, 1);
	$cookie_response = curl_exec($cookie_ch);
	curl_close($cookie_ch);
	//echo $cookie_response;
	preg_match_all("/set\-cookie:([^\r\n]*); expires/i", $cookie_response, $matches);
	$cookie_arr = $matches[1];
	foreach ($cookie_arr as $value){
		$cookie.=$value.';'.'SUV=; weixinIndexVisited=1; Hm_lvt_cdce8cda34e84469b1c8015204129522=1629900161; SMYUV=; UM_distinctid=; SNUID=;';//cookie自行获取
	}
	/*获取cookie结束SUV=1629900157643552;*/
    	

    $ch = curl_init();
    //curl_setopt($ch, CURLOPT_POST, 1);
    curl_setopt($ch, CURLOPT_URL, $url);
    //curl_setopt($ch, CURLOPT_POSTFIELDS, json_encode($jsonStr));
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    //curl_setopt($ch, CURLOPT_HEADER, 1);
    //curl_setopt($ch, CURLOPT_COOKIEJAR, $cookie);
    curl_setopt($ch,CURLOPT_COOKIE,$cookie);
    //curl_setopt($ch , CURLOPT_NOBODY, 1);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
	//若给定url自动跳转到新的url,有了下面参数可自动获取新url内容：302跳转
	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
    curl_setopt($ch, CURLOPT_ENCODING, "gzip"); 
    curl_setopt($ch, CURLOPT_HTTPHEADER, array(
    	"referer: https://weixin.sogou.com",
            "content-type: text/html; charset=utf-8",
            "accept-encoding: gzip, deflate, br",
            "accept-language: zh-CN,zh;q=0.9,en;q=0.8",
        "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
        )
    );
    $response = curl_exec($ch);
    //$info = curl_getinfo( $ch ,CURLINFO_EFFECTIVE_URL);
    //$httpCode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
    $response= mb_convert_encoding($response, 'utf-8', 'GBK,UTF-8,ASCII');
    curl_close($ch);
    	$response=get_between($response, '<script>', '</script>');
    	preg_match_all("#\+\= '(.*?)';#is", $response, $matches);
    	foreach ($matches[1] as $value){
    		$real_url.=$value;
    	}//获取公众号文章真正的链接
    	if($real_url!=''){
    		$result=get_between(file_get_contents($real_url),'<div class="rich_media_content " id="js_content" style="visibility: hidden;">','var first_sceen__time = (+new Date());');
	//echo $result;
    $newstr = preg_replace("/<script[\s\S]*?<\/script>/i","",$result,3); 
    $newstr = preg_replace('/style="visibility: hidden;"/i',"",$newstr); 
    
	
    	}
    	
        return $newstr;
    }
    
 
    //echo $cookie_file;
    //echo $response;
}
//取文本中间
function get_between($input, $start, $end) {    
	$substr = substr($input, strlen($start)+strpos($input, $start),(strlen($input) - strpos($input, $end))*(-1));    
	return $substr;
 }

最后小程序效果：
公众号文章爬取