PHP获取url跳转后的真实地址 get_headers 500错误 模拟浏览器curl访问

本文介绍如何通过模拟浏览器信息,如设置User-Agent,使用curl获取并解析网页跳转后的URL,包括遇到301/302重定向的处理策略,以获取目标页面内容,适用于遇到服务器访客验证的情况。

一个二维码扫出来是一个网址,但访问这个网址会跳转到新的网址,那怎么抓取新的跳转后的网址的页面内容呢?
先获取到它跳转的目标url,原来使用的get_headers:

		$header = get_headers($url, 1);
        if (empty($header[0]) || empty($header[1])) {
            return $url;
        }
        if (strpos($header[0], '301') || strpos($header[0], '302')) {
            if (empty($header['Location'])) {
                return $url;
            }
            if (is_array($header['Location'])) {
                return $header['Location'][count($header['Location']) - 1];
            } else {
                return $header['Location']; //跳转目标url
            }
        } else {
            return $url;
        }

但是最近获取不到了,返回的是500错误,但真实访问是可以正常跳转打开的,肯定是对方特意判断了访客信息,于是改成下面的方法,使用curl加入模拟浏览器信息:

		$oCurl = curl_init();
        // 设置请求头, 有时候需要,有时候不用,看请求网址是否有对应的要求
        $header[] = "Content-type: application/x-www-form-urlencoded";
        $user_agent = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.146 Safari/537.36";
        curl_setopt($oCurl, CURLOPT_URL, $url);
        curl_setopt($oCurl, CURLOPT_HTTPHEADER,$header);
        // 返回 response_header, 该选项非常重要,如果不为 true, 只会获得响应的正文
        curl_setopt($oCurl, CURLOPT_HEADER, true);
        // 是否不需要响应的正文,为了节省带宽及时间,在只需要响应头的情况下可以不要正文
        curl_setopt($oCurl, CURLOPT_NOBODY, false);
        // 使用上面定义的 ua
        curl_setopt($oCurl, CURLOPT_USERAGENT,$user_agent);
        curl_setopt($oCurl, CURLOPT_RETURNTRANSFER, 1 );
        // 不用 POST 方式请求, 意思就是通过 GET 请求
        curl_setopt($oCurl, CURLOPT_POST, false);
        $sContent = curl_exec($oCurl);
        // 获得响应结果里的:头大小
        // $headerSize = curl_getinfo($oCurl, CURLINFO_HEADER_SIZE);
        // 根据头大小去获取头信息内容
        // $header = substr($sContent, 0, $headerSize);
        curl_close($oCurl);
        $con1 = explode('Location:',$sContent);
        $con2 = explode('Content-Language',$con1[1]);
        $con3 = str_replace(array("\r\n", "\r", "\n", " "), "", $con2[0]);
        return $con3;//跳转目标url

然后抓取目标url网页的内容:

$curl = curl_init();
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
        curl_setopt($curl, CURLOPT_TIMEOUT, 30);
        curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, false);
        curl_setopt($curl, CURLOPT_URL, $url);
		//模拟浏览器环境
        $useragent="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11";
        curl_setopt($curl, CURLOPT_USERAGENT, $useragent);
        $res = curl_exec($curl);
        curl_close($curl);
        return $res;

欢迎交流指点!我的微信:

int cloud_https_post(const char *pUrl, const char *request, char **response, st_http_resinfo *pHttpResInfo) { CURLcode res; CURL* curl = NULL; struct curl_slist *headers = NULL; #ifdef CLOUD_HTTPS_DEBUG char errbuf[CURL_ERROR_SIZE]; memset(errbuf, '\0', CURL_ERROR_SIZE); #endif if (NULL == pUrl || NULL == request || NULL == response || NULL == pHttpResInfo) { return CURLE_FAILED_INIT; } res = curl_global_init(CURL_GLOBAL_ALL); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl global init fail and ret %d", res); return res; } curl = curl_easy_init(); if (NULL == curl) { res = CURLE_FAILED_INIT; HTTPS_LOG(LOG_LEVEL_ERROR, "curl init fail"); goto exit; } headers = curl_slist_append(headers, "Content-Type: application/json;charset=UTF-8"); if (NULL == headers) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl get header list fail"); goto exit; } #ifdef CLOUD_HTTPS_DEBUG //provide a buffer to store errors in res = curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, errbuf); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_ERRORBUFFER ret %d", res); goto exit; } #endif if (IS_SESSION_DEBUG_ON()) { HTTPS_LOG(LOG_LEVEL_DEBUG, "post sesstion debug on"); res = curl_easy_setopt(curl, CURLOPT_VERBOSE, 1); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_VERBOSE ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_DEBUGFUNCTION, OnDebug); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_DEBUGFUNCTION ret %d", res); goto exit; } } res = curl_easy_setopt(curl, CURLOPT_URL, pUrl); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_URL ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_HTTPHEADER ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_POST, 1); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_POST ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_POSTFIELDS, request); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_POSTFIELDS ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_READFUNCTION, NULL); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_READFUNCTION ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, OnWriteData_Post); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_WRITEFUNCTION ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_WRITEDATA, (void *)response); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_WRITEDATA ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_NOSIGNAL ret %d", res); goto exit; } if (IS_CA_PATH_NULL()) { res = curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_SSL_VERIFYPEER ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_SSL_VERIFYHOST ret %d", res); goto exit; } } else { res = curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 1); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_SSL_VERIFYPEER ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_CAINFO, GET_CA_PATH()); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_CAINFO ret %d", res); goto exit; } if (GET_CA_TYPE()) { res = curl_easy_setopt(curl,CURLOPT_SSLCERTTYPE,GET_CA_TYPE()); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_SSLCERTTYPE ret %d", res); goto exit; } } else { //curl_easy_setopt(curl,CURLOPT_SSLCERTTYPE,"PEM"); } } res = curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, GET_P_CONNECT_TIMEOUT()); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_CONNECTTIMEOUT ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_TIMEOUT, GET_P_TRANSFER_TIMEOUT()); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_TIMEOUT ret %d", res); goto exit; } res = curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION,1); if (CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_ERROR, "curl set option CURLOPT_FOLLOWLOCATION ret %d", res); goto exit; } res = curl_easy_perform(curl); #ifdef CLOUD_HTTPS_DEBUG if(CURLE_OK != res) { HTTPS_LOG(LOG_LEVEL_DEBUG, "curl post return error: %s", errbuf); } #endif curl_easy_getinfo(curl, CURLINFO_HTTP_CODE, &(pHttpResInfo->status_code)); HTTPS_LOG(LOG_LEVEL_DEBUG, "cloud_https_post done. ret %d, http status code %ld", res, pHttpResInfo->status_code); exit: if (headers) { curl_slist_free_all(headers); } if (curl) { curl_easy_cleanup(curl); } curl_global_cleanup(); return res; }解析函数
最新发布
10-29
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值