获取网站中response.Headers["location"],找到抓取页面

通过两次HTTP请求,实现从指定网址获取包含特定关键词的网页内容,并处理跳转链接。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >


        private static string getPageContent(string tastKeyword)
        {
            string htmlcontent = "";
            HttpWebRequest request = null;
            HttpWebResponse response = null;
            string gethost = string.Empty;
            CookieContainer cc = new CookieContainer();
            string Cookiesstr = string.Empty;

            try
            {

                //第一次POST请求
                string post = @"formhash=59f68caf&srchtxt={0}&searchsubmit=yes";//模拟请求数据

                string postdata = string.Format(post, tastKeyword);
                string LoginUrl = "http://bbs.hc360.com/search.php?mod=forum";
                request = (HttpWebRequest)WebRequest.Create(LoginUrl);//实例化web访问类
                request.Method = "POST";//数据提交方式为POST
                //模拟头
                request.ContentType = "application/x-www-form-urlencoded";
                byte[] postdatabytes = Encoding.GetEncoding("gbk").GetBytes(postdata);
                request.ContentLength = postdatabytes.Length;
                request.Referer = "http://bbs.hc360.com/search.php?mod=forum";
                request.AllowAutoRedirect = false;
                request.CookieContainer = cc;
                request.KeepAlive = true;
                
                //提交请求
                Stream stream;
                stream = request.GetRequestStream();
                stream.Write(postdatabytes, 0, postdatabytes.Length);
                stream.Close();
                //接收响应
                response = (HttpWebResponse)request.GetResponse();
                //保存返回cookie
                response.Cookies = request.CookieContainer.GetCookies(request.RequestUri);
                CookieCollection cook = response.Cookies;
                string strcrook = request.CookieContainer.GetCookieHeader(request.RequestUri);
                Cookiesstr = strcrook;
                //取第一次GET跳转地址
                //StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
                //string content = sr.ReadToEnd();
                response.Close();
            }
            catch (Exception)
            {
                //第一次POST出错;
            }
            try
            {
                string loction = response.Headers["location"];
                gethost = "http://bbs.hc360.com/" + loction; //第一次GET地址
                request = (HttpWebRequest)WebRequest.Create(gethost);
                request.Method = "GET";
                request.KeepAlive = true;
                request.Headers.Add("Cookie:" + Cookiesstr);
                request.CookieContainer = cc;
                request.AllowAutoRedirect = true;
                response = (HttpWebResponse)request.GetResponse();
                //设置cookie   
                Cookiesstr = request.CookieContainer.GetCookieHeader(request.RequestUri);
                //取再次跳转链接   
                StreamReader sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gbk"));
                htmlcontent = sr.ReadToEnd();
                request.Abort();
                sr.Close();
                response.Close();
            }
            catch (Exception)
            {
                //第二次POST出错   
            }
            return htmlcontent;
        }

import requests from lxml import etree import csv import time import random import re import os from scrapy import Selector # 固定请求头配置(需替换为实际值) FIXED_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36 Edg/137.0.0.0' FIXED_COOKIE = 'll="118161"; bid=l5ki4SOlbBM; dbcl2="244638424:dLHXPIU8S0M"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.24463; _pk_id.100001.4cf6=42b01014d5c31947.1748938983.; _vwo_uuid_v2=D9E78C6D9D4E71BBB6EC73B8583864961|9da3be87da4a6d3be6203809b085d4a9; __yadk_uid=2Zr6yzTnllQxMzDhrQB82h7doa8gM4Ku; ck=ILlj; ap_v=0,6.0; frodotk_db="dcae91cc1eae6af7960bb5645c0b40e5"; __utma=30149280.1697373246.1748938900.1750132184.1750207462.10; __utmc=30149280; __utmz=30149280.1750207462.10.7.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1750207469%2C%22https%3A%2F%2Fwww.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1; __utma=223695111.238348316.1748938983.1750132184.1750207469.10; __utmb=223695111.0.10.1750207469; __utmc=223695111; __utmz=223695111.1750207469.10.8.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmt=1; __utmb=30149280.6.10.1750207462' # 需从浏览器获取 # 基础URL和请求头 base_url = "https://movie.douban.com/subject/27181010/reviews" headers = { 'User-Agent': FIXED_USER_AGENT, 'Cookie': FIXED_COOKIE, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Referer': 'https://movie.douban.com/subject/27181010/', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1', } # 创建输出目录 os.makedirs('douban_data', exist_ok=True) # 创建CSV文件并写入表头 csv_file = open('douban_data/douban_reviews_fixed(4).csv', 'w', newline='', encoding='utf-8-sig') writer = csv.writer(csv_file) writer.writerow(['昵称', '星级评分', '评论时间', '文本评论', '点赞数', '负赞数', '回应数', '页面位置']) def extract_content(element): """提取评论内容,处理展开情况""" # 尝试提取短评内容 short_content = element.xpath('.//div[contains(@class, "short-content")]/text()') if short_content: return ''.join(short_content).strip() # 尝试提取完整评论内容 full_content = element.xpath('.//div[@class="review-content clearfix"]/text()') if full_content: return ''.join(full_content).strip() # 尝试提取折叠内容 folded_content = element.xpath('.//div[@class="folded"]/text()') if folded_content: return ''.join(folded_content).strip() return "无内容" # 爬取多页数据 for page in range(0, 125): # 构造URL参数 params = { 'start': page * 20, 'sort': 'new_score', 'status': 'P' } try: print(f"开始爬取第 {page+1} 页...") # 发送请求(禁止重定向以检测验证) response = requests.get( url=base_url, params=params, headers=headers, timeout=15, allow_redirects=False # 禁止重定向以检测验证 ) # 检查重定向状态码(302表示需要验证) if response.status_code == 302: location = response.headers.get('Location', '未知位置') print(f"⚠️ 第 {page+1} 页触发验证,重定向至: {location}") # 保存重定向页面供分析 with open(f'douban_data/redirect_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) continue response.encoding = 'utf-8' if response.status_code == 200: # 解析HTML html = etree.HTML(response.text) # 检查是否有验证码提示 captcha = html.xpath('//input[@name="captcha-id"]') if captcha: print(f"⚠️ 第 {page+1} 页需要验证码,跳过") # 保存验证码页面供分析 with open(f'douban_data/captcha_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) continue # 检查页面是否包含评论容器(使用更灵活的选择器) review_container = html.xpath('//div[@class="review-list"]') if not review_container: # 尝试备用选择器 review_container = html.xpath('//div[contains(@id, "content")]//div[contains(@class, "review")]') if not review_container: # 保存异常页面用于分析 with open(f'douban_data/error_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) print(f"❌ 第 {page+1} 页无评论容器,已保存页面供分析") continue # 提取评论项(更新后的选择器) comments = html.xpath('//div[contains(@class, "review-item")]') # 备用选择器:尝试抓取评论项 if not comments: comments = html.xpath('//div[contains(@class, "main") and contains(@class, "review-item")]') if not comments: comments = html.xpath('//div[@class="review-list"]/div[contains(@class, "review")]') if not comments: print(f"❌ 第 {page+1} 页找到0条评论,可能触发反爬") # 检查反爬提示 anti_spider = html.xpath('//div[contains(text(), "检测到异常请求")]') if anti_spider: print("⚠️ 检测到反爬提示,请更换Cookie或IP") # 保存页面供分析 with open(f'douban_data/antispider_page_{page}.html', 'w', encoding='utf-8') as f: f.write(response.text) continue print(f"✅ 第 {page+1} 页找到 {len(comments)} 条评论") for idx, comment in enumerate(comments): try: # 提取昵称 username = comment.xpath('.//a[contains(@class, "name")]/text()') if not username: username = comment.xpath('.//span[@class="author"]/a/text()') username = username[0].strip() if username else "无昵称" # 提取星级评分 rating = comment.xpath('.//span[contains(@class, "rating")]/@title') if not rating: rating = comment.xpath('.//span[contains(@class, "main-title-rating")]/@title') rating = rating[0] if rating else "无评分" # 提取评论时间 comment_time = comment.xpath('.//span[contains(@class, "main-meta")]/text()') # 提取文本评论 content = extract_content(comment) # 提取点赞数 selector_list = comment.xpath('.//*[contains(@id, "useful_count")]/text()') if selector_list: # 检查列表是否非空 raw_text = selector_list[0] else: raw_text = "" # 空结果处理 match = re.search(r'\d+', raw_text) useful_count = int(match.group()) if match else 0 # 提供默认值 # 提取负赞数 selector_list = comment.xpath('.//*[contains(@id, "useless_count")]/text()') if selector_list: # 检查列表是否非空 raw_text = selector_list[0] else: raw_text = "" # 空结果处理 match = re.search(r'\d+', raw_text) useless_count = int(match.group()) if match else 0 # 提供默认值 # 提取回应数 reply_count = comment.xpath('.//*[contains(@id, "reply")]/text()') # 写入CSV writer.writerow([ username, rating, comment_time, content, useful_count, useless_count, reply_count, f"第{page+1}页第{idx+1}条" ]) except Exception as e: print(f"⚠️ 处理评论时出错: {e}") continue else: print(f"❌ 请求失败,状态码: {response.status_code}") except Exception as e: print(f"❌ 请求异常: {e}") # 随机延迟,避免频繁请求 delay = random.uniform(3, 8) print(f"⏳ 等待 {delay:.2f} 秒后继续...") time.sleep(delay) csv_file.close() print("✅ 爬取完成!数据已保存到 douban_data/douban_reviews_fixed(4).csv"),此代码在爬取文本评论时无法爬取未展开的评论内容,帮我修改代码,使得可以爬取相对应的完整评论内容,此外,提取回应数的输出为[],其Xpath样式为//*[@id="16724545"]/div/div[3]/a[3],帮我修改,要修改后的所有完整代码
06-19
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值