chrome.cookies.getAll 获得的cookie为空值 谷歌浏览器插件开发

在开发谷歌浏览器插件时遇到一个问题,尝试获取淘宝网站的Cookie时,初始返回为空。问题根源在于缺少必要的权限设置。通过更新manifest.json文件,特别是添加'<all_urls>'权限,成功解决了获取Cookie为空的问题,确保了插件能正确访问淘宝网站的数据。

今天开发谷歌浏览器插件 需要获取淘宝网cookie
使用 chrome.cookies.getAll后显示为空

后来发现是权限没有设置规则
manifest.json 修改后如下

"permissions": [
        "contextMenus",
        "cookies",
        "<all_urls>"  //主要是这个  没写这个返回为空 
 ],
还是没解析到目标数据啊,什么原因呢?这是我的代码: #!/usr/bin/env python # -*- coding: utf-8 -*- import requests import json import time import random import re from datetime import datetime from urllib.parse import quote, urlparse, parse_qs import logging import browser_cookie3 # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(f"instagram_fixed_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"), logging.StreamHandler() ] ) logger = logging.getLogger('InstagramFixed') class InstagramSessionFixer: def __init__(self, target_url): self.target_url = target_url self.post_shortcode = self.extract_shortcode(target_url) self.session = requests.Session() self.user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0' ] # 更新会话cookies self.cookies = { 'ds_user_id': '61971702457', 'csrftoken': '4YzNalzWguviI-qNJs0ftx', 'ig_did': '2661FC9A-4FB7-4383-B1EB-714C7A6A54E8', 'dpr': '1.25', 'mid': 'aQRkcwALAAHwafiIiIA8Jae8jgv1', 'ps_l': '1', 'ps_n': '1', 'sessionid': '61971702457%3ATzxhAREIa44MuZ%3A8%3AAYiF0jwFfuAmow6DcWxvsuow1VMgOTvnFfnf-S89PE8', 'wd': '1536x319', 'rur': '"CLN\\05461971702457\\0541793762034:01fe97acad5ea153bccfffef275c80d52863f3d5533563761ada96005c16b0ba99a4a9b5"', } # 添加Instagram所需的其他关键cookie self.cookies['ig_nrcb'] = '1' # 更新关键参数(最新的query_hash) self.post_query_hash = "37479f2b8209594dde7facb0d904896a" self.session.cookies.update(self.cookies) self.update_headers() self.attempt_count = 0 def extract_shortcode(self, url): pattern = r"https?://(?:www\.)?instagram\.com/p/([^/?#]+)" match = re.search(pattern, url) return match.group(1) if match else None def update_headers(self): # https://www.instagram.com/p/DP1z9ZUDiAL/?igsh=ajBqN282aWtxaXZv self.headers = { 'User-Agent': random.choice(self.user_agents), 'X-IG-App-ID': '936619743392459', 'X-Requested-With': 'XMLHttpRequest', 'Referer': f'https://www.instagram.com/p/{self.post_shortcode}/', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'X-CSRFToken': self.cookies.get('csrftoken', ''), 'X-IG-WWW-Claim': 'hmac.AR2d7q7Yf4zE1HjzB8DlUcX0nqI9o9cVkQr1SQV7w8XqE3JxZf', 'Connection': 'keep-alive', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'TE': 'trailers', 'priority': 'u=0, i', 'upgrade-insecure-requests': '1' } def refresh_csrf_token(self): csrf_url = "https://www.instagram.com/" response = self.session.get(csrf_url, headers=self.headers) if 'csrftoken' in response.cookies: self.cookies['csrftoken'] = response.cookies['csrftoken'] self.session.cookies.update(self.cookies) self.headers['X-CSRFToken'] = self.cookies['csrftoken'] logger.info("CSRF令牌已刷新") def validate_session(self): """验证当前会话是否有效""" test_urls = [ "https://www.instagram.com/api/v1/users/web_profile_info/?username=instagram", "https://www.instagram.com/api/v1/feed/reels_tray/" ] for url in test_urls: try: response = self.session.get(url, headers=self.headers, timeout=10) if response.status_code == 200 and response.json().get('status') == 'ok': logger.info("会话验证成功") return True except: continue logger.warning("所有会话验证请求均失败") return False def get_browser_cookies(self, browser_type='chrome'): try: if browser_type == 'chrome': cookies = browser_cookie3.chrome(domain_name='.instagram.com') elif browser_type == 'firefox': cookies = browser_cookie3.firefox(domain_name='.instagram.com') elif browser_type == 'edge': cookies = browser_cookie3.edge(domain_name='.instagram.com') else: cookies = browser_cookie3.load(domain_name='.instagram.com') # 将浏览器cookies添加到会话 for cookie in cookies: self.session.cookies.set(cookie.name, cookie.value) # 更新本地cookie字典 self.cookies[cookie.name] = cookie.value logger.info(f"成功从{browser_type}导入{len(cookies)}个cookies") # 刷新CSRF令牌 self.refresh_csrf_token() return True except Exception as e: logger.error(f"导入浏览器cookies失败: {str(e)}") return False def safe_request(self, url): self.attempt_count += 1 # 每3次尝试刷新CSRF令牌 if self.attempt_count % 3 == 0: self.refresh_csrf_token() logger.info("已刷新请求头部") # 随机延迟 delay = random.uniform(1.5, 3.5) logger.info(f"请求前等待 {delay:.1f}秒...") time.sleep(delay) try: response = self.session.get(url, headers=self.headers, timeout=30) logger.debug(f"响应状态码: {response.status_code}") logger.debug(f"响应头: {response.headers}") if response.status_code == 200: logger.info("请求成功 (200 OK)") return response.json() elif response.status_code == 401: logger.error("会话已过期 (401 Unauthorized)") # 尝试自动修复会话 if self.get_browser_cookies(): return self.safe_request(url) # 重试 elif response.status_code == 403: logger.error("访问被拒绝 (403 Forbidden)") # 尝试刷新CSRF self.refresh_csrf_token() return self.safe_request(url) # 重试 elif response.status_code == 404: logger.error("资源不存在 (404 Not Found)") else: logger.error(f"请求失败: {response.status_code} {response.reason}") return None except requests.exceptions.RequestException as e: logger.error(f"请求异常: {str(e)}") return None def get_html_content(self): """直接获取页面HTML内容""" url = f"https://www.instagram.com/p/{self.post_shortcode}/" try: response = self.session.get(url, headers=self.headers) if response.status_code == 200: logger.info("成功获取HTML内容") return response.text else: logger.error(f"获取HTML失败: {response.status_code}") except Exception as e: logger.error(f"获取HTML失败: {str(e)}") return None def get_post_data(self): if not self.post_shortcode: logger.error("无效的Instagram URL") return None # 验证会话 if not self.validate_session(): logger.warning("会话无效,尝试从浏览器导入cookies...") self.get_browser_cookies() # 尝试REST API rest_api_url = f"https://www.instagram.com/p/{self.post_shortcode}/?__a=1&__d=dis" logger.info(f"尝试REST API端点: {rest_api_url}") rest_data = self.safe_request(rest_api_url) if rest_data and 'graphql' in rest_data: parsed_data = self.parse_rest_data(rest_data) if parsed_data: return parsed_data # 构造GraphQL请求 logger.warning("REST API失败,尝试GraphQL端点") variables = { "shortcode": self.post_shortcode, "child_comment_count": 3, "fetch_comment_count": 40, "parent_comment_count": 24, "has_threaded_comments": True } json_str = json.dumps(variables) encoded_str = quote(json_str) graphql_url = f"https://www.instagram.com/graphql/query/?query_hash={self.post_query_hash}&variables={encoded_str}" logger.info(f"请求GraphQL: {graphql_url[:100]}...") graphql_data = self.safe_request(graphql_url) if graphql_data and 'data' in graphql_data: parsed_data = self.parse_graphql_data(graphql_data) if parsed_data: return parsed_data else: logger.error("GraphQL请求失败") # 尝试直接访问页面作为最后手段 logger.warning("所有API失败,尝试直接解析HTML") html_data = self.get_html_content() if html_data: parsed_data = self.parse_html_data(html_data) if parsed_data: return parsed_data logger.error("所有方法均无法获取帖子数据") return None def parse_rest_data(self, json_data): """解析REST API返回的数据""" try: media = json_data.get('graphql', {}).get('shortcode_media') if not media: logger.error("REST响应中缺少shortcode_media") return None post_data = { 'id': media['id'], 'shortcode': media['shortcode'], 'likes_count': media['edge_media_preview_like']['count'], 'comments_count': media['edge_media_to_comment']['count'], 'timestamp': datetime.fromtimestamp(media['taken_at_timestamp']).strftime('%Y-%m-%d %H:%M:%S'), 'owner': media['owner']['username'], 'is_video': media['is_video'], 'caption': media['edge_media_to_caption']['edges'][0]['node']['text'] if media['edge_media_to_caption']['edges'] else '', 'shares_count': media.get('edge_web_media_to_related_media', {}).get('count', 0) } logger.info(f"解析成功 (REST): {post_data['shortcode']}") return post_data except KeyError as e: logger.error(f"解析失败 (REST): 缺少键 {e}") logger.debug(f"响应数据: {json.dumps(json_data, indent=2)}") return None def parse_graphql_data(self, json_data): """解析GraphQL返回的数据""" try: # 增强空值检查 if not json_data or 'data' not in json_data or not json_data['data']: logger.error(f"GraphQL返回无效数据: {json.dumps(json_data)[:200] if json_data else 'None'}") return None # 使用get方法安全访问嵌套属性 media = json_data.get('data', {}).get('shortcode_media') if not media: logger.error(f"GraphQL响应缺少shortcode_media: {json.dumps(json_data)[:200]}") return None # 安全访问其他属性 caption = "" if media.get('edge_media_to_caption', {}).get('edges'): caption = media['edge_media_to_caption']['edges'][0]['node']['text'] post_data = { 'id': media.get('id', ''), 'shortcode': media.get('shortcode', ''), 'likes_count': media.get('edge_media_preview_like', {}).get('count', 0), 'comments_count': media.get('edge_media_to_comment', {}).get('count', 0), 'timestamp': datetime.fromtimestamp(media.get('taken_at_timestamp', time.time())).strftime('%Y-%m-%d %H:%M:%S'), 'owner': media.get('owner', {}).get('username', ''), 'is_video': media.get('is_video', False), 'caption': caption, 'shares_count': media.get('edge_web_media_to_related_media', {}).get('count', 0) } logger.info(f"解析成功 (GraphQL): {post_data['shortcode']}") return post_data except Exception as e: logger.error(f"解析GraphQL数据时出错: {str(e)}") logger.debug(f"完整响应数据: {json.dumps(json_data, indent=2)[:500]}") return None def parse_html_data(self, html): """直接解析HTML页面内容获取数据""" try: # 方法1: 尝试从脚本标签提取结构化JSON数据 script_pattern = r'<script type="text/javascript">window\.__additionalDataLoaded\(\'[^\']+\',(.*?)\);</script>' script_match = re.search(script_pattern, html, re.DOTALL) if script_match: script_data = script_match.group(1).strip() if script_data.endswith(';'): script_data = script_data[:-1] try: json_data = json.loads(script_data) media = json_data.get('graphql', {}).get('shortcode_media') if media: # 复用已有的解析逻辑 post_data = { 'id': media.get('id', self.post_shortcode), 'shortcode': media.get('shortcode', self.post_shortcode), 'likes_count': media.get('edge_media_preview_like', {}).get('count', 0), 'comments_count': media.get('edge_media_to_comment', {}).get('count', 0), 'timestamp': datetime.fromtimestamp(media['taken_at_timestamp']).strftime('%Y-%m-%d %H:%M:%S') if 'taken_at_timestamp' in media else '', 'owner': media.get('owner', {}).get('username', ''), 'is_video': media.get('is_video', False), 'caption': media.get('edge_media_to_caption', {}).get('edges', [{}])[0].get('node', {}).get('text', ''), 'shares_count': media.get('edge_web_media_to_related_media', {}).get('count', 0) } logger.info("从HTML脚本中成功解析结构化数据") return post_data except json.JSONDecodeError as e: logger.warning(f"HTML中的JSON解析失败: {str(e)}") except KeyError as e: logger.warning(f"HTML JSON结构缺少键: {str(e)}") # 方法2: 备用方法 - 直接正则提取关键字段 logger.info("尝试备用HTML解析方法") # 提取点赞数 likes_match = re.search(r'"likeCount":(\d+)', html) or re.search(r'"likes":\s*{\s*"count":\s*(\d+)', html) # 提取评论数 comments_match = re.search(r'"commentCount":(\d+)', html) or re.search(r'"comments":\s*{\s*"count":\s*(\d+)', html) # 提取分享数 shares_match = re.search(r'"shareCount":(\d+)', html) or re.search(r'"shares":\s*{\s*"count":\s*(\d+)', html) # 提取作者 owner_match = re.search(r'"owner":\s*{\s*"username":\s*"([^"]+)"', html) or re.search(r'"owner_username":\s*"([^"]+)"', html) # 提取时间戳 timestamp_match = re.search(r'"uploadDate":\s*"([^"]+)"', html) or re.search(r'"taken_at_timestamp":\s*(\d+)', html) # 提取描述 caption_match = re.search(r'"caption":\s*"((?:[^"\\]|\\.)*)"', html) if caption_match: # 处理转义字符 caption = caption_match.group(1).encode('utf-8').decode('unicode_escape') else: caption = '' # 提取视频标志 is_video = bool(re.search(r'"is_video":\s*true', html)) or bool(re.search(r'"video_url":\s*"', html)) # 处理时间戳格式 timestamp_str = '' if timestamp_match: if timestamp_match.group(1).isdigit(): timestamp = datetime.fromtimestamp(int(timestamp_match.group(1))) timestamp_str = timestamp.strftime('%Y-%m-%d %H:%M:%S') else: timestamp_str = timestamp_match.group(1) post_data = { 'id': self.post_shortcode, 'shortcode': self.post_shortcode, 'likes_count': int(likes_match.group(1)) if likes_match else 0, 'comments_count': int(comments_match.group(1)) if comments_match else 0, 'shares_count': int(shares_match.group(1)) if shares_match else 0, 'timestamp': timestamp_str, 'owner': owner_match.group(1) if owner_match else 'unknown', 'is_video': is_video, 'caption': caption } logger.info("备用HTML解析方法成功") return post_data except Exception as e: logger.error(f"HTML解析失败: {str(e)}") logger.debug(f"HTML片段:\n{html[:1000]}") return None def print_results(self, post_data): if not post_data: logger.warning("没有数据可展示") return print("\n" + "=" * 50) print(f"Instagram 帖子分析结果: {self.target_url}") print("=" * 50) print(f"帖子ID: {post_data.get('id', '')}") print(f"发布时间: {post_data.get('timestamp', '')}") print(f"发布者: @{post_data.get('owner', '')}") print("-" * 50) print(f"点赞数: {post_data.get('likes_count', 0):,}") print(f"评论数: {post_data.get('comments_count', 0):,}") print(f"转发/分享数: {post_data.get('shares_count', 0):,}") print("=" * 50) print(f"内容描述: {post_data.get('caption', '')[:200]}...") print("=" * 50) def main(): # 目标帖子URL post_url = "https://www.instagram.com/p/DP1z9ZUDiAL/?igsh=ajBqN282aWtxaXZv" # 创建修复后的会话实例 fixer = InstagramSessionFixer(post_url) # 获取帖子数据 post_data = fixer.get_post_data() # 打印结果 if post_data: fixer.print_results(post_data) else: logger.error("无法获取帖子数据,请检查cookies和网络连接") if __name__ == '__main__': main()
11-05
import pickle from selenium import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.by import By from bs4 import BeautifulSoup import pandas as pd # 初始化 WebDriver driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # 打开目标网站 driver.get("https://www.kugou.com/yy/rank/home/1-8888.html?from=rank") # 尝试加载已保存的 Cookies try: with open("cookies.pkl", "rb") as f: cookies = pickle.load(f) for cookie in cookies: driver.add_cookie("kg_mid=bf50351db5d9993d151316e209bf545d; kg_dfid=3ewdoL0WIMsM3q5DeB3zlxTc; kg_dfid_collect=d41d8cd98f00b204e9800998ecf8427e; Hm_lvt_aedee6983d4cfc62f509129360d6bb3d=1749977319,1749985793; HMACCOUNT=39955928486371AA; ACK_SERVER_10015=%7B%22list%22%3A%5B%5B%22bjlogin-user.kugou.com%22%5D%5D%7D; KuGoo=KugooID=2364568992&KugooPwd=373E535A78CBF0D3FB13821009F1F2DB&NickName=%u0032%u0033%u0036%u0034%u0035%u0036%u0038%u0039%u0039%u0032&Pic=http://imge.kugou.com/kugouicon/165/20100101/20100101192931478054.jpg&RegState=1&RegFrom=&t=9fb1f3141df1ac240c53f19a7db2515799860c3e4af8f6751bac018418504b9f&a_id=1014&ct=1749985835&UserName=%u0032%u0033%u0036%u0034%u0035%u0036%u0038%u0039%u0039%u0032&t1=; KugooID=2364568992; t=9fb1f3141df1ac240c53f19a7db2515799860c3e4af8f6751bac018418504b9f; a_id=1014; UserName=2364568992; mid=bf50351db5d9993d151316e209bf545d; dfid=3ewdoL0WIMsM3q5DeB3zlxTc; Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d=1749985850") # 添加 Cookies print("Cookies 加载成功") except FileNotFoundError: print("未找到 Cookies 文件,需要重新登录") # 刷新页面以应用 Cookies driver.refresh() # 如果需要手动登录,则保存新的 Cookies if not driver.current_url.startswith("https://www.kugou.com/yy/rank/home/1-8888.html?from=rank"): input("请手动完成登录后按 Enter 键继续...") with open("cookies.pkl", "wb") as f: pickle.dump(driver.get_cookies(), f) print("新的 Cookies 已保存") # 抓取动态内容 driver.get("https://www.kugou.com/yy/rank/home/1-8888.html?from=rank") # 替换为目标页面 URL html = driver.page_source soup = BeautifulSoup(html, 'html.parser') # 提取数据 titles = [item.text for item in soup.find_all('h2', class_='title')] links = [item['href'] for item in soup.find_all('a', class_='link')] # 存储为 DataFrame data = {'Title': titles, 'Link': links} df = pd.DataFrame(data) # 保存到 CSV 文件 df.to_csv('output.csv', index=False, encoding='utf-8') print("数据已保存到 output.csv") # 关闭浏览器 driver.quit() 为什么不能正常运行
06-16
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值