还是没解析到目标数据啊,什么原因呢?这是我的代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import json
import time
import random
import re
from datetime import datetime
from urllib.parse import quote, urlparse, parse_qs
import logging
import browser_cookie3
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f"instagram_fixed_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger('InstagramFixed')
class InstagramSessionFixer:
def __init__(self, target_url):
self.target_url = target_url
self.post_shortcode = self.extract_shortcode(target_url)
self.session = requests.Session()
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
]
# 更新会话cookies
self.cookies = {
'ds_user_id': '61971702457',
'csrftoken': '4YzNalzWguviI-qNJs0ftx',
'ig_did': '2661FC9A-4FB7-4383-B1EB-714C7A6A54E8',
'dpr': '1.25',
'mid': 'aQRkcwALAAHwafiIiIA8Jae8jgv1',
'ps_l': '1',
'ps_n': '1',
'sessionid': '61971702457%3ATzxhAREIa44MuZ%3A8%3AAYiF0jwFfuAmow6DcWxvsuow1VMgOTvnFfnf-S89PE8',
'wd': '1536x319',
'rur': '"CLN\\05461971702457\\0541793762034:01fe97acad5ea153bccfffef275c80d52863f3d5533563761ada96005c16b0ba99a4a9b5"',
}
# 添加Instagram所需的其他关键cookie
self.cookies['ig_nrcb'] = '1'
# 更新关键参数(最新的query_hash)
self.post_query_hash = "37479f2b8209594dde7facb0d904896a"
self.session.cookies.update(self.cookies)
self.update_headers()
self.attempt_count = 0
def extract_shortcode(self, url):
pattern = r"https?://(?:www\.)?instagram\.com/p/([^/?#]+)"
match = re.search(pattern, url)
return match.group(1) if match else None
def update_headers(self):
# https://www.instagram.com/p/DP1z9ZUDiAL/?igsh=ajBqN282aWtxaXZv
self.headers = {
'User-Agent': random.choice(self.user_agents),
'X-IG-App-ID': '936619743392459',
'X-Requested-With': 'XMLHttpRequest',
'Referer': f'https://www.instagram.com/p/{self.post_shortcode}/',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'X-CSRFToken': self.cookies.get('csrftoken', ''),
'X-IG-WWW-Claim': 'hmac.AR2d7q7Yf4zE1HjzB8DlUcX0nqI9o9cVkQr1SQV7w8XqE3JxZf',
'Connection': 'keep-alive',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'TE': 'trailers',
'priority': 'u=0, i',
'upgrade-insecure-requests': '1'
}
def refresh_csrf_token(self):
csrf_url = "https://www.instagram.com/"
response = self.session.get(csrf_url, headers=self.headers)
if 'csrftoken' in response.cookies:
self.cookies['csrftoken'] = response.cookies['csrftoken']
self.session.cookies.update(self.cookies)
self.headers['X-CSRFToken'] = self.cookies['csrftoken']
logger.info("CSRF令牌已刷新")
def validate_session(self):
"""验证当前会话是否有效"""
test_urls = [
"https://www.instagram.com/api/v1/users/web_profile_info/?username=instagram",
"https://www.instagram.com/api/v1/feed/reels_tray/"
]
for url in test_urls:
try:
response = self.session.get(url, headers=self.headers, timeout=10)
if response.status_code == 200 and response.json().get('status') == 'ok':
logger.info("会话验证成功")
return True
except:
continue
logger.warning("所有会话验证请求均失败")
return False
def get_browser_cookies(self, browser_type='chrome'):
try:
if browser_type == 'chrome':
cookies = browser_cookie3.chrome(domain_name='.instagram.com')
elif browser_type == 'firefox':
cookies = browser_cookie3.firefox(domain_name='.instagram.com')
elif browser_type == 'edge':
cookies = browser_cookie3.edge(domain_name='.instagram.com')
else:
cookies = browser_cookie3.load(domain_name='.instagram.com')
# 将浏览器cookies添加到会话
for cookie in cookies:
self.session.cookies.set(cookie.name, cookie.value)
# 更新本地cookie字典
self.cookies[cookie.name] = cookie.value
logger.info(f"成功从{browser_type}导入{len(cookies)}个cookies")
# 刷新CSRF令牌
self.refresh_csrf_token()
return True
except Exception as e:
logger.error(f"导入浏览器cookies失败: {str(e)}")
return False
def safe_request(self, url):
self.attempt_count += 1
# 每3次尝试刷新CSRF令牌
if self.attempt_count % 3 == 0:
self.refresh_csrf_token()
logger.info("已刷新请求头部")
# 随机延迟
delay = random.uniform(1.5, 3.5)
logger.info(f"请求前等待 {delay:.1f}秒...")
time.sleep(delay)
try:
response = self.session.get(url, headers=self.headers, timeout=30)
logger.debug(f"响应状态码: {response.status_code}")
logger.debug(f"响应头: {response.headers}")
if response.status_code == 200:
logger.info("请求成功 (200 OK)")
return response.json()
elif response.status_code == 401:
logger.error("会话已过期 (401 Unauthorized)")
# 尝试自动修复会话
if self.get_browser_cookies():
return self.safe_request(url) # 重试
elif response.status_code == 403:
logger.error("访问被拒绝 (403 Forbidden)")
# 尝试刷新CSRF
self.refresh_csrf_token()
return self.safe_request(url) # 重试
elif response.status_code == 404:
logger.error("资源不存在 (404 Not Found)")
else:
logger.error(f"请求失败: {response.status_code} {response.reason}")
return None
except requests.exceptions.RequestException as e:
logger.error(f"请求异常: {str(e)}")
return None
def get_html_content(self):
"""直接获取页面HTML内容"""
url = f"https://www.instagram.com/p/{self.post_shortcode}/"
try:
response = self.session.get(url, headers=self.headers)
if response.status_code == 200:
logger.info("成功获取HTML内容")
return response.text
else:
logger.error(f"获取HTML失败: {response.status_code}")
except Exception as e:
logger.error(f"获取HTML失败: {str(e)}")
return None
def get_post_data(self):
if not self.post_shortcode:
logger.error("无效的Instagram URL")
return None
# 验证会话
if not self.validate_session():
logger.warning("会话无效,尝试从浏览器导入cookies...")
self.get_browser_cookies()
# 尝试REST API
rest_api_url = f"https://www.instagram.com/p/{self.post_shortcode}/?__a=1&__d=dis"
logger.info(f"尝试REST API端点: {rest_api_url}")
rest_data = self.safe_request(rest_api_url)
if rest_data and 'graphql' in rest_data:
parsed_data = self.parse_rest_data(rest_data)
if parsed_data:
return parsed_data
# 构造GraphQL请求
logger.warning("REST API失败,尝试GraphQL端点")
variables = {
"shortcode": self.post_shortcode,
"child_comment_count": 3,
"fetch_comment_count": 40,
"parent_comment_count": 24,
"has_threaded_comments": True
}
json_str = json.dumps(variables)
encoded_str = quote(json_str)
graphql_url = f"https://www.instagram.com/graphql/query/?query_hash={self.post_query_hash}&variables={encoded_str}"
logger.info(f"请求GraphQL: {graphql_url[:100]}...")
graphql_data = self.safe_request(graphql_url)
if graphql_data and 'data' in graphql_data:
parsed_data = self.parse_graphql_data(graphql_data)
if parsed_data:
return parsed_data
else:
logger.error("GraphQL请求失败")
# 尝试直接访问页面作为最后手段
logger.warning("所有API失败,尝试直接解析HTML")
html_data = self.get_html_content()
if html_data:
parsed_data = self.parse_html_data(html_data)
if parsed_data:
return parsed_data
logger.error("所有方法均无法获取帖子数据")
return None
def parse_rest_data(self, json_data):
"""解析REST API返回的数据"""
try:
media = json_data.get('graphql', {}).get('shortcode_media')
if not media:
logger.error("REST响应中缺少shortcode_media")
return None
post_data = {
'id': media['id'],
'shortcode': media['shortcode'],
'likes_count': media['edge_media_preview_like']['count'],
'comments_count': media['edge_media_to_comment']['count'],
'timestamp': datetime.fromtimestamp(media['taken_at_timestamp']).strftime('%Y-%m-%d %H:%M:%S'),
'owner': media['owner']['username'],
'is_video': media['is_video'],
'caption': media['edge_media_to_caption']['edges'][0]['node']['text']
if media['edge_media_to_caption']['edges'] else '',
'shares_count': media.get('edge_web_media_to_related_media', {}).get('count', 0)
}
logger.info(f"解析成功 (REST): {post_data['shortcode']}")
return post_data
except KeyError as e:
logger.error(f"解析失败 (REST): 缺少键 {e}")
logger.debug(f"响应数据: {json.dumps(json_data, indent=2)}")
return None
def parse_graphql_data(self, json_data):
"""解析GraphQL返回的数据"""
try:
# 增强空值检查
if not json_data or 'data' not in json_data or not json_data['data']:
logger.error(f"GraphQL返回无效数据: {json.dumps(json_data)[:200] if json_data else 'None'}")
return None
# 使用get方法安全访问嵌套属性
media = json_data.get('data', {}).get('shortcode_media')
if not media:
logger.error(f"GraphQL响应缺少shortcode_media: {json.dumps(json_data)[:200]}")
return None
# 安全访问其他属性
caption = ""
if media.get('edge_media_to_caption', {}).get('edges'):
caption = media['edge_media_to_caption']['edges'][0]['node']['text']
post_data = {
'id': media.get('id', ''),
'shortcode': media.get('shortcode', ''),
'likes_count': media.get('edge_media_preview_like', {}).get('count', 0),
'comments_count': media.get('edge_media_to_comment', {}).get('count', 0),
'timestamp': datetime.fromtimestamp(media.get('taken_at_timestamp', time.time())).strftime('%Y-%m-%d %H:%M:%S'),
'owner': media.get('owner', {}).get('username', ''),
'is_video': media.get('is_video', False),
'caption': caption,
'shares_count': media.get('edge_web_media_to_related_media', {}).get('count', 0)
}
logger.info(f"解析成功 (GraphQL): {post_data['shortcode']}")
return post_data
except Exception as e:
logger.error(f"解析GraphQL数据时出错: {str(e)}")
logger.debug(f"完整响应数据: {json.dumps(json_data, indent=2)[:500]}")
return None
def parse_html_data(self, html):
"""直接解析HTML页面内容获取数据"""
try:
# 方法1: 尝试从脚本标签提取结构化JSON数据
script_pattern = r'<script type="text/javascript">window\.__additionalDataLoaded\(\'[^\']+\',(.*?)\);</script>'
script_match = re.search(script_pattern, html, re.DOTALL)
if script_match:
script_data = script_match.group(1).strip()
if script_data.endswith(';'):
script_data = script_data[:-1]
try:
json_data = json.loads(script_data)
media = json_data.get('graphql', {}).get('shortcode_media')
if media:
# 复用已有的解析逻辑
post_data = {
'id': media.get('id', self.post_shortcode),
'shortcode': media.get('shortcode', self.post_shortcode),
'likes_count': media.get('edge_media_preview_like', {}).get('count', 0),
'comments_count': media.get('edge_media_to_comment', {}).get('count', 0),
'timestamp': datetime.fromtimestamp(media['taken_at_timestamp']).strftime('%Y-%m-%d %H:%M:%S')
if 'taken_at_timestamp' in media else '',
'owner': media.get('owner', {}).get('username', ''),
'is_video': media.get('is_video', False),
'caption': media.get('edge_media_to_caption', {}).get('edges', [{}])[0].get('node', {}).get('text', ''),
'shares_count': media.get('edge_web_media_to_related_media', {}).get('count', 0)
}
logger.info("从HTML脚本中成功解析结构化数据")
return post_data
except json.JSONDecodeError as e:
logger.warning(f"HTML中的JSON解析失败: {str(e)}")
except KeyError as e:
logger.warning(f"HTML JSON结构缺少键: {str(e)}")
# 方法2: 备用方法 - 直接正则提取关键字段
logger.info("尝试备用HTML解析方法")
# 提取点赞数
likes_match = re.search(r'"likeCount":(\d+)', html) or re.search(r'"likes":\s*{\s*"count":\s*(\d+)', html)
# 提取评论数
comments_match = re.search(r'"commentCount":(\d+)', html) or re.search(r'"comments":\s*{\s*"count":\s*(\d+)', html)
# 提取分享数
shares_match = re.search(r'"shareCount":(\d+)', html) or re.search(r'"shares":\s*{\s*"count":\s*(\d+)', html)
# 提取作者
owner_match = re.search(r'"owner":\s*{\s*"username":\s*"([^"]+)"', html) or re.search(r'"owner_username":\s*"([^"]+)"', html)
# 提取时间戳
timestamp_match = re.search(r'"uploadDate":\s*"([^"]+)"', html) or re.search(r'"taken_at_timestamp":\s*(\d+)', html)
# 提取描述
caption_match = re.search(r'"caption":\s*"((?:[^"\\]|\\.)*)"', html)
if caption_match:
# 处理转义字符
caption = caption_match.group(1).encode('utf-8').decode('unicode_escape')
else:
caption = ''
# 提取视频标志
is_video = bool(re.search(r'"is_video":\s*true', html)) or bool(re.search(r'"video_url":\s*"', html))
# 处理时间戳格式
timestamp_str = ''
if timestamp_match:
if timestamp_match.group(1).isdigit():
timestamp = datetime.fromtimestamp(int(timestamp_match.group(1)))
timestamp_str = timestamp.strftime('%Y-%m-%d %H:%M:%S')
else:
timestamp_str = timestamp_match.group(1)
post_data = {
'id': self.post_shortcode,
'shortcode': self.post_shortcode,
'likes_count': int(likes_match.group(1)) if likes_match else 0,
'comments_count': int(comments_match.group(1)) if comments_match else 0,
'shares_count': int(shares_match.group(1)) if shares_match else 0,
'timestamp': timestamp_str,
'owner': owner_match.group(1) if owner_match else 'unknown',
'is_video': is_video,
'caption': caption
}
logger.info("备用HTML解析方法成功")
return post_data
except Exception as e:
logger.error(f"HTML解析失败: {str(e)}")
logger.debug(f"HTML片段:\n{html[:1000]}")
return None
def print_results(self, post_data):
if not post_data:
logger.warning("没有数据可展示")
return
print("\n" + "=" * 50)
print(f"Instagram 帖子分析结果: {self.target_url}")
print("=" * 50)
print(f"帖子ID: {post_data.get('id', '')}")
print(f"发布时间: {post_data.get('timestamp', '')}")
print(f"发布者: @{post_data.get('owner', '')}")
print("-" * 50)
print(f"点赞数: {post_data.get('likes_count', 0):,}")
print(f"评论数: {post_data.get('comments_count', 0):,}")
print(f"转发/分享数: {post_data.get('shares_count', 0):,}")
print("=" * 50)
print(f"内容描述: {post_data.get('caption', '')[:200]}...")
print("=" * 50)
def main():
# 目标帖子URL
post_url = "https://www.instagram.com/p/DP1z9ZUDiAL/?igsh=ajBqN282aWtxaXZv"
# 创建修复后的会话实例
fixer = InstagramSessionFixer(post_url)
# 获取帖子数据
post_data = fixer.get_post_data()
# 打印结果
if post_data:
fixer.print_results(post_data)
else:
logger.error("无法获取帖子数据,请检查cookies和网络连接")
if __name__ == '__main__':
main()