#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
import pandas as pd
import time
import json
import random
import logging
from datetime import datetime
from urllib.parse import quote
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(f"purina_followers_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger('InstagramFollowerSpider')
class InstagramSpider:
def __init__(self, target_username="purina"):
self.session = requests.Session()
self.target_username = target_username
self.target_user_id = None
self.collected_followers = []
self.max_followers = 10000 # 目标粉丝数量设置为1万
# 用户代理池
self.user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0'
]
# 更新为您的有效会话信息(从浏览器获取)
self.session_info = {
'sessionid': '61971702457%3ATzxhAREIa44MuZ%3A8%3AAYif5HvMvEBVkSU3zaOvBbECHuQlzwAl9bCFEhgEDw',
'csrftoken': '4YzNalzWguviI-qNJs0ftx',
'ds_user_id': '61971702457',
'rur': '"FRC\\05461971702457\\0541793525962:01fee321c68ce3d9469f80c0b0e109571e0fc50adbaa2c7741cd4e21588051ad8d43a565"',
'mid': 'aQRkcwALAAHwafiIiIA8Jae8jgv1',
'ig_did': '2661FC9A-4FB7-4383-B1EB-714C7A6A54E8',
}
self.update_headers()
self.update_session_cookies()
self.request_count = 0
self.last_request_time = 0
# GraphQL API 配置
self.graphql_url = "https://www.instagram.com/graphql/query/"
self.followers_query_hash = "37479f2b8209594dde7facb0d904896a"
self.max_retries = 3
self.batch_size = 50 # 每批请求的数量
def update_headers(self):
"""更新请求头"""
self.headers = {
'User-Agent': random.choice(self.user_agents),
'X-IG-App-ID': '936619743392459',
'X-ASBD-ID': '198387',
'X-Requested-With': 'XMLHttpRequest',
'Referer': f'https://www.instagram.com/{self.target_username}/followers/',
'Accept': 'application/json',
'Accept-Language': 'en-US,en;q=0.9',
'X-IG-WWW-Claim': '0',
}
def update_session_cookies(self):
"""更新会话cookies"""
self.session.cookies.update(self.session_info)
def rate_limit(self, min_delay=3.0):
"""请求速率控制"""
elapsed = time.time() - self.last_request_time
if elapsed < min_delay:
sleep_time = min_delay - elapsed
logger.info(f"等待 {sleep_time:.1f}秒...")
time.sleep(sleep_time)
self.last_request_time = time.time()
def safe_request(self, url, params=None, max_retries=3):
"""安全请求方法,带重试和错误处理"""
self.rate_limit()
self.request_count += 1
for attempt in range(max_retries):
try:
response = self.session.get(
url,
headers=self.headers,
params=params,
timeout=30
)
# 调试日志
logger.debug(f"请求 #{self.request_count}: {response.status_code} {url}")
# 检查状态码
if response.status_code == 200:
try:
return response.json(), response.status_code
except json.JSONDecodeError:
logger.error(f"JSON解析失败: {response.text[:200]}...")
# 尝试解析为文本
return response.text, response.status_code
# 处理403禁止访问
elif response.status_code == 403:
logger.error(f"403禁止访问: {url}")
if attempt == 0:
logger.info("尝试刷新会话信息...")
self.update_session_cookies()
continue
# 其他错误状态
else:
logger.error(f"请求失败: {response.status_code} {response.reason}")
# 添加指数退避
time.sleep(2 ** attempt)
continue
except requests.exceptions.RequestException as e:
logger.error(f"请求异常: {str(e)}")
time.sleep(2 ** attempt) # 指数退避延迟
logger.error(f"请求失败超过最大重试次数: {url}")
return None, 500
def get_target_user_id(self):
"""获取目标博主(purina)的用户ID"""
logger.info(f"获取目标博主ID: @{self.target_username}")
profile_url = f'https://www.instagram.com/api/v1/users/web_profile_info/?username={self.target_username}'
json_data, status_code = self.safe_request(profile_url)
if json_data and status_code == 200:
try:
self.target_user_id = json_data['data']['user']['id']
logger.info(f" 目标博主ID获取成功: {self.target_user_id}")
return self.target_user_id
except (KeyError, TypeError):
logger.error(f"解析用户ID失败: {json.dumps(json_data, indent=2)}")
return None
else:
logger.error(f"获取用户ID失败,状态码: {status_code}")
return None
def get_follower_list_graphql(self, count=50, after=None, counter=0):
"""
使用GraphQL API获取粉丝列表
:param count: 每次请求获取的粉丝数量
:param after: 分页游标
:param counter: 当前重试次数
:return: 粉丝数据字典
"""
variables = {
"id": self.target_user_id,
"include_reel": True,
"fetch_mutual": False,
"first": count
}
if after:
variables["after"] = after
# 将变量JSON编码并URL转义
json_str = json.dumps(variables)
encoded_str = quote(json_str)
# 构建URL
url = f"{self.graphql_url}?query_hash={self.followers_query_hash}&variables={encoded_str}"
# 发送请求
json_data, status_code = self.safe_request(url)
counter += 1
if not json_data:
logger.error("获取数据失败,响应为空")
if counter < self.max_retries:
logger.info(f"重试请求 ({counter}/{self.max_retries})")
return self.get_follower_list_graphql(count, after, counter)
return None
# 检查响应状态
if isinstance(json_data, dict) and json_data.get("status", "") != "ok":
logger.error(f"GraphQL响应状态错误: {json_data.get('message', '未知错误')}")
if counter < self.max_retries:
# 增加延迟
delay = random.randint(5, 15)
logger.info(f"等待 {delay} 秒后重试...")
time.sleep(delay)
return self.get_follower_list_graphql(count, after, counter)
return None
return json_data
def get_follower_list(self):
"""获取purina的粉丝列表"""
if not self.target_user_id:
logger.error("未获取到目标博主ID")
return False
logger.info(f"开始采集 {self.target_username} 的粉丝数据,目标: {self.max_followers}位粉丝")
collected = 0
after = None # 初始游标为空
while collected < self.max_followers:
logger.info(f"正在获取下一批粉丝 (已采集: {collected}/{self.max_followers})")
# 获取粉丝数据
json_data = self.get_follower_list_graphql(self.batch_size, after)
if not json_data:
logger.error("获取粉丝列表失败,跳过此批次")
break
try:
# 解析粉丝数据
edges = json_data['data']['user']['edge_followed_by']['edges']
# 如果没有粉丝数据,结束循环
if not edges:
logger.info("没有更多粉丝数据")
break
# 处理当前批次的粉丝
for edge in edges:
if collected >= self.max_followers:
break
node = edge['node']
logger.debug(f"处理粉丝 #{collected}: {node.get('username')}")
# 采集粉丝基本数据
follower_data = {
'博主名称': self.target_username,
'粉丝主页名称': node.get('username', ''),
'粉丝ID': node.get('id', ''),
'粉丝全名': node.get('full_name', ''),
'是否私密账号': node.get('is_private', False),
'是否已验证': node.get('is_verified', False),
'主页链接': f"https://www.instagram.com/{node.get('username', '')}/",
'详情获取状态': '未获取'
}
# 添加到结果集
self.collected_followers.append(follower_data)
collected += 1
# 获取分页游标
page_info = json_data['data']['user']['edge_followed_by']['page_info']
if page_info['has_next_page']:
after = page_info['end_cursor']
logger.info(f"获取到下一页游标: {after[:20]}...")
else:
logger.info("已到达粉丝列表末尾")
break
# 获取部分粉丝详情(每10个粉丝获取1个详情)
if collected % 10 == 0 and collected > 0:
idx = random.randint(0, len(self.collected_followers)-1)
self.get_follower_details(self.collected_followers[idx])
except KeyError as e:
logger.error(f"解析粉丝列表出错: 缺少键 {e}")
logger.debug(f"响应数据: {json.dumps(json_data, indent=2)}")
break
# 随机延迟防止封禁
delay = random.uniform(5.0, 10.0)
logger.info(f"完成批次采集,等待 {delay:.1f} 秒...")
time.sleep(delay)
logger.info(f"粉丝采集完成,共采集 {len(self.collected_followers)} 位粉丝")
# 获取所有粉丝的详细信息(可选)
logger.info("开始获取粉丝详细信息...")
for i, follower in enumerate(self.collected_followers):
if i % 10 == 0: # 每10个获取一次详情
self.get_follower_details(follower)
return True
def get_follower_details(self, follower_data):
"""获取粉丝详细信息"""
username = follower_data['粉丝主页名称']
if not username:
return
logger.info(f"获取粉丝详情: @{username}")
# 方法1: 用户详情端点
details_url = f'https://www.instagram.com/api/v1/users/web_profile_info/?username={username}'
json_data, status_code = self.safe_request(details_url)
if json_data and status_code == 200:
try:
user = json_data['data']['user']
# 提取详细信息
follower_data.update({
'bio信息': user.get('biography', ''),
'外部链接': user.get('external_url', ''),
'公开邮箱': user.get('public_email', ''),
'公开电话': user.get('public_phone_number', ''),
'商业联系方式': self.extract_business_contact(user),
'粉丝数': user.get('edge_followed_by', {}).get('count', 0),
'关注数': user.get('edge_follow', {}).get('count', 0),
'帖子数': user.get('edge_owner_to_timeline_media', {}).get('count', 0),
'是否商业账户': user.get('is_business', False),
'是否专业账户': user.get('is_professional_account', False),
'详情获取状态': '成功'
})
logger.info(f"详情获取成功: @{username} (粉丝:{follower_data['粉丝数']})")
return
except KeyError as e:
logger.warning(f"用户详情结构异常: 缺少字段 {str(e)}")
# 方法2: 备用方案 - 关注关系接口
logger.info(f"尝试备用方法获取详情: @{username}")
friendship_url = f"https://www.instagram.com/api/v1/friendships/show/{follower_data['粉丝ID']}/"
json_data, status_code = self.safe_request(friendship_url)
if json_data and status_code == 200:
try:
user = json_data['user']
follower_data.update({
'bio信息': user.get('biography', ''),
'外部链接': user.get('external_url', ''),
# 从该接口获取统计信息
'粉丝数': user.get('follower_count', 0),
'关注数': user.get('following_count', 0),
'帖子数': user.get('media_count', 0),
'详情获取状态': '成功(备用方法)'
})
logger.info(f"备用方法获取详情成功: @{username}")
except KeyError:
logger.warning(f"备用详情方法失败: @{username}")
follower_data['详情获取状态'] = '部分失败'
else:
logger.warning(f"无法获取粉丝详情: @{username}")
follower_data['详情获取状态'] = '失败'
def extract_business_contact(self, user_data):
"""提取商业联系方式"""
try:
# 尝试获取商业联系方式
business_info = user_data.get('business_contact_info', {})
return {
'邮箱': business_info.get('business_email', business_info.get('email', '')),
'电话': business_info.get('business_phone_number', business_info.get('phone_number', '')),
'地址': self.format_address(business_info)
}
except:
return "无商业联系方式"
def format_address(self, business_info):
"""格式化地址信息"""
address_parts = []
for field in ['address_street', 'city_name', 'zip', 'region_name']:
if business_info.get(field):
address_parts.append(str(business_info[field]))
return ", ".join(address_parts) if address_parts else "无地址信息"
def save_to_excel(self):
"""保存数据到Excel文件"""
if not self.collected_followers:
logger.warning("没有数据可保存")
return False
filename = f"purina_followers_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
try:
# 创建DataFrame
df = pd.DataFrame(self.collected_followers)
# 保存到Excel
df.to_excel(filename, index=False)
logger.info(f"数据已保存到 {filename}")
return True
except Exception as e:
logger.error(f"保存文件失败: {str(e)}")
return False
def main(self):
logger.info(f"开始采集 @{self.target_username} 的粉丝数据")
# 获取purina的用户ID
if not self.get_target_user_id():
logger.error(" 无法获取博主ID,程序终止")
return
# 获取粉丝列表和详情(使用GraphQL接口)
self.get_follower_list()
# 保存数据
self.save_to_excel()
logger.info(f"总请求次数: {self.request_count}")
logger.info("程序执行完成")
if __name__ == '__main__':
# 采集purina的所有粉丝(最多1万条)
spider = InstagramSpider(target_username="purina")
spider.main()
为啥代码跑的停不下来呢,抓1w条数据,有没有改进的地方呢?