request.env["HTTP_REFERER"]

本文介绍了在Rails框架中如何使用request.env['HTTP_REFERER']来获取上一次请求的URL,这对于实现页面跳转回原来的位置等功能非常有用。

rails :

request.env["HTTP_REFERER"] 得到上次请求的URl.

'use strict' const path = require('path') function resolve(dir) { return path.join(__dirname, dir) } const CompressionPlugin = require('compression-webpack-plugin') const name = process.env.VUE_APP_TITLE || '其亚管理系统' // 网页标题 const port = process.env.port || process.env.npm_config_port || 80 // 端口 // vue.config.js 配置说明 //官方vue.config.js 参考文档 https://cli.vuejs.org/zh/config/#css-loaderoptions // 这里只列一部分,具体配置参考文档 module.exports = { chainWebpack: config => { config.module .rule('tinymce') .test(/tinymce\/.*\.js$/) .use('babel-loader') .loader('babel-loader') .end() }, // 部署生产环境和开发环境下的URL。 // 默认情况下,Vue CLI 会假设你的应用是被部署在一个域名的根路径上 // 例如 https://www.qiya.vip/。如果应用被部署在一个子路径上,你就需要用这个选项指定这个子路径。例如,如果你的应用被部署在 https://www.qiya.vip/admin/,则设置 baseUrl 为 /admin/。 publicPath: process.env.NODE_ENV === "production" ? "/" : "/", // 在npm run build 或 yarn build 时 ,生成文件的目录名称(要和baseUrl的生产环境路径一致)(默认dist) outputDir: 'dist', // 用于放置生成的静态资源 (js、css、img、fonts) 的;(项目打包之后,静态资源会放在这个文件夹下) assetsDir: 'static', // 如果你不需要生产环境的 source map,可以将其设置为 false 以加速生产环境构建。 productionSourceMap: false, transpileDependencies: ['quill'], // webpack-dev-server 相关配置 devServer: { host: '0.0.0.0', port: 81, open: true, proxy: { // detail: https://cli.vuejs.org/config/#devserver-proxy [process.env.VUE_APP_BASE_API]: { target: `http://218.31.203.46:8081`, hangeOrigin: true, pathRewrite: { ['^' + process.env.VUE_APP_BASE_API]: '' }, } }, disableHostCheck: true }, css: { loaderOptions: { sass: { sassOptions: { outputStyle: "expanded" } } } }, configureWebpack: { name: name, resolve: { alias: { '@': resolve('src') } }, plugins: [ // http://doc.qiya.vip/qiya-vue/other/faq.html#使用gzip解压缩静态文件 new CompressionPlugin({ cache: false, // 不启用文件缓存 test: /\.(js|css|html|jpe?g|png|gif|svg)?$/i, // 压缩文件格式 filename: '[path][base].gz[query]', // 压缩后的文件名 algorithm: 'gzip', // 使用gzip压缩 minRatio: 0.8, // 压缩比例,小于 80% 的文件不会被压缩 deleteOriginalAssets: false // 压缩后删除原文件 }) ], }, chainWebpack(config) { config.plugins.delete('preload') // TODO: need test config.plugins.delete('prefetch') // TODO: need test // set svg-sprite-loader config.module .rule('svg') .exclude.add(resolve('src/assets/icons')) .end() config.module .rule('icons') .test(/\.svg$/) .include.add(resolve('src/assets/icons')) .end() .use('svg-sprite-loader') .loader('svg-sprite-loader') .options({ symbolId: 'icon-[name]' }) .end() config.when(process.env.NODE_ENV !== 'development', config => { config .plugin('ScriptExtHtmlWebpackPlugin') .after('html') .use('script-ext-html-webpack-plugin', [{ // `runtime` must same as runtimeChunk name. default is `runtime` inline: /runtime\..*\.js$/ }]) .end() config.optimization.splitChunks({ chunks: 'all', cacheGroups: { libs: { name: 'chunk-libs', test: /[\\/]node_modules[\\/]/, priority: 10, chunks: 'initial' // only package third parties that are initially dependent }, elementUI: { name: 'chunk-elementUI', // split elementUI into a single package test: /[\\/]node_modules[\\/]_?element-ui(.*)/, // in order to adapt to cnpm priority: 20 // the weight needs to be larger than libs and app or it will be packaged into libs or app }, commons: { name: 'chunk-commons', test: resolve('src/components'), // can customize your rules minChunks: 3, // minimum common number priority: 5, reuseExistingChunk: true } } }) config.optimization.runtimeChunk('single') }) } } 这个是我在开发时对前端项目做的代理,现在我要部署到我本地的nginx上,我的nginx.conf配置如下:# 必须的全局配置 worker_processes 1; events { worker_connections 1024; } # 所有 server 块必须包含在 http 块内 http { include mime.types; default_type application/octet-stream; # 日志格式 log_format main '$remote_addr - $remote_user [$time_local] "$request" ' '$status $body_bytes_sent "$http_referer" ' '"$http_user_agent" "$http_x_forwarded_for"'; access_log logs/access.log main; error_log logs/error.log; sendfile on; keepalive_timeout 65; # 您的应用配置开始 server { listen 80; server_name 172.26.26.38; # 替换为您的域名或服务器IP root dist_HTGL; # 应该是目录,不是文件 # 启用 Gzip 静态文件支持 gzip_static on; gzip on; gzip_types text/plain text/css application/json application/javascript text/xml application/xml application/xml+rss text/javascript; # 路由 history 模式支持 location / { index index.html; try_files $uri $uri/ /index.html; # 缓存配置 if ($request_filename ~* .*\.(?:js|css|jpg|jpeg|gif|png|ico|cur|gz|svg|svgz|mp4|ogg|ogv|webm)$) { expires 365d; add_header Cache-Control "public, immutable"; } } # 静态资源目录 location /static/ { alias dist_HTGL/static/; expires max; add_header Cache-Control "public, immutable"; access_log off; } # API 代理配置 location ^~ /process.env.VUE_APP_BASE_API/ { proxy_pass http://218.31.203.46:8081; # 请求头处理 proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-Proto $scheme; # WebSocket 支持 proxy_http_version 1.1; proxy_set_header Upgrade $http_upgrade; proxy_set_header Connection "upgrade"; # 超时设置 proxy_connect_timeout 60s; proxy_read_timeout 600s; proxy_send_timeout 600s; } # 禁止访问敏感文件 location ~ /\.(env|git) { deny all; return 403; } # 错误页面 error_page 404 /index.html; error_page 500 502 503 504 /50x.html; location = /50x.html { root html; } } # 可选:HTTP 重定向到 HTTPS server { listen 80; server_name your-domain.com; return 301 https://$host$request_uri; } } 为什么启动后我登录请求的验证码获取不到
08-30
#!/usr/bin/env python3 # coding: utf-8 # File: data_spider.py # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io> # Date: 18-10-3 import urllib.request import urllib.parse from urllib.error import URLError, HTTPError from lxml import etree # type: ignore import pymongo import re import os import random import time import configparser import json from http.cookiejar import CookieJar from urllib.request import HTTPCookieProcessor from typing import Optional, Dict, List class CrimeSpider: def __init__(self): self.conn = pymongo.MongoClient( host='localhost', port=27017, username='root', password='password' ) self.db = self.conn['medical'] self.col = self.db['data'] # 加载反爬配置 self.config = self.load_anti_crawl_config() # 爬虫运行配置 self.timeout = self.config.get('timeout', 12) self.max_retries = self.config.get('max_retries', 3) self.base_delay_min = self.config.get('base_delay_min', 1.0) self.base_delay_max = self.config.get('base_delay_max', 3.0) self.failure_backoff = self.config.get('failure_backoff', 2.0) self.max_delay = self.config.get('max_delay', 30.0) self.adaptive_delay_factor = self.config.get('adaptive_delay_factor', 0.5) # 如存在,加载 proxy.txt 中的代理池 self.proxy_pool = self.load_proxy_pool() # User-Agent池 self.user_agents = self.load_user_agents() self.ua_strategy = self.config.get('ua_strategy', 'random') self.current_ua_index = 0 # Session管理 self.enable_session = self.config.get('enable_session', True) if self.enable_session: self.cookie_jar = CookieJar() # 请求统计 self.request_count = 0 self.last_request_time = 0 # 重试配置 self.retry_status_codes = set(map(int, str(self.config.get('retry_status_codes', '429,503,502,504,520,521,522,524')).split(','))) self.proxy_change_codes = set(map(int, str(self.config.get('proxy_change_codes', '403,407,429')).split(','))) '''根据url,请求html''' def get_html(self, url): """使用可选的随机代理抓取HTML,带完整反爬策略。""" # 智能延时 self._smart_delay() # 构建随机化请求头 headers = self._build_random_headers(url) last_err = None # 准备待尝试的代理列表 proxies_to_try = [] if self.proxy_pool: # 最多尝试 self.max_retries 个不同代理 pool_sample = self.proxy_pool.copy() random.shuffle(pool_sample) proxies_to_try = pool_sample[: self.max_retries] # 最后追加一次直连(无代理)尝试 proxies_to_try.append(None) for attempt, proxy in enumerate(proxies_to_try, start=1): try: req = urllib.request.Request(url=url, headers=headers) # 构建opener,支持Cookie和代理 handlers = [] if self.enable_session: handlers.append(HTTPCookieProcessor(self.cookie_jar)) if proxy: handlers.append(urllib.request.ProxyHandler(proxy)) if handlers: opener = urllib.request.build_opener(*handlers) res = opener.open(req, timeout=self.timeout) else: res = urllib.request.urlopen(req, timeout=self.timeout) html = res.read().decode('gbk', errors='ignore') return html except HTTPError as e: last_err = e status_code = e.code # 根据状态码决定是否更换代理或重试 if self._should_change_proxy(status_code) and proxy and proxy in self.proxy_pool: try: self.proxy_pool.remove(proxy) except ValueError: pass # 如果不需要重试,直接跳出 if not self._should_retry(status_code, attempt): break # 指数退避 delay = min(self.base_delay_max * (self.failure_backoff ** (attempt - 1)), self.max_delay) time.sleep(random.uniform(delay * 0.5, delay)) except (URLError, TimeoutError, ConnectionError, OSError) as e: last_err = e # 网络错误,移除代理并重试 if proxy and proxy in self.proxy_pool: try: self.proxy_pool.remove(proxy) except ValueError: pass # 短暂随机退避,避免频繁请求 time.sleep(random.uniform(0.5, 1.2)) # 若全部尝试失败,抛出最后一次错误 raise last_err or RuntimeError('Failed to fetch url: %s' % url) def load_proxy_pool(self, path: Optional[str] = None): """ 从 conf/proxy.txt 加载代理列表。 支持的每行格式: - http://user:pass@host:port - https://host:port - host:port (默认按 http 处理) 以 “#” 开头的行为注释将被忽略。 返回可用于 urllib ProxyHandler 的代理字典列表,例如: {'http': 'http://host:port', 'https': 'http://host:port'}。 """ try: if not path: # 工程根目录 = 当前文件所在目录的上一层 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) path = os.path.join(project_root, 'conf', 'proxy.txt') if not os.path.isfile(path): return [] proxies = [] with open(path, 'r', encoding='utf-8') as f: for line in f: raw = line.strip() if not raw or raw.startswith('#'): continue url = raw if not raw.startswith('http://') and not raw.startswith('https://'): url = 'http://' + raw # 同时构造 httphttps 两种映射到同一端点 proxies.append({'http': url, 'https': url}) return proxies except Exception: return [] '''url解析''' def url_parser(self, content): selector = etree.HTML(content) urls = ['http://www.anliguan.com' + i for i in selector.xpath('//h2[@class="item-title"]/a/@href')] return urls '''测试''' def spider_main(self): for page in range(1, 11000): try: basic_url = 'http://jib.xywy.com/il_sii/%s.htm'%page cause_url = 'http://jib.xywy.com/il_sii/cause/%s.htm'%page prevent_url = 'http://jib.xywy.com/il_sii/prevent/%s.htm'%page symptom_url = 'http://jib.xywy.com/il_sii/symptom/%s.htm'%page inspect_url = 'http://jib.xywy.com/il_sii/inspect/%s.htm'%page treat_url = 'http://jib.xywy.com/il_sii/treat/%s.htm'%page food_url = 'http://jib.xywy.com/il_sii/food/%s.htm'%page drug_url = 'http://jib.xywy.com/il_sii/drug/%s.htm'%page data = {} data['url'] = basic_url data['basic_info'] = self.basicinfo_spider(basic_url) data['cause_info'] = self.common_spider(cause_url) data['prevent_info'] = self.common_spider(prevent_url) data['symptom_info'] = self.symptom_spider(symptom_url) data['inspect_info'] = self.inspect_spider(inspect_url) data['treat_info'] = self.treat_spider(treat_url) data['food_info'] = self.food_spider(food_url) data['drug_info'] = self.drug_spider(drug_url) print(page, basic_url) self.col.insert(data) except Exception as e: print(e, page) return '''基本信息解析''' def basicinfo_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) title = selector.xpath('//title/text()')[0] category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()') desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()') ps = selector.xpath('//div[@class="mt20 articl-know"]/p') infobox = [] for p in ps: info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','') infobox.append(info) basic_data = {} basic_data['category'] = category basic_data['name'] = title.split('的简介')[0] basic_data['desc'] = desc basic_data['attributes'] = infobox return basic_data '''treat_infobox治疗解析''' def treat_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) ps = selector.xpath('//div[starts-with(@class,"mt20 articl-know")]/p') infobox = [] for p in ps: info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','') infobox.append(info) return infobox '''treat_infobox治疗解析''' def drug_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) drugs = [i.replace('\n','').replace('\t', '').replace(' ','') for i in selector.xpath('//div[@class="fl drug-pic-rec mr30"]/p/a/text()')] return drugs '''food治疗解析''' def food_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) divs = selector.xpath('//div[@class="diet-img clearfix mt20"]') try: food_data = {} food_data['good'] = divs[0].xpath('./div/p/text()') food_data['bad'] = divs[1].xpath('./div/p/text()') food_data['recommand'] = divs[2].xpath('./div/p/text()') except: return {} return food_data '''症状信息解析''' def symptom_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) symptoms = selector.xpath('//a[@class="gre" ]/text()') ps = selector.xpath('//p') detail = [] for p in ps: info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','') detail.append(info) symptoms_data = {} symptoms_data['symptoms'] = symptoms symptoms_data['symptoms_detail'] = detail return symptoms, detail '''检查信息解析''' def inspect_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) inspects = selector.xpath('//li[@class="check-item"]/a/@href') return inspects '''通用解析模块''' def common_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) ps = selector.xpath('//p') infobox = [] for p in ps: info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '') if info: infobox.append(info) return '\n'.join(infobox) '''检查项抓取模块''' def inspect_crawl(self): for page in range(1, 3685): try: url = 'http://jck.xywy.com/jc_%s.html'%page html = self.get_html(url) data = {} data['url']= url data['html'] = html self.db['jc'].insert(data) print(url) except Exception as e: print(e) def load_anti_crawl_config(self) -> Dict: """加载反爬配置文件。""" config_path = os.path.join(os.path.dirname(__file__), '..', 'conf', 'anti_crawl.conf') config = {} if os.path.isfile(config_path): try: parser = configparser.ConfigParser() parser.read(config_path, encoding='utf-8') # 解析各个配置段 for section in parser.sections(): for key, value in parser[section].items(): # 处理不同数据类型 if value.lower() in ['true', 'false']: config[key] = value.lower() == 'true' elif ',' in value and key in ['retry_status_codes', 'proxy_change_codes']: config[key] = value else: try: config[key] = float(value) except ValueError: config[key] = value except Exception: pass return config def load_user_agents(self) -> List[str]: """加载User-Agent池。""" return [ # Chrome Windows 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36', # Firefox Windows 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:119.0) Gecko/20100101 Firefox/119.0', # Safari macOS 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', # Edge Windows 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0', ] def _get_random_user_agent(self) -> str: """获取随机User-Agent。""" if self.ua_strategy == 'sequential': ua = self.user_agents[self.current_ua_index % len(self.user_agents)] self.current_ua_index += 1 return ua else: return random.choice(self.user_agents) def _build_random_headers(self, url: str) -> Dict[str, str]: """构建随机化请求头。""" headers = { 'User-Agent': self._get_random_user_agent(), 'Accept': random.choice([ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ]), 'Accept-Language': random.choice([ 'zh-CN,zh;q=0.9,en;q=0.8', 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'zh-CN,zh;q=0.9' ]), 'Accept-Encoding': 'gzip, deflate, br', 'DNT': str(random.choice([0, 1])), 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } # 根据配置添加Referer referer_strategy = self.config.get('referer_strategy', 'target') if referer_strategy == 'target': from urllib.parse import urlparse parsed = urlparse(url) headers['Referer'] = f"{parsed.scheme}://{parsed.netloc}/" elif referer_strategy == 'random': headers['Referer'] = random.choice([ 'https://www.google.com/', 'https://www.baidu.com/', 'https://www.bing.com/', ]) return headers def _smart_delay(self): """智能延时策略。""" current_time = time.time() # 计算基础延时 base_delay = random.uniform(self.base_delay_min, self.base_delay_max) # 如果有上次请求时间,考虑自适应延时 if self.last_request_time > 0: elapsed = current_time - self.last_request_time if elapsed < base_delay: time.sleep(base_delay - elapsed) else: time.sleep(base_delay) self.last_request_time = time.time() self.request_count += 1 def _should_retry(self, status_code: int, attempt: int) -> bool: """判断是否需要重试。""" if attempt >= self.max_retries: return False return status_code in self.retry_status_codes def _should_change_proxy(self, status_code: int) -> bool: """判断是否需要更换代理。""" return status_code in self.proxy_change_codes # handler = CrimeSpider() # handler.inspect_crawl()
10-29
def getList2(sef, count, after, user_id=None, cookie='', counter=0): variables = {"id": user_id, "after": after, "first": count} # 将 JSON 数据转换为字符串 json_str = json.dumps(variables) # 对 JSON 字符串进行 URL 编码 encoded_str = quote(json_str) url = 'https://www.instagram.com/graphql/query/?query_hash=37479f2b8209594dde7facb0d904896a&variables={}'.format( encoded_str) data = f.getConn(url, cookie) if data['status'] != 'ok': print("获取数据失败") time.sleep(random.randint(3, 5)) counter += 1 if counter == 5: return data return sef.getList2(count, after, user_id, cookie, counter) else: return data 使用这个接口修改我这个代码,目的是拿到1w条数数据。目前的代码是: #!/usr/bin/env python # -*- coding: utf-8 -*- import requests import pandas as pd import time import json import random import logging from datetime import datetime # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(f"purina_followers_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"), logging.StreamHandler() ] ) logger = logging.getLogger('InstagramFollowerSpider') class InstagramSpider: def __init__(self, target_username="purina"): self.session = requests.Session() self.target_username = target_username self.target_user_id = None self.base_url = 'https://www.instagram.com/api/v1/' self.collected_followers = [] self.max_followers = 1000 # 设置更大的采集数量限制 # 用户代理池 self.user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0' ] # 更新为您的有效会话信息(从浏览器获取) self.session_info = { 'sessionid': '61971702457%3ATzxhAREIa44MuZ%3A8%3AAYif5HvMvEBVkSU3zaOvBbECHuQlzwAl9bCFEhgEDw', 'csrftoken': '4YzNalzWguviI-qNJs0ftx', 'ds_user_id': '61971702457', 'rur': '"FRC\\05461971702457\\0541793525962:01fee321c68ce3d9469f80c0b0e109571e0fc50adbaa2c7741cd4e21588051ad8d43a565"', 'mid': 'aQRkcwALAAHwafiIiIA8Jae8jgv1', 'ig_did': '2661FC9A-4FB7-4383-B1EB-714C7A6A54E8', } self.update_headers() self.update_session_cookies() self.request_count = 0 self.last_request_time = 0 def update_headers(self): """更新请求头""" self.headers = { 'User-Agent': random.choice(self.user_agents), 'X-IG-App-ID': '936619743392459', 'X-ASBD-ID': '198387', 'X-Requested-With': 'XMLHttpRequest', 'Referer': f'https://www.instagram.com/{self.target_username}/followers/', 'Accept': 'application/json', 'Accept-Language': 'en-US,en;q=0.9', 'X-IG-WWW-Claim': '0', } def update_session_cookies(self): """更新会话cookies""" self.session.cookies.update(self.session_info) def rate_limit(self): """请求速率控制""" elapsed = time.time() - self.last_request_time if elapsed < 2.0: sleep_time = 3.5 - elapsed logger.info(f"等待 {sleep_time:.1f}秒...") time.sleep(sleep_time) self.last_request_time = time.time() def safe_request(self, url, params=None, max_retries=2): """安全请求方法,带重试和错误处理""" self.rate_limit() self.request_count += 1 for attempt in range(max_retries): try: response = self.session.get( url, headers=self.headers, params=params, timeout=20 ) # 调试日志 logger.debug(f"请求 #{self.request_count}: {response.status_code} {url}") # 检查状态码 if response.status_code == 200: try: return response.json(), response.status_code except json.JSONDecodeError: logger.error(f"JSON解析失败: {response.text[:200]}...") continue # 处理403禁止访问 elif response.status_code == 403: logger.error(f"403禁止访问: {url}") if attempt == 0: logger.info("尝试刷新会话信息...") self.update_session_cookies() continue # 其他错误状态 else: logger.error(f"请求失败: {response.status_code} {response.reason}") continue except requests.exceptions.RequestException as e: logger.error(f"请求异常: {str(e)}") time.sleep(2 ** attempt) # 指数退避延迟 logger.error(f"请求失败超过最大重试次数: {url}") return None, 500 def get_target_user_id(self): """获取目标博主(purina)的用户ID""" logger.info(f"获取目标博主ID: @{self.target_username}") profile_url = f'https://www.instagram.com/api/v1/users/web_profile_info/?username={self.target_username}' json_data, status_code = self.safe_request(profile_url) if json_data and status_code == 200: try: self.target_user_id = json_data['data']['user']['id'] logger.info(f"✅ 目标博主ID获取成功: {self.target_user_id}") return self.target_user_id except KeyError: logger.error(f"解析用户ID失败: {json.dumps(json_data, indent=2)}") return None else: logger.error(f"获取用户ID失败,状态码: {status_code}") return None def get_follower_list(self): """获取purina的粉丝列表""" if not self.target_user_id: logger.error("❌ 未获取到目标博主ID") return False followers_url = f"https://www.instagram.com/api/v1/friendships/{self.target_user_id}/followers/" params = { 'count': 50, 'search_surface': 'follow_list_page' } collected = 0 next_max_id = None logger.info(f"开始采集 {self.target_username} 的粉丝数据,目标: {self.max_followers}位粉丝") while collected < self.max_followers: # 添加分页参数 if next_max_id: params['max_id'] = next_max_id json_data, status_code = self.safe_request(followers_url, params) if not json_data or status_code != 200: logger.error(f"获取粉丝列表失败,状态码: {status_code}") break users = json_data.get('users', []) if not users: logger.info("没有更多粉丝数据") break for user in users: if collected >= self.max_followers: break # 采集粉丝基本数据 follower_data = { '博主名称': self.target_username, '粉丝主页名称': user.get('username', ''), '粉丝ID': user.get('pk', ''), '粉丝全名': user.get('full_name', ''), '是否私密账号': user.get('is_private', False), '是否已验证': user.get('is_verified', False), '主页链接': f"https://www.instagram.com/{user.get('username', '')}/", # '采集时间': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } # 添加到结果集 self.collected_followers.append(follower_data) collected += 1 logger.info(f"采集粉丝 #{collected}: {follower_data['粉丝主页名称']}") # 获取粉丝详细信息 self.get_follower_details(follower_data) # 速率控制 time.sleep(1.5) # 检查是否有更多粉丝 next_max_id = json_data.get('next_max_id') if not next_max_id: break logger.info(f"✅ 粉丝采集完成,共采集 {len(self.collected_followers)} 位粉丝") return True def get_follower_details(self, follower_data): """获取粉丝详细信息""" username = follower_data['粉丝主页名称'] if not username: return logger.info(f"获取粉丝详情: @{username}") # 方法1: 用户详情端点 details_url = f'https://www.instagram.com/api/v1/users/web_profile_info/?username={username}' json_data, status_code = self.safe_request(details_url) if json_data and status_code == 200: try: user = json_data['data']['user'] # 提取详细信息 follower_data.update({ 'bio信息': user.get('biography', ''), '外部链接': user.get('external_url', ''), '公开邮箱': user.get('public_email', ''), '公开电话': user.get('public_phone_number', ''), '商业联系方式': self.extract_business_contact(user), '粉丝数': user.get('edge_followed_by', {}).get('count', 0), '关注数': user.get('edge_follow', {}).get('count', 0), '帖子数': user.get('edge_owner_to_timeline_media', {}).get('count', 0), '是否商业账户': user.get('is_business', False), '是否专业账户': user.get('is_professional_account', False) }) logger.info(f"✅ 详情获取成功: @{username} (粉丝数量:{follower_data['粉丝数']})") return except KeyError as e: logger.warning(f"用户详情结构异常: 缺少字段 {str(e)}") # 方法2: 备用方案 - 关注关系接口 logger.info(f"尝试备用方法获取详情: @{username}") friendship_url = f"https://www.instagram.com/api/v1/friendships/show/{follower_data['粉丝ID']}/" json_data, status_code = self.safe_request(friendship_url) if json_data and status_code == 200: try: user = json_data['user'] follower_data.update({ 'bio信息': user.get('biography', ''), '外部链接': user.get('external_url', ''), # 从该接口获取统计信息 '粉丝数': user.get('follower_count', 0), '关注数': user.get('following_count', 0), '帖子数': user.get('media_count', 0) }) logger.info(f"✅ 备用方法获取详情成功: @{username}") except KeyError: logger.warning(f"备用详情方法失败: @{username}") def extract_business_contact(self, user_data): """提取商业联系方式""" try: # 尝试获取商业联系方式 business_info = user_data.get('business_contact_info', {}) return { '邮箱': business_info.get('email', ''), '电话': business_info.get('phone_number', ''), '地址': self.format_address(business_info) } except: return "无商业联系方式" def format_address(self, business_info): """格式化地址信息""" address_parts = [] for field in ['address_street', 'city_name', 'zip', 'region_name']: if business_info.get(field): address_parts.append(str(business_info[field])) return ", ".join(address_parts) if address_parts else "无地址信息" def save_to_excel(self): """保存数据到Excel文件""" if not self.collected_followers: logger.warning("⚠️ 没有数据可保存") return False filename = f"purina_followers_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx" try: # 创建DataFrame df = pd.DataFrame(self.collected_followers) # 保存到Excel df.to_excel(filename, index=False) logger.info(f"💾 数据已保存到 {filename}") return True except Exception as e: logger.error(f"保存文件失败: {str(e)}") return False def main(self): logger.info(f"🚀 开始采集 @{self.target_username} 的粉丝数据") # 获取purina的用户ID if not self.get_target_user_id(): logger.error("❌ 无法获取博主ID,程序终止") return # 获取粉丝列表和详情 self.get_follower_list() # 保存数据 self.save_to_excel() logger.info(f"📊 总请求次数: {self.request_count}") logger.info("✅ 程序执行完成") if __name__ == '__main__': # 采集purina的所有粉丝 spider = InstagramSpider(target_username="purina") spider.main()
最新发布
11-04
# 用户代理池 self.user_agents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:124.0) Gecko/20100101 Firefox/124.0' ] # 更新为您的有效会话信息(从浏览器获取) self.session_info = { 'sessionid': '61971702457%3ATzxhAREIa44MuZ%3A8%3AAYif5HvMvEBVkSU3zaOvBbECHuQlzwAl9bCFEhgEDw', 'csrftoken': '4YzNalzWguviI-qNJs0ftx', 'ds_user_id': '61971702457', 'rur': '"FRC\\05461971702457\\0541793525962:01fee321c68ce3d9469f80c0b0e109571e0fc50adbaa2c7741cd4e21588051ad8d43a565"', 'mid': 'aQRkcwALAAHwafiIiIA8Jae8jgv1', 'ig_did': '2661FC9A-4FB7-4383-B1EB-714C7A6A54E8', } self.update_headers() self.update_session_cookies() self.request_count = 0 self.last_request_time = 0 # GraphQL API 配置 self.graphql_url = "https://www.instagram.com/graphql/query/" self.followers_query_hash = "37479f2b8209594dde7facb0d904896a" self.max_retries = 3 self.batch_size = 50 # 每批请求的数量 def update_headers(self): """更新请求头""" self.headers = { 'User-Agent': random.choice(self.user_agents), 'X-IG-App-ID': '936619743392459', 'X-ASBD-ID': '198387', 'X-Requested-With': 'XMLHttpRequest', 'Referer': f'https://www.instagram.com/{self.target_username}/followers/', 'Accept': 'application/json', 'Accept-Language': 'en-US,en;q=0.9', 'X-IG-WWW-Claim': '0', } def update_session_cookies(self): """更新会话cookies""" self.session.cookies.update(self.session_info) def rate_limit(self, min_delay=3.0): """请求速率控制""" elapsed = time.time() - self.last_request_time if elapsed < min_delay: sleep_time = min_delay - elapsed logger.info(f"等待 {sleep_time:.1f}秒...") time.sleep(sleep_time) self.last_request_time = time.time() def safe_request(self, url, params=None, max_retries=3): """安全请求方法,带重试和错误处理""" self.rate_limit() self.request_count += 1 for attempt in range(max_retries): try: response = self.session.get( url, headers=self.headers, params=params, timeout=30 ) # 调试日志 logger.debug(f"请求 #{self.request_count}: {response.status_code} {url}") # 检查状态码 if response.status_code == 200: try: return response.json(), response.status_code except json.JSONDecodeError: logger.error(f"JSON解析失败: {response.text[:200]}...") # 尝试解析为文本 return response.text, response.status_code # 处理403禁止访问 elif response.status_code == 403: logger.error(f"403禁止访问: {url}") if attempt == 0: logger.info("尝试刷新会话信息...") self.update_session_cookies() continue # 其他错误状态 else: logger.error(f"请求失败: {response.status_code} {response.reason}") # 添加指数退避 time.sleep(2 ** attempt) continue except requests.exceptions.RequestException as e: logger.error(f"请求异常: {str(e)}") time.sleep(2 ** attempt) # 指数退避延迟 logger.error(f"请求失败超过最大重试次数: {url}") return None, 500 def get_target_user_id(self): """获取目标博主(purina)的用户ID""" logger.info(f"获取目标博主ID: @{self.target_username}") profile_url = f'https://www.instagram.com/api/v1/users/web_profile_info/?username={self.target_username}' json_data, status_code = self.safe_request(profile_url) if json_data and status_code == 200: try: self.target_user_id = json_data['data']['user']['id'] logger.info(f"✅ 目标博主ID获取成功: {self.target_user_id}") return self.target_user_id except (KeyError, TypeError): logger.error(f"解析用户ID失败: {json.dumps(json_data, indent=2)}") return None else: logger.error(f"获取用户ID失败,状态码: {status_code}") return None def get_follower_list_graphql(self, count=50, after=None, counter=0): """ 使用GraphQL API获取粉丝列表 :param count: 每次请求获取的粉丝数量 :param after: 分页游标 :param counter: 当前重试次数 :return: 粉丝数据字典 """ variables = { "id": self.target_user_id, "include_reel": True, "fetch_mutual": False, "first": count } if after: variables["after"] = after # 将变量JSON编码并URL转义 json_str = json.dumps(variables) encoded_str = quote(json_str) # 构建URL url = f"{self.graphql_url}?query_hash={self.followers_query_hash}&variables={encoded_str}" # 发送请求 json_data, status_code = self.safe_request(url) counter += 1 if not json_data: logger.error("获取数据失败,响应为空") if counter < self.max_retries: logger.info(f"重试请求 ({counter}/{self.max_retries})") return self.get_follower_list_graphql(count, after, counter) return None # 检查响应状态 if isinstance(json_data, dict) and json_data.get("status", "") != "ok": logger.error(f"GraphQL响应状态错误: {json_data.get('message', '未知错误')}") if counter < self.max_retries: # 增加延迟 delay = random.randint(5, 15) logger.info(f"等待 {delay} 秒后重试...") time.sleep(delay) return self.get_follower_list_graphql(count, after, counter) return None return json_data def get_follower_list(self): """获取purina的粉丝列表""" if not self.target_user_id: logger.error("❌ 未获取到目标博主ID") return False logger.info(f"开始采集 {self.target_username} 的粉丝数据,目标: {self.max_followers}位粉丝") collected = 0 after = None # 初始游标为空 while collected < self.max_followers: logger.info(f"正在获取下一批粉丝 (已采集: {collected}/{self.max_followers})") # 获取粉丝数据 json_data = self.get_follower_list_graphql(self.batch_size, after) if not json_data: logger.error("获取粉丝列表失败,跳过此批次") break try: # 解析粉丝数据 edges = json_data['data']['user']['edge_followed_by']['edges'] # 如果没有粉丝数据,结束循环 if not edges: logger.info("没有更多粉丝数据") break # 处理当前批次的粉丝 for edge in edges: if collected >= self.max_followers: break node = edge['node'] logger.debug(f"处理粉丝 #{collected}: {node.get('username')}") # 采集粉丝基本数据 follower_data = { '博主名称': self.target_username, '粉丝主页名称': node.get('username', ''), '粉丝ID': node.get('id', ''), '粉丝全名': node.get('full_name', ''), '是否私密账号': node.get('is_private', False), '是否已验证': node.get('is_verified', False), '主页链接': f"https://www.instagram.com/{node.get('username', '')}/", '详情获取状态': '未获取' } # 添加到结果集 self.collected_followers.append(follower_data) collected += 1 # 获取分页游标 page_info = json_data['data']['user']['edge_followed_by']['page_info'] if page_info['has_next_page']: after = page_info['end_cursor'] logger.info(f"获取到下一页游标: {after[:20]}...") else: logger.info("已到达粉丝列表末尾") break # 获取部分粉丝详情(每10个粉丝获取1个详情) if collected % 10 == 0 and collected > 0: idx = random.randint(0, len(self.collected_followers)-1) self.get_follower_details(self.collected_followers[idx]) except KeyError as e: logger.error(f"解析粉丝列表出错: 缺少键 {e}") logger.debug(f"响应数据: {json.dumps(json_data, indent=2)}") break # 随机延迟防止封禁 delay = random.uniform(5.0, 10.0) logger.info(f"完成批次采集,等待 {delay:.1f} 秒...") time.sleep(delay) logger.info(f"✅ 粉丝采集完成,共采集 {len(self.collected_followers)} 位粉丝") # 获取所有粉丝的详细信息(可选) logger.info("开始获取粉丝详细信息...") for i, follower in enumerate(self.collected_followers): if i % 10 == 0: # 每10个获取一次详情 self.get_follower_details(follower) return True def get_follower_details(self, follower_data): """获取粉丝详细信息""" username = follower_data['粉丝主页名称'] if not username: return logger.info(f"获取粉丝详情: @{username}") # 方法1: 用户详情端点 details_url = f'https://www.instagram.com/api/v1/users/web_profile_info/?username={username}' json_data, status_code = self.safe_request(details_url) if json_data and status_code == 200: try: user = json_data['data']['user'] # 提取详细信息 follower_data.update({ 'bio信息': user.get('biography', ''), '外部链接': user.get('external_url', ''), '公开邮箱': user.get('public_email', ''), '公开电话': user.get('public_phone_number', ''), '商业联系方式': self.extract_business_contact(user), '粉丝数': user.get('edge_followed_by', {}).get('count', 0), '关注数': user.get('edge_follow', {}).get('count', 0), '帖子数': user.get('edge_owner_to_timeline_media', {}).get('count', 0), '是否商业账户': user.get('is_business', False), '是否专业账户': user.get('is_professional_account', False), '详情获取状态': '成功' }) logger.info(f"✅ 详情获取成功: @{username} (粉丝:{follower_data['粉丝数']})") return except KeyError as e: logger.warning(f"用户详情结构异常: 缺少字段 {str(e)}") # 方法2: 备用方案 - 关注关系接口 logger.info(f"尝试备用方法获取详情: @{username}") friendship_url = f"https://www.instagram.com/api/v1/friendships/show/{follower_data['粉丝ID']}/" json_data, status_code = self.safe_request(friendship_url) if json_data and status_code == 200: try: user = json_data['user'] follower_data.update({ 'bio信息': user.get('biography', ''), '外部链接': user.get('external_url', ''), # 从该接口获取统计信息 '粉丝数': user.get('follower_count', 0), '关注数': user.get('following_count', 0), '帖子数': user.get('media_count', 0), '详情获取状态': '成功(备用方法)' }) logger.info(f"✅ 备用方法获取详情成功: @{username}") except KeyError: logger.warning(f"备用详情方法失败: @{username}") follower_data['详情获取状态'] = '部分失败' else: logger.warning(f"无法获取粉丝详情: @{username}") follower_data['详情获取状态'] = '失败' def extract_business_contact(self, user_data): """提取商业联系方式""" try: # 尝试获取商业联系方式 business_info = user_data.get('business_contact_info', {}) return { '邮箱': business_info.get('business_email', business_info.get('email', '')), '电话': business_info.get('business_phone_number', business_info.get('phone_number', '')), '地址': self.format_address(business_info) } except: return "无商业联系方式" def format_address(self, business_info): """格式化地址信息""" address_parts = [] for field in ['address_street', 'city_name', 'zip', 'region_name']: if business_info.get(field): address_parts.append(str(business_info[field])) return ", ".join(address_parts) if address_parts else "无地址信息" def save_to_excel(self): """保存数据到Excel文件""" if not self.collected_followers: logger.warning("⚠️ 没有数据可保存") return False filename = f"purina_followers_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx" try: # 创建DataFrame df = pd.DataFrame(self.collected_followers) # 保存到Excel df.to_excel(filename, index=False) logger.info(f"💾 数据已保存到 {filename}") return True except Exception as e: logger.error(f"保存文件失败: {str(e)}") return False def main(self): logger.info(f"🚀 开始采集 @{self.target_username} 的粉丝数据") # 获取purina的用户ID if not self.get_target_user_id(): logger.error("❌ 无法获取博主ID,程序终止") return # 获取粉丝列表和详情(使用GraphQL接口) self.get_follower_list() # 保存数据 self.save_to_excel() logger.info(f"📊 总请求次数: {self.request_count}") logger.info("✅ 程序执行完成")
11-04
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值