import redis
import requests
import time
import json
from datetime import datetime, timedelta
import os
import math
import threading
from concurrent.futures import ThreadPoolExecutor
import pytz
from queue import Queue
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Dict, List
# FastAPI应用
app = FastAPI(title="京东评论爬虫API",
description="多线程爬取京东评论数据并提供查询接口")
# 数据存储
crawled_data: Dict[str, List[dict]] = {}
data_lock = threading.Lock()
# 配置部分
BASE_URL = "https://pjsj.jddj.com/jd/sku-comment/query-page"
ACCOUNTS = [
{"name": "钟红军", "cookie_file": "cookie.json", "current_page": 1},
# {"name": "account2", "cookie_file": "cookie2.json", "current_page": 1}
]
THREAD_POOL_SIZE = len(ACCOUNTS)
task_queue = Queue()
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_DB = 0
PASSWORD = '123456'
redis_client = redis.StrictRedis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, password=PASSWORD)
# 数据模型
class CommentItem(BaseModel):
id: str
content: str
create_time: str
score: int
class CrawlResponse(BaseModel):
status: str
message: str
data_count: int
# 辅助函数
def load_cookies_from_json(cookie_file):
"""从json文件加载cookies"""
if not os.path.exists(cookie_file):
print(f"警告: cookie文件 {cookie_file} 不存在")
return None
try:
with open(cookie_file, 'r', encoding='utf-8') as f:
return json.load(f)
except Exception as e:
print(f"读取cookie文件 {cookie_file} 失败: {e}")
return None
# def save_crawled_data(account_name: str, data: dict):
# """保存爬取数据"""
# with data_lock:
# if account_name not in crawled_data:
# crawled_data[account_name] = []
# crawled_data[account_name].extend(data['data']['resultList'])
# save_to_redis(crawled_data)
def make_request(account):
"""发送请求"""
with data_lock:
page = account['current_page']
now_time = time.strftime('%Y-%m-%d', time.localtime())
params = {
"pageNo": str(page),
"pageSize": "100",
# "storeId": "15506231", # 店铺ID
"startTime": now_time + " 00:00:00",
"endTime": now_time + " 23:59:59"
}
cookies = load_cookies_from_json(account['cookie_file'])
headers = {
"authority": "pjsj.jddj.com",
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-language": "zh-CN,zh;q=0.9",
# "cookie": "3AB9D23F7A4B3C9B=IR5JY5HVIONIPRVZ2XT6IADLJNT7YVYDN7KSOUASGPRJ7NAJBFGHWLVPMEZ2I7HO7ZXYPRPK2XOFRADO3VURDNIBOE; __jda=157302241.1748326760578484526793.1748326760.1748326760.1748326760.1; __jdv=157302241%7Cdirect%7C-%7Cnone%7C-%7C1748326760578; __jdc=157302241; mba_muid=1748326760578484526793; thor=D6CAE49174A1C2B1BFC448D24F0B356D54E1C3A1A62EF708482E83C31DB30F7A0D6E35AF8C74BE68709840C619A79DC59520F022F1C537DAA53D977DF761A73AF6D0083C726CFB395271DAF4D7C482A5302D1A77A17CC9F1F92A7E8185D119635AEEE05CAEC366757DA9A1C06725F830BA2A0C7714B045132D6DD372E3E517D3AEE9FACFD309A6117551CCEB6637A265; light_key=AASBKE7rOxgWQziEhC_QY6ya7oMtArydscb7k_M0HOxtksgRmje1Wh6GnZii5TXmpvFkJWd90SeBVGnpH3MQd0yRl-dW4Q; pin=Z%E4%BA%BA%E6%B0%91%E5%85%AC%E7%A4%BE%E8%80%81%E9%95%BF%E6%B2%99%E6%B9%98%E8%8F%9C; unick=rws31804m68239; lsp-store1.jd.local=3LXOEL6XD5JIX2ARP727WV7XV3ZJJBFMFQM4QPQK4H3WA2NN3LHO25Y5VAEJWZNLF36EPE6YY4JPPMMIB2L4FZ2RA4DC3OQPSKMMG66XKJFZ2ZYYEXN5AIFEQPR53P6LNPKVJFYG2NELNYAVTSBL4E4AOEJO6XA7ZMRDGFJN5BWH6LYPLWP5TYSTLFGS22D2QHJR7UY5QARLLLYGPVZE3ZSOVVVGWJGOHBIA4FKICSWUIXP23IOYXPXML3VQVF37FSJNEJT2P2VSI55JHJZ3KDXSDDW6OVYC77QPQ7MZ4KG54G65X6VYENKNHCEIMBE7ZVIFPMJ57IIVHYH5SFY7PNDXLTNWSBZRMWYMPBUMMXN7PZJO6475BIUZOPGMLVVRQIT4G54OHGJRQ3KKZCJZSNV57774F5ON4BJLKTBOH3OOMGVYLKSHTIARPJSBRTFPC7PUEGQWZW3YUSHGPF7V2T4TIAWASZJKTM7NJMA; user_email=Z%E4%BA%BA%E6%B0%91%E5%85%AC%E7%A4%BE%E8%80%81%E9%95%BF%E6%B2%99%E6%B9%98%E8%8F%9C; __jdb=157302241.3.1748326760578484526793|1.1748326760; 3AB9D23F7A4B3CSS=jdd03IR5JY5HVIONIPRVZ2XT6IADLJNT7YVYDN7KSOUASGPRJ7NAJBFGHWLVPMEZ2I7HO7ZXYPRPK2XOFRADO3VURDNIBOEAAAAMXCBR6FEAAAAAADIUPN47E6AWWHQX; josl-privilege1.jddj.com=3LXOEL6XD5JIX2ARP727WV7XV3ZJJBFMFQM4QPQK4H3WA2NN3LHO25Y5VAEJWZNLF36EPE6YY4JPPMMIB2L4FZ2RA4DC3OQPSKMMG66XKJFZ2ZYYEXN5AIFEQPR53P6LNPKVJFYG2NELNYAVTSBL4E4AOEJO6XA7ZMRDGFJN5BWH6LYPLWP5TYSTLFGS22D2QHJR7UY5QARLLLYGPVZE3ZSOVVVGWJGOHBIA4FPL4IDBM5RFMZT3TMHHZDAW273UDNJ6QCVCBDWVWMXLIQZE34ONYTW6OVYC77QPQ7MZ4KG54G65X6VYENKNHCEIMBE7ZVIFPMJ57IIVHYH5SFY7PNDXLTNWSBZRMWYMPBUMMXN7PZJO6475BIUZOPGMLVVRQIT4G54OHGJRQ3KKZCJZSNV57774F5ON4BJLKTBOH3OOMGVYLKSHTIARPJSBRTFPC7PUEGQWZW3YVENLP3724YREZAYCRBOXRPBOSII; josl-privilege1.jd.local=3LXOEL6XD5JIX2ARP727WV7XV3ZJJBFMFQM4QPQK4H3WA2NN3LHO25Y5VAEJWZNLF36EPE6YY4JPPMMIB2L4FZ2RA4DC3OQPSKMMG66XKJFZ2ZYYEXN5AIFEQPR53P6LNPKVJFYG2NELNYAVTSBL4E4AOEJO6XA7ZMRDGFJN5BWH6LYPLWP5TYSTLFGS22D2QHJR7UY5QARLLLYGPVZE3ZSOVVVGWJGOHBIA4FPL4IDBM5RFMZT3TMHHZDAW273UDNJ6QCVCBDWVWMXLIQZE34ONYTW6OVYC77QPQ7MZ4KG54G65X6VYENKNHCEIMBE7ZVIFPMJ57IIVHYH5SFY7PNDXLTNWSBZRMWYMPBUMMXN7PZJO6475BIUZOPGMLVVRQIT4G54OHGJRQ3KKZCJZSNV57774F5ON4BJLKTBOH3OOMGVYLKSHTIARPJSBRTFPC7PUEGQWZW3YVENLP3724YREZAYCRBOXRPBOSII; mba_sid=17483267605791887488231.1; flash=3_H47SrMyh5_73S8ThAeBvyL_lWo0CFt7RdspWOBxRRgQRi1yN5nLjFba5-pkiQMOvtspoou3_yPDtW-t98169gqz6ZbTb6eU7Z9sYqp1zwI3zpENNrIPiyWxu_yKIkieYvjZvqYSn9FQcGPVL5j8OR9Lr4n8HWnZ5RUvFbdaQcsAzkV**; __jd_ref_cls=takeoutHome_Menu; shop.o2o.jd.com1=3LXOEL6XD5JIX2ARP727WV7XV3ZJJBFMFQM4QPQK4H3WA2NN3LHO25Y5VAEJWZNLF36EPE6YY4JPPMMIB2L4FZ2RA4DC3OQPSKMMG66XKJFZ2ZYYEXN5AIFEQPR53P6LNPKVJFYG2NELNYAVTSBL4E4AOEJO6XA7ZMRDGFJN5BWH6LYPLWP5TYSTLFGS22D2QHJR7UY5QARLLLYGPVZE3ZSOVVVGWJGOHBIA4FKICSWUIXP23IOYXPXML3VQVF37FSJNEJT2P2VSI55JHJZ3KDXSDDW6OVYC77QPQ7MZ4KG54G65X6V6JNKWU65W4AHPTE6S5COCIPJISHSUR6NLDORGUPSNJKM2ZNQMFOU2SFQ6F6WUNO5GXDJSVUELLBJT; lsp-store1.jddj.com=3LXOEL6XD5JIX2ARP727WV7XV3ZJJBFMFQM4QPQK4H3WA2NN3LHO25Y5VAEJWZNLF36EPE6YY4JPPMMIB2L4FZ2RA4DC3OQPSKMMG66XKJFZ2ZYYEXN5AIFEQPR53P6LNPKVJFYG2NELNYAVTSBL4E4AOEJO6XA7ZMRDGFJN5BWH6LYPLWP5TYSTLFGS22D2QHJR7UY5QARLLLYGPVZE3ZSOVVVGWJGOHBIA4FKICSWUIXP23IOYXPXML3VQVF37FSJNEJT2P2VSI55JHJZ3KDXSDDW6OVYC77QPQ7MZ4KG54G65X6V6JNKWU65W4AHPTE6S5COCIPJISHSUR6NLDORGUPSNJKM2ZNQMFOU2SFQ6F6WUNO5GXDJSVUELLBJT; user_key=2474644; vender_id=1512295; vender_name=\\u4EBA\\u6C11\\u516C\\u793E\\u00B7\\u8001\\u957F\\u6C99\\u6E58\\u83DC",
"referer": "https://pjsj.jddj.com/resource/web/html/jdComment.html",
"sec-ch-ua": "\"Chromium\";v=\"106\", \"Microsoft Edge\";v=\"106\", \"Not;A=Brand\";v=\"99\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"Windows\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.47",
"x-requested-with": "XMLHttpRequest"
}
try:
print(f"账号 {account['name']} 正在请求第 {page} 页...")
response = requests.get(
BASE_URL,
params=params,
headers=headers,
cookies=cookies,
timeout=10
)
response.raise_for_status()
data = response.json()
# save_crawled_data(account['name'], data)
# print(data)
return account, data
except Exception as e:
print(f"账号 {account['name']} 请求失败: {e}")
return account, None
def process_data(account, data):
"""处理数据"""
value = data['data']['totalCount']
print(f"账号 {account['name']} 当前有: {value}条数据")
page = math.ceil(value / 100)
page = page if page >= 0 else 0
print(f"账号 {account['name']} 需要翻: {page}页")
if value > 100:
with data_lock:
account['current_page'] += 1
if account['current_page'] <= page:
print(f"账号 {account['name']} 获取第{account['current_page']}页的数据")
return False # 需要重新请求
return True # 处理完成
def save_to_redis(data):
"""将数据保存到redis"""
redis_key = f"crawler_data:{data['guid']}"
try:
redis_client.set(redis_key, json.dumps(data))
print(f"数据已保存到redis: {redis_key}")
except Exception as e:
print(f"保存到redis失败: {e}")
def data_analysis(data):
"""数据分析"""
data_list = data['data']['resultList']
if not data_list:
print('暂时没有评论数据')
return data_list
def worker():
"""工作线程函数"""
while True:
account = task_queue.get()
try:
account, data = make_request(account)
if data is not None:
data_list = data_analysis(data)
# 单次数据存储redis
for datas in data_list:
save_to_redis(datas)
if data is not None and not process_data(account, data):
task_queue.put(account) # 需要重新请求
except Exception as e:
print(f"账号 {account['name']} 处理异常: {e}")
finally:
task_queue.task_done()
def is_in_special_period():
"""判断是否在特殊时段"""
tz = pytz.timezone('Asia/Shanghai')
hour = datetime.now(tz).hour
return (11 <= hour < 14) or (17 <= hour < 20)
def calculate_next_run():
"""计算下次运行时间"""
now = datetime.now()
if is_in_special_period():
next_run = (now + timedelta(minutes=30)).replace(second=0, microsecond=0)
if now.minute >= 30:
next_run = next_run.replace(minute=0) + timedelta(hours=1)
else:
next_run = (now + timedelta(hours=1)).replace(minute=0, second=0, microsecond=0)
return next_run
def wait_until_next_run(next_run):
"""等待到下次运行时间"""
wait_seconds = (next_run - datetime.now()).total_seconds()
if wait_seconds > 0:
print(f"等待 {wait_seconds:.1f} 秒直到下次运行...")
time.sleep(wait_seconds)
def start_scheduler():
"""启动调度器"""
with ThreadPoolExecutor(max_workers=THREAD_POOL_SIZE) as executor:
for _ in range(THREAD_POOL_SIZE):
executor.submit(worker)
while True:
next_run = calculate_next_run()
print(f"\n开始新一轮请求,时间: {datetime.now()}")
for account in ACCOUNTS:
task_queue.put(account.copy())
task_queue.join()
wait_until_next_run(next_run)
# API端点
@app.get("/comments/{account_name}", response_model=List[CommentItem])
async def get_comments(account_name: str, limit: int = 10):
"""获取指定账号的评论数据"""
with data_lock:
if account_name not in crawled_data:
raise HTTPException(status_code=404, detail="Account not found")
return crawled_data[account_name][:limit]
@app.get("/start_crawl", response_model=CrawlResponse)
async def start_crawl():
"""手动触发爬取任务"""
for account in ACCOUNTS:
task_queue.put(account.copy())
return {
"status": "success",
"message": "Crawl tasks added to queue",
"data_count": sum(len(v) for v in crawled_data.values())
}
@app.get("/stats")
async def get_stats():
"""获取爬取统计信息"""
with data_lock:
return {
"accounts": [a['name'] for a in ACCOUNTS],
"data_counts": {k: len(v) for k, v in crawled_data.items()},
"last_updated": datetime.now().isoformat()
}
# 启动爬虫线程
@app.on_event("startup")
async def startup_event():
"""应用启动时初始化"""
# 检查账号配置
if not ACCOUNTS:
raise RuntimeError("未配置任何账号!")
for account in ACCOUNTS:
if not os.path.exists(account['cookie_file']):
print(f"警告: 账号 {account['name']} 的Cookie文件 {account['cookie_file']} 不存在")
print(f"启动多线程爬虫,线程数: {THREAD_POOL_SIZE}")
threading.Thread(target=start_scheduler, daemon=True).start()
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
以上代码,不改变代码逻辑和采集频率的前提下,对代码结构进行优化,进行模块化设计,同时具备良好的阅读性、扩展性