#!/usr/bin/python3
# -*- coding:utf-8 -*-
"""
@author:
@file: async_yuyi_comment.py
@time: 2025/8/8 11:36
@desc:
"""
import os
import json
import logging
import asyncio
import aiohttp
import datetime
from configparser import ConfigParser
from sqlalchemy import create_engine, text
from math import ceil
config = ConfigParser()
current_path = os.path.dirname(os.path.abspath(__file__))
project_path = os.path.dirname(current_path)
config.read(os.path.join(project_path, 'config.ini'))
db_link1 = dict(config.items("link_info1"))
engine = create_engine(f'mysql+pymysql://{db_link1["user"]}:{db_link1["password"]}@{db_link1["host"]}:{db_link1["port"]}/{db_link1["database"]}?charset=utf8mb4')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class AsyncPager:
def __init__(self, start_date, end_date, max_concurrency=5, max_retries=3):
self.queue = asyncio.Queue()
self.service_host = 'openapi.yuyidata.com'
self.APP_KEY = "111"
self.semaphore = asyncio.Semaphore(max_concurrency)
self.start_date = start_date
self.end_date = end_date
self.max_retries = max_retries
self.session = None
self.limit = 100
self.headers = {"Content-Type": "application/json;charset=UTF-8"}
async def fetch(self, offset, time_type=1):
async with self.semaphore:
for retry in range(self.max_retries):
url = rf'https://{self.service_host}/openapi/v4/comment/check/result'
data = {
'appKey': self.APP_KEY,
'startDate': self.start_date.strftime('%Y-%m-%d') + ' 00:00:00',
'endDate': self.end_date.strftime('%Y-%m-%d') + ' 23:59:59',
'timeType': time_type, # 默认评论时间, 传0按评论时间查 ,1代表按修改时间查询
'offset': offset * self.limit,
'limit': self.limit,
}
logging.info(data)
try:
async with self.session.post(url, json=data, timeout=10) as response:
result_data = await response.json()
if response.status == 200 and result_data.get('code') == 200:
return result_data
await asyncio.sleep(2 ** retry) # 指数退避
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
logging.error(f"{url}: {str(e)}")
await asyncio.sleep(1)
print('重试')
print('空值')
return None
@staticmethod
def insert_database(comment_need_data, label_need_data):
sql = 'replace into rpa_yuyi_voc_comment_text (open_id,comment_type,main_comment_id,time,source,shop_name,reviewer_name,source_sku_spec_brand,is_competing_product,source_sku_spec_class_name,source_sku_spec_spec,source_sku_spec_color,source_sku_name,source_product_url,source_product_id,source_sku_id,order_id,sub_order_id,is_effective_evaluation,comment_abstract,comment_length,reply_content,comment_accessory_type,pic_list,video_list,rating,:yuyi_sku_name) values (:open_id,:comment_type,:main_comment_id,:time,:source,:shop_name,:reviewer_name,:source_sku_spec_brand,:is_competing_product,:source_sku_spec_class_name,:source_sku_spec_spec,:source_sku_spec_color,:source_sku_name,:source_product_url,:source_product_id,:source_sku_id,:order_id,:sub_order_id,:is_effective_evaluation,:comment_abstract,:comment_length,:reply_content,:comment_accessory_type,:pic_list,:video_list,:rating,:yuyi_sku_name)'
label_sql = 'insert into rpa_yuyi_voc_comment_label_text (open_id,dimension,id,name,name_en,path,path_en) values (:open_id,:dimension,:id,:name,:name_en,:path,:path_en)'
del_label_sql = 'delete from rpa_yuyi_voc_comment_label_text where open_id = :open_id'
try:
with engine.connect() as conn:
conn.execute(text(sql), comment_need_data)
for o in comment_need_data:
conn.execute(text(del_label_sql), {'open_id': o["open_id"]})
conn.execute(text(label_sql), label_need_data)
conn.commit()
conn.close()
except:
raise ValueError('111')
async def parse(self, page_num, result_data):
comment_need_data = []
label_need_data = []
for data in result_data.get("data", []):
source_sku_spec_brand = None
source_sku_spec_spec = None
source_sku_spec_color = None
source_sku_spec_class_name = None
is_competing_product = '否'
is_effective_evaluation = None
if data.get('sourceSkuSpec') is not None:
source_sku_spec = json.loads(data.get('sourceSkuSpec'))
source_sku_spec_brand = source_sku_spec.get('品牌')
source_sku_spec_spec = source_sku_spec.get('型号')
source_sku_spec_class_name = source_sku_spec.get('商品品类')
source_sku_spec_color = source_sku_spec.get('颜色')
if source_sku_spec_brand in ['usmile']:
is_competing_product = '是'
for label1 in data.get('labels', []):
for label2 in label1.get('labels', []):
if label1.get('dimension') == '是否有效':
is_effective_evaluation = label2.get('name')
label_need_data.append({
'open_id': data.get('openId'),
'dimension': label1.get('dimension'),
'id': label2.get('id'),
'name': label2.get('name'),
'name_en': label2.get('nameEn'),
'path': label2.get('path'),
'path_en': label2.get('pathEn'),
})
comment_need_data.append({
'open_id': data.get('openId'),
'comment_type': data.get('commentType'),
'main_comment_id': data.get('mainCommentId'),
'time': data.get('time'),
'source': data.get('source'),
'shop_name': data.get('shopName'),
'reviewer_name': data.get('reviewerName'),
'source_sku_spec_brand': source_sku_spec_brand,
'is_competing_product': is_competing_product,
'source_sku_spec_class_name': source_sku_spec_class_name,
'source_sku_spec_spec': source_sku_spec_spec,
'source_sku_spec_color': source_sku_spec_color,
'source_sku_name': data.get('sourceSkuName'),
'source_product_url': data.get('sourceProductUrl'),
'source_product_id': data.get('sourceProductId'),
'source_sku_id': data.get('sourceSkuId'),
'order_id': data.get('orderId'),
'sub_order_id': data.get('subOrderId'),
'is_effective_evaluation': is_effective_evaluation,
'comment_abstract': data.get('commentAbstract'),
'comment_length': data.get('commentLength'),
'reply_content': data.get('replyContent'),
'comment_accessory_type': data.get('commentAccessoryType'),
'pic_list': ','.join(data.get('picList', [])),
'video_list': ','.join(data.get('videoList', [])),
'rating': data.get('rating'),
'yuyi_sku_name':data.get('yuyiSkuName')
})
if page_num == 0:
total_pages = ceil(result_data.get('total', 0) / self.limit)
print('总页码:', total_pages)
for p in range(1, total_pages):
await self.queue.put(p)
return comment_need_data, label_need_data
async def worker(self, _):
while True:
page_num = await self.queue.get()
if page_num is None:
self.queue.task_done()
break
try:
result_data = await self.fetch(page_num,0)
# print(result_data)
comment_need_data, label_need_data = await self.parse(page_num, result_data)
self.insert_database(comment_need_data, label_need_data)
print(f'当前进程{_},当前页码:{page_num}--完成')
finally:
self.queue.task_done()
async def run(self):
async with aiohttp.ClientSession(headers=self.headers) as session:
self.session = session
await self.queue.put(0) # 初始任务
workers = [asyncio.create_task(self.worker(i)) for i in range(10)]
await self.queue.join()
for _ in range(len(workers)):
await self.queue.put(None)
await asyncio.gather(*workers)
if __name__ == "__main__":
now = datetime.datetime.now()
for dd in range(1,222):
s_date = now - datetime.timedelta(days=dd)
e_date = now - datetime.timedelta(days=dd)
# s_date = datetime.datetime(2025,8,1)
# e_date = datetime.datetime(2025,8,1)
print('当前时间',s_date)
crawler = AsyncPager(s_date, e_date)
asyncio.run(crawler.run())
为什么这段脚本里面 conn.execute(text(sql), comment_need_data) 出错不会提示,而是直接卡在哪里