前提说明:
1. 有N套用 python 爬取客户不同网站上的评论的程序
2. 有一套用 python 将评论出成 Excel 报表的程序
新需求:
做一个总入口,异步同时爬不同网站评论,或同时出不同月份报表,或爬虫做报表同时做的
# -*- encoding: utf-8 -*-
"""
@File : multiple_process_test.py
@Create Time : 2025/05/14 14:09
@Last Modify Time : 2025/05/14 14:09
@Version : v1.0
@Desciption: None
/usr/bin/php /opt/pia-api-dl/run.php -o check_count -p all -t all -d 15
python /path/to/run_spider.py client_code1 platform1
python /path/to/run_spider.py client_code2 platform2
python /path/to/run_report.py 2024-05 client_code1 keyword1
python /path/to/run_report.py 2024-05 client_code2 keyword2
"""
import asyncio
from datetime import datetime
from dataclasses import dataclass
@dataclass
class AsyncCommandTask:
"""异步任务数据结构"""
executor: str # 解释器路径(如 "/usr/bin/php" 或 "python")
script: str # 脚本路径(如 "/opt/pia-api-dl/run.php" 或 "/path/to/run_spider.py")
args: list # 命令行参数(如 ["client_code1", "platform1"] 或者 ["-o", "check_count", "-p", "all", "-t", "all", "-d", "15"] )
task_id: str # 任务唯一标识(用于日志标记) (可省略)
async def async_execute_command(task: AsyncCommandTask, semaphore: asyncio.Semaphore):
"""执行单个异步命令任务"""
async with semaphore:
# 记录进入任务的时间点
start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"{start_time} [Start] Task {task.task_id}")
# 构造完整命令(注意处理解释器和脚本路径)
cmd = [task.executor, task.script] + task.args
# 启动子进程
proc = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
# 实时捕获输出流
async def log_stream(stream, prefix):
while True:
line = await stream.readline()
if not line:
break
print(f"[{task.task_id}] {prefix}: {line.decode().strip()}")
# 并行读取 stdout 和 stderr
await asyncio.gather(
log_stream(proc.stdout, "INFO"),
log_stream(proc.stderr, "ERROR")
)
# 获取执行结果
return_code = await proc.wait()
# 记录任务结束时间点
end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(f"{end_time} [End] Task {task.task_id}")
return {
"task_id": task.task_id,
"command": " ".join(cmd), # 记录完整命令
"status": "SUCCESS" if return_code == 0 else "FAILED",
"return_code": return_code
}
def parse_raw_command(raw_command: str, task_id: str) -> AsyncCommandTask:
"""将原始命令行解析为 AsyncCommandTask 对象"""
parts = raw_command.strip().split()
return AsyncCommandTask(
executor=parts[0],
script=parts[1],
args=parts[2:],
task_id=task_id
)
async def main():
# 定义所有任务(支持混合 PHP/Python 命令)
original_commands = [
# (
# '/usr/bin/php /opt/pia-api-dl/run.php -o check_count -p all -t all -d 15',
# "php_check_count"
# ),
(
'python run_report.py 2025-01 donki DonQuijote',
"donki_DonQuijote"
),
(
'python run_report.py 2025-02 donki 唐吉诃德',
"donki_唐吉诃德"
),
(
'python run_report.py 2025-03 tokkyu 日本东急百货店',
"tokkyu_日本东急百货店"
)
]
# 将原始命令 转换为 AsyncCommandTask 对象列表
tasks = [
parse_raw_command(cmd, tid) for cmd, tid in original_commands
]
# tasks = [
# # PHP 任务
# AsyncCommandTask(
# executor="/usr/bin/php",
# script="/opt/pia-api-dl/run.php",
# args=["-o", "check_count", "-p", "all", "-t", "all", "-d", "15"],
# task_id="php-check-count"
# ),
# # Python 爬虫任务
# AsyncCommandTask(
# executor="python",
# script="/path/to/run_spider.py",
# args=["client_code1", "platform1"],
# task_id="spider-client1"
# ),
# AsyncCommandTask(
# executor="python",
# script="/path/to/run_spider.py",
# args=["client_code2", "platform2"],
# task_id="spider-client2"
# ),
# # Python 报告任务
# AsyncCommandTask(
# executor="python",
# script="/path/to/run_report.py",
# args=["2024-05", "client_code1", "keyword1"],
# task_id="report-client1"
# ),
# AsyncCommandTask(
# executor="python",
# script="/path/to/run_report.py",
# args=["2024-05", "client_code2", "keyword2"],
# task_id="report-client2"
# )
# ]
# 控制全局并发数(根据系统资源调整)
semaphore = asyncio.Semaphore(2)
results = await asyncio.gather(
*[async_execute_command(task, semaphore) for task in tasks]
)
# 输出汇总报告
print("\n===== 任务执行汇总 =====")
for result in results:
print(
f"任务ID: {result['task_id']}\n"
f"命令: {result['command']}\n"
f"状态: {result['status']} (Code: {result['return_code']})\n"
"----------------------"
)
if __name__ == "__main__":
asyncio.run(main())
D:\tool\python3.8.5\python.exe D:\www\pia_download_crawler\xhs_review_report\multiple_process_test.py
2025-05-14 15:53:50 [Start] Task donki_DonQuijote
2025-05-14 15:53:50 [Start] Task donki_唐吉诃德
[donki_DonQuijote] INFO: 2025-05-14 15:53:51,342 INFO : ========== Task Start!!! ==========
[donki_DonQuijote] INFO: 2025-05-14 15:53:51,343 INFO : month: 2025-01
[donki_DonQuijote] INFO: 2025-05-14 15:53:51,343 INFO : client: donki
[donki_DonQuijote] INFO: 2025-05-14 15:53:51,344 INFO : keywords: DonQuijote
[donki_DonQuijote] INFO: 2025-05-14 15:53:51,344 INFO :
[donki_DonQuijote] INFO: start_date: 2025-01-01
[donki_DonQuijote] INFO: end_date: 2025-01-31
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,346 INFO : ========== Task Start!!! ==========
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,346 INFO : month: 2025-02
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,346 INFO : client: donki
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,347 INFO : keywords: 唐吉诃德
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,347 INFO :
[donki_唐吉诃德] INFO: start_date: 2025-02-01
[donki_唐吉诃德] INFO: end_date: 2025-02-28
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,761 WARNING : DB warning, get_report_data no record returned
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,966 ERROR : object of type 'NoneType' has no len()
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,966 ERROR : Traceback (most recent call last):
[donki_唐吉诃德] INFO: File "D:\www\pia_download_crawler\xhs_review_report\spider\redbook.py", line 86, in crawl
[donki_唐吉诃德] INFO: row_cnt = len(report_data)
[donki_唐吉诃德] INFO: TypeError: object of type 'NoneType' has no len()
[donki_唐吉诃德] INFO:
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,967 INFO : ========== Task End!!! ==========
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,967 INFO : run(2025-02, donki, 唐吉诃德)
[donki_唐吉诃德] INFO: 2025-05-14 15:53:51,967 INFO : The program executed for 1.03 seconds.
2025-05-14 15:53:52 [End] Task donki_唐吉诃德
2025-05-14 15:53:52 [Start] Task tokkyu_日本东急百货店
[donki_DonQuijote] INFO: row_idx:9, prev_feed_comment_cnt:7, merge_start_row:2, merge_end_row:8
[donki_DonQuijote] INFO: prev_feed_id:66acbb56fd86c2919bcdc418107f36ea, current_row['article_id']:32bbe0ee925e63272d60e262391d5a02
[donki_DonQuijote] INFO: --------------------
[donki_DonQuijote] INFO: row_idx:10, prev_feed_comment_cnt:1, merge_start_row:9, merge_end_row:9
[donki_DonQuijote] INFO: prev_feed_id:32bbe0ee925e63272d60e262391d5a02, current_row['article_id']:6a69de3745e836c212251e490642f71b
[donki_DonQuijote] INFO: --------------------
[donki_DonQuijote] INFO: row_idx:21, prev_feed_comment_cnt:11, merge_start_row:10, merge_end_row:20
[donki_DonQuijote] INFO: prev_feed_id:6a69de3745e836c212251e490642f71b, current_row['article_id']:8d8444d863e8c09c99d10bac865531f2
[donki_DonQuijote] INFO: --------------------
[donki_DonQuijote] INFO: row_idx:22, prev_feed_comment_cnt:1, merge_start_row:21, merge_end_row:21
[donki_DonQuijote] INFO: prev_feed_id:8d8444d863e8c09c99d10bac865531f2, current_row['article_id']:6a69de3745e836c212251e490642f71b
[donki_DonQuijote] INFO: --------------------
[donki_DonQuijote] INFO: row_idx:31, prev_feed_comment_cnt:9, merge_start_row:22, merge_end_row:30
[donki_DonQuijote] INFO: prev_feed_id:6a69de3745e836c212251e490642f71b, current_row['article_id']:8d8444d863e8c09c99d10bac865531f2
[donki_DonQuijote] INFO: --------------------
[donki_DonQuijote] INFO: row_idx:32, prev_feed_comment_cnt:1, merge_start_row:31, merge_end_row:31
[donki_DonQuijote] INFO: prev_feed_id:8d8444d863e8c09c99d10bac865531f2, current_row['article_id']:6a69de3745e836c212251e490642f71b
[donki_DonQuijote] INFO: --------------------
[donki_DonQuijote] INFO: row_idx:45, prev_feed_comment_cnt:13, merge_start_row:32, merge_end_row:44
[donki_DonQuijote] INFO: prev_feed_id:6a69de3745e836c212251e490642f71b, current_row['article_id']:8d8444d863e8c09c99d10bac865531f2
[donki_DonQuijote] INFO: --------------------
[donki_DonQuijote] INFO: row_idx:46, prev_feed_comment_cnt:1, merge_start_row:45, merge_end_row:45
[donki_DonQuijote] INFO: prev_feed_id:8d8444d863e8c09c99d10bac865531f2, current_row['article_id']:6a69de3745e836c212251e490642f71b
[donki_DonQuijote] INFO: --------------------
[donki_DonQuijote] INFO: row_idx:51, prev_feed_comment_cnt:5, merge_start_row:46, merge_end_row:51
[donki_DonQuijote] INFO: prev_feed_id:6a69de3745e836c212251e490642f71b, current_row['article_id']:6a69de3745e836c212251e490642f71b
[donki_DonQuijote] INFO: --------------------
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,229 INFO : ========== Task Start!!! ==========
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,229 INFO : month: 2025-03
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,229 INFO : client: tokkyu
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,229 INFO : keywords: 日本东急百货店
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,229 INFO :
[tokkyu_日本东急百货店] INFO: start_date: 2025-03-01
[tokkyu_日本东急百货店] INFO: end_date: 2025-03-31
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,355 WARNING : DB warning, get_report_data no record returned
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,588 ERROR : object of type 'NoneType' has no len()
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,589 ERROR : Traceback (most recent call last):
[tokkyu_日本东急百货店] INFO: File "D:\www\pia_download_crawler\xhs_review_report\spider\redbook.py", line 86, in crawl
[tokkyu_日本东急百货店] INFO: row_cnt = len(report_data)
[tokkyu_日本东急百货店] INFO: TypeError: object of type 'NoneType' has no len()
[tokkyu_日本东急百货店] INFO:
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,589 INFO : ========== Task End!!! ==========
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,589 INFO : run(2025-03, tokkyu, 日本东急百货店)
[tokkyu_日本东急百货店] INFO: 2025-05-14 15:53:53,589 INFO : The program executed for 0.78 seconds.
2025-05-14 15:53:53 [End] Task tokkyu_日本东急百货店
[donki_DonQuijote] INFO: 2025-05-14 15:53:55,324 INFO : The file has been created: D:/www/pia_download_crawler/xhs_review_report/report/output/【donki】月次レポート202501.xlsx
[donki_DonQuijote] INFO: 2025-05-14 15:53:55,492 INFO : ========== Task End!!! ==========
[donki_DonQuijote] INFO: 2025-05-14 15:53:55,492 INFO : run(2025-01, donki, DonQuijote)
[donki_DonQuijote] INFO: 2025-05-14 15:53:55,492 INFO : The program executed for 4.56 seconds.
2025-05-14 15:53:55 [End] Task donki_DonQuijote
===== 任务执行汇总 =====
任务ID: donki_DonQuijote
命令: python run_report.py 2025-01 donki DonQuijote
状态: SUCCESS (Code: 0)
----------------------
任务ID: donki_唐吉诃德
命令: python run_report.py 2025-02 donki 唐吉诃德
状态: SUCCESS (Code: 0)
----------------------
任务ID: tokkyu_日本东急百货店
命令: python run_report.py 2025-03 tokkyu 日本东急百货店
状态: SUCCESS (Code: 0)
----------------------
Process finished with exit code 0