Python常用内建模块_collections:Day18

本文深入探讨了Python的collections模块,介绍了namedtuple、deque、OrderedDict和Counter等实用的集合类,展示了它们如何提高编程效率并保持数据结构的有序性。

前言

养成一个好的习惯只需要坚持21天,Day18

collections

collections是Python内建的一个集合模块,提供了许多有用的集合类。

namedtuple

namedtuple是一个函数,它用来创建一个自定义的tuple对象,并且可以规定元素的个数,可以直接用属性来引用tuple的某个元素。
例如下面的两个示例代码,分别用一个点表示坐标以及用坐标和半径表示一个圆:

#example1 一个坐标点
from collections import namedtuple
Point = namedtuple('Point',['x','y'])
p = Point(1,2)
print(p.x)
#1
#example2 用坐标和半径表示一个圆
from collections import namedtuple
Circle = namedtuple('Circle', ['x', 'y', 'r'])
c = Circle(1,2,3)
print(c.r)
#3
isinstance(p,Point)
#true 创建的p是Point类
isinstance(p,tuple)
#true 创建的p是tuple类,Point对象是tuple的一个子类
deque

list是线性存储,当数据量很大的时候插入和删除的效率很低。deque是一种可以实现插入和删除操作的双向列表,适合用于队列和栈。deque支持list的append()pop()操作,也支持appendleft()popleft()

from collections import deque
q = deque(['a','b','c'])
q.append('x')
q.appendleft('y')
q
#deque(['y', 'a', 'b', 'c', 'x'])
OrderedDict

平常我们在使用和定义dict时,Key是无序的;如果要保持Key的顺序,就可以使用OrderedDict

from collections import OrderedDict
d = dict([('a', 1), ('b', 2), ('c', 3)])
d
#{'a': 1, 'b': 2, 'c': 3} Key是无序的
od = OrderedDict([('a', 1), ('b', 2), ('c', 3)])
od
# OrderedDict([('a', 1), ('b', 2), ('c', 3)]) OrderedDict的Key是有序的

OrderedDict的Key会按照插入的顺序排列,不是Key本身排序:

od = OrderedDict()
od['z'] = 1
od['y'] = 2
od['x'] = 3
list(od.keys()) # 按照插入的Key的顺序返回
# ['z', 'y', 'x']
Counter

Counter是一个简单的计数器,例如,统计字符出现的个数:

from collections import Counter
c = Counter()
for ch in 'programming':
	c[ch] = c[ch] + 1
c
#Counter({'g': 2, 'm': 2, 'r': 2, 'a': 1, 'i': 1, 'o': 1, 'n': 1, 'p': 1})
c.update('hello')
c
#Counter({'r': 2, 'o': 2, 'g': 2, 'm': 2, 'l': 2, 'p': 1, 'a': 1, 'i': 1, 'n': 1, 'h': 1, 'e': 1})

由测试结果可知,Counter实际上也是dict的一个子类,上面的结果可以看出每个字符出现的次数。

``` import re import os import csv from datetime import datetime from collections import defaultdict from dataclasses import dataclass, field from typing import List, Tuple, Dict, Any, Optional @dataclass class LogAnalyserConfig: log_file: str = "vllm2.log" csv_file: str = "performance_data.csv" abnormal_log: str = "abnormal.log" output_csv: str = "marked_issues.csv" analyse_dir: str = None ignore_zero_pending: bool = True kv_cache_threshold: float = 99.9 pending_threshold: int = 1 timestamp_formats: List[str] = field(default_factory=lambda: [ "%Y-%m-%d %H:%M:%S", "%Y/%m/%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S" ]) def parse_timestamp(timestamp_str: str, formats: List[str]) -> datetime: for fmt in formats: try: return datetime.strptime(timestamp_str, fmt) except ValueError: continue raise ValueError(f"无法解析时间戳: {timestamp_str}") def extract_timestamp(log_line: str) -> Optional[datetime]: match = re.search(r'(\d{2}-\d{2}) (\d{2}:\d{2}:\d{2})', log_line) if match: month_day = match.group(1) time = match.group(2) year = datetime.now().year full_date = f"{year}-{month_day}" try: return datetime.strptime(f"{full_date} {time}", "%Y-%m-%d %H:%M:%S") except ValueError: return None return None def find_kv_cache(log_file: str, threshold: float) -> List[Tuple[datetime, int, str]]: kv_cache_lines = [] pattern = re.compile(rf'GPU KV cache usage: {threshold:.1f}%') with open(log_file, 'r', errors='ignore') as file: for line_number, line in enumerate(file, 1): line = line.strip() if pattern.search(line): timestamp = extract_timestamp(line) if timestamp: kv_cache_lines.append((timestamp, line_number, line)) return kv_cache_lines def find_pending_value(log_line: str) -> Optional[int]: match = re.search(r'Pending: (\d+) reqs', log_line) return int(match.group(1)) if match else None def find_repeated_pending(log_file: str, ignore_zero: bool = True) -> List[Dict[str, Any]]: repeated_pending = [] last_pending_value = None last_line_info = None current_sequence = [] with open(log_file, 'r', errors='ignore') as file: for line_number, line in enumerate(file, 1): line = line.strip() pending_value = find_pending_value(line) if pending_value is not None and (not ignore_zero or pending_value != 0): if pending_value == last_pending_value: if not current_sequence and last_line_info: current_sequence.append(last_line_info) timestamp = extract_timestamp(line) if timestamp: current_sequence.append((timestamp, line_number, line)) else: if len(current_sequence) > 1: repeated_pending.append({ 'pending_value': last_pending_value, 'entries': current_sequence }) current_sequence = [] timestamp = extract_timestamp(line) if timestamp: last_line_info = (timestamp, line_number, line) last_pending_value = pending_value else: if len(current_sequence) > 1: repeated_pending.append({ 'pending_value': last_pending_value, 'entries': current_sequence }) current_sequence = [] last_line_info = None last_pending_value = None if len(current_sequence) > 1: repeated_pending.append({ 'pending_value': last_pending_value, 'entries': current_sequence }) return repeated_pending def save_abnormal_sequences( sequences: List[Dict[str, Any]], kv_cache_issues: List[Tuple[datetime, int, str]], output_file: str ) -> int: with open(output_file, 'w') as out_file: if kv_cache_issues: out_file.write("===== KV缓存99.9%问题 =====\n") for timestamp, line_num, line in kv_cache_issues: out_file.write(f"[时间戳: {timestamp.strftime('%Y-%m-%d %H:%M:%S')}] 行 {line_num}: {line}\n") out_file.write("\n") if sequences: out_file.write("===== 连续Pending不减少问题 =====\n") for sequence in sequences: pending_value = sequence['pending_value'] out_file.write(f"[重复序列开始] Pending值: {pending_value}\n") for entry in sequence['entries']: timestamp, line_num, line = entry out_file.write(f"行 {line_num}: {timestamp.strftime('%Y-%m-%d %H:%M:%S')} - {line}\n") out_file.write("[重复序列结束]\n\n") return len(sequences) + len(kv_cache_issues) def mark_issues_in_csv( csv_file: str, kv_cache_issues: List[Tuple[datetime, int, str]], pending_issues: List[Dict[str, Any]], output_csv: str, timestamp_formats: List[str] ) -> List[Dict[str, str]]: rows = [] with open(csv_file, 'r') as f: reader = csv.DictReader(f) fieldnames = reader.fieldnames or [] rows = list(reader) if "Issue" not in fieldnames: fieldnames.append("Issue") timestamp_issues = defaultdict(set) for timestamp, _, _ in kv_cache_issues: timestamp_issues[timestamp].add("kv_cache_full") for issue in pending_issues: for entry in issue['entries']: timestamp = entry[0] timestamp_issues[timestamp].add("max_num_seqs") for row in rows: try: start_time = row.get("Start Time") or row.get("start_time") or row.get("start") end_time = row.get("End Time") or row.get("end_time") or row.get("end") if not start_time or not end_time: row["Issue"] = "missing_time_interval" continue start_dt = parse_timestamp(start_time, timestamp_formats) end_dt = parse_timestamp(end_time, timestamp_formats) found_issues = set() for ts, issues in timestamp_issues.items(): if start_dt <= ts <= end_dt: found_issues.update(issues) if "kv_cache_full" in found_issues: row["Issue"] = "KV-Cache-full" elif "max_num_seqs" in found_issues: row["Issue"] = "max-num-seqs-full" else: row["Issue"] = "" except Exception as e: row["Issue"] = f"error: {str(e)}" with open(output_csv, 'w', newline='') as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) return rows def run_analysis(config: LogAnalyserConfig) -> Dict[str, Any]: results = {} abnormal_log_dir : str output_csv_dir : str csv_file_dir : str kv_cache_issues = find_kv_cache(config.log_file, config.kv_cache_threshold) results["kv_cache_count"] = len(kv_cache_issues) pending_issues = find_repeated_pending(config.log_file, config.ignore_zero_pending) results["pending_count"] = len(pending_issues) if config.analyse_dir: abnormal_log_dir = os.path.join(config.analyse_dir, config.abnormal_log) else: ValueError("请指定analyse_dir(csv文件所在目录)!") abnormal_count = save_abnormal_sequences( pending_issues, kv_cache_issues, abnormal_log_dir ) results["abnormal_count"] = abnormal_count csv_file_dir = os.path.join(config.analyse_dir, config.csv_file) output_csv_dir = os.path.join(config.analyse_dir, config.output_csv) marked_rows = mark_issues_in_csv( csv_file_dir kv_cache_issues, pending_issues, output_csv_dir, config.timestamp_formats ) issue_counts = defaultdict(int) for row in marked_rows: issue = row.get("Issue", "") if issue: issue_counts[issue] += 1 results["issue_counts"] = dict(issue_counts) return results ``` 修改代码,从指定文件夹读对应文件,但是修改之后,有点问题,请帮忙修改检查一下
最新发布
08-20
你的脚本已经基本完成了日志分析与 CSV 标记功能,但你在修改代码以从指定文件夹读取文件时遇到了问题。经过分析,以下是发现的问题及修复建议: --- ### ✅ **主要问题分析** 1. **`run_analysis` 函数中未正确处理 `config.analyse_dir` 为 `None` 的情况**: - 如果 `config.analyse_dir` 为 `None`,当前逻辑会抛出 `ValueError`,但错误信息不准确(提示的是“请指定 csv 文件所在目录”,但实际上应先判断 `config.analyse_dir` 是否设置)。 - 应在函数开头优先处理路径逻辑。 2. **`mark_issues_in_csv` 函数中路径拼接错误**: - `csv_file_dir = os.path.join(config.analyse_dir, config.csv_file)` 应在函数调用前就处理好,而不是传入 `config.csv_file` 字符串。 - 或者应确保 `csv_file` 是完整路径。 3. **`run_analysis` 中 `marked_rows = mark_issues_in_csv(...)` 调用参数不完整**: - 缺少逗号导致参数未正确传入: ```python marked_rows = mark_issues_in_csv( csv_file_dir kv_cache_issues, pending_issues, output_csv_dir, config.timestamp_formats ) ``` 应改为: ```python marked_rows = mark_issues_in_csv( csv_file_dir, kv_cache_issues, pending_issues, output_csv_dir, config.timestamp_formats ) ``` 4. **`abnormal_log_dir` 和 `output_csv_dir` 的路径拼接应统一处理**: - 应统一使用 `os.path.join` 并确保目录存在。 --- ### ✅ **修改建议后的 `run_analysis` 函数如下**: ```python def run_analysis(config: LogAnalyserConfig) -> Dict[str, Any]: results = {} # 确保 analyse_dir 已设置 if not config.analyse_dir: raise ValueError("必须指定 analyse_dir 参数(日志和 CSV 文件所在目录)") # 构建完整路径 log_file_path = os.path.join(config.analyse_dir, config.log_file) csv_file_path = os.path.join(config.analyse_dir, config.csv_file) abnormal_log_path = os.path.join(config.analyse_dir, config.abnormal_log) output_csv_path = os.path.join(config.analyse_dir, config.output_csv) # 检查日志文件是否存在 if not os.path.isfile(log_file_path): raise FileNotFoundError(f"找不到日志文件: {log_file_path}") # 检查 CSV 文件是否存在 if not os.path.isfile(csv_file_path): raise FileNotFoundError(f"找不到 CSV 文件: {csv_file_path}") # 1. 查找 KV Cache 满的问题 kv_cache_issues = find_kv_cache(log_file_path, config.kv_cache_threshold) results["kv_cache_count"] = len(kv_cache_issues) # 2. 查找 Pending 持续不减少的问题 pending_issues = find_repeated_pending(log_file_path, config.ignore_zero_pending) results["pending_count"] = len(pending_issues) # 3. 保存异常日志到 abnormal.log abnormal_count = save_abnormal_sequences(pending_issues, kv_cache_issues, abnormal_log_path) results["abnormal_count"] = abnormal_count # 4. 标记 CSV 文件中的问题 marked_rows = mark_issues_in_csv( csv_file_path, kv_cache_issues, pending_issues, output_csv_path, config.timestamp_formats ) # 5. 统计各类问题数量 issue_counts = defaultdict(int) for row in marked_rows: issue = row.get("Issue", "") if issue: issue_counts[issue] += 1 results["issue_counts"] = dict(issue_counts) return results ``` --- ### ✅ **其他建议优化** 1. **增加日志输出**: - 在关键步骤中加入 `logging.info()`,方便调试。 2. **统一路径处理**: - 所有文件路径都应使用 `os.path` 模块处理,确保跨平台兼容性。 3. **异常处理增强**: - 在 `with open(...)` 语句中添加 `try-except`,避免文件打开失败导致程序崩溃。 --- ### ✅ **完整修改建议总结** | 问题 | 修复方式 | |------|----------| | `run_analysis` 中路径拼接错误 | 使用 `os.path.join` 构建完整路径 | | 参数传参错误 | 添加缺失的逗号,确保参数正确传递 | | `analyse_dir` 未设置时错误提示不准确 | 提前检查并抛出明确错误信息 | | 文件不存在时未做检查 | 添加 `os.path.isfile()` 检查 | | 缺少日志输出 | 建议加入 `logging.info()` 提升可调试性 | --- ###
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值