import os
import math
from datetime import datetime, date
from openpyxl import load_workbook, Workbook
from openpyxl.drawing.image import Image as OpenpyxlImage
from openpyxl.styles import Font
# ================== 配置路径 ==================
folder_path = r'D:\Class\能管比对\导出文件_月'
folder_path2 = r'D:\Class\能管比对\导出文件合并'
output_result = r'D:\Class\能管比对\每日结果\高变异列_CV大于0.3_整列分析.xlsx'
# 创建输出目录
os.makedirs(os.path.dirname(output_result), exist_ok=True)
# 获取今天零点时间用于过滤
today = datetime.combine(date.today(), datetime.min.time())
# ========== 第一阶段:清洗数据并保存到 folder_path2 ==========
processed_count = 0
for filename in os.listdir(folder_path):
if not filename.endswith('.xlsx') or filename.startswith('~$'):
continue
src_file = os.path.join(folder_path, filename)
dst_file = os.path.join(folder_path2, filename)
try:
wb = load_workbook(src_file)
ws = wb.active
data = [list(row) for row in ws.iter_rows(values_only=True)]
if len(data) <= 1:
continue
header, rows = data[0], data[1:]
cleaned_rows = [header] # 保留表头
for row in rows:
if not row:
continue
cell_val = row[0]
try:
# 统一解析为 datetime 类型
if isinstance(cell_val, datetime):
dt = cell_val
else:
# 处理字符串格式如 '2025/4/1' 或 '2025-04-01'
str_val = str(cell_val).strip()
dt = datetime.fromisoformat(str_val.replace('/', '-'))
if dt >= today:
continue # 排除今天及以后的数据
except Exception:
continue # 解析失败则跳过该行
cleaned_rows.append(row)
# 保存清洗后的数据
new_wb = Workbook()
new_ws = new_wb.active
for r in cleaned_rows:
new_ws.append(r)
new_wb.save(dst_file)
processed_count += 1
except Exception as e:
print(f"⚠️ 处理 {filename} 时出错:{e}")
print(f"✅ 共清洗并保存了 {processed_count} 个文件")
# ========== 第二阶段:分析 CV > 0.3 的列,并记录图像来源 ==========
results = []
image_sources = []
for filename in os.listdir(folder_path2):
if not filename.endswith('.xlsx') or filename.startswith('~$'):
continue
file_path = os.path.join(folder_path2, filename)
filename_no_ext = os.path.splitext(filename)[0]
png_path = os.path.join(folder_path2, f"{filename_no_ext}.png")
try:
wb = load_workbook(file_path)
data = [list(row) for row in wb.active.iter_rows(values_only=True)]
if len(data) < 2:
continue
header, rows = data[0], data[1:]
# 提取各列数值数据
col_data = [[] for _ in header]
for row in rows:
for i, v in enumerate(row):
try:
col_data[i].append(float(v))
except (TypeError, ValueError):
pass # 非数值跳过
# 分析第2列开始(索引1起)的变异系数
for idx, col_name in enumerate(header[1:], start=1):
vals = col_data[idx]
n = len(vals)
if n < 2:
continue
mean_val = sum(vals) / n
if abs(mean_val) < 1e-8: # 避免除以0
continue
variance = sum((x - mean_val) ** 2 for x in vals) / n
std_val = math.sqrt(variance)
cv = std_val / mean_val
if cv > 0.3:
max_val = max(vals)
min_val = min(vals)
# 构建待添加项
item = {
'来源文件': filename_no_ext,
'列名': col_name,
'有效数据点数': n,
'均值': round(mean_val, 6),
'标准差': round(std_val, 6),
'变异系数(CV)': round(cv, 6),
'最大值': round(max_val, 6),
'最小值': round(min_val, 6)
}
# === 特殊列白名单过滤逻辑 ===
# 替换原来的 if 判断
col_name_clean = str(col_name).strip() # 确保转字符串并去空格
if (
('1st DI耗用量' in col_name_clean and max_val <= 1500)
# ('CCoolingTower用水量' in col_name_clean and max_val <= 15000) or
# ('低温热水系统' in col_name_clean and max_val <= 300000)
):
continue # 跳过这些“伪异常”
results.append(item)
image_sources.append({
'source_file': filename,
'png_path': png_path,
'exists': os.path.exists(png_path)
})
except Exception as e:
print(f"⚠️ 分析 {filename} 时出错:{e}")
# ========== 第三阶段:生成最终结果 Excel ==========
if results:
result_wb = Workbook()
sheet1 = result_wb.active
sheet1.title = "高变异列数据"
sheet1.append(list(results[0].keys()))
for r in results:
sheet1.append(list(r.values()))
# 第二个工作表:展示图表
sheet2 = result_wb.create_sheet("对应图表展示")
sheet2.cell(row=1, column=1, value="来源文件")
sheet2.cell(row=1, column=2, value="图表展示")
current_row = 2
inserted_images_count = 0
seen_files = set() # 去重,避免重复插入同一文件图表
for item in image_sources:
source_file = item['source_file']
if source_file in seen_files:
continue
seen_files.add(source_file)
# 写入来源文件名(A列)
sheet2.cell(row=current_row, column=1, value=os.path.splitext(source_file)[0])
# 插入图片或错误提示(B列)
if item['exists']:
try:
img = OpenpyxlImage(item['png_path'])
img.width *= 0.5 # 缩放图片防止过大
img.height *= 0.5
sheet2.add_image(img, f'B{current_row}')
inserted_images_count += 1
height_in_rows = int(img.height / 15) + 3 # 动态调整行高
except Exception as e:
sheet2.cell(row=current_row, column=2, value=f"❌ 加载失败: {e}")
sheet2.cell(row=current_row, column=2).font = Font(color="FF0000", size=9)
height_in_rows = 5
else:
sheet2.cell(row=current_row, column=2, value="❌ 图像未找到")
sheet2.cell(row=current_row, column=2).font = Font(italic=True, color="888888", size=9)
height_in_rows = 5
# 设置当前区域行高
for i in range(current_row, current_row + height_in_rows):
sheet2.row_dimensions[i].height = 15
current_row += height_in_rows
# 设置列宽
sheet2.column_dimensions['A'].width = 50
sheet2.column_dimensions['B'].width = 80
# 保存最终结果
result_wb.save(output_result)
print(f"\n✅ 分析完成!共 {len(results)} 个高变异列已保存至:\n {output_result}")
print(f"📊 共尝试插入 {len(seen_files)} 张图像,成功插入 {inserted_images_count} 张。")
else:
print("\n🟡 未发现变异系数大于 0.3 的列。")
# ========== 第四阶段:后处理 - 按关键词+最大值条件过滤并删除对应行与图表 ==========
import os
from openpyxl.styles import Font
# 输出路径
if not os.path.exists(output_result):
print(f"❌ 后处理失败:未找到输出文件 {output_result}")
else:
from openpyxl.utils import coordinate_from_string, column_index_from_string
wb = load_workbook(output_result)
sheet1 = wb["高变异列数据"]
sheet2 = wb["对应图表展示"]
# 定义关键词与阈值映射关系:(keyword -> threshold)
keyword_threshold_map = {
'纯水曲线看板-CCoolingTower用水量(中水+自来水': 1500,
'空调-6-冷热水-冷却水系统': 3500,
# 可继续扩展:'其他关键词': 1000
}
# 获取表头以定位列索引
headers = [cell.value for cell in sheet1[1]]
try:
max_col_idx = headers.index('最大值') + 1 # openpyxl 列从1开始
source_col_idx = headers.index('来源文件') + 1
except ValueError as e:
print(f"❌ 表头缺失: {e}")
raise
# 收集需要删除的“来源文件”名称(用于匹配 sheet2)
files_to_remove = set()
# 第一步:扫描 sheet1,标记要删除的行
rows_to_delete = [] # 存储行号(逆序删除)
for row_idx, row in enumerate(sheet1.iter_rows(min_row=2, max_row=sheet1.max_row), start=2):
source_cell = row[source_col_idx - 1]
max_cell = row[max_col_idx - 1]
source_value = str(source_cell.value) if source_cell.value else ""
try:
max_value = float(max_cell.value)
except (TypeError, ValueError):
max_value = float('inf') # 非数字视为不满足条件
# 检查是否匹配任一关键词 + 数值条件
for keyword, threshold in keyword_threshold_map.items():
if keyword in source_value and max_value < threshold:
rows_to_delete.append(row_idx)
files_to_remove.add(os.path.splitext(str(source_value))[0]) # 去掉扩展名
break # 匹配一个即可
# 第二步:从下往上删除 sheet1 的行(避免索引错乱)
for row_idx in sorted(rows_to_delete, reverse=True):
sheet1.delete_rows(row_idx)
print(f"🗑️ 已删除 {len(rows_to_delete)} 行不符合条件的数据")
# 第三步:清理 sheet2 中对应的图表块
current_row = 2
deleted_image_blocks = 0
while current_row <= sheet2.max_row:
cell_a = sheet2.cell(row=current_row, column=1)
if cell_a.value and str(cell_a.value) in files_to_remove:
# 找到该区块的高度(查找下一个非空A列之前有多少行)
block_height = 1
next_check_row = current_row + 1
while next_check_row <= sheet2.max_row:
next_cell_a = sheet2.cell(row=next_check_row, column=1)
if next_cell_a.value is not None:
break
block_height += 1
# 删除整个 block 占据的行
sheet2.delete_rows(current_row, block_height)
deleted_image_blocks += 1
# 注意:current_row 不变,因为删除后新内容已上移
else:
current_row += 1
print(f"🖼️ 已删除 {deleted_image_blocks} 个对应的图表区块")
# 保存最终文件
wb.save(output_result)
print(f"✅ 清理完成:符合条件的数据及图表已删除 → {output_result}")
——"D:\Program Files\Python39\python.exe" C:\Users\E918928\PycharmProjects\pythonProject\LZRR\代码1、\能管检查\数据检测.py
✅ 共清洗并保存了 30 个文件
✅ 分析完成!共 16 个高变异列已保存至:
D:\Class\能管比对\每日结果\高变异列_CV大于0.3_整列分析.xlsx
📊 共尝试插入 13 张图像,成功插入 13 张。
Traceback (most recent call last):
File "C:\Users\E918928\PycharmProjects\pythonProject\LZRR\代码1、\能管检查\数据检测.py", line 427, in <module>
from openpyxl.utils import coordinate_from_string, column_index_from_string
ImportError: cannot import name 'coordinate_from_string' from 'openpyxl.utils' (D:\Program Files\Python39\lib\site-packages\openpyxl\utils\__init__.py)
Process finished with exit code 1
最新发布