方法主要函数介绍
openpyxl.load_workbook() 是 Python 中用于处理 Excel 文件的核心函数之一,属于 openpyxl 库(专门用于读写 Excel 2010+ xlsx/xlsm/xltx/xltm 文件的库)。
函数原型
openpyxl.load_workbook(
filename,
read_only=False,
keep_vba=False,
data_only=False,
keep_links=True,
rich_text=False
)
参数详解
| 参数 | 类型 | 默认值 | 说明 |
|---|---|---|---|
filename | str | 必填 | Excel 文件路径(支持相对/绝对路径) |
read_only | bool | False | 是否以只读模式打开(大文件优化) ✅ 优点:内存占用低 ❌ 缺点:不能修改/保存文件 |
keep_vba | bool | False | 是否保留 VBA 宏代码 (保存时需用 .xlsm 扩展名) |
data_only | bool | False | 是否读取单元格的计算结果(而非公式) 例: A1=SUM(B1:C1) → 若 data_only=True 则返回计算结果 |
keep_links | bool | True | 是否保留外部链接(如数据库连接) |
rich_text | bool | False | 是否保留富文本格式(如加粗/颜色) |
返回值
Workbook对象:代表整个 Excel 工作簿- 关键属性:
wb.sheetnames # 所有工作表名称列表 wb.active # 当前活动工作表 wb.properties # 文档元数据(作者/标题等) wb.worksheets # 所有工作表对象的列表
- 关键属性:
使用示例
基础用法
from openpyxl import load_workbook
# 加载工作簿
wb = load_workbook("sales_data.xlsx")
# 获取工作表
ws = wb["Q1_Report"] # 按名称获取
# 或
ws = wb.active # 获取活动工作表
# 读取单元格值
cell_value = ws["B2"].value
print(cell_value) # 输出: 15000
# 遍历行
for row in ws.iter_rows(min_row=2, values_only=True):
print(row) # 输出每行数据元组
# 关闭工作簿(释放内存)
wb.close()
高级场景
1. 只读模式处理大文件
wb = load_workbook("big_data.xlsx", read_only=True)
ws = wb.active
# 流式读取(避免内存溢出)
for row in ws.iter_rows(values_only=True):
process(row) # 自定义处理函数
wb.close() # 必须显式关闭!
2. 获取公式计算结果
wb = load_workbook("financial_model.xlsx", data_only=True)
ws = wb["Calculations"]
# 读取 A1 单元格的计算结果(非公式)
result = ws["A1"].value
print(result) # 输出: 285000(假设公式已计算)
3. 保留 VBA 宏
wb = load_workbook("macro_enabled.xlsm", keep_vba=True)
# 修改数据(不影响宏)
ws = wb["Dashboard"]
ws["C3"] = "Updated"
# 保存时必须用 .xlsm 扩展名
wb.save("modified_macro.xlsm")
关键特性
-
延迟加载 (Lazy Loading)
默认只加载工作簿结构,实际数据在访问时按需读取 -
内存优化
只读模式下使用磁盘缓存,适合处理 100MB+ 的大文件 -
格式保留
完整保留单元格样式、数字格式、条件格式等 -
公式支持
可读取公式文本(data_only=False)或计算结果(data_only=True)
注意事项
-
文件关闭
务必在操作完成后调用wb.close()释放资源(尤其只读模式) -
公式计算限制
data_only=True只能获取 Excel 已计算的结果(若文件未保存过可能返回None) -
密码保护
不支持加密文件(需先用其他工具解密) -
性能陷阱
避免在循环中重复访问同一单元格:# 低效写法 ❌ for i in range(1000): val = ws.cell(row=i, column=1).value # 高效写法 ✅ data = list(ws.iter_rows(values_only=True))
替代方案对比
| 场景 | 推荐工具 | 原因 |
|---|---|---|
| 简单读写 | pandas.read_excel() | 更简洁的 API |
| 超大文件 (>500MB) | openpyxl.read_only | 内存效率最高 |
| 旧版 .xls 文件 | xlrd | 兼容老格式 |
| 高性能需求 | libreoffice --convert-to | 命令行批量转换 |
最佳实践建议
-
上下文管理器(自动关闭文件)
from openpyxl import load_workbook with load_workbook("data.xlsx") as wb: ws = wb.active print(ws["A1"].value) # 退出 with 块自动关闭 -
错误重试机制
from openpyxl.utils.exceptions import InvalidFileException try: wb = load_workbook("corrupted.xlsx") except InvalidFileException: print("文件损坏或格式不支持") -
类型转换辅助
from openpyxl.cell.cell import TYPE_FORMULA, TYPE_NUMERIC cell = ws["D5"] if cell.data_type == TYPE_FORMULA: print("公式:", cell.value) elif cell.data_type == TYPE_NUMERIC: print("数值:", cell.value)
完整案例
第一层:Sheet关系确认
输入:两个Excel文件
输出:DataFrame包含:
- 匹配的sheet对
- 仅存在于文件1的sheet
- 仅存在于文件2的sheet
- 状态标记(一致/不一致)
第二层:内容区域检测
输入:匹配的sheet对
处理:动态检测有效数据区域,查找最早有数据和最晚有数据的单元格
输出:
- 每个sheet的数据边界(起始/结束单元格)
- 边界变化报告
- 共同比较区域
第三层:精细化比对
输入:共同比较区域内的数据
比较方法:所有单元格都字符串处理除前后空格后比较是否一致
输出:
- 差异位置明细(sheet名、行列坐标、原值、新值)
- 差异统计汇总
V 1.0
import pandas as pd
import numpy as np
from openpyxl import load_workbook
from openpyxl.utils import get_column_letter
import os
class ExcelComparator:
def __init__(self, file1_path, file2_path):
self.file1_path = file1_path
self.file2_path = file2_path
self.sheet_comparison_result = None
self.boundary_comparison_result = None
self.cell_comparison_result = None
def compare_sheets_info(self):
"""第一层:Sheet关系确认"""
try:
# 读取两个Excel文件的所有sheet名称
wb1 = load_workbook(self.file1_path, read_only=True, data_only=True)
wb2 = load_workbook(self.file2_path, read_only=True, data_only=True)
sheets1 = set(wb1.sheetnames)
sheets2 = set(wb2.sheetnames)
# 找出共同的、独有的sheet
common_sheets = sheets1.intersection(sheets2)
only_in_file1 = sheets1 - sheets2
only_in_file2 = sheets2 - sheets1
# 构建结果DataFrame
results = []
# 共同的sheet
for sheet in sorted(common_sheets):
results.append({
'file1_sheet': sheet,
'file2_sheet': sheet,
'status': '匹配',
'备注': ''
})
# 仅在文件1中的sheet
for sheet in sorted(only_in_file1):
results.append({
'file1_sheet': sheet,
'file2_sheet': '',
'status': '仅文件1有',
'备注': f'文件2中缺少此sheet'
})
# 仅在文件2中的sheet
for sheet in sorted(only_in_file2):
results.append({
'file1_sheet': '',
'file2_sheet': sheet,
'status': '仅文件2有',
'备注': f'文件1中缺少此sheet'
})
self.sheet_comparison_result = pd.DataFrame(results)
wb1.close()
wb2.close()
return self.sheet_comparison_result
except Exception as e:
print(f"比较sheet信息时出错: {e}")
return None
def find_data_boundaries(self, worksheet):
"""查找工作表中的数据边界"""
min_row, max_row, min_col, max_col = None, None, None, None
for row in worksheet.iter_rows():
for cell in row:
if cell.value is not None and str(cell.value).strip() != '':
if min_row is None or cell.row < min_row:
min_row = cell.row
if max_row is None or cell.row > max_row:
max_row = cell.row
if min_col is None or cell.column < min_col:
min_col = cell.column
if max_col is None or cell.column > max_col:
max_col = cell.column
return min_row, max_row, min_col, max_col
def compare_sheet_boundaries(self, sheet_pairs=None):
"""第二层:内容区域检测"""
try:
wb1 = load_workbook(self.file1_path, read_only=True, data_only=True)
wb2 = load_workbook(self.file2_path, read_only=True, data_only=True)
results = []
# 如果没有提供sheet对应关系,使用名称相同的sheet
if sheet_pairs is None:
common_sheets = set(wb1.sheetnames).intersection(set(wb2.sheetnames))
sheet_pairs = [(sheet, sheet) for sheet in common_sheets]
for sheet1_name, sheet2_name in sheet_pairs:
if sheet1_name not in wb1.sheetnames:
results.append({
'sheet1_name': sheet1_name,
'sheet2_name': sheet2_name,
'status': '错误',
'sheet1_boundary': 'N/A',
'sheet2_boundary': 'N/A',
'comparison_area': 'N/A',
'备注': f'文件1中找不到sheet: {sheet1_name}'
})
continue
if sheet2_name not in wb2.sheetnames:
results.append({
'sheet1_name': sheet1_name,
'sheet2_name': sheet2_name,
'status': '错误',
'sheet1_boundary': 'N/A',
'sheet2_boundary': 'N/A',
'comparison_area': 'N/A',
'备注': f'文件2中找不到sheet: {sheet2_name}'
})
continue
ws1 = wb1[sheet1_name]
ws2 = wb2[sheet2_name]
# 查找数据边界
min_row1, max_row1, min_col1, max_col1 = self.find_data_boundaries(ws1)
min_row2, max_row2, min_col2, max_col2 = self.find_data_boundaries(ws2)
# 处理空sheet的情况
if min_row1 is None:
sheet1_boundary = "空sheet"
else:
sheet1_boundary = f"{get_column_letter(min_col1)}{min_row1}:{get_column_letter(max_col1)}{max_row1}"
if min_row2 is None:
sheet2_boundary = "空sheet"
else:
sheet2_boundary = f"{get_column_letter(min_col2)}{min_row2}:{get_column_letter(max_col2)}{max_row2}"
# 确定比较区域
if min_row1 is None and min_row2 is None:
status = "一致"
comparison_area = "两个sheet都为空"
note = ""
elif min_row1 is None or min_row2 is None:
status = "不一致"
comparison_area = "N/A"
note = "一个sheet有数据,另一个为空"
else:
# 计算共同比较区域
comp_min_row = min(min_row1, min_row2)
comp_max_row = max(max_row1, max_row2)
comp_min_col = min(min_col1, min_col2)
comp_max_col = max(max_col1, max_col2)
comparison_area = f"{get_column_letter(comp_min_col)}{comp_min_row}:{get_column_letter(comp_max_col)}{comp_max_row}"
if min_row1 == min_row2 and max_row1 == max_row2 and min_col1 == min_col2 and max_col1 == max_col2:
status = "一致"
note = "数据边界完全一致"
else:
status = "边界不一致"
note = "数据边界发生变化"
results.append({
'sheet1_name': sheet1_name,
'sheet2_name': sheet2_name,
'status': status,
'sheet1_boundary': sheet1_boundary,
'sheet2_boundary': sheet2_boundary,
'comparison_area': comparison_area,
'备注': note
})
self.boundary_comparison_result = pd.DataFrame(results)
wb1.close()
wb2.close()
return self.boundary_comparison_result
except Exception as e:
print(f"比较数据边界时出错: {e}")
return None
def compare_cells_in_sheets(self, sheet_pairs=None):
"""第三层:精细化单元格比对"""
try:
wb1 = load_workbook(self.file1_path, read_only=True, data_only=True)
wb2 = load_workbook(self.file2_path, read_only=True, data_only=True)
all_differences = []
summary = []
# 如果没有提供sheet对应关系,使用名称相同的sheet
if sheet_pairs is None:
common_sheets = set(wb1.sheetnames).intersection(set(wb2.sheetnames))
sheet_pairs = [(sheet, sheet) for sheet in common_sheets]
for sheet1_name, sheet2_name in sheet_pairs:
if sheet1_name not in wb1.sheetnames or sheet2_name not in wb2.sheetnames:
continue
ws1 = wb1[sheet1_name]
ws2 = wb2[sheet2_name]
# 查找数据边界
min_row1, max_row1, min_col1, max_col1 = self.find_data_boundaries(ws1)
min_row2, max_row2, min_col2, max_col2 = self.find_data_boundaries(ws2)
# 跳过空sheet
if min_row1 is None and min_row2 is None:
summary.append({
'sheet_name': sheet1_name,
'total_cells': 0,
'differences': 0,
'difference_rate': '0%'
})
continue
elif min_row1 is None or min_row2 is None:
summary.append({
'sheet_name': sheet1_name,
'total_cells': 'N/A',
'differences': 'N/A',
'difference_rate': '边界不一致'
})
continue
# 确定比较区域
comp_min_row = min(min_row1, min_row2)
comp_max_row = max(max_row1, max_row2)
comp_min_col = min(min_col1, min_col2)
comp_max_col = max(max_col1, max_col2)
differences_count = 0
total_cells = 0
# 遍历比较区域内的所有单元格
for row in range(comp_min_row, comp_max_row + 1):
for col in range(comp_min_col, comp_max_col + 1):
total_cells += 1
# 获取单元格值
try:
val1 = ws1.cell(row=row, column=col).value
val2 = ws2.cell(row=row, column=col).value
except:
val1 = None
val2 = None
# 转换为字符串并去除前后空格
str_val1 = str(val1).strip() if val1 is not None else ""
str_val2 = str(val2).strip() if val2 is not None else ""
# 比较
if str_val1 != str_val2:
differences_count += 1
all_differences.append({
'sheet_name': sheet1_name,
'cell_address': f"{get_column_letter(col)}{row}",
'file1_value': str_val1 if str_val1 else '(空)',
'file2_value': str_val2 if str_val2 else '(空)'
})
# 计算差异率
if total_cells > 0:
diff_rate = f"{(differences_count / total_cells * 100):.2f}%"
else:
diff_rate = "0%"
summary.append({
'sheet_name': sheet1_name,
'total_cells': total_cells,
'differences': differences_count,
'difference_rate': diff_rate
})
self.cell_comparison_details = pd.DataFrame(all_differences)
self.cell_comparison_summary = pd.DataFrame(summary)
wb1.close()
wb2.close()
return self.cell_comparison_details, self.cell_comparison_summary
except Exception as e:
print(f"比较单元格时出错: {e}")
return None, None
def run_complete_comparison(self, custom_sheet_pairs=None):
"""执行完整的三层比较"""
print("=== Excel文件比较报告 ===")
print(f"文件1: {os.path.basename(self.file1_path)}")
print(f"文件2: {os.path.basename(self.file2_path)}")
print()
# 第一层:Sheet关系确认
print("第一层:Sheet关系确认")
sheet_info = self.compare_sheets_info()
if sheet_info is not None:
print(sheet_info)
print()
# 第二层:内容区域检测
print("第二层:内容区域检测")
boundary_info = self.compare_sheet_boundaries(custom_sheet_pairs)
if boundary_info is not None:
print(boundary_info)
print()
# 第三层:精细化比对
print("第三层:精细化单元格比对")
details, summary = self.compare_cells_in_sheets(custom_sheet_pairs)
if summary is not None:
print("差异统计汇总:")
print(summary)
print()
if details is not None and not details.empty:
print("差异详情 (前20条):")
print(details.head(20))
if len(details) > 20:
print(f"... 还有 {len(details) - 20} 条差异未显示")
else:
print("未发现单元格级别差异")
return sheet_info, boundary_info, details, summary
# 使用示例
if __name__ == "__main__":
# 初始化比较器
comparator = ExcelComparator("file1.xlsx", "file2.xlsx")
# 执行完整比较
sheet_info, boundary_info, cell_details, cell_summary = comparator.run_complete_comparison()
# 如果需要自定义sheet对应关系
# custom_pairs = [("Sheet1", "Data"), ("Summary", "Report")]
# sheet_info, boundary_info, cell_details, cell_summary = comparator.run_complete_comparison(custom_pairs)
# 保存结果到Excel(可选)
# with pd.ExcelWriter('comparison_result.xlsx') as writer:
# sheet_info.to_excel(writer, sheet_name='Sheet关系', index=False)
# boundary_info.to_excel(writer, sheet_name='数据边界', index=False)
# cell_summary.to_excel(writer, sheet_name='差异统计', index=False)
# cell_details.to_excel(writer, sheet_name='差异详情', index=False)
使用方法:
# 基本用法
comparator = ExcelComparator("file1.xlsx", "file2.xlsx")
results = comparator.run_complete_comparison()
# 自定义sheet对应关系
custom_pairs = [("Sheet1", "Data"), ("Summary", "Report")]
results = comparator.run_complete_comparison(custom_pairs)
3万+

被折叠的 条评论
为什么被折叠?



