import os
import glob
import zipfile
from openpyxl import load_workbook, Workbook
from openpyxl.drawing.image import Image
from openpyxl.drawing.spreadsheet_drawing import AnchorMarker, TwoCellAnchor, OneCellAnchor
from openpyxl.styles import PatternFill, Border, Side, Alignment, Font, colors
from openpyxl.utils import get_column_letter
import copy
import re
def is_valid_excel_file(file_path):
"""检查是否为有效的Excel文件"""
try:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
required_files = ['xl/workbook.xml', 'xl/styles.xml', 'xl/worksheets/']
return all(name in zip_ref.namelist() for name in required_files[:-1])
except zipfile.BadZipFile:
return False
except Exception:
return False
def copy_cell_style(src_cell, dest_cell):
"""复制单元格样式"""
if src_cell.font:
dest_cell.font = copy.copy(src_cell.font)
if src_cell.fill:
dest_cell.fill = copy.copy(src_cell.fill)
if src_cell.border:
dest_cell.border = copy.copy(src_cell.border)
if src.cell.alignment:
dest_cell.alignment = copy.copy(src_cell.alignment)
if src_cell.number_format:
dest_cell.number_format = src_cell.number_format
def find_passport_column(ws):
"""
查找包含'passport'关键字的列(不区分大小写)
返回列字母和列索引(从1开始)
"""
# 检查第一行(通常是标题行)
for cell in ws[1]:
if cell.value and re.search(r'passport', str(cell.value), re.IGNORECASE):
return cell.column_letter, cell.column
# 如果第一行没找到,检查所有单元格
for row in ws.iter_rows(min_row=1, max_row=min(10, ws.max_row)):
for cell in row:
if cell.value and re.search(r'passport', str(cell.value), re.IGNORECASE):
return cell.column_letter, cell.column
# 仍未找到,默认第一列
return 'A', 1
def create_header_style():
"""创建第一列的标题样式"""
return (
Font(bold=True, color=colors.WHITE),
PatternFill(start_color="4F81BD", end_color="4F81BD", fill_type="solid"),
Alignment(horizontal='center', vertical='center')
)
def merge_excel_with_deduplication(input_dir, output_file):
"""
合并Excel文件并基于Passport字段去重
保留格式和图片,第一列添加文件名和工作表名
"""
# 创建新工作簿
wb_output = Workbook()
ws_output = wb_output.active
ws_output.title = "Merged Deduplicated"
# 创建第一列标题的样式
header_font, header_fill, header_alignment = create_header_style()
ws_output.column_dimensions['A'].width = 40
# 存储所有已处理的Passport值
seen_passports = set()
# 存储行数据和图片信息:(passport_value, row_data, images_list, has_image)
all_rows = []
# 存储列名映射关系
column_mapping = {}
# 第一阶段:收集所有数据并进行预处理
try:
for excel_file in glob.glob(os.path.join(input_dir, "*.xlsx")):
file_name = os.path.basename(excel_file)
if not is_valid_excel_file(excel_file):
print(f"跳过无效文件: {file_name}")
continue
print(f"处理文件: {file_name}")
try:
wb_source = load_workbook(excel_file, keep_vba=False, data_only=True)
except Exception as e:
print(f" 加载失败: {str(e)}")
continue
for ws_name in wb_source.sheetnames:
print(f" 处理工作表: {ws_name}")
ws_source = wb_source[ws_name]
# 1. 查找Passport列
passport_col_letter, passport_col_idx = find_passport_column(ws_source)
print(f" 检测到Passport列: {passport_col_letter}")
# 2. 收集图片信息
image_rows = {}
if hasattr(ws_source, '_images'):
for img in ws_source._images:
# 获取图片所在行
if isinstance(img.anchor, (TwoCellAnchor, OneCellAnchor)):
anchor = img.anchor
img_row = anchor._from.row if hasattr(anchor, '_from') else anchor.row
if img_row not in image_rows:
image_rows[img_row] = []
image_rows[img_row].append(img)
# 3. 处理每一行数据
for row_idx, row in enumerate(ws_source.iter_rows(), 1):
passport_cell = ws_source.cell(row=row_idx, column=passport_col_idx)
passport_value = str(passport_cell.value).strip() if passport_cell.value else ""
# 跳过空Passport行
if not passport_value:
continue
has_image = row_idx in image_rows
# 收集行数据
row_data = []
for cell in row:
cell_data = {
'value': cell.value,
'font': copy.copy(cell.font),
'fill': copy.copy(cell.fill),
'border': copy.copy(cell.border),
'alignment': copy.copy(cell.alignment),
'number_format': cell.number_format
}
row_data.append(cell_data)
# 记录列名映射(仅第一行)
if row_idx == 1:
for col_idx, cell in enumerate(row, 1):
col_name = cell.value if cell.value else f"Column{col_idx}"
column_mapping[col_idx] = col_name
# 添加文件信息
source_info = f"{file_name}-{ws_name}"
# 添加到所有行集合
all_rows.append({
'passport': passport_value,
'source_info': source_info,
'row_data': row_data,
'images': image_rows.get(row_idx, []),
'has_image': has_image,
'row_idx': row_idx,
'file': file_name,
'sheet': ws_name
})
except Exception as e:
print(f"收集数据时出错: {str(e)}")
raise
# 第二阶段:基于Passport去重
print("\n开始去重处理...")
passport_groups = {}
for row in all_rows:
passport = row['passport']
if passport not in passport_groups:
passport_groups[passport] = []
passport_groups[passport].append(row)
# 按规则选择保留的行
dedup_rows = []
for passport, rows in passport_groups.items():
# 查找有图片的行
rows_with_images = [r for r in rows if r['has_image']]
if rows_with_images:
# 保留第一个有图片的行
dedup_rows.append(rows_with_images[0])
if len(rows_with_images) > 1:
print(f" Passport {passport}: 保留有图片的第一行(共有 {len(rows_with_images)} 行含图片)")
else:
# 保留第一行
dedup_rows.append(rows[0])
print(f" Passport {passport}: 保留第一行(无图片行)")
print(f"去重完成: 原始 {len(all_rows)} 行 -> 去重后 {len(dedup_rows)} 行")
# 第三阶段:写入输出文件
current_row = 1
image_offset = 0 # 图片行偏移量
# 写入标题行
ws_output.cell(row=current_row, column=1, value="Source").font = header_font
ws_output.cell(row=current_row, column=2, value="Passport").font = header_font
for col_idx, col_name in column_mapping.items():
ws_output.cell(row=current_row, column=col_idx+2, value=col_name).font = header_font
current_row += 1
# 写入数据行
for row_data in dedup_rows:
# 添加源信息列
source_cell = ws_output.cell(row=current_row, column=1)
source_cell.value = row_data['source_info']
source_cell.font = header_font
source_cell.fill = header_fill
source_cell.alignment = header_alignment
# 添加Passport列
passport_cell = ws_output.cell(row=current_row, column=2)
passport_cell.value = row_data['passport']
# 添加其他列数据
for col_idx, cell_data in enumerate(row_data['row_data'], 1):
dest_cell = ws_output.cell(row=current_row, column=col_idx+2)
dest_cell.value = cell_data['value']
# 应用样式
if cell_data['font']:
dest_cell.font = cell_data['font']
if cell_data['fill']:
dest_cell.fill = cell_data['fill']
if cell_data['border']:
dest_cell.border = cell_data['border']
if cell_data['alignment']:
dest_cell.alignment = cell_data['alignment']
if cell_data.get('number_format'):
dest_cell.number_format = cell_data['number_format']
# 处理图片
if row_data['has_image']:
for img in row_data['images']:
img_copy = copy.deepcopy(img)
anchor = img_copy.anchor
# 调整锚点位置
if isinstance(anchor, TwoCellAnchor):
new_anchor = TwoCellAnchor(
_from=AnchorMarker(
col=anchor._from.col + 2, # 新增了两列
row=anchor._from.row + current_row - row_data['row_idx'] - 1,
colOff=anchor._from.colOff,
rowOff=anchor._from.rowOff
),
to=AnchorMarker(
col=anchor.to.col + 2, # 新增了两列
row=anchor.to.row + current_row - row_data['row_idx'] - 1,
colOff=anchor.to.colOff,
rowOff=anchor.to.rowOff
)
)
img_copy.anchor = new_anchor
elif isinstance(anchor, OneCellAnchor):
new_anchor = OneCellAnchor(
_from=AnchorMarker(
col=anchor._from.col + 2, # 新增了两列
row=anchor._from.row + current_row - row_data['row_idx'] - 1,
colOff=anchor._from.colOff,
rowOff=anchor._from.rowOff
)
)
img_copy.anchor = new_anchor
# 添加图片
ws_output.add_image(img_copy)
current_row += 1
# 保存结果
try:
wb_output.save(output_file)
print(f"\n合并完成! 输出文件: {output_file}")
print(f"原始行数: {len(all_rows)}, 去重后行数: {len(dedup_rows)}")
except Exception as e:
print(f"保存文件时出错: {str(e)}")
raise
if __name__ == "__main__":
input_directory = "./input_excels" # Excel文件所在目录
output_filename = "merged_deduplicated.xlsx"
merge_excel_with_deduplication(input_directory, output_filename)
这个代码图片图片会错位和缺失,修正下
最新发布