"""
@File : excel.py
@Author : 董根虎
@Date : 2024-10-29
@Desc : 文档对象模型的excel格式的实现
"""
import logging
import os
import traceback
from base64 import b64encode
from copy import deepcopy
from io import BytesIO
from typing import List, Dict, Tuple, Any
import webcolors
import re
import pythoncom
from openpyxl.cell import Cell
from openpyxl.cell.rich_text import CellRichText, TextBlock
from openpyxl.cell.text import InlineFont
from openpyxl.utils import coordinate_to_tuple
from openpyxl.workbook import Workbook
from kotei_omp.common.base import WIN32_LOCK, get_cell_value, get_start_end_of_table, get_cell_info
from kotei_omp.common.excel import get_excel_shapes_win32com, \
handle_table_cell, calculate_overlap_percentage, handle_shape_in_merged_cell, \
ensure_merge_content, get_end_desc
from kotei_omp.data import PictureObject, TextObject, GraphicObject, LayoutObject, RunObject
from kotei_omp.data.position import Position
from kotei_omp.data.style import FontStyleObject, StyleObject
# 执行patch脚本, 修复三方库openpyxl的问题
from kotei_omp.utils.python_patch import fix_openpyxl_descriptors_base_bug
fix_openpyxl_descriptors_base_bug()
from openpyxl.styles.colors import COLOR_INDEX, WHITE, BLACK
from kotei_omp.common.theme_color import ThemeColor
from kotei_omp.data.block import DocumentBlockObject, CHANGE_RESUME
from kotei_omp.data.border import CellBorderObject, BorderObject
from kotei_omp.data.table import TableObject, RowObject, CellObject
from kotei_omp.interface_parser import ParserInterface
from zipfile import ZipFile
from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing
from openpyxl.xml.functions import fromstring
from openpyxl.worksheet.worksheet import Worksheet
from openpyxl.packaging.relationship import get_rels_path, get_dependents, RelationshipList
import base64
from PIL import Image, ImageGrab
import imagehash
# Excel 索引颜色到十六进制颜色值的映射表
INDEXED_COLORS = {
0: "#000000", # 黑色
1: "#FFFFFF", # 白色
2: "#FF0000", # 红色
3: "#00FF00", # 绿色
4: "#0000FF", # 蓝色
5: "#FFFF00", # 黄色
6: "#FF00FF", # 洋红色
7: "#00FFFF", # 青色
8: "#800000", # 深红色
9: "#008000", # 深绿色
10: "#000080", # 深蓝色
11: "#808000", # 深黄色
12: "#800080", # 深洋红色
13: "#008080", # 深青色
14: "#C0C0C0", # 浅灰色
15: "#808080", # 深灰色
64: "#FFFF00", # 黄色(索引 64 是 Excel 的特殊值)
}
def parse_sheet_images(file_path: str, sheet: Worksheet):
"""
从 Excel 工作表中解析嵌入的图片对象。
该函数读取指定 Excel 文件中的特定工作表,解析其中的图片对象,
提取图片的元数据和二进制数据,并返回以图片名称和ID为键的字典。
Args:
file_path (str): Excel 文件的路径。
sheet (Worksheet): openpyxl 的工作表对象,需要从中提取图片。
Returns:
dict: 包含所有图片信息的字典,键为 "图片名###图片ID" 格式,值为包含以下字段的字典:
- "type": 图片类型(固定为 "picture_shape")
- "sheet_name": 所在工作表名称
- "id": 图片ID
- "name": 图片名称
- "data": 图片的base64编码数据
- "hash": 图片的哈希值
异常处理:
如果在解析过程中发生异常,会记录警告和错误日志,并返回空字典。
"""
archive = ZipFile(file_path, 'r')
pictures = {}
try:
drawings = sheet._rels.find(SpreadsheetDrawing._rel_type)
for rel in drawings:
path = rel.target
src = archive.read(path)
tree = fromstring(src)
drawing_rel_path = get_rels_path(rel.target)
# 检查drawing_rel_path是否存在
if drawing_rel_path not in archive.namelist():
logging.warning(f"Drawing relationship file {drawing_rel_path} not found in archive")
continue
drawing_rels = get_dependents(archive, drawing_rel_path)
# 定义命名空间
ns = {
'xdr': 'http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
}
images = {}
for item in drawing_rels:
images[item.Id] = item.Target
# 遍历所有子元素
for anchor in tree:
# 处理 twoCellAnchor, oneCellAnchor 等锚点元素
if anchor.tag.endswith('}twoCellAnchor') or anchor.tag.endswith(
'}oneCellAnchor') or anchor.tag.endswith('}absoluteAnchor'):
# 使用命名空间查找元素
picture = {
"type": "picture_shape",
"sheet_name": sheet.title
}
# 查找图片元素
pic_elem = anchor.find('xdr:pic', ns)
if pic_elem is not None:
# 获取图片的 id 和 name
cNvPr_elem = pic_elem.find('xdr:nvPicPr/xdr:cNvPr', ns)
if cNvPr_elem is not None:
pic_id = cNvPr_elem.get('id')
pic_name = cNvPr_elem.get('name')
picture["id"] = pic_id
picture["name"] = pic_name
# 获取 r:embed 属性值
blip_elem = pic_elem.find('xdr:blipFill/a:blip', ns)
if blip_elem is not None:
r_embed = blip_elem.get(
'{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
image_path = images[r_embed]
image_res = read_image_info(archive, image_path)
if image_res is None:
continue
picture["data"] = image_res["data"]
picture["hash"] = image_res["hash"]
pictures[pic_name + '###' + pic_id] = picture
return pictures
except Exception as e:
logging.warning(traceback.format_exc())
logging.error(f"Error reading drawing relationships for sheet {sheet.title}: {e}")
def get_image_hash(img):
"""
计算图片的哈希值,用于图片去重和比较。
使用 imagehash 库的 average_hash 算法计算图片的感知哈希值,
可用于比较两张图片是否相似或相同。
Args:
img (PIL.Image): PIL 图片对象。
Returns:
tuple: 包含两个元素的元组:
- str: 图片哈希值的字符串表示
- list: 图片哈希值的二维数组表示(用于详细比较)
"""
img_hash = imagehash.average_hash(img)
return str(img_hash), img_hash.hash.tolist()
def read_image_info(archive, image_path):
"""
从 Excel 文件中读取图片并返回详细信息
:param archive: ZipFile 对象
:param image_path: 图片在 Excel 中的路径
:return: 包含 base64、哈希值等信息的字典
"""
try:
# 从 archive 中读取图片数据
image_data = archive.read(image_path)
# 通过文件头进一步验证格式
if len(image_data) >= 4:
header = image_data[:10]
is_jpg = header.startswith(b'\xff\xd8')
is_png = header.startswith(b'\x89PNG\r\n\x1a\n')
if not (is_jpg or is_png):
logging.info(f"根据文件头检测跳过非 JPG/PNG 格式图片: {image_path}")
return None
# 转换为 base64
image_base64 = base64.b64encode(image_data).decode('utf-8')
# 使用 PIL 打开图片获取尺寸信息
image = Image.open(BytesIO(image_data))
# 最终验证格式
if image.format not in ['JPEG', 'PNG']:
logging.info(f"图片实际格式不是 JPEG/PNG: {image_path}, 实际格式: {image.format}")
return None
image_hash = get_image_hash(image)
width, height = image.size
return {
"data": image_base64,
"hash": image_hash[0],
"width": width,
"height": height,
"size": len(image_data)
}
except Exception as e:
logging.error(f"读取图片失败: {image_path}, 错误: {e}")
return None
class ExcelStandardParser(ParserInterface):
"""excel文档解析器"""
def __init__(self):
self._WHITE = "#" + WHITE[2:]
self._BLACK = "#" + BLACK[2:]
self._THEME_COLORS = []
self._HEAD_BORDER_STYLE = ['medium', 'double'] # 粗线,双横线
self._file_path = ""
self._cell_bg_colors = {}
self._merged_cells_cache: Dict[Tuple[int, int], CellObject] = {}
self._openpyxl_wb: Workbook = None
self._win32com_wb = None
self._excel_app = None
# cell 原始的border信息,没有修改
self._cell_border: Dict[Tuple[int, int], CellBorderObject] = {}
def _read_document(self, file_path):
"""读取文档,将文件数据加载到内存。构建对象模型的接口函数
:param file_path: 文档的路径
:return: 返回通用文档“句柄”self
"""
# 判断文件是否存在
self._THEME_COLORS = ThemeColor().get_theme_colors(self._openpyxl_wb)
self._file_path = file_path
def parse_document(self, file_path) -> (list[DocumentBlockObject]):
""" 解析文档 """
workbook = self._openpyxl_wb
self._file_path = file_path
sheet_pictures = {}
for name in workbook.sheetnames:
pictures = parse_sheet_images(self._file_path, workbook[name])
if pictures:
sheet_pictures[name] = pictures
logging.info(f"sheet {name} has {len(pictures)} pictures")
# 使用win32com解析图形+图片
shapes = self._parse_shapes(self._file_path, exclude_sheet_pictures=sheet_pictures)
# 多线程异步处理多个sheet
# with ThreadPoolExecutor() as executor:
# # 遍历excel中所有的sheet
# futures = [executor.submit(self._parse_sheet, workbook[n], shapes) for n in workbook.sheetnames
# if (not demo_config_sheet or n in demo_config_sheet)]
# blocks = [future.result() for future in futures if future.result()]
blocks = []
for name in workbook.sheetnames:
if name in CHANGE_RESUME:
continue
# Sheet 隐藏或者特别隐藏
if workbook[name].sheet_state in ['hidden', 'veryHidden']:
continue
sheet = workbook[name]
# 跳过空的sheet
if sheet.max_row == 1 and sheet.max_column == 1:
block = DocumentBlockObject()
block._name = sheet.title
block.file_name = os.path.basename(self._file_path)
blocks.append(block)
continue
# TODO 找到 shape 里面和 cell 重叠的部分
sheet_shapes = [shape for shape in shapes if shape["sheet_name"] == name]
block = self._parse_sheet(sheet, sheet_shapes)
# handle_positions(block, total_width, total_height)
blocks.append(block)
# print(block)
# print()
# print(blocks)
# print()
return blocks
def _get_table_cell_info(self, sheet, row_index, col_index, sheet_max_row, sheet_max_col, ws_with_pywin32) -> CellObject:
"""
构建 单元格对象
:param sheet: 当前解析的Worksheet对象
:param row_index: excel单元格行坐标
:param col_index: excel单元格列坐标
:return:
"""
def get_border(cell: Cell) -> CellBorderObject:
cell_border = deepcopy(self.get_cell_border(cell_))
if not cell_border.border_top.border_style and row_index > 1:
new_border = self.get_cell_border(sheet.cell(row_index - 1, col_index))
cell_border.border_top.border_style = new_border.border_bottom.border_style
if not cell_border.border_bottom.border_style and row_index < sheet_max_row:
new_border = self.get_cell_border(sheet.cell(row_index + 1, col_index))
cell_border.border_bottom.border_style = new_border.border_top.border_style
if not cell_border.border_left and col_index > 1:
new_border = self.get_cell_border(sheet.cell(row_index + 1, col_index))
cell_border.border_left.border_style = new_border.border_right.border_style
if not cell_border.border_right and col_index < sheet_max_col:
new_border = self.get_cell_border(sheet.cell(row_index + 1, col_index))
cell_border.border_right.border_style = new_border.border_left.border_style
return cell_border
cell_ = sheet.cell(row_index, col_index)
is_merged = False
if (row_index, col_index) in self._merged_cells_cache:
is_merged = True
# 获取单元格的信息
v = get_cell_info(cell_, sheet, in_table=False, is_merged=is_merged)
# 单元格边框对象
b = get_border(cell_)
# 通用模型对象
c = CellObject()
c.text = v.get("content", "")
c.comment.text = v.get("comment", "")
if hasattr(cell_, "number_format") and '0' in cell_.number_format:
c.number_format = cell_.number_format
c._border = b
c.coordinate.desc = cell_.coordinate
current_text = TextObject()
cell_index = (row_index, col_index)
# 合并单元格的border取的是合并范围后的边框,不是单个单元格的边框
if cell_index in self._merged_cells_cache:
cache_cell = self._merged_cells_cache[cell_index]
# 开始的cell存储merge的border信息
if cache_cell._is_merged_start:
cache_cell.border = deepcopy(b)
c.style = cache_cell.style
c.merged_ranges = cache_cell.merged_ranges
run_objs = cache_cell.content[0]
current_text._style = run_objs[0].style if run_objs else None
current_text._runs = run_objs
else:
c.style = self._get_text_style(cell_.font)
bg_color = self._get_cell_bg_color(cell_, ws_with_pywin32)
if bg_color:
c.style.background_color = bg_color
if cell_.fill.patternType:
c.style.background_style = cell_.fill.patternType
else:
c.style.background_style = 'solid'
# 构建一个文本对象, 放在单元格对象中
run_objs = self._get_text_cell(cell_, ws_with_pywin32)
current_text._style = run_objs[0].style if run_objs else None
current_text._runs = run_objs
current_text._text = c.text
current_text.coordinate.desc = cell_.coordinate
c.content.append(current_text)
# print(f"Old Text: {c.text} Old style: {c.style}")
return c
# 合并单元格的边框
def _get_merged_table_cell_info(self, sheet, ws_with_pywin32):
merged_cells_dict = {}
for merged_range in sheet.merged_cells.ranges:
# 获取合并范围的边界
start_col, start_row, end_col, end_row = list(merged_range.bounds)
# 获取起始单元格对象
start_cell = sheet.cell(row=start_row, column=start_col)
# 构造起始单元格的 CellObject
start_cell_obj = CellObject()
start_cell_obj.merged_ranges = [start_row, start_col, end_row, end_col]
run_objs = self._get_text_cell(start_cell, ws_with_pywin32)
# 暂时将runs存到 content 中
start_cell_obj.content.append(run_objs)
# 提取样式信息
start_cell_obj.style = self._get_text_style(start_cell.font)
bg_color = self._get_cell_bg_color(start_cell, ws_with_pywin32)
if bg_color:
start_cell_obj.style.background_color = bg_color
start_cell_obj.style.background_style = start_cell.fill.patternType or 'solid'
start_cell_obj._is_merged = True
start_cell_obj._is_merged_start = True
start_cell_border = self.get_cell_border(start_cell)
# 将起始单元格的 CellObject 放入缓存
merged_cells_dict[(start_row, start_col)] = start_cell_obj
# 将范围内其他单元格也存入缓存,直接使用起始单元格的样式
for row in range(start_row, end_row + 1):
for col in range(start_col, end_col + 1):
top = deepcopy(start_cell_border.border_top)
bottom = deepcopy(start_cell_border.border_bottom)
left = deepcopy(start_cell_border.border_left)
right = deepcopy(start_cell_border.border_right)
empty_border = BorderObject()
if (row, col) == (start_row, start_col):
start_cell_obj.border.border_top = empty_border
start_cell_obj.border.border_right = empty_border
start_cell_obj.border.border_left = empty_border
start_cell_obj.border.border_bottom = empty_border
if row == start_row:
start_cell_obj.border.border_top = top
if row == end_row:
start_cell_obj.border.border_bottom = bottom
if col == start_col:
start_cell_obj.border.border_left = left
if col == end_col:
start_cell_obj.border.border_right = right
self._cell_border[(row, col)] = start_cell_obj.border
continue # 跳过起始单元格
cell_obj = CellObject()
cell_obj._is_merged = True
cell_obj._is_merged_start = False
cell_obj.style = start_cell_obj.style # 使用起始单元格的样式
cell_obj.content = start_cell_obj.content
cell_obj.merged_ranges = start_cell_obj.merged_ranges
merged_cells_dict[(row, col)] = cell_obj
# print(merged_cells_dict)
self._merged_cells_cache = merged_cells_dict # 更新类的缓存
def _parse_table(self, sheet, block, table_range_list, ws_with_pywin32, shapes) -> (List[TableObject]):
"""
解析文档对象的表格,组织成通用结构的对象结构。
:param sheet: Worksheet对象
:param block: DocumentBlockObject
:param table_range_list: 表格的对角坐标范围 [((3, 2), (15, 13))]
:return:
"""
# 统计合并列,只需要保留每行的第一列
# col_merge_dic: Dict[int, List[int]] = {}
# for merged_range in sheet.merged_cells.ranges:
# min_col = merged_range.min_col
# max_col = merged_range.max_col
# min_row = merged_range.min_row
# max_row = merged_range.max_row
# if max_col > min_col:
# for row in range(min_row, max_row + 1):
# if row not in col_merge_dic:
# col_merge_dic[row] = []
# ignore_col = list(range(min_col+1, max_col+1))
# col_merge_dic[row].extend(ignore_col)
# print(f"old: {table_range_list}")
tables = []
sheet_max_row = sheet.max_row
sheet_max_col = sheet.max_column
self._get_merged_table_cell_info(sheet, ws_with_pywin32)
for i in range(len(table_range_list)):
logging.info(f"parsing table index: {i}")
table_range = table_range_list[i]
# 提取一个表格的坐标范围
(min_row, min_col), (max_row, max_col) = table_range
# 构建表格对象
t = TableObject()
for row_index in range(min_row, max_row + 1):
cells_ = []
# 构建表格行对象
r = RowObject()
r.data_id = row_index
# real_col_index = 0
for col_index in range(min_col, max_col + 1):
# ignore_col = col_merge_dic.get(row_index, [])
# if col_index in ignore_col:
# continue
# 构建表格的单元格对象
c = self._get_table_cell_info(sheet, row_index, col_index, sheet_max_row, sheet_max_col, ws_with_pywin32)
# 这里图片会放到合并单元格左上角cell的content中
for shape in shapes:
row = shape["from_row"]
col = shape["from_col"]
if row == row_index and col == col_index and shape["sheet_name"] == sheet.title and shape["in_cell"]:
obj = PictureObject()
self._shape_to_object(shape, obj)
shape["in_table"] = True
c.content.append(obj)
c.layout = self._get_layout(r)
# 修正单元格的坐标为当前表格的行、列索引
c.row_index = row_index - min_row
c.col_index = col_index - min_col
c.data_id = row_index * 10 + col_index
if hasattr(c, "merged_ranges") and c.merged_ranges:
c.merged_ranges = [c.merged_ranges[0] - min_row,
c.merged_ranges[1] - min_col,
c.merged_ranges[2] - min_row,
c.merged_ranges[3] - min_col,
]
# logging.info(
# f"merged_ranges1 {c.coordinate.desc} ranges: {c.merged_ranges} {c.relative_merged_ranges} ")
cells_.append(c)
r._layout = self._get_layout(t)
r.row_index = row_index - min_row
r.cells = cells_
t.rows.append(r)
logging.debug(f"parse_table parse_row sheet_name: {sheet.title} row_index: {row_index}")
t.layout = self._get_layout(block)
if t.rows:
t.coordinate.desc = t.rows[0].cells[0].coordinate.desc
# 表格的最后一个单元格坐标
last_row = t.rows[len(t.rows) - 1]
t._last_coordinate = last_row.cells[len(last_row.cells) - 1].coordinate.desc
logging.debug(f"parse_table end sheet_name: {sheet.title} table_index: {i}")
t = handle_table_cell(t)
t = ensure_merge_content(t)
tables.append(t)
# 处理表格的表头
# with ThreadPoolExecutor() as executor:
# futures = [executor.submit(parse_table_head, t) for t in tables]
# concurrent.futures.wait(futures)
# print(tables)
return tables
@staticmethod
def _get_layout(parent_ref):
"""
构建 layout对象
:param parent_ref:
:return:
"""
layout = LayoutObject()
layout._parent_ref = parent_ref
return layout
def _parse_picture(self, sheet_name, block, shapes) -> (List[PictureObject]):
"""
解析文档对象的图片,组织成通用结构的对象结构
:param sheet_name: sheet名称
:param block:
:param shapes: 图形列表
:return: 图片对象
"""
results = []
for shape in shapes:
if shape["sheet_name"] == sheet_name and shape["shape_type"] == 13 and not 'in_table' in shape:
if shape["position"].width < 1 or shape["position"].height < 1:
continue
# 构建图形对象
g = PictureObject()
self._shape_to_object(shape, g)
g._layout = self._get_layout(block)
results.append(g)
return results
def _parse_graphic(self, sheet_name, block, shapes) -> (List[GraphicObject]):
"""
解析文档对象的图形,组织成通用结构的对象结构
:param sheet_name: sheet名称
:param block:
:param shapes: 图形列表
:return: 图形对象
"""
results = []
for shape in shapes:
if (shape["sheet_name"] == sheet_name and not 'in_table' in shape
# 图片对象
and shape["shape_type"] != 13
# 文本框对象
# and shape["shape_type"] != 17
):
if shape["position"].width < 1 or shape["position"].height < 1:
continue
# 构建图形对象
g = GraphicObject()
self._shape_to_object(shape, g)
g._layout = self._get_layout(block)
# 赋值shape_type
g.shape_type = int(shape["shape_type"])
# 填充色
fill_color = shape.get("fillcolor").split()[0] if shape.get("fillcolor") else None
if fill_color is not None and re.match(r'^#[0-9A-Fa-f]{6}$', fill_color):
g.style.background_color = str(fill_color)
elif fill_color is not None:
try:
fill_color = webcolors.name_to_hex(fill_color)
g.style.background_color = str(fill_color)
except Exception as e:
logging.error(f"webcolors name to hex error-{fill_color}, {e}")
g._text = shape.get("content", "")
g._graphic_type = shape["type"]
results.append(g)
return results
def _parse_text(self, sheet, block, table_range_list, out_table_cells, shapes, ws_with_pywin32) -> (List[TextObject]):
"""
解析文档对象的文本,连续单元格作为一个TextObject对象
:param sheet:
:param block:
:param table_range_list: 表格对象的范围
:param out_table_cells: 表格外对象,跟表格的范围可能存在重复,需要先进行清晰
:param shapes: shapes集合, 从shapes中查找文本框对象
:return: 文本对象
"""
# return self._parse_text_single_cell(sheet, table_range_list, out_table_cells)
# 按照excel的自然行列排序
sorted_out_table_cells = sorted(out_table_cells, key=lambda cell: (cell[0], cell[1]))
texts: List[TextObject] = []
visited = set()
for text_cell in sorted_out_table_cells:
row, col = text_cell
if text_cell in visited:
continue
visited.add(text_cell)
# 当前单元格在表格范围内,则跳过
if self._cell_in_table(*text_cell, table_range_list):
continue
cell_ = sheet.cell(*text_cell)
# 获取单元格中的内容
v = get_cell_value(cell_, sheet, False)
# 单元格内容为空, 则跳过
if v is None or (isinstance(v, str) and v.strip() == ""):
continue
# 从单元格中解析出RunObject
run_objs = self._get_text_cell(cell_, ws_with_pywin32)
# 创建一个新的 TextObject
current_text = TextObject()
current_text._layout = self._get_layout(block)
current_text._text = str(v)
current_text._style = run_objs[0].style if run_objs else None
current_text._runs = run_objs
current_text.coordinate.desc = cell_.coordinate
neighbors = self._get_neighbors(text_cell, sorted_out_table_cells, table_range_list)
queue = neighbors
while len(queue) > 0:
current_index = queue.pop(0)
if current_index in visited:
continue
visited.add(current_index)
cell_ = sheet.cell(*current_index)
v = get_cell_value(cell_, sheet, False)
if v is None or (isinstance(v, str) and v.strip() == ""):
continue
run_objs = self._get_text_cell(cell_, ws_with_pywin32)
current_text.text += "\n" + str(v)
current_text.runs.extend(run_objs)
neighbors = self._get_neighbors(current_index, sorted_out_table_cells, table_range_list)
queue.extend(neighbors)
texts.append(current_text)
return texts
def _get_neighbors(self, cell_index, sorted_out_table_cells, table_range_list):
current_row, current_col = cell_index
directions = [(-1, 0), (1, 0), (0, -1), (0, 1)]
neighbors = []
for dr, dc in directions:
neighbor_row = current_row + dr
neighbor_col = current_col + dc
neighbor_cell = (neighbor_row, neighbor_col)
if neighbor_cell in table_range_list or neighbor_cell not in sorted_out_table_cells:
continue
neighbors.append(neighbor_cell)
return neighbors
def _parse_text_single_cell(self, sheet, block, table_range_list, out_table_cells, ws_with_pywin32) -> (List[TextObject]):
"""
解析文档对象的文本,以单元格为单位, 一个单元格一个文本对象
:param table_range_list: 表格对象的范围
:param out_table_cells: 表格外对象,跟表格的范围可能存在重复,需要先进行清晰
:return: 文本对象
"""
texts = []
sorted_out_table_cells = sorted(out_table_cells, key=lambda cell: (cell[0], cell[1]))
for text_cell in sorted_out_table_cells:
(row, col) = text_cell
# 当前单元格在表格范围内,则跳过
if self._cell_in_table(row, col, table_range_list):
continue
cell_ = sheet.cell(row, col)
v = get_cell_value(cell_, sheet, False)
# 单元格内容为空, 则跳过
if not v:
continue
# excel中的单元格文本组成
run_objs = self._get_text_cell(cell_, ws_with_pywin32)
# 构建文本对象
t = TextObject()
t._text = v
t._style = run_objs[0].style if run_objs else None
t._runs = run_objs
t.coordinate.desc = cell_.coordinate
t.layout.parent_ref = block
texts.append(t)
return texts
def _cell_in_table(self, row, col, table_range_list) -> bool:
"""
判断单元格是否在表格范围内
:param row: 当前单元格坐标
:param col:
:param table_range_list: 表格范围
:return: True: 在表格范围内, False: 不在表格范围内
"""
for table_range in table_range_list:
# 提取一个表格的坐标范围
(min_row, min_col), (max_row, max_col) = table_range
if min_row <= row <= max_row and min_col <= col <= max_col:
return True
return False
def _parse_shapes(self, file_path, exclude_type_list=[4], exclude_sheet_pictures={}):
"""
使用win32com解析图形+图片
:param file_path:
:return:
"""
with WIN32_LOCK:
shapes_json = get_excel_shapes_win32com(self._win32com_wb, exclude_type_list, exclude_sheet_pictures)
return shapes_json
def _get_text_cell(self, cell_, ws_with_pywin32) -> (List[RunObject]):
"""
解析excel中一个单元格的文本,解析为RunObject
:param cell_: openpyxl.cell.cell.Cell
:return: 返回 RunObject 列表
"""
# excel中cell_.value 类型常见取值: str TextBlock CellRichText
# 获取字体信息
run_objs = []
font = cell_.font
value = cell_.value
bg_color = self._get_cell_bg_color(cell_, ws_with_pywin32)
if isinstance(value, TextBlock):
style_obj = self._get_text_style(cell_.value.font)
style_obj.background_color = bg_color
r = RunObject()
r.coordinate.desc = cell_.coordinate
if cell_.value.text:
r._text = str(cell_.value.text)
r._style = style_obj
run_objs.append(r)
elif isinstance(value, CellRichText):
# CellRichText 文本片段循环处理
for item in value:
item_font = font if isinstance(item, str) else item.font
style_obj = self._get_text_style(item_font)
style_obj.background_color = bg_color
r = RunObject()
r.coordinate.desc = cell_.coordinate
if item and isinstance(item, str):
r._text = item
elif item.text:
r._text = str(item.text)
r._style = style_obj
run_objs.append(r)
else:
style_obj = self._get_text_style(font)
style_obj.background_color = bg_color
r = RunObject()
if value or isinstance(value, int):
r._text = str(value)
r.coordinate.desc = cell_.coordinate
r._style = style_obj
run_objs.append(r)
return run_objs
def _get_text_style(self, font):
"""
解析文本样式
:param font:
:return:
"""
style_obj = StyleObject()
style_obj._font_size = font.size
if font.color:
style_obj._font_color = self._get_rgb_color(font.color)
elif font.color is None:
style_obj._font_color = "#000000"
style_obj._background_color = ''
# 字体样式,如粗体、斜体、下划线、删除线
font_style_ = FontStyleObject()
if font.b:
font_style_.bold = True # 粗体
if font.i:
font_style_.italic = True # 斜体
if font.u:
font_style_.underline = True # 下划线
if font.strike:
font_style_.strikeout = True # 删除线
style_obj.font_style = font_style_
# excel中font行内文本样式: InlineFont
if isinstance(font, InlineFont):
style_obj._font_family = font.rFont
else:
style_obj._font_family = font.name
return style_obj
@staticmethod
def _find_text_object(current_cell, texts):
"""检查当前单元格是否连续, 并返回TextObject对象 """
if not texts:
return None
(curr_row, curr_col) = current_cell
return next(
(text_ for text_ in texts
for prev_row, prev_col in text_._cell_list
if (curr_row == prev_row and curr_col == prev_col + 1)
or (curr_row == prev_row + 1 and curr_col == prev_col)),
None
)
@staticmethod
def _shape_to_object(shape, obj):
"""
shape json对象转换为对象模型
:param shape:
:param obj:
:return:
"""
obj._id = shape["id"]
obj._name = shape["name"]
obj._width = shape["width"]
obj._height = shape["height"]
obj.px_width = obj._width
obj.px_height = obj._height
obj._in_cell = shape.get("in_cell", True)
obj._data = shape.get("data", "")
obj._digest = shape.get("hash", "")
obj.coordinate.desc = shape["index"]
obj.to_coordinate.desc = shape["to_index"]
obj.coordinate.top = str(shape["top"]) # 坐标:距离顶边距离
obj.coordinate.left = str(shape["left"]) # 坐标:距离左边距离
obj._position = shape["position"]
@staticmethod
def index_color_to_hex(index: int) -> str:
""" 索引颜色值转为十六进制颜色值 """
return INDEXED_COLORS.get(index, "#FFFFFF")
@staticmethod
def int_color_to_hex(val: int) -> str:
""" 整形颜色值转为十六进制 """
assert type(val) is int, "function 'int_color_to_hex' need int param."
r = val & 0xFF
g = (val >> 8) & 0xFF
b = (val >> 16) & 0xFF
return "#{:02X}{:02X}{:02X}".format(r, g, b)
def _get_cell_bg_color(self, cell_: Cell, ws_with_pywin32):
"""
获取单元格背景色
https://openpyxl.readthedocs.io/en/latest/_modules/openpyxl/styles/colors.html
:param cell_: openpyxl.cell.cell.Cell对象
:return: 十六进制的颜色值
"""
index = ws_with_pywin32.Name + cell_.coordinate
if index in self._cell_bg_colors:
return self._cell_bg_colors[index]
cell = ws_with_pywin32.Range(cell_.coordinate)
color = self.int_color_to_hex(int(cell.Interior.Color))
self._cell_bg_colors[index] = color
return color
# 判定是否有显式设置背景色
# if cell_.fill.patternType is not None:
# # 有背景色
# fg_color_idx = cell_.fill.fgColor.index
# if isinstance(fg_color_idx, int) and fg_color_idx == 0:
# # 定制白色背景色
# return "#FFFFFF"
# elif isinstance(fg_color_idx, int):
# return self.index_color_to_hex(fg_color_idx)
# elif isinstance(fg_color_idx, str):
# return "#" + fg_color_idx[2:]
#
# # 未显式设置背景色
# return "#FFFFFF"
# color_ = self._get_rgb_color(cell_.fill.bgColor)
# if color_ in ("", self._BLACK, self._WHITE):
# # 如果为默认颜色,则再从cell_.fill.bgColor读取一次
# color_ = self._get_rgb_color(cell_.fill.fgColor)
# return color_
def _get_rgb_color(self, color):
"""
openpyxl的颜色对象解析对于的rgb颜色值
:param color:
:return:
"""
if not color:
return ""
type_ = color.type
value_ = color.value
if type_ == 'rgb':
return "#" + value_[2:] if len(value_) == 8 else "#" + value_
if type_ == 'indexed':
# 索引颜色
return self._get_color_by_index(value_)
if type_ == 'theme':
# 主题颜色
if color.theme >= len(self._THEME_COLORS):
return ""
return "#" + ThemeColor().theme_and_tint_to_rgb(self._THEME_COLORS, color.theme, color.tint)
return ""
def _get_color_by_index(self, value):
"""
拼接颜色属性
:param value:
:return:
"""
return self._BLACK if value >= len(COLOR_INDEX) else "#" + COLOR_INDEX[value][2:]
def _get_openpyxl_cell_bg_color(self, cell: Cell):
index = cell.coordinate
if index in self._cell_bg_colors:
return self._cell_bg_colors[index]
# 无填充
color_str = "#FFFFFF"
if cell.fill.patternType is not None:
color = cell.fill.bgColor
# 64 是系统保留的前景色
if color.type == "indexed" and color.value == 64:
color_str = self._get_rgb_color(cell.fill.fgColor)
else:
# openpyxl 不支持 lightUp 的解析方式,背景色是黑色,但是实际应该是白色,所以单独处理这种情况
if cell.fill.patternType=="lightUp" and color.type == "rgb" and color.value == "00000000":
color_str = "#FFFFFF"
else:
color_str = self._get_rgb_color(color)
# 这里背景颜色可能错误的提取为黑色 000000,导致背景颜色提取出错,如果提取为黑色,则尝试使用 fgCorlor
# 如果后面还是出现背景颜色提取错误的问题,建议直接使用 _get_cell_bg_color 方法,这个方法准确度很高,确定是会牺牲性能
if color_str == self._BLACK:
color_str = self._get_rgb_color(cell.fill.fgColor)
self._cell_bg_colors[index] = color_str
return color_str
def _assign_ref(self, block):
"""
为每个sheet 元素prev_ref, next_ref 属性赋值
:param block:
:return:
"""
# 按照excel的自然行列排序
sorted_elements = sorted(block.elements, key=lambda ele_: self._get_cell_index(ele_, False))
# 排序完成后, 按照自然顺序为对象的prev_ref, next_ref 属性赋值
for i in range(len(sorted_elements)):
ele = sorted_elements[i]
if i > 0:
ele.layout.prev_ref = sorted_elements[i - 1]
if ele._type != 'table' and i < len(sorted_elements) - 1:
# table的next_ref 需要单独计算
ele.layout.next_ref = sorted_elements[i + 1]
# table类型元素的next_ref需要按照最后一个单元格坐标进行匹配
sorted_elements_2 = sorted(block.elements, key=lambda ele_: self._get_cell_index(ele_, True))
for i in range(len(sorted_elements_2)):
ele = sorted_elements_2[i]
if ele._type == 'table' and i < len(sorted_elements_2) - 1:
ele.layout.next_ref = sorted_elements[i + 1]
@staticmethod
def _get_cell_index(ele, table_last_coordinate=True):
"""
获取单元格行列数字坐标
:param ele:
:return: (1,2)
"""
coordinate = ele.coordinate.desc
if table_last_coordinate:
coordinate = ele._last_coordinate if hasattr(ele, "_last_coordinate") else ele.coordinate.desc
return coordinate_to_tuple(coordinate)
@staticmethod
def _get_start_end_of_table(sheet):
return get_start_end_of_table(sheet)
@staticmethod
def remove_verbose_border(img, border_color=(0, 0, 0, 255)):
""" 处理图片去除边框像素 """
# 转为RGBA格式
if img.mode != "RGBA":
img = img.convert("RGBA")
img_data = img.load()
width, height = img.size
# 去除上下黑色边框
for r in [0, height - 1]:
for c in range(width):
if img_data[c, r] == border_color:
# 设置为透明
img_data[c, r] = (0, 0, 0, 0)
# 去除左右的黑色边框
for c in [0, width - 1]:
for r in range(height):
if img_data[c, r] == border_color:
img_data[c, r] = (0, 0, 0, 0)
# 去除其他的横线边框
for r in range(height):
# 边框占用了1px,已改为透明,所以从第二个像素开始
cur_pixel = img_data[1, r]
if cur_pixel != (0, 0, 0, 0):
for c in range(width):
# 暂时简化处理
if img_data[c, r] == cur_pixel:
img_data[c, r] = (0, 0, 0, 0)
return img
def _get_merged_graphic(self, ws_with_pywin32, start_pos, end_pos, graphic_obj):
""" 从当前worksheet中对应的pos坐标位置截取图片,图片数据存入graphic_obj中 """
try:
# 隐藏excel单元格的网格
ws_with_pywin32.Application.ActiveWindow.DisplayGridlines = False
# 获取当前范围的单元格
cell_range = ws_with_pywin32.Range(f"{start_pos}:{end_pos}")
cell_num = cell_range.Cells.Count
temp_list = []
for i in range(cell_num):
cur_cell = cell_range.Cells(i+1)
# 记录单元格内容
temp_list.append((cur_cell, cur_cell.Value))
cur_cell.Value = None
# ws_with_pywin32.Range(f"{start_pos}:{end_pos}").CopyPicture()
cell_range.CopyPicture()
img = ImageGrab.grabclipboard()
# 单元格内容恢复
for c, v in temp_list:
c.Value = v
except Exception:
img = None
if img is not None:
if start_pos != end_pos:
# 去除图片中可能存在的边框
img = self.remove_verbose_border(img)
bio = BytesIO()
img.save(bio, format="png")
graphic_obj.data = b64encode(bio.getvalue()).decode()
@staticmethod
def is_virtual_cell_in(vcell, vlist):
""" 判定当前虚拟单元格是否在虚拟列表中
:param vcell: 虚拟的单元格对象VirtualCell
:param vlist: 虚拟的单元格对象的列表
:return: -1未找到 or 列表正向索引
"""
for idx, cell in enumerate(vlist):
if vcell == cell:
return idx
return -1
# def _merge_graphic_pic_cross_multi_cell(self, block, ws_with_pywin32, ws_with_openpyxl):
# """ 跨越多个单元格的图片、图形的合并,即虚拟单元格内图片、图形的合并 """
# # 存储虚拟单元格对象
# virtual_cell_list = []
# cur_idx = -1
# # 虚拟单元格在列表的索引为key, 同一个虚拟单元格内的图形、图片放入列表,以该列表为value
# virtual_cell_dict = defaultdict(list)
#
# # 统计虚拟单元格内的图形
# for graphic in block.graphics:
# coord = graphic.coordinate.desc
# if not coord:
# continue
# # 判定该图形是否在虚拟单元格内
# vcell = has_virtual_cell(coord, ws_with_openpyxl)
# if vcell is None: # 不在虚拟单元格内则跳过
# continue
# # 虚拟单元格不在列表中则存入
# r = self.is_virtual_cell_in(vcell, virtual_cell_list)
# if r == -1:
# cur_idx += 1
# virtual_cell_list.append(vcell)
# # 记录当前虚拟单元格中的图形对象
# virtual_cell_dict[cur_idx].append(graphic)
# else:
# # 当虚拟单元格已在列表中,则仅记录图形对象
# virtual_cell_dict[r].append(graphic)
#
# # 统计虚拟单元格内的图片
# for pic in block.pictures:
# coord = pic.coordinate.desc
# if not coord:
# continue
# # 判定该图片是否在虚拟单元格内
# vcell = has_virtual_cell(coord, ws_with_openpyxl)
# if vcell is None: # 不在虚拟单元格内则跳过
# continue
# # 虚拟单元格不在列表中则存入
# r = self.is_virtual_cell_in(vcell, virtual_cell_list)
# if r == -1:
# cur_idx += 1
# virtual_cell_list.append(vcell)
# # 记录当前虚拟单元格中的图形对象
# virtual_cell_dict[cur_idx].append(pic)
# else:
# # 当虚拟单元格已在列表中,则仅记录图形对象
# virtual_cell_dict[r].append(pic)
#
# # 逆序处理每个虚拟单元格的图片、图形合并
# vnum = len(virtual_cell_list)
# vidx = vnum - 1
# while vidx >= 0:
# g_list = virtual_cell_dict.get(vidx) # 获取当前虚拟单元格中的图形、图片列表
# # 以图片对象作为基准
# leader = g_list[0]
# for cur_g in g_list:
# if isinstance(cur_g, PictureObject):
# leader = cur_g
# break
# leader_label = "graphics" if isinstance(leader, GraphicObject) else "pictures"
# try:
# # 记录elements的插入位置
# ele_insert_idx = block.elements.index(leader)
# # 记录graphics/pictures的插入位置
# gp_insert_idx = getattr(block, leader_label, []).index(leader)
# # 开始处理合并
# for g in g_list:
# # 从列表中删除待合并的图形/图片对象
# block.elements.remove(g)
# if isinstance(g, GraphicObject):
# block.graphics.remove(g)
# else:
# block.pictures.remove(g)
# # 获取合并的图形数据
# current_vcell = virtual_cell_list[vidx]
# start_pos = current_vcell.start_cell_coord
# end_pos = current_vcell.end_cell_coord
# self._get_merged_graphic(ws_with_pywin32, start_pos, end_pos, leader)
# # 修正大图片的坐标
# leader.coordinate.desc = start_pos
# # 在指定的位置插入合并的图片
# block.elements.insert(ele_insert_idx, leader)
# if isinstance(leader, GraphicObject):
# block.graphics.insert(gp_insert_idx, leader)
# else:
# block.pictures.insert(gp_insert_idx, leader)
# vidx -= 1
# except (ValueError, IndexError, pythoncom.com_error):
# logging.error("merge graphics&pictures inside virtual cell failed.")
# vidx -= 1
def _merge_graphics(self, block, ws_with_pywin32):
""" 合并同一个单元格内的图形 """
pos_dict = {}
pos_list = [] # 保持位置有序性
# 统计每个位置的图形个数
for graphic in block.graphics:
pos = graphic.coordinate.desc
if not pos:
continue
if pos not in pos_dict:
pos_dict[pos] = [graphic]
pos_list.append(pos)
else:
pos_dict[pos].append(graphic)
# 逆序处理(便于删除、插入)有图形的坐标位置
for pos in reversed(pos_list):
g_list = pos_dict.get(pos) # 获取当前位置的图形
if len(g_list) == 1:
# 当前位置只有一个图形,不做处理
continue
try:
# 当前位置有多个图形,则进行图形的合并
# 记录elements的插入位置
ele_insert_idx = block.elements.index(g_list[0])
# 记录graphics的插入位置
graphic_insert_idx = block.graphics.index(g_list[0])
# 开始处理
merged_graphic = g_list[0]
for g in g_list:
# 从列表中删除待合并的图形对象
block.elements.remove(g)
block.graphics.remove(g)
# 获取合并的图形数据
self._get_merged_graphic(ws_with_pywin32, pos, pos, merged_graphic)
# 在指定的位置插入合并的图形
block.elements.insert(ele_insert_idx, merged_graphic)
block.graphics.insert(graphic_insert_idx, merged_graphic)
except (ValueError, IndexError, pythoncom.com_error):
logging.error("merge graphics inside same cell failed.")
def _merge_pictures(self, block, ws_with_pywin32):
""" 合并同一个单元格内的图片 """
pos_dict = {}
pos_list = [] # 保持位置有序性
# 统计每个pos位置的图片个数
for pic in block.pictures:
pos = pic.coordinate.desc
if not pos:
continue
if pos not in pos_dict:
pos_dict[pos] = [pic]
pos_list.append(pos)
else:
pos_dict[pos].append(pic)
# 逆序处理(便于删除、插入)有图片的坐标位置
for pos in reversed(pos_list):
pic_list = pos_dict.get(pos) # 获取当前位置的图片列表
if len(pic_list) == 1:
# 当前位置只有一个图片,不做处理
continue
try:
# 当前位置有多个图片,则进行图片的合并
# 记录elements的插入位置
ele_insert_idx = block.elements.index(pic_list[0])
# 记录pictures的插入位置
picture_insert_idx = block.pictures.index(pic_list[0])
# 开始处理
merged_pic = pic_list[0]
for cur_pic in pic_list:
# 从列表中删除待合并的图片对象
block.elements.remove(cur_pic)
block.pictures.remove(cur_pic)
# 获取合并的图片数据
self._get_merged_graphic(ws_with_pywin32, pos, pos, merged_pic)
# 在指定的位置插入合并的图形
block.elements.insert(ele_insert_idx, merged_pic)
block.pictures.insert(picture_insert_idx, merged_pic)
except (ValueError, IndexError, pythoncom.com_error):
logging.error("merge pictures inside same cell failed.")
def _merge_graphics_and_pictures(self, block, ws_with_pywin32):
""" 合并同一个单元格的图形 + 图片 """
pos_dict = {}
pos_list = [] # 保持位置有序性
# 统计每个pos位置的图片个数
for pic in block.pictures:
pos = pic.coordinate.desc
if not pos:
continue
if pos not in pos_dict:
pos_dict[pos] = [pic]
pos_list.append(pos)
else:
pos_dict[pos].append(pic)
# 统计每个pos位置的图形个数
for graphic in block.graphics:
pos = graphic.coordinate.desc
if not pos or pos not in pos_dict:
continue
else:
# 同一个cell中有图片 + 图形
pos_dict[pos].append(graphic)
# 逆序处理(便于删除、插入)有图片的坐标位置
for pos in reversed(pos_list):
pic_list = pos_dict.get(pos) # 获取当前位置的图片列表
if len(pic_list) == 1:
# 当前位置只有一个图片,不做处理
continue
try:
end_pos = get_end_desc(pic_list)
# 当前位置有图片+图形,则进行图片+图形的合并
# 记录elements的插入位置
ele_insert_idx = block.elements.index(pic_list[0])
# 记录pictures的插入位置
picture_insert_idx = block.pictures.index(pic_list[0])
# 开始处理
merged_pic = pic_list[0]
for cur_pic in pic_list:
# 从列表中删除待合并的图片、图形对象
block.elements.remove(cur_pic)
if isinstance(cur_pic, PictureObject):
block.pictures.remove(cur_pic)
elif isinstance(cur_pic, GraphicObject):
block.graphics.remove(cur_pic)
# 获取合并的图片数据
self._get_merged_graphic(ws_with_pywin32, pos, end_pos, merged_pic)
# 在指定的位置插入合并的图形
block.elements.insert(ele_insert_idx, merged_pic)
block.pictures.insert(picture_insert_idx, merged_pic)
except (ValueError, IndexError, pythoncom.com_error):
logging.error("merge pictures inside same cell failed.")
def get_cell_border(self, cell: Cell) -> CellBorderObject:
index = (cell.row, cell.column)
if index in self._cell_border:
return self._cell_border[index]
cell_border = CellBorderObject()
border_style = getattr(cell.border, "top").style
border_color = self._get_rgb_color(getattr(cell.border, "top").color)
cell_border.border_top = BorderObject(border_color, border_style)
border_style = getattr(cell.border, "left").style
border_color = self._get_rgb_color(getattr(cell.border, "left").color)
cell_border.border_left = BorderObject(border_color, border_style)
border_style = getattr(cell.border, "bottom").style
border_color = self._get_rgb_color(getattr(cell.border, "bottom").color)
cell_border.border_bottom = BorderObject(border_color, border_style)
border_style = getattr(cell.border, "right").style
border_color = self._get_rgb_color(getattr(cell.border, "right").color)
cell_border.border_right = BorderObject(border_color, border_style)
self._cell_border[index] = cell_border
return cell_border
def _parse_sheet(self, sheet, shapes):
""" 解析一个worksheet中的数据
:params sheet: Worksheet对象
:params shapes: shape字典列表
"""
wb_with_pywin32 = self._excel_app
ws_with_pywin32 = wb_with_pywin32.Sheets[sheet.title]
sheet_name = sheet.title
# 重置sheet背景颜色的缓存
self._cell_bg_colors = {}
self._cell_border = {}
self._merged_cells_cache = {}
logging.debug(f"parse_excel start sheet_name: {sheet_name}")
# 表格的范围, 表格外的内容的范围
table_range_list, out_table_cells = self._get_start_end_of_table(sheet)
table_range_list = sorted(table_range_list) # sort the table range list
logging.debug(f"parse_excel sheet_name: {sheet_name} get_table_range_list {table_range_list}")
# 实例化一个sheet对象
block = DocumentBlockObject()
block.file_name = os.path.basename(self._file_path)
shapes = handle_shape_in_merged_cell(sheet, shapes)
# 解析当前sheet的表格信息
tables = self._parse_table(sheet, block, table_range_list, ws_with_pywin32, shapes)
logging.info(f"parse_excel sheet_name: {sheet_name} _parse_table {len(tables)}")
# 解析当前sheet的图片信息
pictures = self._parse_picture(sheet_name, block, shapes)
logging.info(f"parse_excel sheet_name: {sheet_name} _parse_picture {len(pictures)}")
# 解析当前sheet的图形信息
graphics = self._parse_graphic(sheet_name, block, shapes)
logging.info(f"parse_excel sheet_name: {sheet_name} _parse_graphic {len(graphics)}")
# 解析当前sheet的表格外的对象信息
texts = self._parse_text(sheet, block, table_range_list, out_table_cells, shapes, ws_with_pywin32)
logging.info(f"parse_excel sheet_name: {sheet_name} _parse_text {len(texts)}")
block.name = sheet_name
block.add_table(tables)
block.add_text(texts)
block.add_graphic(graphics)
block.add_picture(pictures)
self._assign_ref(block)
logging.debug(f"parse_excel end sheet_name: {sheet_name}")
# 合并虚拟单元格中的图形、图片
# self._merge_graphic_pic_cross_multi_cell(block, ws_with_pywin32, sheet)
# 合并同一个单元格中的图形
self._merge_graphics(block, ws_with_pywin32)
# 合并同一个单元格图片
self._merge_pictures(block, ws_with_pywin32)
# 合并同一个单元格中的图形 + 图片
self._merge_graphics_and_pictures(block, ws_with_pywin32)
# 关闭
return block
# 这个方法暂时没有启用,由于位置信息有问题,无法计算覆盖率
def _handle_shapes(self, shapes: List[Dict[str, Any]], total_width, total_height) -> List[Dict[str, Any]]:
"""通过覆盖比例判断shape是否属某个的单元格"""
for shape in shapes:
from_row = shape["from_row"]
from_col = shape["from_col"]
to_row = shape["to_row"]
to_col = shape["to_col"]
# 已经属于某个单元格,则跳过
if shape["in_cell"]:
continue
# 如果图片的大小超过6个单元格,则认为当前图片不属于某个单元格
if (to_col - from_col + 1) * (to_row - from_row + 1) > 6:
continue
found = False
for row in range(from_row, to_row + 1):
for col in range(from_col, to_col + 1):
cell_pos = self._cell_positions[(row, col)]
## cell_pos 是 百分比,需要转换
cell_pos = Position(cell_pos.x * total_width, cell_pos.y * total_height, cell_pos.width * total_width, cell_pos.height * total_height)
shape_pos = shape["position"]
overlap_percentage = calculate_overlap_percentage(cell_pos, shape_pos)
if hasattr(shape, 'content') and hasattr(shape, 'index') and shape["content"] == "△X":
print(f"{shape['index']}: {overlap_percentage}")
print(f"{shape['content']}: {overlap_percentage}")
if overlap_percentage > 0.8:
shape["in_cell"] = True
shape["cell_row"] = row
shape["cell_col"] = col
found = True
break
if found:
break
return shapes
详细说一下解析的逻辑
最新发布