"""
@Author : 王定雄
@Date : 2024-10-29
@Desc : 文本对象的数据结构的定义
"""
import logging
from kotei_omp.data.base_object import BaseObject
from kotei_omp.data.layout import LayoutObject
from kotei_omp.data.position import Position
from kotei_omp.data.style import StyleObject
from kotei_omp.data.coordinate import CoordinateObject
from kotei_i18n.api import _
class CharObject(BaseObject):
def __init__(self, char, style):
self.char = char
self.style = style
def is_same(self, char_obj) -> bool:
return self.char == char_obj.char and self.style == char_obj.style
class TextObject(BaseObject):
"""文本对象"""
def __init__(self):
self._type = "text"
self._text = '' # 完整内容
self._style = StyleObject() # 文本样式
self._layout = LayoutObject() # 文本布局对象
self._coordinate = CoordinateObject() # 文本的坐标
self._runs = [] # 文本片段对象列表,所有内容连起来就是完整内容
self._data_id = None # 唯一标识
self._position = Position()
def to_dict(self):
"""
将 TextObject 对象转换为字典
"""
return {
"type": self._type,
"text": self._text,
"style": self._style.to_dict(), # 调用 StyleObject 的 to_dict 方法
"layout": self._layout.to_dict(), # 调用 LayoutObject 的 to_dict 方法
"coordinate": self._coordinate.to_dict(), # 调用 CoordinateObject 的 to_dict 方法
"runs": [run.to_dict() for run in self._runs], # 递归转换 RunObject 列表
"data_id": self._data_id,
"position": self._position.to_dict()
}
@classmethod
def from_dict(cls, data):
"""
从字典创建 TextObject 实例
"""
obj = cls()
obj._type = data.get("type", "text")
obj._text = data.get("text", '')
obj._style = StyleObject.from_dict(data.get("style", {})) # 恢复 StyleObject
obj._layout = LayoutObject.from_dict(data.get("layout", {})) # 恢复 LayoutObject
obj._coordinate = CoordinateObject.from_dict(data.get("coordinate", {})) # 恢复 CoordinateObject
obj._runs = [RunObject.from_dict(run) for run in data.get("runs", [])] # 恢复 RunObject 列表
obj._data_id = data.get("data_id", None)
obj._position = Position.from_dict(data.get("position", {}))
return obj
def __repr__(self):
return f'{self.__class__.__name__}()[TEXT="{self._text}"]'
@property
def data_id(self):
return self._data_id
@data_id.setter
def data_id(self, new_value):
assert type(new_value) == int
self._data_id = new_value
@property
def text(self):
return self._text
@text.setter
def text(self, new_value):
assert type(new_value) == str
self._text = new_value
@property
def runs(self):
return self._runs
@runs.setter
def runs(self, new_value):
assert type(new_value) == list
self._runs = new_value
@property
def coordinate(self):
return self._coordinate
@coordinate.setter
def coordinate(self, new_value):
assert isinstance(new_value, CoordinateObject)
self._coordinate = new_value
@property
def layout(self):
return self._layout
@layout.setter
def layout(self, new_value):
assert isinstance(new_value, LayoutObject)
self._layout = new_value
@property
def style(self):
return self._style
@style.setter
def style(self, new_value):
assert isinstance(new_value, StyleObject)
self._style = new_value
@property
def position(self):
return self._position
@position.setter
def position(self, new_value):
assert isinstance(new_value, Position)
self._position = new_value
@staticmethod
def lstrip_special_char(title):
""" 去除title左边的特殊字符 """
built_delims = ["、", ",", ".", " "]
for sep in built_delims:
title = title.lstrip(sep)
return title
def get_data(self):
"""精简化输出支持"""
data = {
'type': 'text',
'content': self.text,
'parent_content': self.layout.parent_content,
"chapter": "",
"title": "",
"desc": "",
"data_id": self.data_id,
"index": -1
}
if hasattr(self.layout, "is_chapter_title"):
if self.layout.is_chapter_title:
chapter = self.layout.chapter_id
if chapter not in self.text.replace("\u3000", " ").replace(".", ".").replace('-', '.'):
logging.info(f'start to translate text 0: {_("章节号和标题不匹配,请检查!")}')
else:
title = self.text.replace("\u3000", " ").replace(".", ".").replace('-', '.').split(chapter, maxsplit=1)[1].strip(getattr(self, "customized_sep", " "))
title = self.lstrip_special_char(title)
data["chapter"] = chapter
data["title"] = title
logging.info(f'start to translate text 1: {_("这段文本是章节标题,章节号:")}')
data["desc"] = _("这段文本是章节标题,章节号:") + chapter + "," + _("标题为:") + title
else:
logging.info(f'start to translate text 2: {_("这段文本是纯内容,不是章节标题,没有章节号。")}')
data["desc"] = _("这段文本是纯内容,不是章节标题,没有章节号。")
data["index"] = self.index if hasattr(self, "index") else -1
else:
logging.info(f'start to translate text 3: {_("这段文本是纯内容,不是章节标题,没有章节号。")}')
# 普通文本
data["desc"] = _("这段文本是纯内容,不是章节标题,没有章节号。")
if self.coordinate.desc:
data["coord"] = self.coordinate.desc
else:
data["page_num"] = self.layout.page_id
return data
def get_chars(self):
""" 获取文本对象中的每个字及样式
去除整行文字的前后连续空格,保留中间的空格和样式
"""
# 去除整行文字的前后连续空格,保留中间的空格和样式
full_text = ''.join(str(run.text) for run in self.runs).strip()
data_list = []
current_index = 0 # 用于跟踪 full_text 的匹配位置
for run in self.runs:
for char in str(run.text):
# 跳过前后被去掉的空格
if current_index >= len(full_text):
break
# 如果字符匹配 full_text 中的字符,则添加到结果列表
if char == full_text[current_index]:
data_list.append(CharObject(char, run.style))
current_index += 1
return data_list
def get_html_text(self):
""" 获取带有样式的html文本字符串 """
html = ""
for run in self.runs:
# 字体样式
bg_color = run.style.background_color # 可为""
font_color = run.style.font_color # 可为""
font_family = run.style.font_family # 可为""
font_size = run.style.font_size.strip("pt")
if font_size:
font_size = int(float(font_size) * 1.3)
# 加粗、倾斜
bold = run.style.font_style.bold
italic = run.style.font_style.italic
normal = run.style.font_style.normal
# 装饰线
strike_out = run.style.font_style.strikeout
underline = run.style.font_style.underline
html += "<span style='"
# 拼接样式
if not normal:
if strike_out:
html += "text-decoration: line-through;"
elif underline:
html += "text-decoration: underline;"
else:
html += "text-decoration: none;"
if bold:
html += "font-weight: bold;"
if italic:
html += "font-style: italic;"
if font_color:
html += "color: %s;" % font_color
if font_family:
html += "font-family: %s;" % font_family
if font_size:
html += "font-size: %dpx;" % font_size
if bg_color:
html += "background-color: %s;" % bg_color
html += "'>"
# 拼接文本
html += run.text + "</span>"
return html
def get_chapter_id(self):
""" 获取文本对象章节号字符串,
返回章节id字符串 或者 空(无章节号)
"""
return getattr(self.layout, "chapter_id", "")
class RunObject(BaseObject):
"""文本片段对象"""
def __init__(self):
self._text = '' # 完整内容
self._style = StyleObject() # 节段文本样式对象
self._layout = LayoutObject() # 文本片段布局对象
self._coordinate = CoordinateObject() # 文本的坐标
self._type = '' # 文本片段类型:text(普通文本);br(分页标识);
@property
def text(self):
return self._text
@text.setter
def text(self, new_value):
assert type(new_value) == str
self._text = new_value
@property
def type(self):
return self._type
@type.setter
def type(self, new_value):
assert type(new_value) == str
self._type = new_value
@property
def coordinate(self):
return self._coordinate
@coordinate.setter
def coordinate(self, new_value):
assert isinstance(new_value, CoordinateObject)
self._coordinate = new_value
@property
def layout(self):
return self._layout
@layout.setter
def layout(self, new_value):
assert isinstance(new_value, LayoutObject)
self._layout = new_value
@property
def style(self):
return self._style
@style.setter
def style(self, new_value):
assert isinstance(new_value, StyleObject)
self._style = new_value
def to_dict(self):
return {"text": self._text, "style": self._style.to_dict()}
"""
@Author : 王定雄
@Date : 2024-10-29
@Desc : 图片对象的数据结构的定义
"""
from kotei_omp.data.base_object import BaseObject
from kotei_omp.data.layout import LayoutObject
from kotei_omp.data.position import Position
from kotei_omp.data.style import StyleObject
from kotei_omp.data.coordinate import CoordinateObject
class PictureObject(BaseObject):
"""图片对象"""
def __init__(self):
self._type = "picture"
self._id = '' # 图片ID
self._name = '' # 图片名称
self._width = '' # 宽
self._height = '' # 高
self._digest = '' # 图片数据的hash值
self._data = '' # 图片二进制数据的base64编码
self._layout = LayoutObject() # 文本布局对象
self._style = StyleObject() # 样式
self._coordinate = CoordinateObject() # 坐标
self._to_coordinate = CoordinateObject()
self._data_id = None # 唯一标识
self._position = Position()
def to_dict(self):
"""
将 PictureObject 对象转换为字典
"""
return {
"type": self._type,
"id": self._id,
"name": self._name,
"width": self._width,
"height": self._height,
"digest": self._digest,
"data": self._data,
"layout": self._layout.to_dict(), # 调用 LayoutObject 的 to_dict 方法
"style": self._style.to_dict(), # 调用 StyleObject 的 to_dict 方法
"coordinate": self._coordinate.to_dict(), # 调用 CoordinateObject 的 to_dict 方法
"data_id": self._data_id,
"position": self._position.to_dict()
}
@classmethod
def from_dict(cls, data):
"""
从字典创建 PictureObject 实例
"""
obj = cls()
obj._type = data.get("type", "picture")
obj._id = data.get("id", '')
obj._name = data.get("name", '')
obj._width = data.get("width", '')
obj._height = data.get("height", '')
obj._digest = data.get("digest", '')
obj._data = data.get("data", '')
obj._layout = LayoutObject.from_dict(data.get("layout", {})) # 恢复 LayoutObject
obj._style = StyleObject.from_dict(data.get("style", {})) # 恢复 StyleObject
obj._coordinate = CoordinateObject.from_dict(data.get("coordinate", {})) # 恢复 CoordinateObject
obj._data_id = data.get("data_id", None)
obj._position = Position.from_dict(data.get("position", {}))
return obj
def __repr__(self):
return f'{self.__class__.__name__}()[{str(self)}]'
def __str__(self):
coordinate_str = f'\nCoordinate: {self._coordinate.desc}' if self._coordinate.desc else ''
return f'Image [{self._name}]: Width:{self._width}, Height:{self._height}. \nType:{self._type}. \nOffset: Top:{self._coordinate.top or 0},Left:{self._coordinate.left or 0}.{coordinate_str}'
@property
def data_id(self):
return self._data_id
@data_id.setter
def data_id(self, new_value):
assert type(new_value) == int
self._data_id = new_value
@property
def id(self):
return self._id
@id.setter
def id(self, new_value):
assert type(new_value) == str
self._id = new_value
@property
def name(self):
return self._name
@name.setter
def name(self, new_value):
assert type(new_value) == str
self._name = new_value
@property
def width(self):
return self._width
@width.setter
def width(self, new_value):
assert type(new_value) == str
self._width = new_value
@property
def height(self):
return self._height
@height.setter
def height(self, new_value):
assert type(new_value) == str
self._height = new_value
@property
def digest(self):
return self._digest
@digest.setter
def digest(self, new_value):
assert type(new_value) == str
self._digest = new_value
@property
def data(self):
return self._data
@data.setter
def data(self, new_value):
assert type(new_value) == str
self._data = new_value
@property
def layout(self):
return self._layout
@layout.setter
def layout(self, new_value):
assert isinstance(new_value, LayoutObject)
self._layout = new_value
@property
def style(self):
return self._style
@style.setter
def style(self, new_value):
assert isinstance(new_value, StyleObject)
self._style = new_value
@property
def coordinate(self):
return self._coordinate
@coordinate.setter
def coordinate(self, new_value):
assert isinstance(new_value, CoordinateObject)
self._coordinate = new_value
@property
def to_coordinate(self):
return self._to_coordinate
@to_coordinate.setter
def to_coordinate(self, new_value):
assert isinstance(new_value, CoordinateObject)
self.to_coordinate = new_value
@property
def position(self):
return self._position
@position.setter
def position(self, new_value):
assert isinstance(new_value, Position)
self._position = new_value
def get_data(self):
"""精简化输出支持"""
data = {
"data_id": self.data_id,
'type': 'picture',
"parent_content": self.layout.parent_content,
'content': {
'id': self.id,
'name': self.name,
'width': self.width,
'height': self.height,
'data': self.data,
'digest': self.digest,
},
"index": self.index if hasattr(self, "index") else 0
}
if self.coordinate.desc:
data["coord"] = self.coordinate.desc
else:
data["page_num"] = self.layout.page_id
return data
"""
@Author : 王定雄
@Date : 2024-10-29
@Desc : 图形对象的数据结构的定义
"""
from kotei_omp.data.base_object import BaseObject
from kotei_omp.data.layout import LayoutObject
from kotei_omp.data.position import Position
from kotei_omp.data.style import StyleObject
from kotei_omp.data.coordinate import CoordinateObject
class GraphicObject(BaseObject):
"""图形对象"""
def __init__(self):
self._type = "graphic"
self._id = '' # ID
self._name = '' # 名称
self._width = '' # 宽
self._height = '' # 高
self._digest = '' # 图片数据的hash值
self._data = '' # 图片二进制数据的base64编码
self._text = '' # 图形文本框内容
self._graphic_type = '' # 类型,如矩形rect、线条line、嵌入对象等
self._layout = LayoutObject() # 文本布局对象
self._style = StyleObject() # 样式
self._coordinate = CoordinateObject() # 坐标位置
self._to_coordinate = CoordinateObject()
self._data_id = None # 唯一标识
self._position = Position()
def to_dict(self):
"""
将 GraphicObject 对象转换为字典
"""
return {
"type": self._type,
"id": self._id,
"name": self._name,
"width": self._width,
"height": self._height,
"digest": self._digest,
"data": self._data,
"text": self._text,
"graphic_type": self._graphic_type,
"layout": self._layout.to_dict(), # 调用 LayoutObject 的 to_dict 方法
"style": self._style.to_dict(), # 调用 StyleObject 的 to_dict 方法
"coordinate": self._coordinate.to_dict(), # 调用 CoordinateObject 的 to_dict 方法
"data_id": self._data_id,
"position": self._position.to_dict()
}
@classmethod
def from_dict(cls, data):
"""
从字典创建 GraphicObject 实例
"""
obj = cls()
obj._type = data.get("type", "graphic")
obj._id = data.get("id", '')
obj._name = data.get("name", '')
obj._width = data.get("width", '')
obj._height = data.get("height", '')
obj._digest = data.get("digest", '')
obj._data = data.get("data", '')
obj._text = data.get("text", '')
obj._graphic_type = data.get("graphic_type", '')
obj._layout = LayoutObject.from_dict(data.get("layout", {})) # 恢复 LayoutObject
obj._style = StyleObject.from_dict(data.get("style", {})) # 恢复 StyleObject
obj._coordinate = CoordinateObject.from_dict(data.get("coordinate", {})) # 恢复 CoordinateObject
obj._data_id = data.get("data_id", None)
obj._position = Position.from_dict(data.get("position", {}))
return obj
def __repr__(self):
return f'{self.__class__.__name__}()[{str(self) }]'
def __str__(self):
coordinate_str = f'\nCoordinate: {self._coordinate.desc}' if self._coordinate.desc else ''
return f'Image [{self._name}]: Width:{self._width}, Height:{self._height}. \nType:{self._type}. \nOffset: Top:{self._coordinate.top or 0},Left:{self._coordinate.left or 0}.{coordinate_str}'
@property
def position(self):
return self._position
@position.setter
def position(self, new_value):
assert isinstance(new_value, Position)
self._position = new_value
@property
def data_id(self):
return self._data_id
@data_id.setter
def data_id(self, new_value):
assert type(new_value) == int
self._data_id = new_value
@property
def id(self):
return self._id
@id.setter
def id(self, new_value):
assert type(new_value) == str
self._id = new_value
@property
def name(self):
return self._name
@name.setter
def name(self, new_value):
assert type(new_value) == str
self._name = new_value
@property
def width(self):
return self._width
@width.setter
def width(self, new_value):
assert type(new_value) == str
self._width = new_value
@property
def height(self):
return self._height
@height.setter
def height(self, new_value):
assert type(new_value) == str
self._height = new_value
@property
def digest(self):
return self._digest
@digest.setter
def digest(self, new_value):
assert type(new_value) == str
self._digest = new_value
@property
def data(self):
return self._data
@data.setter
def data(self, new_value):
assert type(new_value) == str
self._data = new_value
@property
def text(self):
return self._text
@text.setter
def text(self, new_value):
assert type(new_value) == str
self._text = new_value
@property
def graphic_type(self):
return self._graphic_type
@graphic_type.setter
def graphic_type(self, new_value):
assert type(new_value) == str
self._graphic_type = new_value
@property
def layout(self):
return self._layout
@layout.setter
def layout(self, new_value):
assert isinstance(new_value, LayoutObject)
self._layout = new_value
@property
def style(self):
return self._style
@style.setter
def style(self, new_value):
assert isinstance(new_value, StyleObject)
self._style = new_value
@property
def coordinate(self):
return self._coordinate
@coordinate.setter
def coordinate(self, new_value):
assert isinstance(new_value, CoordinateObject)
self._coordinate = new_value
@property
def to_coordinate(self):
return self._to_coordinate
@to_coordinate.setter
def to_coordinate(self, new_value):
assert isinstance(new_value, CoordinateObject)
self.to_coordinate = new_value
def get_data(self):
"""精简化输出支持"""
data = {
"data_id": self.data_id,
'type': 'graphic',
"parent_content": self.layout.parent_content,
'content': {
'id': self.id,
'name': self.name,
'width': self.width,
'height': self.height,
'data': self.data,
'digest': self.digest,
'text': self.text,
},
"index": self.index if hasattr(self, "index") else 0
}
if self.coordinate.desc:
data["coord"] = self.coordinate.desc
else:
data["page_num"] = self.layout.page_id
if hasattr(self, 'text_obj'):
runs_style_obj = []
from kotei_omp.data.table import get_text_obj_runs_style
get_text_obj_runs_style(self.text_obj, runs_style_obj)
if runs_style_obj:
data['runs_style'] = runs_style_obj
return data
完善这三个类的类图
┌──────────────────────────────┐
│ TextObject │
├──────────────────────────────┤
│ - text: str │ ← 文本内容
│ - runs: List[RunObject] │ ← 文本片段列表
│ - style: StyleObject │ ← 文本样式
├──────────────────────────────┤
│ + to_dict(): dict │ ← 转换为字典
│ + from_dict(data: dict): cls │ ← 从字典创建实例
└──────────────────────────────┘