import os
from docx import Document
from openpyxl import Workbook
from openpyxl.styles import Alignment
from openpyxl.utils import get_column_letter
def get_merged_regions(table):
"""获取表格所有合并区域信息"""
merged = []
for merge in table._tbl.xpath('.//w:gridSpan | .//w:vMerge'):
cell = merge.getparent().getparent()
row_idx = int(cell.xpath('count(ancestor::w:tr/preceding-sibling::w:tr)'))
col_idx = int(cell.xpath('count(preceding-sibling::w:tc)'))
# 处理水平合并
if 'gridSpan' in merge.tag:
colspan = int(merge.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val', 1))
merged.append(('h', row_idx, col_idx, colspan))
# 处理垂直合并
elif 'vMerge' in merge.tag:
if merge.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') != 'continue':
rowspan = 1
next_row = row_idx + 1
while next_row < len(table.rows):
next_cell = table.cell(next_row, col_idx)._tc
if next_cell.xpath('.//w:vMerge'):
v_merge_val = next_cell.xpath('.//w:vMerge')[0].get(
'{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val')
if v_merge_val == 'continue':
rowspan += 1
next_row += 1
else:
break
else:
break
merged.append(('v', row_idx, col_idx, rowspan))
return merged
def get_table_data_range(table, start_text, end_text):
"""获取表格中从start_text到end_text之间的数据范围"""
start_row = None
end_row = None
for row_idx, row in enumerate(table.rows):
for cell in row.cells:
if start_text in cell.text:
start_row = row_idx
if end_text in cell.text:
end_row = row_idx
if start_row is not None and end_row is not None:
break
return start_row, end_row
def convert_table_to_excel(table, ws, start_row=None, end_row=None):
"""将Word表格转换为Excel工作表(保留合并和样式)"""
# 确定处理的行范围
if start_row is None:
start_row = 0
if end_row is None:
end_row = len(table.rows)
# 获取合并区域信息
merged_regions = get_merged_regions(table)
# 创建数据矩阵,处理合并单元格的值
data_matrix = []
last_values = {} # 跟踪每列的最后一个非空值,处理合并单元格
for row_idx, row in enumerate(table.rows[start_row:end_row]):
original_row_idx = start_row + row_idx
row_data = []
for col_idx, cell in enumerate(row.cells):
# 检查当前单元格是否属于垂直合并区域
is_in_merged = False
for merge_type, m_row, m_col, m_span in merged_regions:
if merge_type == 'v' and m_col == col_idx and m_row <= original_row_idx < m_row + m_span:
is_in_merged = True
# 如果是合并区域的起始行,使用当前单元格的值
if original_row_idx == m_row:
value = cell.text.strip()
last_values[col_idx] = value
# 否则使用该列的最后一个非空值
else:
value = last_values.get(col_idx, "")
break
# 如果不是合并区域,直接使用当前单元格的值
if not is_in_merged:
value = cell.text.strip()
last_values[col_idx] = value
row_data.append(value)
data_matrix.append(row_data)
# 调整合并区域的行索引以匹配数据矩阵
adjusted_merged_regions = []
for merge_type, row, col, span in merged_regions:
if start_row <= row < end_row:
new_row = row - start_row
adjusted_merged_regions.append((merge_type, new_row, col, span))
# 写入数据并合并单元格
for row_idx, row_data in enumerate(data_matrix):
for col_idx, value in enumerate(row_data):
cell = ws.cell(row=row_idx + 1, column=col_idx + 1, value=value)
cell.alignment = Alignment(horizontal='center', vertical='center')
# 应用合并区域
for merge_type, row, col, span in adjusted_merged_regions:
if merge_type == 'h':
start_col = col + 1
end_col = start_col + span - 1
ws.merge_cells(start_row=row + 1, start_column=start_col,
end_row=row + 1, end_column=end_col)
elif merge_type == 'v':
start_row = row + 1
end_row = start_row + span - 1
ws.merge_cells(start_row=start_row, start_column=col + 1,
end_row=end_row, end_column=col + 1)
# 调整列宽(智能适应内容)
for col_idx, _ in enumerate(data_matrix[0]):
max_length = 0
for row in data_matrix:
try:
cell_value = row[col_idx]
except IndexError:
continue
if cell_value and len(cell_value) > max_length:
max_length = len(cell_value)
adjusted_width = (max_length + 2) * 1.2
ws.column_dimensions[get_column_letter(col_idx + 1)].width = adjusted_width
# 使用示例
path = './岗位说明书'
os.makedirs("EXCEL", exist_ok=True)
for file in os.listdir(path):
if not file.endswith(('.docx', '.doc')):
continue
file_path = os.path.join(path, file)
try:
doc = Document(file_path)
except Exception as e:
print(f"无法处理文件 {file}: {e}")
continue
wb = Workbook()
wb.remove(wb.active)
# 查找目标表格
target_table = None
start_row = None
end_row = None
for table in doc.tables:
start, end = get_table_data_range(table, "主要职责及评价标准:", "岗位法律及规范风险点:")
if start is not None:
target_table = table
start_row = start
end_row = end
break
if target_table and start_row is not None:
ws = wb.create_sheet("岗位职责")
# 如果找到了结束标记,处理到结束标记前一行
if end_row is not None:
convert_table_to_excel(target_table, ws, start_row, end_row)
else:
# 如果没有找到结束标记,处理从开始标记到表格末尾
convert_table_to_excel(target_table, ws, start_row)
excel_name = os.path.splitext(file)[0] + "_转换结果.xlsx"
wb.save(os.path.join("EXCEL", excel_name))
print(f"已成功转换: {excel_name}")
else:
print(f"未找到'主要职责及评价标准'表格或'岗位法律及规范风险点'标记: {file}")
"工作职责及目的(描述该岗位主要活动及要达到的结果,每一应负责任请依其重要性排列)"
列中序号需要保留在内容中,“重要性”和"工作领域列"中如果为内容重复列就合并
最新发布