import cv2
import os
import numpy as np
import matplotlib.pyplot as plt
import skimage.io as io
from collections import Counter
from PIL import Image
import pandas as pd
from scipy.ndimage import gaussian_filter1d, convolve
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import warnings
# 忽略警告
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 1. 读取图像并进行预处理
data_dir = './附件4'
path = data_dir + '/*.bmp'
coll = io.ImageCollection(path)
img_num = len(coll)
# 转矩阵并二值化
img = np.asarray(coll)
for i in range(len(coll)):
img[i] = cv2.adaptiveThreshold(
src=img[i],
maxValue=1,
adaptiveMethod=cv2.ADAPTIVE_THRESH_MEAN_C,
thresholdType=cv2.THRESH_BINARY,
blockSize=13,
C=2
)
print("图像数据形状:", img.shape)
# 2. 计算每张图片的左右边距
def calculate_margins(image):
# 计算左边距
left = 0
for y in range(image.shape[1]):
if np.any(image[:, y] == 0):
break
left += 1
# 计算右边距
right = 0
for y in range(image.shape[1] - 1, -1, -1):
if np.any(image[:, y] == 0):
break
right += 1
return left, right
left_margins = []
right_margins = []
for i in range(img.shape[0]):
left, right = calculate_margins(img[i])
left_margins.append(left)
right_margins.append(right)
# 3. 改进特征提取:使用固定长度的特征向量
def extract_enhanced_features(image, index):
# 固定长度的特征向量
feature_length = 100 # 设定固定长度
features = np.zeros(feature_length)
# 边距特征
features[0] = left_margins[index]
features[1] = right_margins[index]
# 全局统计特征
features[2] = np.mean(image)
features[3] = np.std(image) if np.std(image) > 0 else 0.001
features[4] = np.sum(image == 0) / image.size
# 水平投影
horizontal_proj = np.sum(1 - image, axis=1)
smoothed_proj = gaussian_filter1d(horizontal_proj, sigma=1.5)
# 检测文字区域
line_regions = []
in_text = False
start = 0
threshold = 0.1 * np.max(smoothed_proj) if np.max(smoothed_proj) > 0 else 0.1
for i, val in enumerate(smoothed_proj):
if val > threshold and not in_text:
in_text = True
start = i
elif val <= threshold and in_text:
in_text = False
line_regions.append((start, i))
if in_text:
line_regions.append((start, len(smoothed_proj) - 1))
# 如果有文本区域,提取文本特征
if line_regions:
# 取最大区域作为主要文本行
main_region = max(line_regions, key=lambda x: x[1] - x[0])
start, end = main_region
line_img = image[start:end, :]
# 计算基线位置(字母底部)
vertical_proj = np.sum(1 - line_img, axis=0)
if len(vertical_proj) > 0:
baseline_pos = np.argmax(vertical_proj)
else:
baseline_pos = image.shape[0] // 2
# 空白行掩码(宽度为小写字母)
mask_height = int((end - start) * 0.6) # 小写字母高度约为行高的60%
mask_start = max(0, baseline_pos - mask_height)
# 特征:空白行掩码特征
mask_region = image[mask_start:baseline_pos, :]
if mask_region.size > 0:
mask_feature = np.mean(mask_region, axis=0)
if len(mask_feature) > 0:
# 采样固定数量的点
step = max(1, len(mask_feature) // 10)
mask_feature = mask_feature[::step]
if len(mask_feature) > 10:
mask_feature = mask_feature[:10]
features[5:5 + len(mask_feature)] = mask_feature
# 特征:文字边缘特征
edge_kernel = np.array([[-1, -1, -1],
[-1, 8, -1],
[-1, -1, -1]])
edges = convolve(image, edge_kernel, mode='reflect')
edge_feature = np.mean(np.abs(edges), axis=0)
if len(edge_feature) > 0:
step = max(1, len(edge_feature) // 10)
edge_feature = edge_feature[::step]
if len(edge_feature) > 10:
edge_feature = edge_feature[:10]
features[15:15 + len(edge_feature)] = edge_feature
# 特征:投影特征
proj_feature = np.concatenate([
horizontal_proj[::max(1, len(horizontal_proj) // 10)],
vertical_proj[::max(1, len(vertical_proj) // 10)]
])
if len(proj_feature) > 0:
if len(proj_feature) > 20:
proj_feature = proj_feature[:20]
features[25:25 + len(proj_feature)] = proj_feature
# 特征:文字密度特征
text_density = np.mean(1 - image, axis=0)
if len(text_density) > 0:
step = max(1, len(text_density) // 10)
text_density = text_density[::step]
if len(text_density) > 10:
text_density = text_density[:10]
features[45:45 + len(text_density)] = text_density
# 确保没有NaN值
features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)
return features
# 提取所有碎片的增强特征
features_list = []
for i in range(img.shape[0]):
features = extract_enhanced_features(img[i], i)
features_list.append(features)
# 4. 聚类成16类(带反复聚类机制)
def iterative_clustering(features, initial_clusters=16, max_size=19, max_iter=5):
# 初始聚类
valid_indices = [i for i in range(len(features))] # 所有碎片都有效
valid_features = features # 所有特征都有效
if len(valid_features) == 0:
return np.full(len(features), -1), initial_clusters
# 标准化特征
scaler = StandardScaler()
X = np.array(valid_features)
X_scaled = scaler.fit_transform(X)
# 初始聚类
kmeans = KMeans(n_clusters=initial_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)
# 创建完整标签数组
full_labels = np.full(len(features), -1)
for idx, label in zip(valid_indices, cluster_labels):
full_labels[idx] = label
# 反复聚类机制
for iteration in range(max_iter):
print(f"迭代聚类: 第 {iteration + 1} 轮")
oversized_clusters = []
# 检查是否有过大的聚类
for cluster_id in range(initial_clusters):
cluster_indices = np.where(full_labels == cluster_id)[0]
if len(cluster_indices) > max_size:
print(f"聚类 {cluster_id} 过大 ({len(cluster_indices)} 个碎片),将重新聚类")
oversized_clusters.append(cluster_id)
if not oversized_clusters:
print("所有聚类大小合适,停止迭代")
break
# 处理过大的聚类
for cluster_id in oversized_clusters:
cluster_indices = np.where(full_labels == cluster_id)[0]
# 提取该聚类的特征
cluster_features = []
for idx in cluster_indices:
cluster_features.append(features[idx])
if len(cluster_features) < 2:
continue
# 对该聚类进行子聚类(分成2类)
sub_kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
sub_labels = sub_kmeans.fit_predict(scaler.transform(cluster_features))
# 将新标签分配给碎片
new_label1 = cluster_id # 保留原标签
new_label2 = initial_clusters # 新标签
initial_clusters += 1 # 增加总聚类数
for i, idx in enumerate(cluster_indices):
if sub_labels[i] == 0:
full_labels[idx] = new_label1
else:
full_labels[idx] = new_label2
print(f"本轮迭代后总聚类数: {initial_clusters}")
return full_labels, initial_clusters
# 执行迭代聚类
full_cluster_labels, final_cluster_count = iterative_clustering(features_list, initial_clusters=16)
print(f"最终聚类数: {final_cluster_count}")
# 5. 聚类结果显示(每个类一张图片)
os.makedirs('clusters', exist_ok=True)
# 为每个聚类创建单独图片
for cluster_id in range(final_cluster_count):
cluster_indices = np.where(full_cluster_labels == cluster_id)[0]
if len(cluster_indices) == 0:
continue
print(f"聚类 {cluster_id} 包含 {len(cluster_indices)} 个碎片: {cluster_indices.tolist()}")
# 计算网格大小
cols = min(5, len(cluster_indices)) # 每行最多5个碎片
rows = (len(cluster_indices) + cols - 1) // cols
# 创建空白画布
cluster_img = np.zeros((rows * 100 + 50, cols * 100 + 20), dtype=np.uint8) + 255
# 添加标题
title = f"聚类 {cluster_id} (碎片数: {len(cluster_indices)})"
cv2.putText(cluster_img, title, (10, 30), cv2.FONT_HERSHEY_SIMPLEX,
0.7, 0, 2, cv2.LINE_AA)
# 排列碎片
for i, idx in enumerate(cluster_indices):
row = i // cols
col = i % cols
# 获取碎片图像并调整大小
frag = img[idx]
resized = cv2.resize(frag, (100, 100))
resized = resized * 255
# 添加碎片编号
cv2.putText(resized, str(idx), (5, 15), cv2.FONT_HERSHEY_SIMPLEX,
0.5, 0, 1, cv2.LINE_AA)
# 添加到画布
y_start = 50 + row * 100
x_start = 10 + col * 100
cluster_img[y_start:y_start + 100, x_start:x_start + 100] = resized
# 保存图片
cv2.imwrite(f'clusters/cluster_{cluster_id}.png', cluster_img)
# 6. 生成聚类概览图
overview_img = np.zeros((1000, 1600), dtype=np.uint8) + 255
cv2.putText(overview_img, "聚类概览", (50, 50), cv2.FONT_HERSHEY_SIMPLEX,
1.2, 0, 2, cv2.LINE_AA)
# 创建网格展示所有聚类
cols = 4
rows = (final_cluster_count + cols - 1) // cols
for cluster_id in range(final_cluster_count):
row = cluster_id // cols
col = cluster_id % cols
# 尝试读取聚类图片
img_path = f'clusters/cluster_{cluster_id}.png'
if os.path.exists(img_path):
cluster_img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
if cluster_img is not None:
# 调整大小以适应网格
cluster_img = cv2.resize(cluster_img, (380, 150))
# 添加到概览图
y_start = 100 + row * 180
x_start = 50 + col * 390
if y_start + 150 < overview_img.shape[0] and x_start + 380 < overview_img.shape[1]:
overview_img[y_start:y_start + 150, x_start:x_start + 380] = cluster_img
else:
print(f"警告: 聚类 {cluster_id} 图片不存在")
# 保存概览图
cv2.imwrite('clusters_overview.png', overview_img)
print("已保存聚类概览图到 clusters_overview.png")
# 显示概览图
plt.figure(figsize=(16, 10))
plt.imshow(overview_img, cmap='gray')
plt.title(f'{final_cluster_count}类聚类概览')
plt.axis('off')
plt.show()
# 7. 人工调控:将聚类合并为11行
print(f"\n请根据 clusters 目录中的聚类图片,将{final_cluster_count}类合并为11行")
print("输入格式: 目标行ID:原聚类ID1,原聚类ID2,... (例如: 0:1,3,5)")
# 初始化行分配
row_assignments = [[] for _ in range(11)]
# 获取用户输入
for row_id in range(11):
while True:
user_input = input(f"输入行 {row_id} 对应的聚类ID (逗号分隔): ")
try:
cluster_ids = list(map(int, user_input.split(',')))
# 验证输入
valid = True
for cid in cluster_ids:
if cid < 0 or cid >= final_cluster_count:
print(f"错误: 聚类ID {cid} 无效 (应在0-{final_cluster_count - 1}之间)")
valid = False
break
if valid:
# 收集该行所有碎片
for cid in cluster_ids:
indices = np.where(full_cluster_labels == cid)[0]
row_assignments[row_id].extend(indices.tolist())
break
except ValueError:
print("输入格式错误,请重新输入")
# 显示行分配结果
print("\n行分配结果:")
for row_id, fragments in enumerate(row_assignments):
print(f"行 {row_id}: 包含 {len(fragments)} 个碎片 - {fragments}")
# 8. 行内排序(基于边缘匹配和文字连续性)
def sort_fragments_in_row(row_indices):
if len(row_indices) < 2:
return row_indices
# 计算边缘匹配分数矩阵
match_scores = np.zeros((len(row_indices), len(row_indices)))
for i, idx_i in enumerate(row_indices):
img_i = img[idx_i]
right_edge = img_i[:, -20:] # 右侧20列
for j, idx_j in enumerate(row_indices):
if i == j:
match_scores[i, j] = -np.inf # 不自匹配
continue
img_j = img[idx_j]
left_edge = img_j[:, :20] # 左侧20列
# 计算匹配度(相同位置都是文字或都是空白)
match_score = np.sum(right_edge == left_edge)
# 增强文字区域的匹配权重
text_match = np.sum((right_edge == 0) & (left_edge == 0))
match_score += text_match * 2
match_scores[i, j] = match_score
# 使用贪心算法构建顺序
sorted_indices = [0] # 从第一个碎片开始
used = set([0])
while len(used) < len(row_indices):
current_idx = sorted_indices[-1]
best_match = -1
best_score = -np.inf
# 查找最佳匹配
for j in range(len(row_indices)):
if j not in used and match_scores[current_idx, j] > best_score:
best_score = match_scores[current_idx, j]
best_match = j
if best_match == -1:
# 没有找到匹配,尝试其他碎片
unused = [j for j in range(len(row_indices)) if j not in used]
best_match = unused[0]
sorted_indices.append(best_match)
used.add(best_match)
# 转换为原始索引
return [row_indices[i] for i in sorted_indices]
# 对每行进行排序并可视化
sorted_rows = []
plt.figure(figsize=(15, 20))
for row_id, row_indices in enumerate(row_assignments):
if not row_indices:
sorted_rows.append([])
continue
# 排序
sorted_frags = sort_fragments_in_row(row_indices)
sorted_rows.append(sorted_frags)
# 创建行预览图
row_img = np.zeros((100, 100 * len(sorted_frags)), dtype=np.uint8)
for col_idx, frag_idx in enumerate(sorted_frags):
frag_img = img[frag_idx]
resized = cv2.resize(frag_img, (100, 100))
resized = (1 - resized) * 255
# 添加碎片编号
cv2.putText(resized, str(frag_idx), (5, 15), cv2.FONT_HERSHEY_SIMPLEX,
0.5, 0, 1, cv2.LINE_AA)
row_img[:, col_idx * 100:(col_idx + 1) * 100] = resized
# 显示行排序结果
plt.subplot(11, 1, row_id + 1)
plt.imshow(row_img, cmap='gray')
plt.title(f'行 {row_id} 排序结果 (碎片数: {len(sorted_frags)})')
plt.axis('off')
plt.tight_layout()
plt.suptitle('行内排序结果', fontsize=16)
plt.subplots_adjust(top=0.95)
plt.savefig('row_sorting.png')
plt.show()
# 人工调控行内顺序
print("请检查行内排序结果,如有需要可进行调整")
print("输入格式: 行ID:碎片ID1,碎片ID2,... (例如: 0:15,23,42)")
adjusted_rows = []
for row_id in range(11):
while True:
user_input = input(f"输入行 {row_id} 的碎片顺序 (直接回车保持原样): ")
if user_input.strip() == "":
adjusted_rows.append(sorted_rows[row_id])
break
else:
try:
frag_ids = list(map(int, user_input.split(',')))
# 验证输入
valid = True
for fid in frag_ids:
if fid not in row_assignments[row_id]:
print(f"错误: 碎片 {fid} 不属于行 {row_id}")
valid = False
break
if valid:
adjusted_rows.append(frag_ids)
break
else:
print("请确保所有碎片ID都属于该行")
except:
print("输入格式错误,请重新输入")
# 9. 行间排序(基于上下边界匹配)
def sort_rows(rows):
if len(rows) < 2:
return rows
# 计算每行的上下边界
top_bounds = []
bottom_bounds = []
for row in rows:
if not row:
top_bounds.append(0)
bottom_bounds.append(0)
continue
top = img.shape[1] # 初始化为最大高度
bottom = 0
for frag_idx in row:
frag = img[frag_idx]
# 找到碎片中的文本区域
horizontal_proj = np.sum(1 - frag, axis=1)
text_indices = np.where(horizontal_proj > 0)[0]
if len(text_indices) > 0:
frag_top = text_indices.min()
frag_bottom = text_indices.max()
if frag_top < top:
top = frag_top
if frag_bottom > bottom:
bottom = frag_bottom
top_bounds.append(top)
bottom_bounds.append(bottom)
# 计算行间匹配分数
match_scores = np.zeros((len(rows), len(rows)))
for i in range(len(rows)):
for j in range(len(rows)):
if i == j:
match_scores[i, j] = -np.inf
continue
# 行间距:下行顶部与上行底部的距离
gap = abs(top_bounds[j] - bottom_bounds[i])
# 行重叠:负值表示重叠
overlap = bottom_bounds[i] - top_bounds[j]
# 匹配分数:行间距小、行重叠合理时分数高
if overlap > 0: # 无重叠,有间距
match_score = 1.0 / (1 + gap)
else: # 有重叠
match_score = 1.0 - min(1.0, abs(overlap) / img.shape[1])
match_scores[i, j] = match_score
# 使用贪心算法构建行顺序
sorted_indices = [0] # 从第一行开始
used = set([0])
while len(used) < len(rows):
current_idx = sorted_indices[-1]
best_match = -1
best_score = -np.inf
# 查找最佳匹配
for j in range(len(rows)):
if j not in used and match_scores[current_idx, j] > best_score:
best_score = match_scores[current_idx, j]
best_match = j
if best_match == -1:
# 没有找到匹配,尝试其他行
unused = [j for j in range(len(rows)) if j not in used]
best_match = unused[0]
sorted_indices.append(best_match)
used.add(best_match)
return [rows[i] for i in sorted_indices]
# 行间排序
sorted_row_order = sort_rows(adjusted_rows)
# 显示行间排序结果
print("行间排序结果:")
for i, row in enumerate(sorted_row_order):
print(f"位置 {i}: 碎片 {row}")
# 人工调控行顺序
print("请检查行间排序结果,如有需要可进行调整")
print("输入格式: 新顺序的行ID (用逗号分隔, 例如: 0,2,1,3,4,5,6,7,8,9,10)")
while True:
user_input = input("输入行顺序 (直接回车保持原样): ")
if user_input.strip() == "":
final_row_order = sorted_row_order
break
else:
try:
new_order = list(map(int, user_input.split(',')))
if len(new_order) != 11:
print("错误: 必须指定11行的顺序")
continue
if set(new_order) != set(range(11)):
print("错误: 行ID必须包含0到10的所有值")
continue
final_row_order = [adjusted_rows[i] for i in new_order]
break
except:
print("输入格式错误,请重新输入")
# 10. 最终拼接与结果输出
full_image = None
for row in final_row_order:
if not row:
continue
row_img = img[row[0]]
for frag_id in row[1:]:
row_img = np.hstack((row_img, img[frag_id]))
if full_image is None:
full_image = row_img
else:
# 在行之间添加空白分隔
separator = np.ones((10, row_img.shape[1]), dtype=row_img.dtype)
full_image = np.vstack((full_image, separator))
full_image = np.vstack((full_image, row_img))
# 保存结果
if full_image is not None:
full_image = (1 - full_image) * 255
full_image = full_image.astype(np.uint8)
final_img = Image.fromarray(full_image)
final_img.save('result4.png')
print("最终拼接结果已保存为 'result4.png'")
# 保存碎片顺序表格
result_table = np.full((11, 19), -1)
for row_idx, row in enumerate(final_row_order):
for col_idx, frag_id in enumerate(row[:19]):
result_table[row_idx, col_idx] = frag_id
pd.DataFrame(result_table).to_csv('result4.csv', index=False, header=False)
print("碎片顺序表格已保存为 'result4.csv'")
else:
print("错误: 无法拼接图像")
# 11. 保存人工调控记录
with open('adjustment_log.txt', 'w') as f:
f.write("行分配调整:\n")
for row_id, frag_ids in enumerate(adjusted_rows):
f.write(f"行 {row_id}: {frag_ids}\n")
f.write("\n行顺序调整:\n")
f.write(f"最终行顺序: {[i for i in range(len(final_row_order))]}\n")
print("人工调控记录已保存为 'adjustment_log.txt'")行内排序算法需要优化,可以聚少一点的类,然后进行行内排序,对于聚类较多的类,两个边缘都有碎片可以高相似度匹配的留下,其他碎片进入备选
最新发布