Binary Gap(二进制空白)

本文深入探讨了二进制间隙算法,详细介绍了如何找出正整数二进制表示中最长的连续零序列,该序列由两端的一包围。通过实例说明了算法的实现过程,并提供了一种创新的字符串处理方法来简化计算。

中文标题【二进制空白】

英文描述

A binary gap within a positive integer N is any maximal sequence of consecutive zeros that is surrounded by ones at both ends in the binary representation of N.

For example, number 9 has binary representation 1001 and contains a binary gap of length 2. The number 529 has binary representation 1000010001 and contains two binary gaps: one of length 4 and one of length 3. The number 20 has binary representation 10100 and contains one binary gap of length 1. The number 15 has binary representation 1111 and has no binary gaps. The number 32 has binary representation 100000 and has no binary gaps.

Write a function:

class Solution { public int solution(int N); }

that, given a positive integer N, returns the length of its longest binary gap. The function should return 0 if N doesn't contain a binary gap.

For example, given N = 1041 the function should return 5, because N has binary representation 10000010001 and so its longest binary gap is of length 5. Given N = 32 the function should return 0, because N has binary representation '100000' and thus no binary gaps.

Write an efficient algorithm for the following assumptions:

N is an integer within the range [1..2,147,483,647].

中文描述

这里我不按照原文一字一字的翻译,但是尽量按照题目的要求把题目解释清楚。

这里题目的要求是,将 N 为一个整数类型的数据,转换为一个 2 进制的字符串,然后在返回的字符串中返回最大的 0 的间隔数字。

例如 529 转换为 2 进制的字符串为:1000010001,在这里,将会存在以 1 为分割的字符串  0000 和 000,这 2 个字符串的长度分别为 4 和 3。

我们的算法需要返回的值诶 4。

思路和点评

这个题目的思路其实比较简单,你需要首先将 N 这个整数,转换为 0 和 1 的字符串。然后在转换成功的字符串中返回以 1 分分割的 0 的长度。

这里可能需要考虑下面的几种情况。

情况
结果
11这个情况应该返回的长度为 0
10这个情况因为没有被 1 这个字符串封闭,因此应该返回长度为 0

传统的思路应该是采取字符串分割的方式,进行遍历后获得结果。

我们在这里采取一种相对不是非常常规的方式,例如在 10000010001 字符串中插入 #,将字符串变为 #1#00000#1#000#1#。

然后将字符串按照 1 进行分割,那么分割后的数组应该分别存储的数据为:#,#0000#,#000#,#

这里我们只需要找到 #...# 中值最大的连续 0 字符串就可以了。基本上可以使用 1 个字符串替换函数和一个字符串分割函数就可以了,并不需要多次存储和遍历。

源代码

源代码和有关代码的更新请访问 GitHub:

https://github.com/cwiki-us/java-tutorial/blob/master/src/test/java/com/ossez/lang/tutorial/tests/codility/CodilityBinaryGapTest.java

代码思路请参考:



package com.ossez.lang.tutorial.tests.codility;

import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * <p>
 * More details about question see link below
 * <ul>
 * <li>@see <a href= "https://www.cwiki.us/display/ITCLASSIFICATION/Binary+Gap">https://www.cwiki.us/display/ITCLASSIFICATION/Binary+Gap</a>
 * </li>
 * </ul>
 * </p>
 * 
 * @author YuCheng
 *
 */
public class CodilityBinaryGapTest {

	private final static Logger logger = LoggerFactory.getLogger(CodilityBinaryGapTest.class);

	/**
	 * 
	 */
	@Test
	public void testMain() {
		logger.debug("BEGIN");

		int N = 529;
		String intStr = Integer.toBinaryString(N);

		intStr = intStr.replace("1", "#1#");

		String[] strArray = intStr.split("1");

		int maxCount = 0;
		for (int i = 0; i < strArray.length; i++) {
			String checkStr = strArray[i];
			int countLength = 0;

			if (checkStr.length() > 2 && checkStr.startsWith("#") && checkStr.endsWith("#")) {
				checkStr = checkStr.replace("#", "");
				countLength = checkStr.length();

				if (maxCount < countLength) {
					maxCount = countLength;
				}

			}
		}

		logger.debug("MAX COUNT: [{}]", maxCount);
	}

}



https://www.cwiki.us/display/ITCLASSIFICATION/Binary+Gap

import cv2 import os import numpy as np import matplotlib.pyplot as plt import skimage.io as io from collections import Counter from PIL import Image import pandas as pd from scipy.ndimage import gaussian_filter1d, convolve from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler import warnings # 忽略警告 warnings.filterwarnings('ignore') plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 1. 读取图像并进行预处理 data_dir = './附件4' path = data_dir + '/*.bmp' coll = io.ImageCollection(path) img_num = len(coll) # 转矩阵并二值化 img = np.asarray(coll) for i in range(len(coll)): img[i] = cv2.adaptiveThreshold( src=img[i], maxValue=1, adaptiveMethod=cv2.ADAPTIVE_THRESH_MEAN_C, thresholdType=cv2.THRESH_BINARY, blockSize=13, C=2 ) print("图像数据形状:", img.shape) # 2. 计算每张图片的左右边距 def calculate_margins(image): # 计算左边距 left = 0 for y in range(image.shape[1]): if np.any(image[:, y] == 0): break left += 1 # 计算右边距 right = 0 for y in range(image.shape[1] - 1, -1, -1): if np.any(image[:, y] == 0): break right += 1 return left, right left_margins = [] right_margins = [] for i in range(img.shape[0]): left, right = calculate_margins(img[i]) left_margins.append(left) right_margins.append(right) # 3. 改进特征提取:使用固定长度的特征向量 def extract_enhanced_features(image, index): # 固定长度的特征向量 feature_length = 100 # 设定固定长度 features = np.zeros(feature_length) # 边距特征 features[0] = left_margins[index] features[1] = right_margins[index] # 全局统计特征 features[2] = np.mean(image) features[3] = np.std(image) if np.std(image) > 0 else 0.001 features[4] = np.sum(image == 0) / image.size # 水平投影 horizontal_proj = np.sum(1 - image, axis=1) smoothed_proj = gaussian_filter1d(horizontal_proj, sigma=1.5) # 检测文字区域 line_regions = [] in_text = False start = 0 threshold = 0.1 * np.max(smoothed_proj) if np.max(smoothed_proj) > 0 else 0.1 for i, val in enumerate(smoothed_proj): if val > threshold and not in_text: in_text = True start = i elif val <= threshold and in_text: in_text = False line_regions.append((start, i)) if in_text: line_regions.append((start, len(smoothed_proj) - 1)) # 如果有文本区域,提取文本特征 if line_regions: # 取最大区域作为主要文本行 main_region = max(line_regions, key=lambda x: x[1] - x[0]) start, end = main_region line_img = image[start:end, :] # 计算基线位置(字母底部) vertical_proj = np.sum(1 - line_img, axis=0) if len(vertical_proj) > 0: baseline_pos = np.argmax(vertical_proj) else: baseline_pos = image.shape[0] // 2 # 空白行掩码(宽度为小写字母) mask_height = int((end - start) * 0.6) # 小写字母高度约为行高的60% mask_start = max(0, baseline_pos - mask_height) # 特征:空白行掩码特征 mask_region = image[mask_start:baseline_pos, :] if mask_region.size > 0: mask_feature = np.mean(mask_region, axis=0) if len(mask_feature) > 0: # 采样固定数量的点 step = max(1, len(mask_feature) // 10) mask_feature = mask_feature[::step] if len(mask_feature) > 10: mask_feature = mask_feature[:10] features[5:5 + len(mask_feature)] = mask_feature # 特征:文字边缘特征 edge_kernel = np.array([[-1, -1, -1], [-1, 8, -1], [-1, -1, -1]]) edges = convolve(image, edge_kernel, mode='reflect') edge_feature = np.mean(np.abs(edges), axis=0) if len(edge_feature) > 0: step = max(1, len(edge_feature) // 10) edge_feature = edge_feature[::step] if len(edge_feature) > 10: edge_feature = edge_feature[:10] features[15:15 + len(edge_feature)] = edge_feature # 特征:投影特征 proj_feature = np.concatenate([ horizontal_proj[::max(1, len(horizontal_proj) // 10)], vertical_proj[::max(1, len(vertical_proj) // 10)] ]) if len(proj_feature) > 0: if len(proj_feature) > 20: proj_feature = proj_feature[:20] features[25:25 + len(proj_feature)] = proj_feature # 特征:文字密度特征 text_density = np.mean(1 - image, axis=0) if len(text_density) > 0: step = max(1, len(text_density) // 10) text_density = text_density[::step] if len(text_density) > 10: text_density = text_density[:10] features[45:45 + len(text_density)] = text_density # 确保没有NaN值 features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0) return features # 提取所有碎片的增强特征 features_list = [] for i in range(img.shape[0]): features = extract_enhanced_features(img[i], i) features_list.append(features) # 4. 聚类成16类(带反复聚类机制) def iterative_clustering(features, initial_clusters=16, max_size=19, max_iter=5): # 初始聚类 valid_indices = [i for i in range(len(features))] # 所有碎片都有效 valid_features = features # 所有特征都有效 if len(valid_features) == 0: return np.full(len(features), -1), initial_clusters # 标准化特征 scaler = StandardScaler() X = np.array(valid_features) X_scaled = scaler.fit_transform(X) # 初始聚类 kmeans = KMeans(n_clusters=initial_clusters, random_state=42, n_init=10) cluster_labels = kmeans.fit_predict(X_scaled) # 创建完整标签数组 full_labels = np.full(len(features), -1) for idx, label in zip(valid_indices, cluster_labels): full_labels[idx] = label # 反复聚类机制 for iteration in range(max_iter): print(f"迭代聚类: 第 {iteration + 1} 轮") oversized_clusters = [] # 检查是否有过大的聚类 for cluster_id in range(initial_clusters): cluster_indices = np.where(full_labels == cluster_id)[0] if len(cluster_indices) > max_size: print(f"聚类 {cluster_id} 过大 ({len(cluster_indices)} 个碎片),将重新聚类") oversized_clusters.append(cluster_id) if not oversized_clusters: print("所有聚类大小合适,停止迭代") break # 处理过大的聚类 for cluster_id in oversized_clusters: cluster_indices = np.where(full_labels == cluster_id)[0] # 提取该聚类的特征 cluster_features = [] for idx in cluster_indices: cluster_features.append(features[idx]) if len(cluster_features) < 2: continue # 对该聚类进行子聚类(分成2类) sub_kmeans = KMeans(n_clusters=2, random_state=42, n_init=10) sub_labels = sub_kmeans.fit_predict(scaler.transform(cluster_features)) # 将新标签分配给碎片 new_label1 = cluster_id # 保留原标签 new_label2 = initial_clusters # 新标签 initial_clusters += 1 # 增加总聚类数 for i, idx in enumerate(cluster_indices): if sub_labels[i] == 0: full_labels[idx] = new_label1 else: full_labels[idx] = new_label2 print(f"本轮迭代后总聚类数: {initial_clusters}") return full_labels, initial_clusters # 执行迭代聚类 full_cluster_labels, final_cluster_count = iterative_clustering(features_list, initial_clusters=16) print(f"最终聚类数: {final_cluster_count}") # 5. 聚类结果显示(每个类一张图片) os.makedirs('clusters', exist_ok=True) # 为每个聚类创建单独图片 for cluster_id in range(final_cluster_count): cluster_indices = np.where(full_cluster_labels == cluster_id)[0] if len(cluster_indices) == 0: continue print(f"聚类 {cluster_id} 包含 {len(cluster_indices)} 个碎片: {cluster_indices.tolist()}") # 计算网格大小 cols = min(5, len(cluster_indices)) # 每行最多5个碎片 rows = (len(cluster_indices) + cols - 1) // cols # 创建空白画布 cluster_img = np.zeros((rows * 100 + 50, cols * 100 + 20), dtype=np.uint8) + 255 # 添加标题 title = f"聚类 {cluster_id} (碎片数: {len(cluster_indices)})" cv2.putText(cluster_img, title, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, 0, 2, cv2.LINE_AA) # 排列碎片 for i, idx in enumerate(cluster_indices): row = i // cols col = i % cols # 获取碎片图像并调整大小 frag = img[idx] resized = cv2.resize(frag, (100, 100)) resized = resized * 255 # 添加碎片编号 cv2.putText(resized, str(idx), (5, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, 0, 1, cv2.LINE_AA) # 添加到画布 y_start = 50 + row * 100 x_start = 10 + col * 100 cluster_img[y_start:y_start + 100, x_start:x_start + 100] = resized # 保存图片 cv2.imwrite(f'clusters/cluster_{cluster_id}.png', cluster_img) # 6. 生成聚类概览图 overview_img = np.zeros((1000, 1600), dtype=np.uint8) + 255 cv2.putText(overview_img, "聚类概览", (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1.2, 0, 2, cv2.LINE_AA) # 创建网格展示所有聚类 cols = 4 rows = (final_cluster_count + cols - 1) // cols for cluster_id in range(final_cluster_count): row = cluster_id // cols col = cluster_id % cols # 尝试读取聚类图片 img_path = f'clusters/cluster_{cluster_id}.png' if os.path.exists(img_path): cluster_img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) if cluster_img is not None: # 调整大小以适应网格 cluster_img = cv2.resize(cluster_img, (380, 150)) # 添加到概览图 y_start = 100 + row * 180 x_start = 50 + col * 390 if y_start + 150 < overview_img.shape[0] and x_start + 380 < overview_img.shape[1]: overview_img[y_start:y_start + 150, x_start:x_start + 380] = cluster_img else: print(f"警告: 聚类 {cluster_id} 图片不存在") # 保存概览图 cv2.imwrite('clusters_overview.png', overview_img) print("已保存聚类概览图到 clusters_overview.png") # 显示概览图 plt.figure(figsize=(16, 10)) plt.imshow(overview_img, cmap='gray') plt.title(f'{final_cluster_count}类聚类概览') plt.axis('off') plt.show() # 7. 人工调控:将聚类合并为11行 print(f"\n请根据 clusters 目录中的聚类图片,将{final_cluster_count}类合并为11行") print("输入格式: 目标行ID:原聚类ID1,原聚类ID2,... (例如: 0:1,3,5)") # 初始化行分配 row_assignments = [[] for _ in range(11)] # 获取用户输入 for row_id in range(11): while True: user_input = input(f"输入行 {row_id} 对应的聚类ID (逗号分隔): ") try: cluster_ids = list(map(int, user_input.split(','))) # 验证输入 valid = True for cid in cluster_ids: if cid < 0 or cid >= final_cluster_count: print(f"错误: 聚类ID {cid} 无效 (应在0-{final_cluster_count - 1}之间)") valid = False break if valid: # 收集该行所有碎片 for cid in cluster_ids: indices = np.where(full_cluster_labels == cid)[0] row_assignments[row_id].extend(indices.tolist()) break except ValueError: print("输入格式错误,请重新输入") # 显示行分配结果 print("\n行分配结果:") for row_id, fragments in enumerate(row_assignments): print(f"行 {row_id}: 包含 {len(fragments)} 个碎片 - {fragments}") # 8. 行内排序(基于边缘匹配和文字连续性) def sort_fragments_in_row(row_indices): if len(row_indices) < 2: return row_indices # 计算边缘匹配分数矩阵 match_scores = np.zeros((len(row_indices), len(row_indices))) for i, idx_i in enumerate(row_indices): img_i = img[idx_i] right_edge = img_i[:, -20:] # 右侧20列 for j, idx_j in enumerate(row_indices): if i == j: match_scores[i, j] = -np.inf # 不自匹配 continue img_j = img[idx_j] left_edge = img_j[:, :20] # 左侧20列 # 计算匹配度(相同位置都是文字或都是空白) match_score = np.sum(right_edge == left_edge) # 增强文字区域的匹配权重 text_match = np.sum((right_edge == 0) & (left_edge == 0)) match_score += text_match * 2 match_scores[i, j] = match_score # 使用贪心算法构建顺序 sorted_indices = [0] # 从第一个碎片开始 used = set([0]) while len(used) < len(row_indices): current_idx = sorted_indices[-1] best_match = -1 best_score = -np.inf # 查找最佳匹配 for j in range(len(row_indices)): if j not in used and match_scores[current_idx, j] > best_score: best_score = match_scores[current_idx, j] best_match = j if best_match == -1: # 没有找到匹配,尝试其他碎片 unused = [j for j in range(len(row_indices)) if j not in used] best_match = unused[0] sorted_indices.append(best_match) used.add(best_match) # 转换为原始索引 return [row_indices[i] for i in sorted_indices] # 对每行进行排序并可视化 sorted_rows = [] plt.figure(figsize=(15, 20)) for row_id, row_indices in enumerate(row_assignments): if not row_indices: sorted_rows.append([]) continue # 排序 sorted_frags = sort_fragments_in_row(row_indices) sorted_rows.append(sorted_frags) # 创建行预览图 row_img = np.zeros((100, 100 * len(sorted_frags)), dtype=np.uint8) for col_idx, frag_idx in enumerate(sorted_frags): frag_img = img[frag_idx] resized = cv2.resize(frag_img, (100, 100)) resized = (1 - resized) * 255 # 添加碎片编号 cv2.putText(resized, str(frag_idx), (5, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, 0, 1, cv2.LINE_AA) row_img[:, col_idx * 100:(col_idx + 1) * 100] = resized # 显示行排序结果 plt.subplot(11, 1, row_id + 1) plt.imshow(row_img, cmap='gray') plt.title(f'行 {row_id} 排序结果 (碎片数: {len(sorted_frags)})') plt.axis('off') plt.tight_layout() plt.suptitle('行内排序结果', fontsize=16) plt.subplots_adjust(top=0.95) plt.savefig('row_sorting.png') plt.show() # 人工调控行内顺序 print("请检查行内排序结果,如有需要可进行调整") print("输入格式: 行ID:碎片ID1,碎片ID2,... (例如: 0:15,23,42)") adjusted_rows = [] for row_id in range(11): while True: user_input = input(f"输入行 {row_id} 的碎片顺序 (直接回车保持原样): ") if user_input.strip() == "": adjusted_rows.append(sorted_rows[row_id]) break else: try: frag_ids = list(map(int, user_input.split(','))) # 验证输入 valid = True for fid in frag_ids: if fid not in row_assignments[row_id]: print(f"错误: 碎片 {fid} 不属于行 {row_id}") valid = False break if valid: adjusted_rows.append(frag_ids) break else: print("请确保所有碎片ID都属于该行") except: print("输入格式错误,请重新输入") # 9. 行间排序(基于上下边界匹配) def sort_rows(rows): if len(rows) < 2: return rows # 计算每行的上下边界 top_bounds = [] bottom_bounds = [] for row in rows: if not row: top_bounds.append(0) bottom_bounds.append(0) continue top = img.shape[1] # 初始化为最大高度 bottom = 0 for frag_idx in row: frag = img[frag_idx] # 找到碎片中的文本区域 horizontal_proj = np.sum(1 - frag, axis=1) text_indices = np.where(horizontal_proj > 0)[0] if len(text_indices) > 0: frag_top = text_indices.min() frag_bottom = text_indices.max() if frag_top < top: top = frag_top if frag_bottom > bottom: bottom = frag_bottom top_bounds.append(top) bottom_bounds.append(bottom) # 计算行间匹配分数 match_scores = np.zeros((len(rows), len(rows))) for i in range(len(rows)): for j in range(len(rows)): if i == j: match_scores[i, j] = -np.inf continue # 行间距:下行顶部与上行底部的距离 gap = abs(top_bounds[j] - bottom_bounds[i]) # 行重叠:负值表示重叠 overlap = bottom_bounds[i] - top_bounds[j] # 匹配分数:行间距小、行重叠合理时分数高 if overlap > 0: # 无重叠,有间距 match_score = 1.0 / (1 + gap) else: # 有重叠 match_score = 1.0 - min(1.0, abs(overlap) / img.shape[1]) match_scores[i, j] = match_score # 使用贪心算法构建行顺序 sorted_indices = [0] # 从第一行开始 used = set([0]) while len(used) < len(rows): current_idx = sorted_indices[-1] best_match = -1 best_score = -np.inf # 查找最佳匹配 for j in range(len(rows)): if j not in used and match_scores[current_idx, j] > best_score: best_score = match_scores[current_idx, j] best_match = j if best_match == -1: # 没有找到匹配,尝试其他行 unused = [j for j in range(len(rows)) if j not in used] best_match = unused[0] sorted_indices.append(best_match) used.add(best_match) return [rows[i] for i in sorted_indices] # 行间排序 sorted_row_order = sort_rows(adjusted_rows) # 显示行间排序结果 print("行间排序结果:") for i, row in enumerate(sorted_row_order): print(f"位置 {i}: 碎片 {row}") # 人工调控行顺序 print("请检查行间排序结果,如有需要可进行调整") print("输入格式: 新顺序的行ID (用逗号分隔, 例如: 0,2,1,3,4,5,6,7,8,9,10)") while True: user_input = input("输入行顺序 (直接回车保持原样): ") if user_input.strip() == "": final_row_order = sorted_row_order break else: try: new_order = list(map(int, user_input.split(','))) if len(new_order) != 11: print("错误: 必须指定11行的顺序") continue if set(new_order) != set(range(11)): print("错误: 行ID必须包含0到10的所有值") continue final_row_order = [adjusted_rows[i] for i in new_order] break except: print("输入格式错误,请重新输入") # 10. 最终拼接与结果输出 full_image = None for row in final_row_order: if not row: continue row_img = img[row[0]] for frag_id in row[1:]: row_img = np.hstack((row_img, img[frag_id])) if full_image is None: full_image = row_img else: # 在行之间添加空白分隔 separator = np.ones((10, row_img.shape[1]), dtype=row_img.dtype) full_image = np.vstack((full_image, separator)) full_image = np.vstack((full_image, row_img)) # 保存结果 if full_image is not None: full_image = (1 - full_image) * 255 full_image = full_image.astype(np.uint8) final_img = Image.fromarray(full_image) final_img.save('result4.png') print("最终拼接结果已保存为 'result4.png'") # 保存碎片顺序表格 result_table = np.full((11, 19), -1) for row_idx, row in enumerate(final_row_order): for col_idx, frag_id in enumerate(row[:19]): result_table[row_idx, col_idx] = frag_id pd.DataFrame(result_table).to_csv('result4.csv', index=False, header=False) print("碎片顺序表格已保存为 'result4.csv'") else: print("错误: 无法拼接图像") # 11. 保存人工调控记录 with open('adjustment_log.txt', 'w') as f: f.write("行分配调整:\n") for row_id, frag_ids in enumerate(adjusted_rows): f.write(f"行 {row_id}: {frag_ids}\n") f.write("\n行顺序调整:\n") f.write(f"最终行顺序: {[i for i in range(len(final_row_order))]}\n") print("人工调控记录已保存为 'adjustment_log.txt'")行内排序算法需要优化,可以聚少一点的类,然后进行行内排序,对于聚类较多的类,两个边缘都有碎片可以高相似度匹配的留下,其他碎片进入备选
最新发布
08-12
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值