目标,发掘文本噪音
import numpy as np
import hdbscan
import matplotlib.pyplot as plt
import seaborn as sns
import umap
# 1. 读取 embedding 数据
X = np.load("your_embedding.npy") # 替换成你的路径
# 2. 使用 HDBSCAN 进行聚类
clusterer = hdbscan.HDBSCAN(min_cluster_size=30, min_samples=10)
cluster_labels = clusterer.fit_predict(X)
probabilities = clusterer.probabilities_
# 3. 提取噪声样本(label = -1)
noise_mask = (cluster_labels == -1)
noise_indices = np.where(noise_mask)[0]
print(f"识别出噪声样本数量: {len(noise_indices)} / {len(X)}")
# 4. 可选:找出代表性样本(高置信度)
representative_indices = probabilities.argsort()[::-1][:100]
# 5. 用 UMAP(余弦距离)降维可视化
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1)
X_2d = reducer.fit_transform(X)
# 6. 绘图
plt.figure(figsize=(10, 6))
sns.scatterplot(x=X_2d[:, 0], y=X_2d[:, 1], hue=cluster_labels, palette='tab20', s=10, legend=None)
plt.title("HDBSCAN 聚类 + 噪声识别(使用余弦距离)")
plt.show()