这是机器学习的一个实验,这是我在未做任何调整的情况下得到的结果(里面包含实验要求,请你帮我完善,因为要写入实验报告),但是我还行要你帮我进行优化,即“1.1.分析三个程序完成聚类的过程,通过调整参数等,找出分类比较好的结果并分析;”这部分内容,给出详细优化方案,对每个优化方案都给出代码但是我希望对代码的改动最小
实验目的
(1)掌握 C 均值聚类与分级聚类算法算法的思想及原理,并能实现这些算法。
(2)分析这些算法的优缺点,针对特定应用场景及数据,能用这些算法进行聚类分析。
2. 实验内容
(1)针对给定数据集,采用欧氏距离作为度量,编制程序,将数据集分类。
(2)实现中文文本聚类。
3. 实验要求
4.实验总结
实验内容:
1.1.基于k-means算法,分析三个程序完成聚类的过程,通过调整参数等,找出分类比较好的结果并分析;
Birch
adjusted_rand_score: 0.9801380101427927
FMI: 0.9801380101427927
silhouette: 0.39163465652931684
CHI: 606.2796653633279
Dbscan
Estimated number of noise points: 107
adjusted_rand_score: 0.7786634889087207
FMI: 0.7786634889087207
silhouette: 0.29598759738536534
CHI: 316.99337264612177
Kmeans
adjusted_rand_score: 0.9934238060352091
FMI: 0.9934238060352091
silhouette: 0.39324595540258134
CHI: 610.2807494055157
birch.py
# -*- coding: utf-8 -*-
"""
Birch
"""
import sys
sys.path.append('D:/作业/机器学习与模式学习/机器学习实验/实验三/reference1-3-11.9/tools') # 替换为你的'tools'目录的实际路径
from sklearn.cluster import Birch
from tools.my_preprocess import *
from tools.visualizer import plot_result
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from tools.labelText import LabelText
import settings
import time
import pandas as pd
from sklearn import metrics
"""
loading source
载入资源
文件详情参照本文件夹README
"""
print('------Loading Source...')
ori_path = settings.SOURCE_DATA + 'cut_data.csv'
sentences = loading_source(file_name=ori_path)
# content_lines = loading_source(file_name=ori_path)
# start = time.time()
# cut_source(content_lines, sentences)
# end = time.time()
# print('------- cutting cost', end - start)
"""
Vertorizer
向量化
"""
print('------Vertorizer...')
start = time.time()
# 词频矩阵 Frequency Matrix Of Words
vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46)
transformer = TfidfTransformer()
# Fit Raw Documents
freq_words_matrix = vertorizer.fit_transform(sentences)
# Get Words Of Bag
words = vertorizer.get_feature_names_out()
tfidf = transformer.fit_transform(freq_words_matrix)
weight = freq_words_matrix.toarray()
end = time.time()
print("Shape: Documents(Class) / Words")
print(weight.shape)
print('------ vectorizer cost', end-start)
"""
Dimension Reduction
降维
"""
pca = PCA(n_components=10)
trainingData = pca.fit_transform(weight)
# svd = TruncatedSVD(n_components=10, n_iter=10, random_state=42)
# trainingData = svd.fit_transform(weight)
"""
Compute Birch
"""
numOfClass: int = 4
start = time.time()
clf = Birch(n_clusters=4, branching_factor=10, threshold=0.01)
result = clf.fit(trainingData)
source = list(clf.predict(trainingData))
end = time.time()
label = clf.labels_
labelAndText = LabelText(label, ori_path)
labelAndText.sortByLabel(show=False, write=True)
"""
Result
生成各个指标并写入文件
"""
content = pd.read_csv(settings.SOURCE_DATA + 'labeled_data.csv')
labels_true = content.flag.to_list()
ars = metrics.adjusted_rand_score(labels_true, label)
print("adjusted_rand_score: ", ars)
fmi = metrics.adjusted_rand_score(labels_true, label)
print("FMI: ", fmi)
silhouette = metrics.silhouette_score(trainingData, label)
print("silhouette: ", silhouette)
CHI = metrics.calinski_harabasz_score(trainingData, label)
print("CHI: ", CHI)
with open(settings.DST_DATA+time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())+'result.txt', 'w') as w:
w.write("-------Birch Experiment-------\n")
w.write("adjusted_rand_score: %f\n" % ars)
w.write("FMI: %f\n" % fmi)
w.write("Silhouette: %f\n " % silhouette)
w.write("CHI: %f\n" % CHI)
w.write("------End------")
plot_result(trainingData, source, numOfClass)
DBSCAN.py
# -*- coding: utf-8 -*-
"""
DBSCAN
"""
import sys
sys.path.append('D:/作业/机器学习与模式学习/机器学习实验/实验三/reference1-3-11.9/tools') # 替换为你的'tools'目录的实际路径
from sklearn.cluster import DBSCAN
from tools.my_preprocess import *
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import PCA
from tools.labelText import LabelText
import settings
import time
import pandas as pd
from sklearn import metrics
import matplotlib.pyplot as plt
"""
loading source
载入资源
文件详情参照本文件夹README
"""
print('------Loading Source...')
ori_path = settings.SOURCE_DATA + 'cut_data.csv'
sentences = loading_source(file_name=ori_path)
# start = time.time()
# end = time.time()
# print('------- cutting cost', end - start)
"""
Vertorizer
向量化
"""
print('------Vertorizer...')
start = time.time()
# 词频矩阵 Frequency Matrix Of Words
vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46)
transformer = TfidfTransformer()
# Fit Raw Documents
freq_words_matrix = vertorizer.fit_transform(sentences)
# Get Words Of Bag
words = vertorizer.get_feature_names_out()
tfidf = transformer.fit_transform(freq_words_matrix)
weight = freq_words_matrix.toarray()
end = time.time()
print("Shape: Documents(Class) / Words")
print(weight.shape)
print('------ vectorizer cost', end-start)
"""
Dimension Reduction
降维
"""
pca = PCA(n_components=8)
trainingData = pca.fit_transform(weight)
"""
Compute DBSCAN
"""
numOfClass: int = 4
start = time.time()
# db = DBSCAN(eps=0.08, min_samples=7)
db = DBSCAN(eps=0.08, min_samples=7)
result = db.fit(trainingData)
source = list(db.fit_predict(trainingData))
end = time.time()
label = db.labels_
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labelAndText = LabelText(label, ori_path)
labelAndText.sortByLabel(show=False, write=True, algorithm="DB")
"""
Visualize
考虑到 DBSCAN 算法有检测噪声的能力,单独实现一个可视化
"""
def plot_res(labels: list, n_cluster: int, num: int):
colors = plt.cm.Spectral(np.linspace(0, 1, len(set(labels))))
for k, col in zip(set(labels), colors):
if k == -1:
# Black used for noise.
col = 'k'
class_member_mask = (labels == k)
xy = trainingData[class_member_mask & core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=10)
xy = trainingData[class_member_mask & ~core_samples_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
markeredgecolor='k', markersize=6)
plt.title('DBSCAN')
plt.savefig(settings.PLOT_DIR + 'db-%d-%d.png' % (n_cluster, num))
plt.show()
"""
Result
生成各个指标并写入文件
"""
n_clusters_ = len(set(label)) - (1 if -1 in label else 0)
n_noise_ = int(list(label).count(-1))
print('Estimated number of noise points: %d \n' % n_noise_)
content = pd.read_csv(settings.SOURCE_DATA + 'labeled_data.csv')
labels_true = content.flag.to_list()
ars = metrics.adjusted_rand_score(labels_true, label)
print("adjusted_rand_score: ", ars)
fmi = metrics.adjusted_rand_score(labels_true, label)
print("FMI: ", fmi)
silhouette = metrics.silhouette_score(trainingData, label)
print("silhouette: ", silhouette)
CHI = metrics.calinski_harabasz_score(trainingData, label)
print("CHI: ", CHI)
with open(settings.DST_DATA+time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())+'result.txt', 'w') as w:
w.write("------DBSCAN Experiment-------\n")
w.write("adjusted_rand_score: %f\n" % ars)
w.write("FMI: %f\n" % fmi)
w.write("Silhouette: %f\n" % silhouette)
w.write("CHI: %f\n" % CHI)
w.write('Estimated number of noise points: %d \n' % n_noise_)
w.write("------End------")
plot_res(label, n_clusters_, n_clusters_)
kmeans.py
# -*- coding: utf-8 -*-
"""
K-means-Single-Test
"""
import sys
sys.path.append('D:/作业/机器学习与模式学习/机器学习实验/实验三/reference1-3-11.9/tools') # 替换为你的'tools'目录的实际路径
from sklearn.cluster import KMeans
from tools.my_preprocess import *
from tools.visualizer import plot_result
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA, TruncatedSVD
from tools.labelText import LabelText
import settings
import time
import pandas as pd
from sklearn import metrics
"""
loading source
载入资源
文件详情参照本文件夹README
"""
print('------Loading Source...')
ori_path = settings.SOURCE_DATA + 'cut_data.csv'
# sentences = loading_source(file_name=ori_path)
sentences = []
# content_lines = loading_source(file_name=ori_path)
# ori_path = settings.SOURCE_DATA + 'cut_data.csv'
sentences = loading_source(file_name=ori_path)
# start = time.time()
# cut_source(content_lines, sentences, write=True)
# end = time.time()
# print('------- cutting cost', end - start)
import os
os.environ["OMP_NUM_THREADS"] = "7"
"""
Vertorizer
向量化
"""
print('------Vertorizer...')
start = time.time()
# 词频矩阵 Frequency Matrix Of Words
vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46)
transformer = TfidfTransformer()
# Fit Raw Documents
freq_words_matrix = vertorizer.fit_transform(sentences)
# Get Words Of Bag
words = vertorizer.get_feature_names_out()
tfidf = transformer.fit_transform(freq_words_matrix)
weight = freq_words_matrix.toarray()
end = time.time()
print("Shape: Documents(Class) / Words")
print(weight.shape)
print('------ vectorizer cost', end-start)
"""
Dimension Reduction
降维
"""
pca = PCA(n_components=10)
trainingData = pca.fit_transform(weight)
# svd = TruncatedSVD(n_components=10, n_iter=10, random_state=42)
# trainingData = svd.fit_transform(weight)
"""
Compute K-Means
"""
numOfClass: int = 4
start = time.time()
clf = KMeans(n_clusters=numOfClass, max_iter=10000, init="k-means++", tol=1e-6)
result = clf.fit(trainingData)
source = list(clf.predict(trainingData))
end = time.time()
label = clf.labels_
labelAndText = LabelText(label, ori_path)
labelAndText.sortByLabel(show=False, write=True)
"""
Result
生成各个指标并写入文件
"""
content = pd.read_csv(settings.SOURCE_DATA + 'labeled_data.csv')
labels_true = content.flag.to_list()
ars = metrics.adjusted_rand_score(labels_true, label)
print("adjusted_rand_score: ", ars)
fmi = metrics.adjusted_rand_score(labels_true, label)
print("FMI: ", fmi)
silhouette = metrics.silhouette_score(trainingData, label)
print("silhouette: ", silhouette)
CHI = metrics.calinski_harabasz_score(trainingData, label)
print("CHI: ", CHI)
with open(settings.DST_DATA+time.strftime('KM'+"%Y-%m-%d %H-%M-%S", time.localtime())+'result.txt', 'w') as w:
w.write("------K-Means Experiment-------\n")
w.write("adjusted_rand_score: %f\n" % ars)
w.write("FMI: %f\n" % fmi)
w.write("Silhouette: %f\n " % silhouette)
w.write("CHI: %f\n" % CHI)
w.write("------End------")
plot_result(trainingData, source, numOfClass)