python-文本挖掘

最新推荐文章于 2023-11-30 19:46:20 发布

原创最新推荐文章于 2023-11-30 19:46:20 发布 · 390 阅读

CC 4.0 BY-SA版权

文章标签：

# 聚类
import pandas as pd
import jieba
import matplotlib.pyplot as plt
from sklearn import metrics
from wordcloud import WordCloud
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer

def get_stop():
    with open("data/stopword.txt","r",encoding="GBK") as f:
        stop=[line.strip() for line in f.readlines()]  #遍历每一行并读取第一个f.readlines()读一行
    return stop

def rm_stops(words,stops):
    words_no_stop=[]
    for i in words:
        if i not in stops:
            words_no_stop.append(i)
    return words_no_stop
stops=get_stop()

def preprocess_text(docs,sentences,label):
    for doc in docs:
        words=jieba.lcut(doc)
        words_new = rm_stops(words,stops)
        sentences.append((" ".join(words_new),label))

# 数据可视化
def Seedata(cv_x,clf):
    # 7.1 降维聚类
    p = PCA(n_components=2)  # 降为2维
    new_x = p.fit_transform(cv_x)
    clf = KMeans(n_clusters=4)
    y_pre = clf.fit_predict(new_x)
    result = list(y_pre)
    # 7.2 绘制样本点
    plt.figure(2)
    Lab = [[] for i in range(4)]
    index = 0
    for labi in result:
        Lab[labi].append(index)
        index += 1
    color = ['oy', 'ob', 'og', 'cs']
    for i in range(4):
        x1 = []
        y1 = []
        for ind1 in new_x[Lab[i]]:
            x1.append(ind1[0])
            y1.append(ind1[1])
        plt.plot(x1, y1, color[i])
    # 7.3 绘制聚类中心点
    x1 = []
    y1 = []
    for ind1 in clf.cluster_centers_:
        x1.append(ind1[0])
        y1.append(ind1[1])
    plt.plot(x1, y1, "rv")
    plt.show()
    return Lab

#  词云图
def Wc(Lab):
    for i in range(num_cluster):
        s = " "
        for j in Lab[i]:
            s = s+x[j]
        w = WordCloud(font_path="SIMLI.TTF",  # 微软雅黑 msyh.ttc
                      background_color='white',  # 背景颜色
                      max_words=30,  # 词语个数
                      min_font_size=8,  # 最小字体
                      random_state=12,  # 颜色随机
                      mask=plt.imread("data/heart.jpg")  # 指定图形样式
                      )  # 配置参数
        w.generate(s)  # 加载
        plt.imshow(w)
        plt.show()

if __name__ == '__main__':
    #1.文本收集
    doc = pd.read_csv("data/protein2.csv")
    doc.dropna(inplace=True)
    ym = doc["segment"].values.tolist()
    print(ym)

    #2.分词
    sentences=[]
    preprocess_text(ym,sentences,1)
    # print(sentences)
    # random.shuffle(sentences)

    #3、向量化  Count Tfidf 等等
    #训练测试数据分离
    x=[ a for a,b in sentences]
    y=[ b for a,b in sentences]
    cv = CountVectorizer()
    cv_x = cv.fit_transform(x).toarray()

    # 4、模型建立 Kmeans 层次聚类
    num_cluster = 3
    clt = KMeans(n_clusters=num_cluster)
    # clt = AgglomerativeClustering(n_clusters=num_cluster)
    clt.fit(cv_x)
    # print(clt.inertia_)
    print(clt.labels_)

    # 5、模型评价
    lk = metrics.silhouette_score(cv_x,clt.labels_)  # 轮廓系数
    print("轮廓系数：",lk)
    ars = metrics.adjusted_rand_score(y,clt.labels_) #  兰德指数：计算两个聚类之间的相似性度量
    print("兰德指数：",ars)

    # 6、模型输出
    li = []
    for i in range(1,30,1):
        clf = KMeans(n_clusters=i)
        s = clf.fit(cv_x)
        li.append(clf.inertia_)
    import matplotlib.pyplot as plt
    plt.plot(range(1,30,1),li)
    plt.show()

    # 7、数据可视化
    L = Seedata(cv_x,clt)
    # 8. 词云图
    Wc(L)