python-文本挖掘

# 聚类
import pandas as pd
import jieba
import matplotlib.pyplot as plt
from sklearn import metrics
from wordcloud import WordCloud
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.feature_extraction.text import CountVectorizer

def get_stop():
    with open("data/stopword.txt","r",encoding="GBK") as f:
        stop=[line.strip() for line in f.readlines()]  #遍历每一行并读取第一个f.readlines()读一行
    return stop

def rm_stops(words,stops):
    words_no_stop=[]
    for i in words:
        if i not in stops:
            words_no_stop.append(i)
    return words_no_stop
stops=get_stop()

def preprocess_text(docs,sentences,label):
    for doc in docs:
        words=jieba.lcut(doc)
        words_new = rm_stops(words,stops)
        sentences.append((" ".join(words_new),label))

# 数据可视化
def Seedata(cv_x,clf):
    # 7.1 降维聚类
    p = PCA(n_components=2)  # 降为2维
    new_x = p.fit_transform(cv_x)
    clf = KMeans(n_clusters=4)
    y_pre = clf.fit_predict(new_x)
    result = list(y_pre)
    # 7.2 绘制样本点
    plt.figure(2)
    Lab = [[] for i in range(4)]
    index = 0
    for labi in result:
        Lab[labi].append(index)
        index += 1
    color = ['oy', 'ob', 'og', 'cs']
    for i in range(4):
        x1 = []
        y1 = []
        for ind1 in new_x[Lab[i]]:
            x1.append(ind1[0])
            y1.append(ind1[1])
        plt.plot(x1, y1, color[i])
    # 7.3 绘制聚类中心点
    x1 = []
    y1 = []
    for ind1 in clf.cluster_centers_:
        x1.append(ind1[0])
        y1.append(ind1[1])
    plt.plot(x1, y1, "rv")
    plt.show()
    return Lab

#  词云图
def Wc(Lab):
    for i in range(num_cluster):
        s = " "
        for j in Lab[i]:
            s = s+x[j]
        w = WordCloud(font_path="SIMLI.TTF",  # 微软雅黑 msyh.ttc
                      background_color='white',  # 背景颜色
                      max_words=30,  # 词语个数
                      min_font_size=8,  # 最小字体
                      random_state=12,  # 颜色随机
                      mask=plt.imread("data/heart.jpg")  # 指定图形样式
                      )  # 配置参数
        w.generate(s)  # 加载
        plt.imshow(w)
        plt.show()

if __name__ == '__main__':
    #1.文本收集
    doc = pd.read_csv("data/protein2.csv")
    doc.dropna(inplace=True)
    ym = doc["segment"].values.tolist()
    print(ym)

    #2.分词
    sentences=[]
    preprocess_text(ym,sentences,1)
    # print(sentences)
    # random.shuffle(sentences)

    #3、向量化  Count Tfidf 等等
    #训练测试数据分离
    x=[ a for a,b in sentences]
    y=[ b for a,b in sentences]
    cv = CountVectorizer()
    cv_x = cv.fit_transform(x).toarray()

    # 4、模型建立 Kmeans 层次聚类
    num_cluster = 3
    clt = KMeans(n_clusters=num_cluster)
    # clt = AgglomerativeClustering(n_clusters=num_cluster)
    clt.fit(cv_x)
    # print(clt.inertia_)
    print(clt.labels_)

    # 5、模型评价
    lk = metrics.silhouette_score(cv_x,clt.labels_)  # 轮廓系数
    print("轮廓系数:",lk)
    ars = metrics.adjusted_rand_score(y,clt.labels_) #  兰德指数:计算两个聚类之间的相似性度量
    print("兰德指数:",ars)

    # 6、模型输出
    li = []
    for i in range(1,30,1):
        clf = KMeans(n_clusters=i)
        s = clf.fit(cv_x)
        li.append(clf.inertia_)
    import matplotlib.pyplot as plt
    plt.plot(range(1,30,1),li)
    plt.show()

    # 7、数据可视化
    L = Seedata(cv_x,clt)
    # 8. 词云图
    Wc(L)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值