信息内容安全-微博话题检测及分析

该博客介绍了基于论文的方法进行中文微博新闻话题检测。通过分析和应用相关技术,探讨了信息内容安全在微博平台上的实践。

信息内容安全-微博话题检测及分析

简介

根据论文郑斐然 苗夺谦 张志飞 高灿《一种中文微博新闻话题检测的方法》,有改动。

代码

# 信息内容安全实验-(微博)话题检测及分析
# 参考论文:郑斐然 苗夺谦 张志飞 高灿《一种中文微博新闻话题检测的方法》


import jieba
import math
import codecs


# 判断字符是否为汉字(是则返回 True)
def is_chinese(char):
    if char >= u'\u4e00' and char <= u'\u9fa5':
        return True
    else:
        return False


# 标准化处理(只留下必要信息)
def standardize(text):
    st_text = ""
    flag = 0
    for char in text:
        if flag == 0:
            # 提取汉字
            if is_chinese(char):
                st_text = st_text + char
            # 删除以“#话题名#”为格式的部分
            elif char == "#":
                flag = 1
            else:
                pass
        else:
            if char == "#":
                flag = 0

    with open(st, 'a', encoding = 'utf-8') as s:
        s.write("\n原语料内容:\n%s" % text)
        s.write("\n-------------------------------------------------------------------")
        s.write("\n标准化处理结果:\n%s" % st_text)
        s.close()
    return st_text


# 根据时间窗将微博语料分类,并进行预处理
def set_time_window():
    # 设置时间窗大小(单位:小时)
    time_window = 1
    # 按时间窗分类的微博语料,键为时间,值为包含在此时间窗的微博内容列表
    time_window_dict = {}

    # 按行读取微博内容(一行即为一条微博)
    file = open(material, 'r', encoding = 'utf-8')
    for line in file.readlines():
        #t1 = line.split()[5:7]
        count = 0
        flag = 0
        time = ""
        text = ""
        for c in line:
            count += 1
            # 提取发表时间
            if count >= 55 and count <=63:
                if c.isdigit():
                    time = time + c
            # 提取微博内容
            if count >= 73:
                if c == "'":
                    flag = 1
                if flag == 0:
                    text = text + c
                else:
                    break
        st_text = standardize(text)
        #print("--------------------------------------------------------------------")
        #print("st_text:", st_text)
        #print("time:", time)
        if time_window_dict.get(time):
            time_window_dict.setdefault(time,[]).append(st_text)
            #print("-----------if线--------------------------")
            #print("time:", time)
            #print("st_text:", st_text)
            #print(time_window_dict)
        else:
            temp = {time: [st_text]}
            time_window_dict.update(temp)
            #print("----------------else线-------------------------")
            #print(temp)
    file.close()
    #print("-----------------------------------------")
    #print(time_window_dict)
    #print(time_window_dict.keys())
    return time_window_dict


# 分词
def divide(st_text):
    di_list = jieba.lcut(st_text, cut_all = False)       # 精准模式分词
    with open(st, 'a') as s:
        s.write("\n-------------------------------------------------------------------")
        s.write("\n分词结果:\n")
        s.write('/'.join(di_list))
        s.close()
    return di_list


# 过滤停用词
def delete_stopword(di_list):
    de_list = []
    with open(cn_stopword_list, 'r', encoding = 'utf-8') as c:
        stopwords = c.read()
        c.close()
    for word in di_list:
        if word in stopwords:
            continue
        else:
            de_list.append(word)
    with open(st, 'a') as s:
        s.write("\n-------------------------------------------------------------")
        s.write("\n过滤结果:\n")
        s.write('/'.join(de_list))
        s.close()

    return de_list


# 主题词检测
def sub_word_detect(processed_dict):
    # 统计各窗格中各词的出现次数
    fre_dict = {}
    for k in processed_dict.keys():
        temp = {}
        for de in processed_dict[k]:
            for word in de:
                if temp.get(word):
                    temp[word] += 1
                else:
                    temp.update({word: 1})
        t1 = sorted(temp.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)
        del t1[10:]
        #print("----------hey----------------")
        #print(t1)
        fre_dict.update({k: t1})
    #print("----------------hello-----------------------")
    #print(fre_dict)

    '''
    # 由于语料原因(缺少发表时间连续的语料),没办法进行增长系数和复合权值的计算,所以直接以词频作为主题词选取的依据
    t2 = {}
    # 设置回顾窗
    k = 3
    w_dict = {}
    for f in fre_dict.keys():
        for i in f:
            flag = 0
            fu = []
            fi = i[1]
            u = int(f)
            for r in range(1,k):
                fu.append(fre_dict[str(u-r)])
            all_fu = 0
            for u in fu:
                all_fu += u
            
            # 计算增长系数G,并保留三位小数
            G = round((fi*k)/all_fu, 3)
            
            # 计算复合权值w
            # 设置调节参数a
            a = 1.3
            w = math.log(G) + a * (math.log(fi/max(sorted.(fre_dict.keys()))))
            
            w_dict[k] = [(i, w)]
            
    topic_dect_dict = w_dict
    '''

    topic_detect_dict = fre_dict

    return topic_detect_dict

# 主题词聚类
def clustering(processed_dict, topic_detect_dict):
    cluster = {}        # 存放最终簇
    D = 8               # 设置阈值D

    # 增量聚类
    for k in topic_detect_dict.keys():
        for t in topic_detect_dict[k]:      # t 为窗格中每一元组(词,词在此窗格中出现频数)
            if cluster.get(k):
                pw = t[1]
                ad = []  # 存放各簇的距离

                for c in cluster[k]:        # c 为窗格中每一簇
                    p = []                  # 存放 p(ci|w)
                    for word in c:          # word 为每一簇中词语
                        # 计算 p(ci|w)
                        pcw = 0
                        temp = 0.00000

                        # 统计未分类词语与此簇中词语同时出现次数
                        for de in processed_dict[k]:
                            #print("de:", de)
                            if word in de:
                                if t[0] in de:
                                    pcw += 1
                                    #print("word:", word, "t[0]:", t[0])


                        temp = pcw / pw
                        temp = round(temp, 5)
                        p.append(temp)

                        #print("temp:", temp, "pcw:", pcw, "pw:", pw)
                        #print("p:", p)
                        #print("-------------------")


                    # 计算词到簇的距离
                    d = 0.00000

                    if max(p) == 0:
                        d = 99999
                    else:
                        d = 1 / max(p)
                        d = round(d, 5)
                    ad.append(d)

                # 判断是否建立新簇
                mind = min(ad)
                index = ad.index(mind)

                #print("ad:", ad, "mind:", mind, "index:", index)

                if mind > D:
                    # 建立新簇
                    cluster[k].append([t[0]])
                    # cluster.setdefault(k, []).append([t[0]])
                else:
                    # 将此词语加入距离最近的簇中
                    cluster[k][index-1].append(t[0])
                    # cluster.setdefault(k, [])[index].append(t[0])
                #print("cluster:", cluster)
                #print("--------------------------------------------")

            else:
                # 首次循环,以第一个词作为初始簇
                #print("-----------hey----------------------")
                tc = []                 # 临时存放簇
                tc.append(t[0])
                cluster.update({k: [tc]})

    return cluster


if __name__ == '__main__':
    # 语料及中文停用词表文件
    material = '5千条微博语料.txt'
    cn_stopword_list = 'baidu_stopwords.txt'
    st = '处理结果.txt'

    # 语料处理
    time_window_dict = set_time_window()
    # 分词及停用词过滤
    # 结果存储在字典中,键为发表时间,值为各微博内容处理后的结果列表的列表
    processed_dict = {}
    for k in time_window_dict.keys():
        for t in time_window_dict[k]:
            di_list = divide(t)
            de_list = delete_stopword(di_list)
            if processed_dict.get(k):
                processed_dict.setdefault(k, []).append(de_list)
            else:
                temp = {k: [de_list]}
                processed_dict.update(temp)

    topic_detect_dict = sub_word_detect(processed_dict)
    topic_dict = clustering(processed_dict, topic_detect_dict)

    '''
    print("---------------------------------------")
    for k in sorted(processed_dict.keys()):
        print(k, processed_dict[k])
    for k in sorted(topic_detect_dict.keys()):
        print(k, topic_detect_dict[k])
    '''

    # 结果输出
    temp = sorted(topic_dict.keys())
    for k in temp:
        print("---------------------------------------------")
        print("时间:%s-%s %s" % (k[0:2], k[2:4], k[4:6]))
        for c in topic_dict[k]:
            print("话题:", c)
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值