文章目录
信息内容安全-微博话题检测及分析
简介
根据论文郑斐然 苗夺谦 张志飞 高灿《一种中文微博新闻话题检测的方法》,有改动。
代码
# 信息内容安全实验-(微博)话题检测及分析
# 参考论文:郑斐然 苗夺谦 张志飞 高灿《一种中文微博新闻话题检测的方法》
import jieba
import math
import codecs
# 判断字符是否为汉字(是则返回 True)
def is_chinese(char):
if char >= u'\u4e00' and char <= u'\u9fa5':
return True
else:
return False
# 标准化处理(只留下必要信息)
def standardize(text):
st_text = ""
flag = 0
for char in text:
if flag == 0:
# 提取汉字
if is_chinese(char):
st_text = st_text + char
# 删除以“#话题名#”为格式的部分
elif char == "#":
flag = 1
else:
pass
else:
if char == "#":
flag = 0
with open(st, 'a', encoding = 'utf-8') as s:
s.write("\n原语料内容:\n%s" % text)
s.write("\n-------------------------------------------------------------------")
s.write("\n标准化处理结果:\n%s" % st_text)
s.close()
return st_text
# 根据时间窗将微博语料分类,并进行预处理
def set_time_window():
# 设置时间窗大小(单位:小时)
time_window = 1
# 按时间窗分类的微博语料,键为时间,值为包含在此时间窗的微博内容列表
time_window_dict = {}
# 按行读取微博内容(一行即为一条微博)
file = open(material, 'r', encoding = 'utf-8')
for line in file.readlines():
#t1 = line.split()[5:7]
count = 0
flag = 0
time = ""
text = ""
for c in line:
count += 1
# 提取发表时间
if count >= 55 and count <=63:
if c.isdigit():
time = time + c
# 提取微博内容
if count >= 73:
if c == "'":
flag = 1
if flag == 0:
text = text + c
else:
break
st_text = standardize(text)
#print("--------------------------------------------------------------------")
#print("st_text:", st_text)
#print("time:", time)
if time_window_dict.get(time):
time_window_dict.setdefault(time,[]).append(st_text)
#print("-----------if线--------------------------")
#print("time:", time)
#print("st_text:", st_text)
#print(time_window_dict)
else:
temp = {time: [st_text]}
time_window_dict.update(temp)
#print("----------------else线-------------------------")
#print(temp)
file.close()
#print("-----------------------------------------")
#print(time_window_dict)
#print(time_window_dict.keys())
return time_window_dict
# 分词
def divide(st_text):
di_list = jieba.lcut(st_text, cut_all = False) # 精准模式分词
with open(st, 'a') as s:
s.write("\n-------------------------------------------------------------------")
s.write("\n分词结果:\n")
s.write('/'.join(di_list))
s.close()
return di_list
# 过滤停用词
def delete_stopword(di_list):
de_list = []
with open(cn_stopword_list, 'r', encoding = 'utf-8') as c:
stopwords = c.read()
c.close()
for word in di_list:
if word in stopwords:
continue
else:
de_list.append(word)
with open(st, 'a') as s:
s.write("\n-------------------------------------------------------------")
s.write("\n过滤结果:\n")
s.write('/'.join(de_list))
s.close()
return de_list
# 主题词检测
def sub_word_detect(processed_dict):
# 统计各窗格中各词的出现次数
fre_dict = {}
for k in processed_dict.keys():
temp = {}
for de in processed_dict[k]:
for word in de:
if temp.get(word):
temp[word] += 1
else:
temp.update({word: 1})
t1 = sorted(temp.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)
del t1[10:]
#print("----------hey----------------")
#print(t1)
fre_dict.update({k: t1})
#print("----------------hello-----------------------")
#print(fre_dict)
'''
# 由于语料原因(缺少发表时间连续的语料),没办法进行增长系数和复合权值的计算,所以直接以词频作为主题词选取的依据
t2 = {}
# 设置回顾窗
k = 3
w_dict = {}
for f in fre_dict.keys():
for i in f:
flag = 0
fu = []
fi = i[1]
u = int(f)
for r in range(1,k):
fu.append(fre_dict[str(u-r)])
all_fu = 0
for u in fu:
all_fu += u
# 计算增长系数G,并保留三位小数
G = round((fi*k)/all_fu, 3)
# 计算复合权值w
# 设置调节参数a
a = 1.3
w = math.log(G) + a * (math.log(fi/max(sorted.(fre_dict.keys()))))
w_dict[k] = [(i, w)]
topic_dect_dict = w_dict
'''
topic_detect_dict = fre_dict
return topic_detect_dict
# 主题词聚类
def clustering(processed_dict, topic_detect_dict):
cluster = {} # 存放最终簇
D = 8 # 设置阈值D
# 增量聚类
for k in topic_detect_dict.keys():
for t in topic_detect_dict[k]: # t 为窗格中每一元组(词,词在此窗格中出现频数)
if cluster.get(k):
pw = t[1]
ad = [] # 存放各簇的距离
for c in cluster[k]: # c 为窗格中每一簇
p = [] # 存放 p(ci|w)
for word in c: # word 为每一簇中词语
# 计算 p(ci|w)
pcw = 0
temp = 0.00000
# 统计未分类词语与此簇中词语同时出现次数
for de in processed_dict[k]:
#print("de:", de)
if word in de:
if t[0] in de:
pcw += 1
#print("word:", word, "t[0]:", t[0])
temp = pcw / pw
temp = round(temp, 5)
p.append(temp)
#print("temp:", temp, "pcw:", pcw, "pw:", pw)
#print("p:", p)
#print("-------------------")
# 计算词到簇的距离
d = 0.00000
if max(p) == 0:
d = 99999
else:
d = 1 / max(p)
d = round(d, 5)
ad.append(d)
# 判断是否建立新簇
mind = min(ad)
index = ad.index(mind)
#print("ad:", ad, "mind:", mind, "index:", index)
if mind > D:
# 建立新簇
cluster[k].append([t[0]])
# cluster.setdefault(k, []).append([t[0]])
else:
# 将此词语加入距离最近的簇中
cluster[k][index-1].append(t[0])
# cluster.setdefault(k, [])[index].append(t[0])
#print("cluster:", cluster)
#print("--------------------------------------------")
else:
# 首次循环,以第一个词作为初始簇
#print("-----------hey----------------------")
tc = [] # 临时存放簇
tc.append(t[0])
cluster.update({k: [tc]})
return cluster
if __name__ == '__main__':
# 语料及中文停用词表文件
material = '5千条微博语料.txt'
cn_stopword_list = 'baidu_stopwords.txt'
st = '处理结果.txt'
# 语料处理
time_window_dict = set_time_window()
# 分词及停用词过滤
# 结果存储在字典中,键为发表时间,值为各微博内容处理后的结果列表的列表
processed_dict = {}
for k in time_window_dict.keys():
for t in time_window_dict[k]:
di_list = divide(t)
de_list = delete_stopword(di_list)
if processed_dict.get(k):
processed_dict.setdefault(k, []).append(de_list)
else:
temp = {k: [de_list]}
processed_dict.update(temp)
topic_detect_dict = sub_word_detect(processed_dict)
topic_dict = clustering(processed_dict, topic_detect_dict)
'''
print("---------------------------------------")
for k in sorted(processed_dict.keys()):
print(k, processed_dict[k])
for k in sorted(topic_detect_dict.keys()):
print(k, topic_detect_dict[k])
'''
# 结果输出
temp = sorted(topic_dict.keys())
for k in temp:
print("---------------------------------------------")
print("时间:%s-%s %s" % (k[0:2], k[2:4], k[4:6]))
for c in topic_dict[k]:
print("话题:", c)
该博客介绍了基于论文的方法进行中文微博新闻话题检测。通过分析和应用相关技术,探讨了信息内容安全在微博平台上的实践。
5788

被折叠的 条评论
为什么被折叠?



