1.英文词频统
下载一首英文的歌词或文章
将所有,.?!’:等分隔符全部替换为空格
将所有大写转换为小写
生成单词列表
生成词频统计
排序
排除语法型词汇,代词、冠词、连词
输出词频最大TOP20
将分析对象存为utf-8编码的文件,通过文件读取的方式获得词频分析内容。
news = '''《Faded》歌词'''
sep = ''',.?!'":'''
exclude = {'you', 'and', 'to'}
for c in sep:
news = news.replace(c,'')
wordList = news.lower().split()
wordDict = {}
# for w in wordList:
# wordDict[w] = wordDict.get(w, 0)+1
# for w in exclude:
# del (wordDict[w])
wordSet = set(wordList) - exclude
for w in wordSet:
wordDict[w] = wordList.count(w)
dictList = list(wordDict.items())
dictList.sort(key = lambda x:x[1], reverse=True)
for i in range(20):
print(dictList[i])
# print(dictList)
# for w in wordDict:
# print(w, wordDict[w])
# for w in wordList:
# print(w)
# f = open('w.txt', 'r')
# news = f.read()
# f.close()
# print(news)
2.中文词频统计
下载一长篇中文文章。
从文件读取待分析文本。
news = open('gzccnews.txt','r',encoding = 'utf-8')
安装与使用jieba进行中文分词。
pip install jieba
import jieba
list(jieba.lcut(news))
生成词频统计
排序
排除语法型词汇,代词、冠词、连词
输出词频最大TOP20(或把结果存放到文件里)
import jieba
f = open('gzccnews.txt','r', encoding='utf-8')
text = f.read()
f.close()
sep = ''',。‘’“”:;()!?、 '''
a= {
'\n', '\u3000','的',
'他', '之', '不', '人', '一', '我', '下', '大',
}
for i in sep:
text = text.replace(i, '')
print(list(jieba.cut(text)))
t = list(jieba.lcut(text))
print(t)
count = {}
wl = list(set(t) - a)
print(wl)
for i in range(0, len(wl)):
count[wl[i]] = text.count(str(wl[i]))
cl = list(count.items())
cl.sort(key=lambda x: x[1], reverse=True)
print(cl)
f = open('cpCount.txt', 'a')
for i in range(20):
f.write(cl[i][0] + ':' + str(cl[i][1]) + '\n')
f.close()