读入Excel文件,内容如下:

处理程序如下:
# coding: utf-8
import pandas as pd
def read_txt(src):
f = open(src, 'r', encoding='utf-8')
lines = f.readlines()
split_words = []
for line in lines:
word = line.strip()
if word not in split_words:
split_words.append(word)
return split_words
if __name__ == '__main__':
src = '汇总_更新.xlsx'
split_words = read_txt('split.txt')
df = pd.read_excel(src, sheet_name='WWW')
# print(file)
# paper_name = file
df_list = df.values.tolist()
# print(df_li)
paper_name = []
for paper in df_list:
try:
paper_name.append(paper[2].split())
except:
print(paper[2])
# print(paper_name)
word_dict = {}
word_list = []
for name in paper_name:
for word in name:
if word.lower() in split_words:
continue
if word not in word_list:
word_list.append(word)
word_dict[word] = 1
else:
word_dict[word] += 1
df_new = pd.DataFrame.from_dict(word_dict, orient='index', columns=['number'])
# print(word_dict)
df_new = df_new.sort_values(by='number',ascending=False )
# print(df_new)
tgt = pd.ExcelWriter('005.xlsx')
df_new.to_excel(tgt, sheet_name='WWW')
tgt.save()
最终结果
英文停用词表来源:
https://blog.youkuaiyun.com/shijiebei2009/article/details/39696523/
2048

被折叠的 条评论
为什么被折叠?



