library(jiebaR)
library(wordcloud2)
#读入数据分隔符是‘\n’,字符编码是‘UTF-8’,what=''表示以字符串类型读入
file <- scan('ciyun.txt',sep='\n',what='',encoding="UTF-8")
txtList = lapply(file, strsplit,"\\s+") #使用空格符号进行分词
txtChar = tolower(unlist(txtList))
txtChar <- txtChar[nchar(txtChar)>2] #去除字符长度小于2的词语
txtChar <- table(txtChar) #统计词频
#grepl类似grep,但是返回逻辑向量,即是否包含pattern
txtChar <- txtChar[!grepl('^[0-9-]+$',names(txtChar),perl = TRUE)] #去除纯数字
txtChar <- txtChar[!grepl('^and|the$',names(txtChar),perl = TRUE)] #delete and the
txtChar <- sort(txtChar, decreasing = TRUE)[1:100] #降序排序,并提取出现次数最多的前100个词语
data=data.frame(txtChar)
data = dplyr::filter(data,Freq>=2)#保留频率>=2的单词
#.pdf替换成空白
data$txtChar = sub('\\.pdf','',data$txtChar)
#绘制词云图
wordcloud2(data, size = 1,shape = 'star')
参考
https://zhuanlan.zhihu.com/p/22601260