import jieba.analyse from wordcloud import WordCloud, ImageColorGenerator,STOPWORDS from imageio import imread import matplotlib.pyplot as plt from urllib import request import time from lxml import etree class wc(): def __init__(self, txt_file, img_file,front): self.f = open(txt_file, 'r', encoding='utf-8') self.txt = self.f.read() self.f.close() self.tags = jieba.analyse.extract_tags(self.txt, topK=100) self.img = imread(img_file) # font_path指的是字体文件路径,因为wordcloud自带的字体不支持中文所以我们要指定一个字体文件,否者输出的图片全是框框 # background_color 默认是黑色 我设置成白色 # max_words最大显示的词数 # mask 背景图片 # max_font_size 最大字体字号 self.text = ' '.join(self.tags) self.wc = WordCloud( font_path='./fonts/simhei.ttf', background_color='white', max_words=100, mask=self.img, max_font_size=80).generate(self.text) def get_url_content(self): url = 'https://read.qidian.com/chapter/_AaqI-dPJJ4uTkiRw_sFYA2/eSlFKP1Chzg1' headers = { 'Referer': 'https://read.qidian.com/chapter/_AaqI-dPJJ4uTkiRw_sFYA2/eSlFKP1Chzg1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' } #构建请求 req = request.Request(url=url,headers=headers) # 发送请求 response = request.urlopen(req) content = response.read() xpath_content = etree.HTML(content) #//*[@id="chapter-339991957"]/div/div[2] #//*[@id="chapter-339991957"]/div/div/p/text() new_content = xpath_content.xpath('//*[@id="chapter-339991957"]/div/div/p/text()') with open('sanwen.txt','w',encoding='utf-8') as f: for i in new_content: f.writelines(i.strip()) time.sleep(2) def show_wc(self): plt.imshow(self.wc) # 可以通过 plt.imshow(self.wc.recolor(color_func=img_color))使图片颜色跟字体颜色一样 plt.axis("off") plt.show() self.wc.to_file('result.png') if __name__ == '__main__': mywc = wc('sanwen.txt', 'timg.png','AGENCYR.TTF') mywc.get_url_content() #网上爬虫,爬内容写到TXT文件中 mywc.show_wc() #词云,填充显示我们图片轮廓
爬虫 and 词云 结合例子
最新推荐文章于 2023-12-20 12:32:59 发布