爬取新闻内容
import urllib.request
from bs4 import BeautifulSoup
response = urllib.request.urlopen('https://news.hist.edu.cn/kyyw/378.htm')
content=response.read().decode('utf-8')
soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
divs=soup.find_all('div',{'class':"sec-a"})
lis=divs[0].find_all('li')
num = 1
def content(href,title):
response = urllib.request.urlopen('https://news.hist.edu.cn/' + href)
content = response.read().decode('utf-8')
soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
divs = soup.find_all('div', {'id': "vsb_content_501"})
if len(divs) == 0:
return
ps = divs[0].find_all('p')
file = "./news/" +title + ".txt"
for p in ps:
with open(file,'a',encoding='utf-8') as f:
f.write(p.text + '\n')
for li in lis:
href = li.find_all('a')[0].get("href")
if len(href) == 0:
continue
title = li.find_all('a')[0].get("title")
// 一个一个网页爬取
content(href[2:],title)
分词处理
import jieba
import os
ciyu ={}
rootdir = './news/'
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
with open(path,'r',encoding='utf-8') as f:
data = f.readlines()
list1 = jieba.lcut(str(data))
for i in list1:
if len(i) <= 1:
continue
if ciyu.get(i) == None:
ciyu[i] = 1
else:
ciyu[i] += 1
sort_ciyu = sorted(ciyu.items(),key=lambda x:x[1],reverse=True)[:10]
print(sort_ciyu)