需求场景:
想一键把人民网、新华网、光明网、求是网、半月谈等网站的头条爬下来,展示头条的标题与链接,方便一览当天新闻热点。
运行结果:

实现代码:
import requests
from bs4 import BeautifulSoup
import chardet
wz = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
url = 'http://www.people.com.cn/' # 人民网
res = requests.get(url, headers=wz) # 请求网址,得到数据
res.encoding = chardet.detect(res.content)['encoding'] # 使用chardet库来自动检测编码方式
soup = BeautifulSoup(res.text, 'lxml') # 整理数据
lis = soup.find_all('h1',class_='fbold') # 筛选,思路是从大到小去筛选,实际是先筛小,如果不对再扩大范围
for l in lis:
a = l.find('a')
link = a.get('href')
topic = a.text
print('人民网:',topic,link)
url = 'http://www.news.cn/' # 新华网
res = requests.get(url, headers=wz) # 请求网址,得到数据
res.encoding = 'utf-8' # 将编码格式改为utf-8
soup = BeautifulSoup(res.text, 'lxml') # 整理数据
lis = soup.find_all('h1') # 筛选,思路是从大到小去筛选,实际是先筛小,如果不对再扩大范围
for l in lis:
a = l.find('a')
link = a.get('href')
topic = a.text
print('新华网:',topic,link)
url = 'https://www.gmw.cn/' # 光明网
res = requests.get(url, headers=wz) # 请求网址,得到数据
res.encoding = 'utf-8' # 将编码格式改为utf-8
soup = BeautifulSoup(res.text, 'lxml') # 整理数据
lis1 = soup.find_all('div', class_='m_zy')[0]
lis = lis1.find_all('span') # 筛选,思路是从大到小去筛选,实际是先筛小,如果不对再扩大范围
for l in lis:
a = l.find('a')
link = a.get('href')
topic = a.text
print('光明网:',topic,link)
url = 'http://www.qstheory.cn/' # 求是网
res = requests.get(url, headers=wz) # 请求网址,得到数据
res.encoding = 'utf-8' # 将编码格式改为utf-8
soup = BeautifulSoup(res.text, 'lxml') # 整理数据
lis1 = soup.find_all('div', class_='headtitle')[0]
lis = lis1.find_all('div') # 筛选,思路是从大到小去筛选,实际是先筛小,如果不对再扩大范围
for l in lis:
a = l.find('a')
link = a.get('href')
topic = a.text
print('求是网:',topic,link)
url = 'http://www.banyuetan.org/' # 半月谈
res = requests.get(url, headers=wz) # 请求网址,得到数据
res.encoding = 'utf-8' # 将编码格式改为utf-8
soup = BeautifulSoup(res.text, 'lxml') # 整理数据
lis1 = soup.find_all('div', class_='hot_tt')[0]
lis = lis1.find_all('h3') # 筛选,思路是从大到小去筛选,实际是先筛小,如果不对再扩大范围
for l in lis:
a = l.find('a')
link = a.get('href')
topic = a.text
print('半月谈:',topic,link)
'''
res1 = requests.get(link, headers=wz)
soup1 = BeautifulSoup(res1.text, 'lxml')
内容1 = soup1.find_all('div', class_='content')[0]
text = '\n'.join(内容1.stripped_strings)
open(f'C:/Users/Administrator/Desktop/test/傲娇闪婚女总裁/{标题}.txt', 'w').write(text)
'''
'''
import requests
from bs4 import BeautifulSoup
def get_html(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding =r.apparent_encoding
return r.text
except:
return ""
def parse_html(html):
soup = BeautifulSoup(html,"html.parser")
movies =[]
for item in soup.find_all("div", class_="item"):
movie ={}
movie["rank"]= item.em.text
movie["name"]= item.a.span.text
movie["score"]= item.find("span", class_="rating_num").text
movie["quote"]= item.find("span", class_="inq").text
movies.append(movie)
return movies
def main():
url_template ="{}&filter="
for i in range(0, 250, 25):
html = get_html(url)
movies = parse_html(html)
for movie in movies:
print(movie)
if __name__=="__main__": main()
'''
3229

被折叠的 条评论
为什么被折叠?



