# coding: utf-8
# In[4]:
#导入包
import requests
from bs4 import BeautifulSoup
#爬取特定网页
res = requests.get("https://news.sina.com.cn/china/")
#转化文字编码
res.encoding = 'utf-8'
#存进BeautifulSoup元素中
soup = BeautifulSoup(res.text, 'html.parser')
#print(soup)
for news in soup.select('.news-1'):#爬取并遍历所有class为"news_1”的元素
li = news.select('li')#选取所有含有'li'特定标签的元素,并存进li这个list中去
num = len(li)#获取到元素的个数
if num > 0:
for i in range(0, num):
print(li[i].text)
#a = '<a href = "#" abc = 456 def = 123> i am a link </a>'
#soup = BeautifulSoup(a, 'html.parser')
#print(soup.select('a')[0]['href'])#输出"#"
#print(soup.select('a')[0]['abc'])#输出"456"
#print(soup.select('a')[0]['def'])#输出"123"
#soup.select('#main-title')[0].text
# In[5]:
soup.select("#_blank")[0].text
# In[6]:
#导入包
import requests
from bs4 import BeautifulSoup
#爬取特定网页
res = requests.get("http://news.sina.com.cn./c/nd/2016-08-20/doc-ifxvctcc8121090.shtml")
#转化文字编码
res.encoding = 'utf-8'
#存进BeautifulSoup元素中
soup = BeautifulSoup(res.text, 'html.parser'
爬取网页的标题、时间、来源 、正文、作者、评论数、新闻id
最新推荐文章于 2024-06-23 09:53:38 发布