一、爬取校园新闻
import requests
from bs4 import BeautifulSoup
url = requests.get("http://news.gzcc.cn/html/xiaoyuanxinwen/")
url.encoding = "utf-8"
soup = BeautifulSoup(url.text,'html.parser')
#print(soup.head.title.text)
for news in soup.select('li'):
if len(news.select('.news-list-title'))>0:
#print(news.select('.news-list-title'))
#print(news.select('.news-list-title')[0])
#print(news.select('.news-list-title')[0].text)
time = news.select('.news-list-info')[0].contents[0].text
title = news.select('.news-list-title')[0].text
href = news.select('a')[0]['href']
href_text = requests.get(href)
href_text.encoding = "utf-8"
href_soup = BeautifulSoup(href_text.text,'html.parser')
href_text_body = href_soup.select('.show-content')[0].text
print(time,title,href,href_text_body)
二、爬取自己兴趣的网页
import requests
from bs4 import BeautifulSoup
jq='http://www.gamersky.com/pcgame/'
res = requests.get(jq)
res.encoding='utf-8'
soup = BeautifulSoup(res.text,'html.parser')
for news in soup.select('li'):
if len(news.select('a'))>0:
title=news.select('a')[0].text
url=news.select('a')[0]['href']
#time=news.select('span')[0].contents[0].text
#print(time,title,url)
print(title,url)