import requests
from bs4 import BeautifulSoup
con = requests.get('http://www.ifeng.com/')
# 有些页面的编码并不是utf8, 将页面的编码用encode转化为二进制, 然后在转化为utf8格式
text = con.text.encode('iso-8859-1').decode('utf-8')
html = BeautifulSoup(text, 'html5lib')
# print(html)
topNews = html.select('#headLineDefault > ul > ul > li > a')
for i in range(len(topNews)):
print(topNews[i].get_text())
1.2 查询一个网页的编码
import requests
from bs4 import BeautifulSoup
con = requests.get('http://www.ifeng.com/')
print(con.encoding)
1.3 抓取邮政编码
import requests
from bs4 import BeautifulSoup
# 需求是打印所有省份里的所有邮政编码
# 基本url, 此页面提供所有省份供选择
commonUrl = 'http://www.ip138.com/post'
# 浏览器的身份标识字符串, 让服务器识别客户端的信息
userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'
# 定义消息头
setHeaders = {'user-agent': userAgent}
# 根据基本url拿到所有省份
allProvince = requests.get(commonUrl, headers=setHeaders)
# 获取当前页面的编码
pageEncoding = allProvince.encoding
# 将页面编码转换为gbk
text = allProvince.text.encode(pageEncoding).decode('gbk')
# 将文本转化为dom
htmlDom = BeautifulSoup(text, 'html5lib')
# 所有省份的数组
singalProvince = htmlDom.select('#newAlexa tbody tr td a')
# 有多少个省就遍历多少次
for i in range(len(singalProvince)):
# 每次执行都拿到当前省的名字和url
provinceName = singalProvince[i].text
provinceUrl = singalProvince[i].get('href')
# 获取到当前省的所有邮编
provinceAllPostPage = requests.get('http://www.ip138.com/' + provinceUrl)
# 获取当前页面的编码
provincePageEncoding = provinceAllPostPage.encoding
# 将页面编码转换为gbk
provinceText = provinceAllPostPage.text.encode(provincePageEncoding).decode('gbk')
# 将文本转化为dom
provinceHtmlDom = BeautifulSoup(provinceText, 'html5lib')
# 获取当前页面所有邮编, 数组
provinceAllPost = provinceHtmlDom.select('.t12 tr')
for areaIndex in range(len(provinceAllPost)):
tds = provinceAllPost[areaIndex].select('td')
for tdIndex in range(len(tds)):
print(tds[tdIndex].text)
print('----------------------------------------------------------')