requests
requests是什么?urllib是什么。库,模拟浏览器发送http请求的,requests也是模拟发送http请求的
requests是对urllib的又一层封装,提供的接口非常的人性化、简单
http://docs.python-requests.org/zh_CN/latest/index.html
安装: pip install requests
发送get
定制头部
查看响应的内容
字符串内容 r.text
字节格式内容 r.content
响应url r.url
响应的状态码 r.status_code
响应头 r.headers
查看字符集 r.encoding 还可以定制字符集
发送post
r = requests.post(url=url, headers=headers, data=data)
ajax-post
r.json() === json.loads(r.text)
代理
proxies=proxy
cookie
是什么?由于http的无状态特性
如何使用会话机制,如何保存和携带cookie
s = requests.Session()
创建一个会话,再往下所有的请求都使用s.get() s.post()发送即可
异常
所有的异常都在 requests.exceptions 模块中
ConnectionError:URLError
HTTPError:HTTPError
Timeout:超时异常
通过添加timeout参数,来实现
证书处理
忽略证书 r = requests.get(url=url, verify=False)
requests-get请求
import requests
url = 'https://www.baidu.com/s?'
data = {
'ie': 'utf8',
'wd': '中国历史文化'
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
# data直接传递原生字典即可。
r = requests.get(url=url, params=data, headers=headers)
'''
查看响应的内容
字符串内容 r.text
字节格式内容 r.content
响应url r.url
响应的状态码 r.status_code
响应头 r.headers
查看字符集 r.encoding 还可以定制字符集
'''
with open('baidu.html', 'wb') as fp:
fp.write(r.content)
r.encoding = 'gbk'
print(r.url, r.status_code, r.headers, r.encoding)
requests-post请求
import requests
url = 'https://cn.bing.com/ttranslationlookup?&IG=70498A7A7CB44CCAAFF6A962C489282A&IID=translator.5036.12'
formdata = {
'from': 'zh-CHS',
'to': 'en',
'text': '土豆',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
r = requests.post(url=url, headers=headers, data=formdata)
# print(r.text)
obj = r.json()
print(obj)
print(type(obj))
代理操作
import requests
proxy = {
'http': '120.92.74.189:3128'
}
url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
r = requests.get(url=url, headers=headers, proxies=proxy)
with open('daili.html', 'wb') as fp:
fp.write(r.content)
关于cookie的操作
import requests
import time
# 创建一个会话
s = requests.Session()
# 再往下所有的操作都使用s.get() s.post()方法发送就会自动的保存和携带cookie
post_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2018931129711'
# 表单数据
formdata = {
'email': '17701256561',
'icode': '',
'origURL': 'http://www.renren.com/home',
'domain': 'renren.com',
'key_id': '1',
'captcha_type': 'web_login',
'password': '1d3f6dedf850c58d9a712c4b5197ad886a9c05c567c61465a31cbf11877df493',
'rkey': '9f8c50df15ec91a9248c02a50bbb36da',
'f': 'http%3A%2F%2Fwww.renren.com%2F960481378%2Fprofile',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
r_post = s.post(url=post_url, headers=headers, data=formdata)
print(r_post.text)
time.sleep(2)
# 访问登录后的页面
info_url = 'http://www.renren.com/960481378/profile'
r_info = s.get(url=info_url, headers=headers)
with open('renren1.html', 'wb') as fp:
fp.write(r_info.content)
异常之timeout
import requests
import requests.exceptions
url = 'http://www.baidu.com/'
try:
r = requests.get(url, timeout=0.01)
except requests.exceptions.Timeout as e:
print(e)
公交车路线的爬取
import requests
import time
from bs4 import BeautifulSoup
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
}
def parse_first_page(url):
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# 查找得到所有的以数字开头的链接
number_a_list = soup.select('.bus_kt_r1 > a')
char_a_list = soup.select('.bus_kt_r2 > a')
# 提取a里面的href
a_list = number_a_list + char_a_list
href_list = []
for oa in a_list:
href = url.rstrip('/') + oa['href']
href_list.append(href)
return href_list
def parse_second_page(url, href):
r = requests.get(url=href, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# 查找得到所有的公交链接
bus_a_list = soup.select('#con_site_1 > a')
href_list = []
for oa in bus_a_list:
href = url.rstrip('/') + oa['href']
href_list.append(href)
return href_list
def parse_third_page(href, fp):
r = requests.get(href, headers=headers)
soup = BeautifulSoup(r.text, 'lxml')
# 线路名称
route_name = soup.select('.bus_i_t1 > h1')[0].string
print('正在爬取---%s---...' %route_name)
# 运行时间
run_time = soup.select('.bus_i_content > p')[0].string.lstrip('运行时间:')
# 票价信息
price_info = soup.select('.bus_i_content > p')[1].string.lstrip('票价信息:')
# 公交公司
company = soup.select('.bus_i_content > p > a')[0].string
# 更新时间
update_time = soup.select('.bus_i_content > p')[-1].string.lstrip('最后更新:')
# 上行总个数
up_total = soup.select('.bus_line_top > span')[0].string.strip('共站').strip()
# 上行总站牌
up_name_list = []
number = int(up_total)
up_a_list = soup.select('.bus_site_layer > div > a')[:number]
for oa in up_a_list:
up_name_list.append(oa.string)
# 下行总个数
# 下行总站牌
down_a_list = soup.select('.bus_site_layer > div > a')[number:]
down_total = len(down_a_list)
down_name_list = []
for oa in down_a_list:
down_name_list.append(oa.string)
# 保存到字典中
item = {
'线路名称': route_name,
'运行时间': run_time,
'票价信息': price_info,
'公交公司': company,
'更新时间': update_time,
'上行个数': up_total,
'上行站牌': up_name_list,
'下行个数': down_total,
'下行站牌': down_name_list,
}
string = json.dumps(item, ensure_ascii=False)
fp.write(string + '\n')
print('结束爬取---%s---' %route_name)
# time.sleep(1)
def main():
url = 'http://beijing.8684.cn/'
number_char_list = parse_first_page(url)
fp = open('北京.txt', 'w', encoding='utf8')
# 向所有的以数字、字母开头的发送请求,解析二级页面
for href in number_char_list:
bus_href_list = parse_second_page(url, href)
# 遍历所有的公交详情页,获取每一路公交的详细信息
for href_detail in bus_href_list:
parse_third_page(href_detail, fp)
fp.close()
if __name__ == '__main__':
main()