Requests爬虫学习(比较全面)

Requests

  • get请求
# get请求 
import requests 
data = { 
    'name': 'germey', 
    'age': 22 
} 
r = requests.get('http://httpbin.org/get', params=data) 
print(r.text) 
# 返回JSON格式字符串转化的字典dict 
print(r.json()) 
print(type(r.json())) # dict 
  • 抓取网页
import requests 
import re 
headers = { 
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' 
} 
r = requests.get("https://www.zhihu.com/explore", headers=headers) 
pattern = re.compile('data-za-detail-view-id="5799">(.*?)</a>', re.S) 
titles = re.findall(pattern, r.text) 
print(titles) 

  • 抓取二进制数据(图片,音频,视频)
# 抓取二进制数据(图片,音频,视频) 
import requests 
headers = { 
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' 
} 
r = requests.get("https://static.zhihu.com/heifetz/assets/logo.f6eef033.png", headers=headers) 
# 以二进制的方式打开文件 
with open('favicon.ico', 'wb') as f: 
    # 将content写入favicon.ico 
    f.write(r.content) 
print(r.text) 
print(r.content) 

  • post请求

使用了data属性

import requests 
data = {'name': 'germey', 'age': '22'} 
r = requests.post('http://httpbin.org/post', data=data) 
print(r.text) 
  • 内置状态码
# 内置状态码 
import requests 
headers = { 
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' 
} 
r = requests.get('http://www.jianshu.com', headers=headers) 
requests.codes.ok就是内置状态码 
if not r.status_code == requests.codes.ok: 
    exit() 
else: 
    print('requests successfully') 

  • 文件上传
# 文件上传 
import requests 
# 字典类型 
files = {'file': open('favicon.ico', 'rb')} 
# 传入files属性 
r = requests.post('http://httpbin.org/post', files=files) 
print(r.text) 
  • cookies
# 成功 
import requests 
headers = { 
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', 
    'Cookie': '_zap=0ed08b6f-0536-4fe7-af62-4f6d8ed6f4dd; _xsrf=F6Y90l9FURL5ECK8gpYlQGQps7Yuivfk; _ga=GA1.2.1860576024.1590395089; _gid=GA1.2.1869275334.1590395089; d_c0="AIASYHrPUhGPTpVkggw52w-Mh96pfWj1TjA=|1590395092"; l_n_c=1; n_c=1; tst=r; _gat_gtag_UA_149949619_1=1; SESSIONID=jkiAxmQ7ZDL2AVy0A6H2PpVApGT52r439qr1fwSUNry; JOID=V1kQAEhA5dksKo32OEZXTMWDIacpLoq2bVTkowkcpZEYa92LDAyGE3QojfQ-WedhwryiHnxwLk17_0_1haUlr6E=; osd=U1sUAUNE590tIYn0PEdcSMeHIKwtLI63ZlDmpwgXoZMcataPDgiHGHAqifU1XeVlw7emHHhxJUl5-07-gachrqo=; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1590397983,1590398005,1590452552,1590452558; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1590452558; capsion_ticket="2|1:0|10:1590452558|14:capsion_ticket|44:OGM4Y2Y5YjRlMmMzNDg1NDg1ZDAyZjJiZTVmYjMwMmQ=|417320164b89b182d1b748aa5bc038129659f288d652a60944b3f07e98bc578f"; r_cap_id="ZmE5MTRmMzAxYzE5NDJlNWJlYzEyZDU5NWZkMTRjYzI=|1590452574|34668f68864e61b885bc460d2c978ad111aaf3fe"; cap_id="YzM5OGE4Mzk4MGE1NDk3OWFlY2JiMzVlNDkyZTI5YTY=|1590452574|2fecbf69fe55407269237f025db14726b9c516b5"; l_cap_id="N2UyNTcwYjhmODA4NDlhMTk5YzAwZWRhMWRkZjY2NTQ=|1590452574|a21b1162ec4e63037e5057e0781b81f2a014afcf"; z_c0=Mi4xZklkdUVBQUFBQUFBZ0JKZ2VzOVNFUmNBQUFCaEFsVk5iSy01WHdDUXRYSUxHMUhBa2hyc0dOdzcxc2gtMGRQa2dn|1590452588|c0a7c1e8e1b382331ac9396de95eab32e664564c; KLBRSID=d6f775bb0765885473b0cba3a5fa9c12|1590452589|1590452548', 
    'Host': 'www.zhihu.com' 
} 
r = requests.get('https://www.zhihu.com', headers=headers) 
print(r.text) 
# Cookies 
# 获取cookies 
import requests 
r = requests.get("https://www.tianyancha.com") 
print(r.cookies) 
# 使用items()转化为元组列表 
for key, value in r.cookies.items(): 
    print(key + '=' + value) 
  • session
# 使用session获取cookie 
import requests 
# 创建session对象 
s = requests.Session() 
# 使用session设置cookie 
s.get('http://httpbin.org/cookies/set/number/123456789') 
# 使用session获取发送get请求 
r = s.get('http://httpbin.org/cookies') 
# 成功获取cookies 
print(r.text) 
# 使用session可以模拟在一个浏览器中打开同一站点的不同页面 
# 通常用于模拟登录成功之后在进行下一步操作 
  • SSL证书验证
# SSL证书验证 
import requests 
response = requests.get('https://www.12306.cn', verify=False) 
print(response.status_code) 
  • Prepared Request
from requests import Request, Session 
url = 'http://httpbin.org/post' 
data = { 
    'name': 'germey' 
} 
headers = { 
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36' 
} 
# 创建Session对象 
s = Session() 
# 使用Request构造方法创建request对象 
req = Request('POST', url, headers=headers, data=data) 
# 使用session的prepare_request方法创建prepared request对象 
prepped = s.prepare_request(req) 
# 用send方法发送请求 
r = s.send(prepped) 
print(r.status_code) 

Max retries exceeded报错!

在requests请求后面加上verify=False,禁用证书

requests.get(url, headers, verify=False) 

加完verify=False之后还会有警告信息,在requests请求之前加上

import urllib3 
urllib3.disable_warnings() 
requests.get(url, headers, verify=False) 
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值