requests模块
功能:模拟浏览器发请求
使用:
指定url:
发起请求:
获取响应数据:
持久化存储:
简单示例
import requests
# 指定url
url = 'https://cn.bing.com/'
# 发起请求,返回一个响应对像
response=requests.get(url=url)
# 获取响应数据:text返回的是字符串类型的响应数据
page_text = response.text
# 持久化存储
with open('./bing.html','w',encoding='utf-8') as f:
f.write(page_text)
UA伪装(反扒策略)
案例1:简易网页采集器
import requests
# 指定url
url = 'https://www.sogou.com/web'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
}
# 处理url携带的参数:封装到字典中
kw = input('enter a word:')
param = {
'query': kw
}
# 发起请求
response = requests.get(url=url, headers=headers, params=param)
# 获取响应数据
page_text = response.text
# 持久化存储
file_name = kw + '.html'
with open(file_name, 'w', encoding='utf-8') as f:
f.write(page_text)
案例2:破解百度翻译
import requests
import json
# 指定url
url = 'https://fanyi.baidu.com/sug'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36 Edg/88.0.705.74'
}
kw = input('input your keyword:')
data = {
'kw' : kw
}
response = requests.post(url=url,data=data,headers=headers)
dic_obj = response.json()
fp = open('./dog.json','w',encoding='utf-8')
json.dump(dic_obj,fp=fp,ensure_ascii=False)
案例3:爬取豆瓣电影信息
import requests
import json
url = 'https://movie.douban.com/j/chart/top_list'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
}
param = {
'type':'24',
'interval_id': '100:90',
'action':'',
'start':'1',
'limit':'20'
}
list_data = requests.get(url=url,headers=headers,params=param).json()
fp = open('./douban.json',mode='w',encoding='utf-8')
json.dump(list_data,fp=fp,ensure_ascii=False)
案例4:爬取肯德基餐厅位置
import requests
import json
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
}
data = {
'cname':'',
'pid':'',
'keyword': '北京',
'pageIndex': '1',
'pageSize': '10'
}
page_json = requests.post(url=url, headers=headers, data=data).json()
fp = open('./kfc.json',mode='w',encoding='utf-8')
json.dump(page_json,fp=fp,ensure_ascii=False)
案例5:爬起药监局数据
import requests
import json
url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:85.0) Gecko/20100101 Firefox/85.0'
}
data = {
'on': 'true',
'page': '1',
'pageSize': '15',
'productName': '',
'conditionType': '1',
'applyname': '',
'appltsn': ''
}
# 获取所有id
data_obj = requests.post(url=url, data=data, headers=headers).json()
id_list = []
for i in data_obj['list']:
id_list.append(i['ID'])
# 获取每个id所对应的数据
all_data_list = []
url_byld = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
data_id = {
'id': id
}
detail_json = requests.post(url=url_byld,
headers=headers,
data=data_id).json()
all_data_list.append(detail_json)
# 持久化存储
fp = open('./alldata.json',mode='w',encoding='utf-8')
json.dump(all_data_list,fp=fp,ensure_ascii=False)