requests模块
- python中原生的一款基于网络请求的模块,功能非常强大,简单便捷,效率极高
- 作用:模拟浏览器发请求。
- 如何使用:( requests模块的编码流程)
- 环境安装:
pip install requests
实战编码:
- 需求:爬取搜狗首页的页面数据
import requests
def get_sogou():
url = 'https://www.sogou.com/'
response = requests.get(url=url)
page_txt = response.text
print(page_txt)
with open('./sogou.html','w',encoding='utf-8') as f:
f.write(page_txt)
print('爬虫结束-----')
if __name__ == '__main__':
get_sogou()
实战巩固
- 需求:爬取搜狗指定词条对应的搜索结果页面(简易网页采集器)
import requests
def get_sogou():
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
url = 'https://www.sogou.com/web'
kw = input('请输入要查询的字符:')
param = {
'query':kw
}
response = requests.get(url=url,params=param,headers=headers)
page_txt = response.text
file_name = kw + '.html'
with open(file_name,'w',encoding='utf-8') as f:
f.write(page_txt)
print('保存成功')
if __name__ == '__main__':
get_sogou()
- 需求:破解百度翻译
import requests
import json
def get_baidu():
url = 'https://fanyi.baidu.com/sug'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
kw = input('请输入要翻译的字符')
data = {
'kw':kw
}
response = requests.post(url=url,data=data,headers=headers)
dic_obj = response.json()
file = kw+'.json'
f = open(file,'w',encoding='utf-8')
json.dump(dic_obj,fp=f,ensure_ascii=False)
print('翻译成功')
if __name__ == '__main__':
get_baidu()
- 需求:爬取豆瓣电影分类排行榜htps://movie.douban.com/中的电影详情数据
import json
import requests
def get_douban():
url = 'https://movie.douban.com/j/chart/top_list'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
param = {
'type': '11',
'interval_id': '100:90',
'action':'',
'start': '0',
'limit': '20'
}
response = requests.get(url=url,params=param,headers=headers)
list_data = response.json()
f = open('./douban.json','w',encoding='utf-8')
json.dump(list_data,fp=f,ensure_ascii=False)
print('爬取成功')
if __name__ == '__main__':
get_douban()
- 需求:爬取肯德基餐厅查询http://www.kfc.comcn/kfccda/index.aspx中指定地点的餐厅数
import json
import requests
def get_kfc():
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
data = {
'cname':'',
'pid':'',
'keyword': '深圳',
'pageIndex': '1',
'pageSize': '10'
}
params = {
'op':'keyword'
}
response = requests.post(url=url,headers=headers,data=data,params=params)
result = response.json()
total = result['Table'][0]['rowcount']
if total > 10:
data['pageSize'] = total
response = requests.post(url=url, headers=headers, data=data, params=params)
result = response.json()
f = open('./kfc.json','w',encoding='utf-8')
json.dump(result,fp=f,ensure_ascii=False)
print('爬取成功')
if __name__ == '__main__':
get_kfc()