Requests优点
- 底层实现为urllib
- 方法在Python2和Python3中通用
- 自动对gzip等压缩方式进行解压
作用
发送简单请求
# -*- coding:utf-8 -*-
import requests
url =
response = requests.get(url)
response.text
response.content
response.status_code
response.request.headers
response.headers
response.url
- .text 和 .content的区别
- response.text
- 类型:str
- 解码类型: 根据HTTP 头部对响应的编码作出有根据的推测,推测的文本编码
- 如何修改编码方式:response.encoding=”gbk”
- response.content
- 类型:bytes
- 解码类型: 没有指定
- 如何修改编码方式:response.content.decode(“utf8”)
一般使用response.content.decode()来获取网页内容
# coding=utf-8
# demo
import requests
url = 'http://www.baidu.com'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.15 Safari/537.36'
}
response = requests.get(url, header=header)
response.content.decode('gbk')
发送带参数请求
kw = dict(a:'50')
response = requests.get(url, params = kw)
小案例:实现爬取任意网站内容并保存
import requests
class TieBaSpider(object):
def __init__(self, search_name):
self.search_name = search_name
temp_url = 'http://tieba.baidu.com/f?kw=("+self.search_name+")&ie=utf-8&pn={}'
self.tie_ba_url = []
for i in range(1000):
self.tie_ba_url.append(temp_url.format(i*50))
self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.15 Safari/537.36'}
def parse_url(self, url):
response = requests.get(url, params=self.header)
return response.content.decode('gbk')
def save_html(self, html, page_num):
file_path = self.search_name + '_' + str(page_num) + '.html'
with open(file_path, 'w') as f:
f.write(html)
print('保存成功')
def run(self):
for url in self.tie_ba_url:
html = self.parse_url(url)
page_num = self.tie_ba_url.index(url)+1
self.save_html(html, page_num)
if __name__ == "__main__":
tie_ba_name = input("请输入要爬取的贴吧名字:")
spride_test = TieBaSpider(tie_ba_name)
spride_test.run()