urllib.request模块
1.1 版本
python2:urllib2,urllib
python3:把urllib和urllib2合并,urllib.request
1.2 常用的方法
- urllib.request.urlopen(“网址”) 作用:向网站发起一个请求并获取响应
- 字节流= response.read()
- 字符串= response.read().decode(“utf-8”)
- urllib.request.Request"网址",headers=“字典”) urlopen()不支持重构User-Agent
1.3 响应对象
- read() 读取服务器相应的内容
- getcode() 返回http的响应码
- geturl() 返回实际数据的URL(防止重定向问题)
import urllib.request
url = 'https://www.baidu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
# 1 创建了请求的对象Request()
req = urllib.request.Request(url,headers=headers)
# 2 获取响应对象 urlopen()
res = urllib.request.urlopen(req)
# 3 读取响应的内容 read()
html = res.read().decode('utf-8')
print(html)
print(res.getcode()) # 返回状态码
print(res.geturl()) # 返回我们实际请求的url
2.urllib.parse模块
2.1 常用方法
- urlcode(字典)
- guote(字符串) 参数是字符串
import urllib.parse
import urllib.request
te = {'wd':'海贼王'}
result = urllib.parse.urlencode(te)
print(result)
# 搜索一个内容 把这个数据保存到本地 html
baseurl = 'https://www.baidu.com/s?'
key = input('请输入你要搜索的内容:')
# 进行urlencde()进行编码
w = {'wd':key}
k = urllib.parse.urlencode(w)
# 拼接url
url = baseurl + k
print(url)
3.请求方式
- GET 特点 :查询参数在URL地址中显示
- POST
- 在Request⽅法中添加data参数
- urllib.request.Request(url,data=data,headers=headers)
- data :表单数据以bytes类型提交,不能是str
百度贴吧 get请求方式
import urllib.request
import urllib.parse
class BaiduSpider():
# 把常用的不变的放到init方法里面
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
self.baseurl = 'https://tieba.baidu.com/f?'
def readPage(self,url):
req = urllib.request.Request(url, headers=self.headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
return html
def writePage(self,filename,html):
with open(filename, 'w', encoding='utf-8') as f:
f.write(html)
print('写入成功')
def main(self):
name = input('请输入贴吧的名字:')
begin = int(input('请输入起始页:'))
end = int(input('请输入结束页:'))
kw = {'kw': name}
kw = urllib.parse.urlencode(kw)
for i in range(begin, end + 1):
pn = (i - 1) * 50
# baseurl = 'https://tieba.baidu.com/f?'
url = self.baseurl + kw + '&pn=' + str(pn)
# 调用函数
html = self.readPage(url)
filename = '第' + str(i) + '页.html'
self.writePage(filename, html)
if __name__ == '__main__':
# 我们要调用main()方法 就需要实例化类
spider = BaiduSpider()
spider.main()
有道翻译 POST请求方式
import urllib.request
import urllib.parse
import json
# 请输入你要翻译的内容
key = input('请输入你要翻译的内容:')
# 拿到form表单的数据
data = {
'i': key,
'from': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15880623642174',
'sign': 'c6c2e897040e6cbde00cd04589e71d4e',
'ts': '1588062364217',
'bv': '42160534cfa82a6884077598362bbc9d',
'doctype': 'json',
'version': '2.1',
'keyfrom':'fanyi.web',
'action': 'FY_BY_CLICKBUTTION'
}
data = urllib.parse.urlencode(data)
data = bytes(data,'utf-8')
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
req = urllib.request.Request(url,data=data,headers=headers)
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
# print(type(html),html)
'''
{"type":"ZH_CN2EN","errorCode":0,"elapsedTime":1,"translateResult":[[{"src":"你好","tgt":"hello"}]]}
'''
# 把json类型的字符串转换成Python数据类型的字典
r_dict = json.loads(html)
# print(type(r_dict),r_dict)
# [[{"src":"你好","tgt":"hello"}]]
# [{"src":"你好","tgt":"hello"}]
# {"src":"你好","tgt":"hello"}
# 'hello'
r = r_dict['translateResult'][0][0]['tgt']
print(r)
4.requests模块
4.1 什么学习requests,而不是urllib
1、request的底层实现就是urllib
2、requests在Python2和Python3通用,方法完全一样
3、requests简单易用
4、requests能够自动帮助我们解压(gzip压缩的)网页内容
4.2 requests的作用
作用:发送网络请求,返回相应数据
中文文档API:http://docs.python-requests.org/zh_CN/latest/
4.3 安装
- pip install requests
- 在开发工具中安装
4.4 常用方法
- request.get(网址)
4.5 响应对象response的方法
- response.text返回unicode格式的数据(str)
- response.content返回字节流数据(二进制)
- response.content.decode(‘utf-8’)手动进行解码
- response.url 返回url
- response.encode()=编码
4.6 requests中解决编码的方法
response.content.decode()
response.content.decode('gbk')
response.text
response.encoding = 'utf-8'
4.7 response.text和response.content的区别
response.text
- 类型:str
- 修改编码.式:response.encoding = ‘utf-8’
response.content - 类型:bytes
- 修改编码.式:response.content.decode(‘utf8’)
发送简单的请求
response = requests.get(url)
response的常用方法:
response.text
response.content
response.status_code
response.request.headers
response.headers
下载图片
import requests
response = requests.get('https://www.baidu.com/img/bd_logo1.png?where=su
with open('baidu.png','wb') as f:
f.write(response.content)
发送带header的请求
为什么请求需要带上header?
模拟浏览器,欺骗服务器,获取和浏览器一致的内容
header的形式:字典
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6)
#⽤法:requests.get(url,headers = headers)
贴吧练习
import requests
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=0 第一页
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=50 第二页
# http://tieba.baidu.com/f?kw=python&ie=utf-8&pn=100 第三页
class TiebaSpider(object):
def __init__(self, tieba_name):
self.tieba_name = tieba_name
self.url = "http://tieba.baidu.com/f?kw="+tieba_name+"&ie=utf-8&pn={}"
def get_url_list(self):
"""构造URL列表"""
# url_list = []
# for i in range(10):
# url_list.append(self.url.format(i*50))
# print(url_list)
# 列表推导式
return [self.url.format(i*50) for i in range(10)]
def parse_url(self, url):
"""发送请求 获取响应"""
response = requests.get(url)
return response.text
def svae_html(self, page_num, tb_html):
"""保存页面"""
file_path = 'html/{}-第{}页.html'.format(self.tieba_name, page_num)
# python-第1页
with open(file_path, 'w', encoding='utf-8') as f:
f.write(tb_html)
def run(self):
"""实现主要业务逻辑"""
# 1.构造URL列表
tieba_url_list = self.get_url_list()
# 2.遍历发送请求获取响应
for tburl in tieba_url_list:
print(tburl)
tb_html = self.parse_url(tburl)
# 3.保存页面
page_num = tieba_url_list.index(tburl) + 1
self.svae_html(page_num, tb_html)
if __name__ == '__main__':
tb_spider = TiebaSpider('lol')
tb_spider.run()
4.8 发送POST请求
哪些地方我们会用到POST请求:
1、登陆注册(post比get安全)
2、需要传输大文本的时候(post请求对数据长度没有要求)
爬虫也需要在这两个地方模拟浏览器发送post请求
⽤法:
response = requests.post("http://www.baidu.com/",data=data,headers=headers
有道翻译
import requests
import json
key = input('请输入您要翻译的内容:')
data = {
'i': key,
'from': 'AUTO',
'smartresult': 'dict',
'client': 'fanyideskweb',
'salt': '15880623642174',
'sign': 'c6c2e897040e6cbde00cd04589e71d4e',
'ts': '1588062364217',
'bv': '42160534cfa82a6884077598362bbc9d',
'doctype': 'json',
'version': '2.1',
'keyfrom':'fanyi.web',
'action': 'FY_BY_CLICKBUTTION'
}
url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
}
res = requests.post(url,data=data,headers=headers)
res.encoding = 'utf-8'
html = res.text
print(html)
4.9 request设置代理
- 准备⼀堆的IP地址,组成IP池,随机选择⼀个IP来⽤
- 使用requests添加代理只需要在请求方法中传递proxies参数
- 代理网站
- 西刺免费代理ip:http://www.xicidaili.com/
- 快代理 :http://www.kuaidaili.com/
- 代理云:http://www.dailiyun.com/
import requests
# 设置代理
proxy = {
'HTTPS': '223.199.21.229:9999',
'HTTP': '163.204.241.204:9999'
}
url = 'http://httpbin.org/ip' #该url可检测访问网站的原始ip
res = requests.get(url,proxies=proxy)
print(res.text)
- 检查ip的可用性
- 1 在线代理检测的网站
- 2 requests检查
- 返回代码 200
- 'http://httpbin.org/ip 该url可检测访问网站的原始ip
4.10 处理不信任的SSL证书
import requests
url = 'https://inv-veri.chinatax.gov.cn/'
res = requests.get(url,verify=False)
print(res.text)
4.10 cookies和seession
- cookie数据存放在客户端的浏览器上,session数据放在服务器上。 - cookie不是很安全,别⼈可以分析存放在本地的cookie并进⾏cookie欺骗
- session会在⼀定时间内保存在服务器上。当访问增多,会⽐较占⽤你服 务器的性能
- 单个cookie保存的数据不能超过4K,很多浏览器都限制⼀个站点最多保存 20个cookie
- 爬⾍处理cookie和session
- 带上cookie、session的好处:能够请求到登录之后的⻚⾯
- 带上cookie、session的弊端:⼀套cookie和session往往和⼀个⽤户对应请求 太多,请求次数太多,容易被服务器识别为爬⾍
- 不需要cookie的时候尽量不去使⽤cookie 但是为了获取登录之后的⻚⾯,我们必须发送带有cookies的请求
#requests提供了⼀个叫做session类,来实现客户端和服务端的会话保持 #使⽤⽅法
#1 实例化⼀个session对象
#2 让session发送get或者post请求
session = requests.session()
response = session.get(url,headers)
- 请求登录之后的⽹站的思路:
- 1 实例化session
- 2 先使⽤session发送请求,登录对应⽹站,把cookie保持在session中
- 3 在使⽤session请求登录之后才能访问的⽹站,session能够⾃动携带登录成 功时保存在其中的cookie,进⾏请求
#登陆人人网
import requests
from configparser import ConfigParser
cfg = ConfigParser()
r = cfg.read('password.ini')
pwd = cfg.get('password', 'password')
# print(pwd)
session = requests.session()
# print(session)
post_url = 'http://www.renren.com/PLogin.do'
post_data = {
'email': '',
'password': pwd
}
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36'
}
# cookie保存在其中
session.post(post_url, data=post_data, headers=header)
# 请求登陆之后的页面
response = session.get('http://www.renren.com/474133869/profile', headers=header)
with open('renren.html', 'w', encoding='utf-8') as f:
f.write(response.text)
#pwd.ini
[password]
rr_pwd:123
[username]
name:xxx
- 不发送post请求,使⽤cookie获取登录后的⻚⾯
- cookie过期时间很⻓的⽹站
- 在cookie过期之前能够拿到所有的数据, 配合其他程序⼀起使⽤,其他程序专⻔获取cookie,当前程序专⻔请求⻚⾯
import requests
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.75 Safari/537.36',
#直接将cookies放在header里
# 'Cookie': 'anonymid=k2oc3i563739.157310916 5-b85a-c47b678c6576a9; taihe_bi_sdk_session=71536662d9085ea132e779d1f3bdecc9; ick=148dfccc-031c-4b09-82df-2ac13756cbf5; __utma=151146938.373373739.1573109168.1573109270.1578395623.2; __utmc=151146938; __utmz=151146938.1578395623.2.2.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/; first_login_flag=1; ln_uact=844297347@qq.com; ln_hurl=http://hdn.xnimg.cn/photos/hdn221/20130723/1740/h_main_w77W_bf650000112f113e.jpg; jebe_key=af8c32b9-c6ec-4dc2-85d7-f08316801751%7C190f352586cb7e166243b51272b9d865%7C1578395705525%7C1%7C1578395705014; wp=1; jebecookies=86f9275f-2134-4759-9923-6ccaa6c9b3ea|||||; _de=1FAF01AFA458C343C5BC13FF67F7809D696BF75400CE19CC; p=0156b2612644e97a069aac78e97273b79; t=587cc526c47870cb39330ab35ec056f09; societyguester=587cc526c47870cb39330ab35ec056f09; id=474133869; xnsid=7fb455c1; ver=7.0; loginfrom=null; wp_fold=0'
}
cookie = 'ano57310916gM0sT-9w; ick_logdae86336a; taihe_bi_sdk_uid=9fcc8763c918200f1bc47b678c6576a9; taihe_bi_sdk_session=71536662d9085ea132e779d1f3bdecc9; ick=148dfccc-031c-4b09-82df-2ac13756cbf5; __utma=151146938.373373739.1573109168.1573109270.1578395623.2; __utmc=151146938; __utmz=151146938.1578395623.2.2.utmcsr=renren.com|utmccn=(referral)|utmcmd=referral|utmcct=/; first_login_flag=1; ln_uact=844297347@qq.com; ln_hurl=http://hdn.xnimg.cn/photos/hdn221/20130723/1740/h_main_w77W_bf650000112f113e.jpg; jebe_key=af8c32b9-c6ec-4dc2-85d7-f08316801751%7C190f352586cb7e166243b51272b9d865%7C1578395705525%7C1%7C1578395705014; wp=1; jebecookies=86f9275f-2134-4759-9923-6ccaa6c9b3ea|||||; _de=1FAF01AFA458C343C5BC13FF67F7809D696BF75400CE19CC; p=0156b2612644e97a069aac78e97273b79; t=587cc526c47870cb39330ab35ec056f09; societyguester=587cc526c47870cb39330ab35ec056f09; id=474133869; xnsid=7fb455c1; ver=7.0; loginfrom=null; wp_fold=0'
# cookie = {'anonymid': 'k2oc3i56ypqdrc'}
# 'xnsid:7fb455c1'
#将cookies转换成dict
cookies = {i.split('=')[0]:i.split('=')[1] for i in cookie.split('; ')}
print(cookies)
url = 'http://www.renren.com/474133869/profile' #该页面为登陆以后的页面
r = requests.get(url, headers=header, cookies=cookies)
with open('renren3.html', 'w', encoding='utf-8') as f:
f.write(r.text)