目录
PP樱桃红
urllib 使用
urlopen使用
方法原型:
urllib.request.urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,*, cafile=None, capath=None, cadefault=False, context=None):
简单get请求一个url,打印内容
import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))
简单post传递表单参数
import urllib.request
import urllib.parse
data = bytes(
urllib.parse.urlencode({"word":"hello"}),
encoding="utf-8")
response= urllib.request.urlopen("http://httpbin.org/post",data=data)
print(response.read().decode("utf-8"))
{
"args": {},
"data": "",
"files": {},
"form": {
"word": "hello"
},
"headers": {
"Accept-Encoding": "identity",
"Connection": "close",
"Content-Length": "10",
"Content-Type": "application/x-www-form-urlencoded",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.7"
},
"json": null,
"origin": "117.61.146.4",
"url": "http://httpbin.org/post"
}
超时设置
import urllib.request
import urllib.error
import socket
# 设置超时时间为1s,1s内响应不会超时
response = urllib.request.urlopen("https://httpbin.org/get",timeout=1)
print(response.read().decode("utf-8"))
# 设置超时时间为0.1s,0.1s内不响应会超时
try:
response = urllib.request.urlopen("https://httpbin.org/get",timeout=0.1)
except Exception as e:
print("异常:",e)
{
"args": {},
"headers": {
"Accept-Encoding": "identity",
"Connection": "close",
"Host": "httpbin.org",
"User-Agent": "Python-urllib/3.7"
},
"origin": "117.61.146.4",
"url": "https://httpbin.org/get"
}
异常: <urlopen error timed out>
响应
响应类型
import urllib.request
response = urllib.request.urlopen("https://httpbin.org/get")
print(type(response))
响应码、响应头
import urllib.request
response = urllib.request.urlopen("https://httpbin.org/get")
print(response.status)
print(response.getheaders())
print(response.getheader("Server"))
200
[('Connection', 'close'), ('Server', 'gunicorn/19.9.0'), ('Date', 'Fri, 07 Sep 2018 04:15:35 GMT'), ('Content-Type', 'application/json'), ('Content-Length', '234'), ('Access-Control-Allow-Origin', '*'), ('Access-Control-Allow-Credentials', 'true'), ('Via', '1.1 vegur')]
gunicorn/19.9.0
请求
构造一个request对象,添加headers data url然后请求
from urllib import request,parse
url="http://httpbin.org/post"
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Host":"httpbin.org"
}
dict={
"name":"domain"
}
data=bytes(parse.urlencode(dict),encoding="utf-8")
req=request.Request(url=url,data=data,headers=headers,method="POST")
response = request.urlopen(req)
print(response.read().decode("utf-8"))
add_header(key,value)
from urllib import request,parse
url="http://httpbin.org/post"
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}
dict={
"name":"domain"
}
data=bytes(parse.urlencode(dict),encoding="utf-8")
req=request.Request(url=url,data=data,headers=headers,method="POST")
req.add_header("Host","httpbin.org")
response = request.urlopen(req)
print(response.read().decode("utf-8"))
Handler
ProxyHandler 代理handler
import urllib.request
proxy_handler = urllib.request.ProxyHandler(
{
"http":"114.82.109.134:8118"
}
)
opener=urllib.request.build_opener(proxy_handler)
response=opener.open("https://www.baidu.com")
print("主机地址:",response.read().decode("utf-8"))
主机地址: <html>
<head>
<script>
location.replace(location.href.replace("https://","http://"));
</script>
</head>
<body>
<noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>
</body>
</html>
Cookie
获取cookie
import http.cookiejar, urllib.request
cookie = http.cookiejar.CookieJar() # 获取cookie 对象
cookie_handler = urllib.request.HTTPCookieProcessor(cookie) # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
print(item.name + "=" + item.value)
BAIDUID=ADCB88EA2EA49502F0371FBBC84CCBA4:FG=1
BIDUPSID=ADCB88EA2EA49502F0371FBBC84CCBA4
H_PS_PSSID=1449_21081_27112
PSTM=1536299781
BDSVRTM=0
BD_HOME=0
delPer=0
保存cookie为文本文件,格式为火狐浏览器的cookie格式
import http.cookiejar, urllib.request
file_name="cookie.txt"
cookie = http.cookiejar.MozillaCookieJar(file_name) # 用上面的CookieJar的子类MozillaCookieJar获取cookie对象并保存为文本文件
cookie_handler = urllib.request.HTTPCookieProcessor(cookie) # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
print(item.name + "=" + item.value)
cookie.save(ignore_discard=True,ignore_expires=True) #调用cookie的save方法保存cookie.txt在工作下目录
BAIDUID=407D2C4DF98E5FC997B72296A2BF061A:FG=1
BIDUPSID=407D2C4DF98E5FC997B72296A2BF061A
H_PS_PSSID=1454_25810_26910_21118_22072
PSTM=1536300020
BDSVRTM=0
BD_HOME=0
delPer=0
保存cookie为文本文件,格式为LWP cookie2.0格式
import http.cookiejar, urllib.request
file_name="cookie.txt"
cookie = http.cookiejar.LWPCookieJar(file_name) # 用上面的CookieJar的子类MozillaCookieJar获取cookie对象并保存为文本文件
cookie_handler = urllib.request.HTTPCookieProcessor(cookie) # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
print(item.name + "=" + item.value)
cookie.save(ignore_discard=True,ignore_expires=True)
BAIDUID=1C3F3DADCA2959F54C5205D52E251B41:FG=1
BIDUPSID=1C3F3DADCA2959F54C5205D52E251B41
H_PS_PSSID=1458_21114_26350_20718
PSTM=1536300536
BDSVRTM=0
BD_HOME=0
delPer=0
使用 LWPCookie2.0格式的cookie 文本文件,如果是火狐格式 LWPCookieJar 改为MozillaCookieJar 即可
import http.cookiejar, urllib.request
cookie = http.cookiejar.LWPCookieJar() # 用CookieJar的子类LWPCookieJar获取LWPCookie2.0格式的cookie对象
cookie.load("cookie.txt",ignore_discard=True,ignore_expires=True) # 加载cookie文件作为cookie使用
cookie_handler = urllib.request.HTTPCookieProcessor(cookie) # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
print(response.read().decode("utf-8"))
异常处理
异常处理分为两个,一个是URLError,一个是HTTPError,HTTPError是一个是URLError的子类。所以一般先捕获HTTPError的异常再捕获URLError的异常
from urllib import request,error
try:
response = request.urlopen("https://www.cuiqingcai.com/index.html")
except error.HTTPError as e :
print("code:", e.code,"reason:",e.reason,"headers:",e.headers,sep="\n")
except error.URLError as e :
print(e.reason)
URL 解析
urlparse(解析url)
urlparse 原型:
urlparse(url, scheme='', allow_fragments=True)
- url:url地址
- scheme:默认协议类型,没有协议类型时生效
- allow_fragments:锚点连接,false时忽略锚点链接,拼接到query,如果query没有的话就拼接到path,往前拼接。
输入url 解析url信息
from urllib.parse import urlparse
#urlparse(url, scheme='', allow_fragments=True)
result = urlparse("https://www.baidu.com/index.html;user?id=5#comment")
print(result)
ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
设置默认协议类型
result = urlparse("https://www.baidu.com/index.html;user?id=5#comment",scheme="https")
print(result)
ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
#### 设置忽略锚点
result = urlparse("https://www.baidu.com/index.html;user?id=5#comment",scheme="https",allow_fragments=False)
print(result)
ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='')
urlunparse(拼接url)
from urllib.parse import urlunparse
data= [ "http","www.baidu.com","index.html","user","a=6","comment"]
print(urlunparse(data))
urljoin (合并url)
合并时,如果有重复属性,以后面的url为准,其余合并。
from urllib.parse import urljoin
print(urljoin("http://www.baidu.com","FAQ.html"))
print(urljoin("http://www.baidu.com","https://www.caiqingcai.com/FAQ.html"))
print(urljoin("http://www.baidu.com/about","https://www.caiqingcai.com/FAQ.html"))
http://www.baidu.com/FAQ.html
https://www.caiqingcai.com/FAQ.html
https://www.caiqingcai.com/FAQ.html
urlencode (字典对象转化为url参数)
from urllib.parse import urlencode
dict={
"name":"domian",
"age":17
}
base_url="https://www.baidu.com?"
url=base_url+urlencode(dict)
print(url)
https://www.baidu.com?name=domian&age=17
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1