Python 爬虫学习之urllib

目录

PP樱桃红

urllib 使用

urlopen使用

方法原型:

urllib.request.urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,*, cafile=None, capath=None, cadefault=False, context=None):

简单get请求一个url,打印内容

import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))

简单post传递表单参数

import  urllib.request
import  urllib.parse
data = bytes(
    urllib.parse.urlencode({"word":"hello"}),
    encoding="utf-8")
response= urllib.request.urlopen("http://httpbin.org/post",data=data)
print(response.read().decode("utf-8"))
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "word": "hello"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Content-Length": "10", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.7"
  }, 
  "json": null, 
  "origin": "117.61.146.4", 
  "url": "http://httpbin.org/post"
}

超时设置

import urllib.request
import  urllib.error
import  socket
# 设置超时时间为1s,1s内响应不会超时
response = urllib.request.urlopen("https://httpbin.org/get",timeout=1)
print(response.read().decode("utf-8"))
# 设置超时时间为0.1s,0.1s内不响应会超时
try:
    response = urllib.request.urlopen("https://httpbin.org/get",timeout=0.1)
except Exception as e:
    print("异常:",e)
{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.7"
  }, 
  "origin": "117.61.146.4", 
  "url": "https://httpbin.org/get"
}

异常: <urlopen error timed out>

响应

响应类型

import urllib.request
response = urllib.request.urlopen("https://httpbin.org/get")
print(type(response))

响应码、响应头

import urllib.request
response = urllib.request.urlopen("https://httpbin.org/get")
print(response.status)
print(response.getheaders())
print(response.getheader("Server"))
200
[('Connection', 'close'), ('Server', 'gunicorn/19.9.0'), ('Date', 'Fri, 07 Sep 2018 04:15:35 GMT'), ('Content-Type', 'application/json'), ('Content-Length', '234'), ('Access-Control-Allow-Origin', '*'), ('Access-Control-Allow-Credentials', 'true'), ('Via', '1.1 vegur')]
gunicorn/19.9.0

请求

构造一个request对象,添加headers data url然后请求

from urllib import request,parse
url="http://httpbin.org/post"
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Host":"httpbin.org"
}
dict={
    "name":"domain"
}
data=bytes(parse.urlencode(dict),encoding="utf-8")
req=request.Request(url=url,data=data,headers=headers,method="POST")
response = request.urlopen(req)
print(response.read().decode("utf-8"))
add_header(key,value)
from urllib import request,parse
url="http://httpbin.org/post"
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}
dict={
    "name":"domain"
}
data=bytes(parse.urlencode(dict),encoding="utf-8")
req=request.Request(url=url,data=data,headers=headers,method="POST")
req.add_header("Host","httpbin.org")
response = request.urlopen(req)
print(response.read().decode("utf-8"))

Handler

ProxyHandler 代理handler

import urllib.request
proxy_handler = urllib.request.ProxyHandler(
    {
        "http":"114.82.109.134:8118"
    }
)
opener=urllib.request.build_opener(proxy_handler)
response=opener.open("https://www.baidu.com")
print("主机地址:",response.read().decode("utf-8"))

主机地址: <html>
<head>
    <script>
        location.replace(location.href.replace("https://","http://"));
    </script>
</head>
<body>
    <noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>
</body>
</html>

获取cookie

import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar()  # 获取cookie 对象
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)  # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
    print(item.name + "=" + item.value)
BAIDUID=ADCB88EA2EA49502F0371FBBC84CCBA4:FG=1
BIDUPSID=ADCB88EA2EA49502F0371FBBC84CCBA4
H_PS_PSSID=1449_21081_27112
PSTM=1536299781
BDSVRTM=0
BD_HOME=0
delPer=0

保存cookie为文本文件,格式为火狐浏览器的cookie格式


import http.cookiejar, urllib.request
file_name="cookie.txt"
cookie = http.cookiejar.MozillaCookieJar(file_name)  # 用上面的CookieJar的子类MozillaCookieJar获取cookie对象并保存为文本文件
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)  # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
    print(item.name + "=" + item.value)
cookie.save(ignore_discard=True,ignore_expires=True) #调用cookie的save方法保存cookie.txt在工作下目录
BAIDUID=407D2C4DF98E5FC997B72296A2BF061A:FG=1
BIDUPSID=407D2C4DF98E5FC997B72296A2BF061A
H_PS_PSSID=1454_25810_26910_21118_22072
PSTM=1536300020
BDSVRTM=0
BD_HOME=0
delPer=0

保存cookie为文本文件,格式为LWP cookie2.0格式

import http.cookiejar, urllib.request
file_name="cookie.txt"
cookie = http.cookiejar.LWPCookieJar(file_name)  # 用上面的CookieJar的子类MozillaCookieJar获取cookie对象并保存为文本文件
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)  # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
    print(item.name + "=" + item.value)
cookie.save(ignore_discard=True,ignore_expires=True)
BAIDUID=1C3F3DADCA2959F54C5205D52E251B41:FG=1
BIDUPSID=1C3F3DADCA2959F54C5205D52E251B41
H_PS_PSSID=1458_21114_26350_20718
PSTM=1536300536
BDSVRTM=0
BD_HOME=0
delPer=0

使用 LWPCookie2.0格式的cookie 文本文件,如果是火狐格式 LWPCookieJar 改为MozillaCookieJar 即可

import http.cookiejar, urllib.request
cookie = http.cookiejar.LWPCookieJar()  # 用CookieJar的子类LWPCookieJar获取LWPCookie2.0格式的cookie对象
cookie.load("cookie.txt",ignore_discard=True,ignore_expires=True) # 加载cookie文件作为cookie使用
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)  # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
print(response.read().decode("utf-8"))

异常处理

异常处理分为两个,一个是URLError,一个是HTTPError,HTTPError是一个是URLError的子类。所以一般先捕获HTTPError的异常再捕获URLError的异常

from urllib import request,error

try:
    response = request.urlopen("https://www.cuiqingcai.com/index.html")
except error.HTTPError as e :
    print("code:", e.code,"reason:",e.reason,"headers:",e.headers,sep="\n")
except error.URLError as e :
    print(e.reason)

URL 解析

urlparse(解析url)

urlparse 原型:
urlparse(url, scheme='', allow_fragments=True)
  • url:url地址
  • scheme:默认协议类型,没有协议类型时生效
  • allow_fragments:锚点连接,false时忽略锚点链接,拼接到query,如果query没有的话就拼接到path,往前拼接。
输入url 解析url信息
from urllib.parse import urlparse

#urlparse(url, scheme='', allow_fragments=True)
result = urlparse("https://www.baidu.com/index.html;user?id=5#comment")
print(result)
ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
设置默认协议类型
result = urlparse("https://www.baidu.com/index.html;user?id=5#comment",scheme="https")
print(result)
ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')
#### 设置忽略锚点
result = urlparse("https://www.baidu.com/index.html;user?id=5#comment",scheme="https",allow_fragments=False)
print(result)
ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='')

urlunparse(拼接url)

from urllib.parse import urlunparse

data= [ "http","www.baidu.com","index.html","user","a=6","comment"]
print(urlunparse(data))

urljoin (合并url)

合并时,如果有重复属性,以后面的url为准,其余合并。

from urllib.parse import urljoin

print(urljoin("http://www.baidu.com","FAQ.html"))
print(urljoin("http://www.baidu.com","https://www.caiqingcai.com/FAQ.html"))
print(urljoin("http://www.baidu.com/about","https://www.caiqingcai.com/FAQ.html"))



http://www.baidu.com/FAQ.html
https://www.caiqingcai.com/FAQ.html
https://www.caiqingcai.com/FAQ.html

urlencode (字典对象转化为url参数)

from urllib.parse import urlencode

dict={
    "name":"domian",
    "age":17
}
base_url="https://www.baidu.com?"
url=base_url+urlencode(dict)
print(url)
https://www.baidu.com?name=domian&age=17

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值