Python 爬虫学习之urllib

最新推荐文章于 2024-12-10 14:12:19 发布

凯撒网络研究院

最新推荐文章于 2024-12-10 14:12:19 发布

阅读量591

点赞数

CC 4.0 BY-SA版权

分类专栏： Python 爬虫

本文链接：https://blog.youkuaiyun.com/qq_24045059/article/details/82498932

Python 爬虫专栏收录该内容

7 篇文章

订阅专栏

PP樱桃红

urllib 使用

urlopen使用

方法原型:

urllib.request.urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,*, cafile=None, capath=None, cadefault=False, context=None):

简单get请求一个url，打印内容

import urllib.request
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode("utf-8"))

简单post传递表单参数

import  urllib.request
import  urllib.parse
data = bytes(
    urllib.parse.urlencode({"word":"hello"}),
    encoding="utf-8")
response= urllib.request.urlopen("http://httpbin.org/post",data=data)
print(response.read().decode("utf-8"))

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "word": "hello"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Content-Length": "10", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.7"
  }, 
  "json": null, 
  "origin": "117.61.146.4", 
  "url": "http://httpbin.org/post"
}

超时设置

import urllib.request
import  urllib.error
import  socket
# 设置超时时间为1s，1s内响应不会超时
response = urllib.request.urlopen("https://httpbin.org/get",timeout=1)
print(response.read().decode("utf-8"))
# 设置超时时间为0.1s，0.1s内不响应会超时
try:
    response = urllib.request.urlopen("https://httpbin.org/get",timeout=0.1)
except Exception as e:
    print("异常:",e)

{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.7"
  }, 
  "origin": "117.61.146.4", 
  "url": "https://httpbin.org/get"
}

异常: <urlopen error timed out>

响应

响应类型

import urllib.request
response = urllib.request.urlopen("https://httpbin.org/get")
print(type(response))

响应码、响应头

import urllib.request
response = urllib.request.urlopen("https://httpbin.org/get")
print(response.status)
print(response.getheaders())
print(response.getheader("Server"))

200
[('Connection', 'close'), ('Server', 'gunicorn/19.9.0'), ('Date', 'Fri, 07 Sep 2018 04:15:35 GMT'), ('Content-Type', 'application/json'), ('Content-Length', '234'), ('Access-Control-Allow-Origin', '*'), ('Access-Control-Allow-Credentials', 'true'), ('Via', '1.1 vegur')]
gunicorn/19.9.0

请求

构造一个request对象，添加headers data url然后请求

from urllib import request,parse
url="http://httpbin.org/post"
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
    "Host":"httpbin.org"
}
dict={
    "name":"domain"
}
data=bytes(parse.urlencode(dict),encoding="utf-8")
req=request.Request(url=url,data=data,headers=headers,method="POST")
response = request.urlopen(req)
print(response.read().decode("utf-8"))

add_header(key,value)

from urllib import request,parse
url="http://httpbin.org/post"
headers={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}
dict={
    "name":"domain"
}
data=bytes(parse.urlencode(dict),encoding="utf-8")
req=request.Request(url=url,data=data,headers=headers,method="POST")
req.add_header("Host","httpbin.org")
response = request.urlopen(req)
print(response.read().decode("utf-8"))

Handler

ProxyHandler 代理handler

import urllib.request
proxy_handler = urllib.request.ProxyHandler(
    {
        "http":"114.82.109.134:8118"
    }
)
opener=urllib.request.build_opener(proxy_handler)
response=opener.open("https://www.baidu.com")
print("主机地址:",response.read().decode("utf-8"))

主机地址: <html>
<head>
    <script>
        location.replace(location.href.replace("https://","http://"));
    </script>
</head>
<body>
    <noscript><meta http-equiv="refresh" content="0;url=http://www.baidu.com/"></noscript>
</body>
</html>

获取cookie

import http.cookiejar, urllib.request

cookie = http.cookiejar.CookieJar()  # 获取cookie 对象
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)  # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
    print(item.name + "=" + item.value)

BAIDUID=ADCB88EA2EA49502F0371FBBC84CCBA4:FG=1
BIDUPSID=ADCB88EA2EA49502F0371FBBC84CCBA4
H_PS_PSSID=1449_21081_27112
PSTM=1536299781
BDSVRTM=0
BD_HOME=0
delPer=0

保存cookie为文本文件,格式为火狐浏览器的cookie格式


import http.cookiejar, urllib.request
file_name="cookie.txt"
cookie = http.cookiejar.MozillaCookieJar(file_name)  # 用上面的CookieJar的子类MozillaCookieJar获取cookie对象并保存为文本文件
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)  # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
    print(item.name + "=" + item.value)
cookie.save(ignore_discard=True,ignore_expires=True) #调用cookie的save方法保存cookie.txt在工作下目录

BAIDUID=407D2C4DF98E5FC997B72296A2BF061A:FG=1
BIDUPSID=407D2C4DF98E5FC997B72296A2BF061A
H_PS_PSSID=1454_25810_26910_21118_22072
PSTM=1536300020
BDSVRTM=0
BD_HOME=0
delPer=0

保存cookie为文本文件,格式为LWP cookie2.0格式

import http.cookiejar, urllib.request
file_name="cookie.txt"
cookie = http.cookiejar.LWPCookieJar(file_name)  # 用上面的CookieJar的子类MozillaCookieJar获取cookie对象并保存为文本文件
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)  # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
for item in cookie:
    print(item.name + "=" + item.value)
cookie.save(ignore_discard=True,ignore_expires=True)

BAIDUID=1C3F3DADCA2959F54C5205D52E251B41:FG=1
BIDUPSID=1C3F3DADCA2959F54C5205D52E251B41
H_PS_PSSID=1458_21114_26350_20718
PSTM=1536300536
BDSVRTM=0
BD_HOME=0
delPer=0

使用 LWPCookie2.0格式的cookie 文本文件,如果是火狐格式 LWPCookieJar 改为MozillaCookieJar 即可

import http.cookiejar, urllib.request
cookie = http.cookiejar.LWPCookieJar()  # 用CookieJar的子类LWPCookieJar获取LWPCookie2.0格式的cookie对象
cookie.load("cookie.txt",ignore_discard=True,ignore_expires=True) # 加载cookie文件作为cookie使用
cookie_handler = urllib.request.HTTPCookieProcessor(cookie)  # 用cookie对象生成一个cookie的handler对象
opener = urllib.request.build_opener(cookie_handler)
response = opener.open("http://www.baidu.com")
print(response.read().decode("utf-8"))

异常处理

异常处理分为两个，一个是URLError，一个是HTTPError，HTTPError是一个是URLError的子类。所以一般先捕获HTTPError的异常再捕获URLError的异常

from urllib import request,error

try:
    response = request.urlopen("https://www.cuiqingcai.com/index.html")
except error.HTTPError as e :
    print("code:", e.code,"reason:",e.reason,"headers:",e.headers,sep="\n")
except error.URLError as e :
    print(e.reason)

URL 解析

urlparse(解析url)

urlparse 原型:

urlparse(url, scheme='', allow_fragments=True)

url:url地址
scheme:默认协议类型，没有协议类型时生效
allow_fragments：锚点连接，false时忽略锚点链接，拼接到query，如果query没有的话就拼接到path，往前拼接。

输入url 解析url信息

from urllib.parse import urlparse

#urlparse(url, scheme='', allow_fragments=True)
result = urlparse("https://www.baidu.com/index.html;user?id=5#comment")
print(result)

ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')

设置默认协议类型

result = urlparse("https://www.baidu.com/index.html;user?id=5#comment",scheme="https")
print(result)

ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5', fragment='comment')

#### 设置忽略锚点

result = urlparse("https://www.baidu.com/index.html;user?id=5#comment",scheme="https",allow_fragments=False)
print(result)

ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=5#comment', fragment='')

urlunparse(拼接url)

from urllib.parse import urlunparse

data= [ "http","www.baidu.com","index.html","user","a=6","comment"]
print(urlunparse(data))

urljoin （合并url）

合并时，如果有重复属性，以后面的url为准，其余合并。

from urllib.parse import urljoin

print(urljoin("http://www.baidu.com","FAQ.html"))
print(urljoin("http://www.baidu.com","https://www.caiqingcai.com/FAQ.html"))
print(urljoin("http://www.baidu.com/about","https://www.caiqingcai.com/FAQ.html"))

http://www.baidu.com/FAQ.html
https://www.caiqingcai.com/FAQ.html
https://www.caiqingcai.com/FAQ.html

urlencode （字典对象转化为url参数）

from urllib.parse import urlencode

dict={
    "name":"domian",
    "age":17
}
base_url="https://www.baidu.com?"
url=base_url+urlencode(dict)
print(url)

https://www.baidu.com?name=domian&age=17

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1