Python3.x urllib_python 3.8.10 对应的urllib3是哪个版本-优快云博客

Python3.X

3.0版本中已经将urllib2、urlparse、和robotparser并入了urllib中，并且修改urllib模块，其中包含5个子模块，即是help()中看到的那五个名字。

help(urllib)

2.x版本的python可以直接使用import urllib来进行操作，但是3.x版本的python使用的是import urllib.request来进行

import urllib.request
fp=urllib.request.urlopen('https://www.baidu.com')
mybytes=fp.read()
mystr=mybytes.decode('utf8')
fp.close()
print(mystr)<span style="color:#333333;">
</span>

2.x版本
import urllib

print urllib. urlopen('http://www.google.com') . read()

最简单的方式：

 
         #coding=utf-8 
        
         import  
         urllib.request 
        
         response  
         =  
         urllib.request.urlopen( 
         'http://python.org/' 
         ) 
        
         buff  
         =  
         response.read() 
        
         #显示 
        
         html  
         =  
         buff.decode( 
         "utf8" 
         ) 
        
         response.close() 
        
         print 
         (html)

使用Request的方式：

 
         #coding=utf-8 
        
         import  
         urllib.request 
        
         req  
         =  
         urllib.request.Request( 
         'http://www.voidspace.org.uk' 
         ) 
        
         response  
         =  
         urllib.request.urlopen(req) 
        
         buff  
         =  
         response.read() 
        
         #显示 
        
         the_page  
         =  
         buff.decode( 
         "utf8" 
         ) 
        
         response.close() 
        
         print 
         (the_page)

这种方式同样可以用来处理其他URL，例如FTP：

 
         #coding=utf-8 
        
         import  
         urllib.request 
        
         req  
         =  
         urllib.request.Request( 
         'ftp://ftp.pku.edu.cn/' 
         ) 
        
         response  
         =  
         urllib.request.urlopen(req) 
        
         buff  
         =  
         response.read() 
        
         #显示 
        
         the_page  
         =  
         buff.decode( 
         "utf8" 
         ) 
        
         response.close() 
        
         print 
         (the_page)

使用POST请求：

 
         import  
         urllib.parseimport 
        
         urllib.requesturl  
         =  
         'http://www.someserver.com/cgi-bin/register.cgi' 
        
         values  
         =  
         { 
         'name'  
         :  
         'Michael Foord' 
         , 
        
         'location'  
         :  
         'Northampton' 
         , 
        
         'language'  
         :  
         'Python'  
         } 
        
         data  
         =  
         urllib.parse.urlencode(values) 
        
         req  
         =  
         urllib.request.Request(url, data) 
        
         response  
         =  
         urllib.request.urlopen(req) 
        
         the_page  
         =  
         response.read()

使用GET请求：

 
         import  
         urllib.request 
        
 
         import  
         urllib.parse 
        
 
         data  
         =  
         {} 
        
 
         data[ 
         'name' 
         ]  
         =  
         'Somebody Here' 
        
 
         data[ 
         'location' 
         ]  
         =  
         'Northampton' 
        
 
         data[ 
         'language' 
         ]  
         =  
         'Python' 
        
 
         url_values  
         =  
         urllib.parse.urlencode(data) 
        
 
         print 
         (url_values) 
        
 
         name 
         = 
         Somebody 
         + 
         Here&language 
         = 
         Python&location 
         = 
         Northampton 
        
 
         url  
         =  
         'http://www.example.com/example.cgi' 
        
 
         full_url  
         =  
         url  
         +  
         '?'  
         +  
         url_values 
        
 
         data  
         =  
         urllib.request.url 
         open 
         (full_url) 
        

添加header：

 
         import  
         urllib.parse 
        
         import  
         urllib.request 
        
         url  
         =  
         'http://www.someserver.com/cgi-bin/register.cgi' 
        
         user_agent  
         =  
         'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' 
        
         values  
         =  
         { 
         'name'  
         :  
         'Michael Foord' 
         , 
        
         'location'  
         :  
         'Northampton' 
         , 
        
         'language'  
         :  
         'Python'  
         } 
        
         headers  
         =  
         {  
         'User-Agent'  
         : user_agent } 
        
         data  
         =  
         urllib.parse.urlencode(values).encode(encoding='UTF8')#红色的必须要否则报错encode('UTF8') 
        
         req  
         =  
         urllib.request.Request(url, data, headers) 
        
         #req = urllib.request.Request(url, data)
 #req.add_header('Referer', 'http://www.python.org/') 
        
         #headers = { 'User-Agent' : user_agent }
 #response = urllib.request.urlopen(req) 
        
         response  
         =  
         urllib.request.urlopen(req) 
        
         the_page  
         =  
         response.read()

错误处理：

1

2

3

4

 
         req  
         =  
         urllib.request.Request( 
         'http://www.pretend_server.org' 
         ) 
        
         try 
         : urllib.request.urlopen(req) 
        
         except  
         urllib.error.URLError as e: 
        
         print 
         (e.reason)

返回的错误代码：

 
         # Table mapping response codes to messages; entries have the 
        
         # form {code: (shortmessage, longmessage)}. 
        
         responses  
         =  
         { 
        
         100 
         : ( 
         'Continue' 
         ,  
         'Request received, please continue' 
         ), 
        
         101 
         : ( 
         'Switching Protocols' 
         , 
        
         'Switching to new protocol; obey Upgrade header' 
         ), 
        
         200 
         : ( 
         'OK' 
         ,  
         'Request fulfilled, document follows' 
         ), 
        
         201 
         : ( 
         'Created' 
         ,  
         'Document created, URL follows' 
         ), 
        
         202 
         : ( 
         'Accepted' 
         , 
        
         'Request accepted, processing continues off-line' 
         ), 
        
         203 
         : ( 
         'Non-Authoritative Information' 
         ,  
         'Request fulfilled from cache' 
         ), 
        
         204 
         : ( 
         'No Content' 
         ,  
         'Request fulfilled, nothing follows' 
         ), 
        
         205 
         : ( 
         'Reset Content' 
         ,  
         'Clear input form for further input.' 
         ), 
        
         206 
         : ( 
         'Partial Content' 
         ,  
         'Partial content follows.' 
         ), 
        
         300 
         : ( 
         'Multiple Choices' 
         , 
        
         'Object has several resources -- see URI list' 
         ), 
        
         301 
         : ( 
         'Moved Permanently' 
         ,  
         'Object moved permanently -- see URI list' 
         ), 
        
         302 
         : ( 
         'Found' 
         ,  
         'Object moved temporarily -- see URI list' 
         ), 
        
         303 
         : ( 
         'See Other' 
         ,  
         'Object moved -- see Method and URL list' 
         ), 
        
         304 
         : ( 
         'Not Modified' 
         , 
        
         'Document has not changed since given time' 
         ), 
        
         305 
         : ( 
         'Use Proxy' 
         , 
        
         'You must use proxy specified in Location to access this ' 
        
         'resource.' 
         ), 
        
         307 
         : ( 
         'Temporary Redirect' 
         , 
        
         'Object moved temporarily -- see URI list' 
         ), 
        
         400 
         : ( 
         'Bad Request' 
         , 
        
         'Bad request syntax or unsupported method' 
         ), 
        
         401 
         : ( 
         'Unauthorized' 
         , 
        
         'No permission -- see authorization schemes' 
         ), 
        
         402 
         : ( 
         'Payment Required' 
         , 
        
         'No payment -- see charging schemes' 
         ), 
        
         403 
         : ( 
         'Forbidden' 
         , 
        
         'Request forbidden -- authorization will not help' 
         ), 
        
         404 
         : ( 
         'Not Found' 
         ,  
         'Nothing matches the given URI' 
         ), 
        
         405 
         : ( 
         'Method Not Allowed' 
         , 
        
         'Specified method is invalid for this server.' 
         ), 
        
         406 
         : ( 
         'Not Acceptable' 
         ,  
         'URI not available in preferred format.' 
         ), 
        
         407 
         : ( 
         'Proxy Authentication Required' 
         ,  
         'You must authenticate with ' 
        
         'this proxy before proceeding.' 
         ), 
        
         408 
         : ( 
         'Request Timeout' 
         ,  
         'Request timed out; try again later.' 
         ), 
        
         409 
         : ( 
         'Conflict' 
         ,  
         'Request conflict.' 
         ), 
        
         410 
         : ( 
         'Gone' 
         , 
        
         'URI no longer exists and has been permanently removed.' 
         ), 
        
         411 
         : ( 
         'Length Required' 
         ,  
         'Client must specify Content-Length.' 
         ), 
        
         412 
         : ( 
         'Precondition Failed' 
         ,  
         'Precondition in headers is false.' 
         ), 
        
         413 
         : ( 
         'Request Entity Too Large' 
         ,  
         'Entity is too large.' 
         ), 
        
         414 
         : ( 
         'Request-URI Too Long' 
         ,  
         'URI is too long.' 
         ), 
        
         415 
         : ( 
         'Unsupported Media Type' 
         ,  
         'Entity body in unsupported format.' 
         ), 
        
         416 
         : ( 
         'Requested Range Not Satisfiable' 
         , 
        
         'Cannot satisfy request range.' 
         ), 
        
         417 
         : ( 
         'Expectation Failed' 
         , 
        
         'Expect condition could not be satisfied.' 
         ), 
        
         500 
         : ( 
         'Internal Server Error' 
         ,  
         'Server got itself in trouble' 
         ), 
        
         501 
         : ( 
         'Not Implemented' 
         , 
        
         'Server does not support this operation' 
         ), 
        
         502 
         : ( 
         'Bad Gateway' 
         ,  
         'Invalid responses from another server/proxy.' 
         ), 
        
         503 
         : ( 
         'Service Unavailable' 
         , 
        
         'The server cannot process the request due to a high load' 
         ), 
        
         504 
         : ( 
         'Gateway Timeout' 
         , 
        
         'The gateway server did not receive a timely response' 
         ), 
        
         505 
         : ( 
         'HTTP Version Not Supported' 
         ,  
         'Cannot fulfill request.' 
         ), 
        
         }

Python 3 抓取网页资源的 N 种方法

1、最简单

import urllib.request
response = urllib.request.urlopen( ' http://python.org/ ')
html = response.read()

2、使用 Request

import urllib.request

req = urllib.request.Request( ' http://python.org/ ')
response = urllib.request.urlopen(req)
the_page = response.read()

3、发送数据

# ! /usr/bin/env python3

import urllib.parse
import urllib.request

url = ' http://localhost/login.php '
user_agent = ' Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
values = {
' act ' : ' login ',
' login[email] ' : ' yzhang@i9i8.com ',
' login[password] ' : ' 123456 '
}

data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data)
req.add_header( ' Referer ', ' http://www.python.org/ ')
response = urllib.request.urlopen(req)
the_page = response.read()

print(the_page.decode( " utf8 "))

4、发送数据和header

# ! /usr/bin/env python3

import urllib.parse
import urllib.request

url = ' http://localhost/login.php '
user_agent = ' Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
values = {
' act ' : ' login ',
' login[email] ' : ' yzhang@i9i8.com ',
' login[password] ' : ' 123456 '
}
headers = { ' User-Agent ' : user_agent }

data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req)
the_page = response.read()

print(the_page.decode( " utf8 "))

5、http 错误

# ! /usr/bin/env python3

import urllib.request

req = urllib.request.Request( ' http://www.python.org/fish.html ')
try:
urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
print(e.code)
print(e.read().decode( " utf8 "))

6、异常处理1

# ! /usr/bin/env python3

from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request( " http://twitter.com/ ")
try:
response = urlopen(req)
except HTTPError as e:
print( ' The server couldn\'t fulfill the request. ')
print( ' Error code: ', e.code)
except URLError as e:
print( ' We failed to reach a server. ')
print( ' Reason: ', e.reason)
else:
print( " good! ")
print(response.read().decode( " utf8 "))

7、异常处理2

# ! /usr/bin/env python3

from urllib.request import Request, urlopen
from urllib.error import URLError
req = Request( " http://twitter.com/ ")
try:
response = urlopen(req)
except URLError as e:
if hasattr(e, ' reason '):
print( ' We failed to reach a server. ')
print( ' Reason: ', e.reason)
elif hasattr(e, ' code '):
print( ' The server couldn\'t fulfill the request. ')
print( ' Error code: ', e.code)
else:
print( " good! ")
print(response.read().decode( " utf8 "))

8、HTTP 认证

# ! /usr/bin/env python3

import urllib.request

# create a password manager
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()

# Add the username and password.
# If we knew the realm, we could use it instead of None.
top_level_url = " https://cms.tetx.com/ "
password_mgr.add_password(None, top_level_url, ' yzhang ', ' cccddd ')

handler = urllib.request.HTTPBasicAuthHandler(password_mgr)

# create "opener" (OpenerDirector instance)
opener = urllib.request.build_opener(handler)

# use the opener to fetch a URL
a_url = " https://cms.tetx.com/ "
x = opener.open(a_url)
print(x.read())

# Install the opener.
# Now all calls to urllib.request.urlopen use our opener.
urllib.request.install_opener(opener)

a = urllib.request.urlopen(a_url).read().decode( ' utf8 ')
print(a)

9、使用代理

# ! /usr/bin/env python3

import urllib.request

proxy_support = urllib.request.ProxyHandler({ ' sock5 ': ' localhost:1080 '})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)

a = urllib.request.urlopen( " http://g.cn ").read().decode( " utf8 ")
print(a)

10、超时

# ! /usr/bin/env python3

import socket
import urllib.request

# timeout in seconds
timeout = 2
socket.setdefaulttimeout(timeout)

# this call to urllib.request.urlopen now uses the default timeout
# we have set in the socket module
req = urllib.request.Request( ' http://twitter.com/ ')
a = urllib.request.urlopen(req).read()
print(a)