Python3.X
3.0版本中已经将urllib2、urlparse、和robotparser并入了urllib中,并且修改urllib模块,其中包含5个子模块,即是help()中看到的那五个名字。
help(urllib)
2.x版本的python可以直接使用import urllib来进行操作,但是3.x版本的python使用的是import urllib.request来进行
import urllib.request
fp=urllib.request.urlopen('https://www.baidu.com')
mybytes=fp.read()
mystr=mybytes.decode('utf8')
fp.close()
print(mystr)<span style="color:#333333;">
</span>
2.x版本
import urllib
print
urllib.
urlopen('http://www.google.com')
.
read()
最简单的方式:
1
2
3
4
5
6
7
8
|
#coding=utf-8
import
urllib.request
response
=
urllib.request.urlopen(
'http://python.org/'
)
buff
=
response.read()
#显示
html
=
buff.decode(
"utf8"
)
response.close()
print
(html)
|
使用Request的方式:
1
2
3
4
5
6
7
8
9
|
#coding=utf-8
import
urllib.request
req
=
urllib.request.Request(
'http://www.voidspace.org.uk'
)
response
=
urllib.request.urlopen(req)
buff
=
response.read()
#显示
the_page
=
buff.decode(
"utf8"
)
response.close()
print
(the_page)
|
这种方式同样可以用来处理其他URL,例如FTP:
1
2
3
4
5
6
7
8
9
|
#coding=utf-8
import
urllib.request
req
=
urllib.request.Request(
'ftp://ftp.pku.edu.cn/'
)
response
=
urllib.request.urlopen(req)
buff
=
response.read()
#显示
the_page
=
buff.decode(
"utf8"
)
response.close()
print
(the_page)
|
使用POST请求:
1
2
3
4
5
6
7
8
9
10
|
import
urllib.parseimport
urllib.requesturl
=
'http://www.someserver.com/cgi-bin/register.cgi'
values
=
{
'name'
:
'Michael Foord'
,
'location'
:
'Northampton'
,
'language'
:
'Python'
}
data
=
urllib.parse.urlencode(values)
req
=
urllib.request.Request(url, data)
response
=
urllib.request.urlopen(req)
the_page
=
response.read()
|
使用GET请求:
1
2
3
4
5
6
7
8
9
10
11
12
|
import
urllib.request
import
urllib.parse
data
=
{}
data[
'name'
]
=
'Somebody Here'
data[
'location'
]
=
'Northampton'
data[
'language'
]
=
'Python'
url_values
=
urllib.parse.urlencode(data)
print
(url_values)
name
=
Somebody
+
Here&language
=
Python&location
=
Northampton
url
=
'http://www.example.com/example.cgi'
full_url
=
url
+
'?'
+
url_values
data
=
urllib.request.url
open
(full_url)
|
添加header:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
import
urllib.parse
import
urllib.request
url
=
'http://www.someserver.com/cgi-bin/register.cgi'
user_agent
=
'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
values
=
{
'name'
:
'Michael Foord'
,
'location'
:
'Northampton'
,
'language'
:
'Python'
}
headers
=
{
'User-Agent'
: user_agent }
data
=
urllib.parse.urlencode(values).encode(encoding='UTF8')#红色的必须要否则报错encode('UTF8')
req
=
urllib.request.Request(url, data, headers)
#req = urllib.request.Request(url, data)
response
=
urllib.request.urlopen(req)
the_page
=
response.read()
|
错误处理:
1
2
3
4
|
req
=
urllib.request.Request(
'http://www.pretend_server.org'
)
try
: urllib.request.urlopen(req)
except
urllib.error.URLError as e:
print
(e.reason)
|
返回的错误代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
# Table mapping response codes to messages; entries have the
# form {code: (shortmessage, longmessage)}.
responses
=
{
100
: (
'Continue'
,
'Request received, please continue'
),
101
: (
'Switching Protocols'
,
'Switching to new protocol; obey Upgrade header'
),
200
: (
'OK'
,
'Request fulfilled, document follows'
),
201
: (
'Created'
,
'Document created, URL follows'
),
202
: (
'Accepted'
,
'Request accepted, processing continues off-line'
),
203
: (
'Non-Authoritative Information'
,
'Request fulfilled from cache'
),
204
: (
'No Content'
,
'Request fulfilled, nothing follows'
),
205
: (
'Reset Content'
,
'Clear input form for further input.'
),
206
: (
'Partial Content'
,
'Partial content follows.'
),
300
: (
'Multiple Choices'
,
'Object has several resources -- see URI list'
),
301
: (
'Moved Permanently'
,
'Object moved permanently -- see URI list'
),
302
: (
'Found'
,
'Object moved temporarily -- see URI list'
),
303
: (
'See Other'
,
'Object moved -- see Method and URL list'
),
304
: (
'Not Modified'
,
'Document has not changed since given time'
),
305
: (
'Use Proxy'
,
'You must use proxy specified in Location to access this '
'resource.'
),
307
: (
'Temporary Redirect'
,
'Object moved temporarily -- see URI list'
),
400
: (
'Bad Request'
,
'Bad request syntax or unsupported method'
),
401
: (
'Unauthorized'
,
'No permission -- see authorization schemes'
),
402
: (
'Payment Required'
,
'No payment -- see charging schemes'
),
403
: (
'Forbidden'
,
'Request forbidden -- authorization will not help'
),
404
: (
'Not Found'
,
'Nothing matches the given URI'
),
405
: (
'Method Not Allowed'
,
'Specified method is invalid for this server.'
),
406
: (
'Not Acceptable'
,
'URI not available in preferred format.'
),
407
: (
'Proxy Authentication Required'
,
'You must authenticate with '
'this proxy before proceeding.'
),
408
: (
'Request Timeout'
,
'Request timed out; try again later.'
),
409
: (
'Conflict'
,
'Request conflict.'
),
410
: (
'Gone'
,
'URI no longer exists and has been permanently removed.'
),
411
: (
'Length Required'
,
'Client must specify Content-Length.'
),
412
: (
'Precondition Failed'
,
'Precondition in headers is false.'
),
413
: (
'Request Entity Too Large'
,
'Entity is too large.'
),
414
: (
'Request-URI Too Long'
,
'URI is too long.'
),
415
: (
'Unsupported Media Type'
,
'Entity body in unsupported format.'
),
416
: (
'Requested Range Not Satisfiable'
,
'Cannot satisfy request range.'
),
417
: (
'Expectation Failed'
,
'Expect condition could not be satisfied.'
),
500
: (
'Internal Server Error'
,
'Server got itself in trouble'
),
501
: (
'Not Implemented'
,
'Server does not support this operation'
),
502
: (
'Bad Gateway'
,
'Invalid responses from another server/proxy.'
),
503
: (
'Service Unavailable'
,
'The server cannot process the request due to a high load'
),
504
: (
'Gateway Timeout'
,
'The gateway server did not receive a timely response'
),
505
: (
'HTTP Version Not Supported'
,
'Cannot fulfill request.'
),
}
|
1、最简单
import urllib.request
response = urllib.request.urlopen( ' http://python.org/ ')
html = response.read()
response = urllib.request.urlopen( ' http://python.org/ ')
html = response.read()
2、使用 Request
import urllib.request
req = urllib.request.Request( ' http://python.org/ ')
response = urllib.request.urlopen(req)
the_page = response.read()
req = urllib.request.Request( ' http://python.org/ ')
response = urllib.request.urlopen(req)
the_page = response.read()
3、发送数据
#
! /usr/bin/env python3
import urllib.parse
import urllib.request
url = ' http://localhost/login.php '
user_agent = ' Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
values = {
' act ' : ' login ',
' login[email] ' : ' yzhang@i9i8.com ',
' login[password] ' : ' 123456 '
}
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data)
req.add_header( ' Referer ', ' http://www.python.org/ ')
response = urllib.request.urlopen(req)
the_page = response.read()
print(the_page.decode( " utf8 "))
import urllib.parse
import urllib.request
url = ' http://localhost/login.php '
user_agent = ' Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
values = {
' act ' : ' login ',
' login[email] ' : ' yzhang@i9i8.com ',
' login[password] ' : ' 123456 '
}
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data)
req.add_header( ' Referer ', ' http://www.python.org/ ')
response = urllib.request.urlopen(req)
the_page = response.read()
print(the_page.decode( " utf8 "))
4、发送数据和header
#
! /usr/bin/env python3
import urllib.parse
import urllib.request
url = ' http://localhost/login.php '
user_agent = ' Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
values = {
' act ' : ' login ',
' login[email] ' : ' yzhang@i9i8.com ',
' login[password] ' : ' 123456 '
}
headers = { ' User-Agent ' : user_agent }
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req)
the_page = response.read()
print(the_page.decode( " utf8 "))
import urllib.parse
import urllib.request
url = ' http://localhost/login.php '
user_agent = ' Mozilla/4.0 (compatible; MSIE 5.5; Windows NT) '
values = {
' act ' : ' login ',
' login[email] ' : ' yzhang@i9i8.com ',
' login[password] ' : ' 123456 '
}
headers = { ' User-Agent ' : user_agent }
data = urllib.parse.urlencode(values)
req = urllib.request.Request(url, data, headers)
response = urllib.request.urlopen(req)
the_page = response.read()
print(the_page.decode( " utf8 "))
5、http 错误
#
! /usr/bin/env python3
import urllib.request
req = urllib.request.Request( ' http://www.python.org/fish.html ')
try:
urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
print(e.code)
print(e.read().decode( " utf8 "))
import urllib.request
req = urllib.request.Request( ' http://www.python.org/fish.html ')
try:
urllib.request.urlopen(req)
except urllib.error.HTTPError as e:
print(e.code)
print(e.read().decode( " utf8 "))
6、异常处理1
#
! /usr/bin/env python3
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request( " http://twitter.com/ ")
try:
response = urlopen(req)
except HTTPError as e:
print( ' The server couldn\'t fulfill the request. ')
print( ' Error code: ', e.code)
except URLError as e:
print( ' We failed to reach a server. ')
print( ' Reason: ', e.reason)
else:
print( " good! ")
print(response.read().decode( " utf8 "))
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
req = Request( " http://twitter.com/ ")
try:
response = urlopen(req)
except HTTPError as e:
print( ' The server couldn\'t fulfill the request. ')
print( ' Error code: ', e.code)
except URLError as e:
print( ' We failed to reach a server. ')
print( ' Reason: ', e.reason)
else:
print( " good! ")
print(response.read().decode( " utf8 "))
7、异常处理2
#
! /usr/bin/env python3
from urllib.request import Request, urlopen
from urllib.error import URLError
req = Request( " http://twitter.com/ ")
try:
response = urlopen(req)
except URLError as e:
if hasattr(e, ' reason '):
print( ' We failed to reach a server. ')
print( ' Reason: ', e.reason)
elif hasattr(e, ' code '):
print( ' The server couldn\'t fulfill the request. ')
print( ' Error code: ', e.code)
else:
print( " good! ")
print(response.read().decode( " utf8 "))
from urllib.request import Request, urlopen
from urllib.error import URLError
req = Request( " http://twitter.com/ ")
try:
response = urlopen(req)
except URLError as e:
if hasattr(e, ' reason '):
print( ' We failed to reach a server. ')
print( ' Reason: ', e.reason)
elif hasattr(e, ' code '):
print( ' The server couldn\'t fulfill the request. ')
print( ' Error code: ', e.code)
else:
print( " good! ")
print(response.read().decode( " utf8 "))
8、HTTP 认证
#
! /usr/bin/env python3
import urllib.request
# create a password manager
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
# Add the username and password.
# If we knew the realm, we could use it instead of None.
top_level_url = " https://cms.tetx.com/ "
password_mgr.add_password(None, top_level_url, ' yzhang ', ' cccddd ')
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
# create "opener" (OpenerDirector instance)
opener = urllib.request.build_opener(handler)
# use the opener to fetch a URL
a_url = " https://cms.tetx.com/ "
x = opener.open(a_url)
print(x.read())
# Install the opener.
# Now all calls to urllib.request.urlopen use our opener.
urllib.request.install_opener(opener)
a = urllib.request.urlopen(a_url).read().decode( ' utf8 ')
print(a)
import urllib.request
# create a password manager
password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
# Add the username and password.
# If we knew the realm, we could use it instead of None.
top_level_url = " https://cms.tetx.com/ "
password_mgr.add_password(None, top_level_url, ' yzhang ', ' cccddd ')
handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
# create "opener" (OpenerDirector instance)
opener = urllib.request.build_opener(handler)
# use the opener to fetch a URL
a_url = " https://cms.tetx.com/ "
x = opener.open(a_url)
print(x.read())
# Install the opener.
# Now all calls to urllib.request.urlopen use our opener.
urllib.request.install_opener(opener)
a = urllib.request.urlopen(a_url).read().decode( ' utf8 ')
print(a)
9、使用代理
#
! /usr/bin/env python3
import urllib.request
proxy_support = urllib.request.ProxyHandler({ ' sock5 ': ' localhost:1080 '})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
a = urllib.request.urlopen( " http://g.cn ").read().decode( " utf8 ")
print(a)
import urllib.request
proxy_support = urllib.request.ProxyHandler({ ' sock5 ': ' localhost:1080 '})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
a = urllib.request.urlopen( " http://g.cn ").read().decode( " utf8 ")
print(a)
10、超时
#
! /usr/bin/env python3
import socket
import urllib.request
# timeout in seconds
timeout = 2
socket.setdefaulttimeout(timeout)
# this call to urllib.request.urlopen now uses the default timeout
# we have set in the socket module
req = urllib.request.Request( ' http://twitter.com/ ')
a = urllib.request.urlopen(req).read()
print(a)
import socket
import urllib.request
# timeout in seconds
timeout = 2
socket.setdefaulttimeout(timeout)
# this call to urllib.request.urlopen now uses the default timeout
# we have set in the socket module
req = urllib.request.Request( ' http://twitter.com/ ')
a = urllib.request.urlopen(req).read()
print(a)