文章目录
打开网页
- urllib.request.urlopen(url)
import urllib.request
# 打开网页,返回 HTTP Response 对象
response = urllib.request.urlopen('http://www.baidu.com')
# 状态码
print(response.getcode())
# 获取当前所爬取的URL地址
print(response.geturl())
# 响应头
print(response.getheaders())
# 读取数据(二进制模式)
print(response.read())
浏览器伪装
import urllib.request
url = 'http://www.baidu.com'
header = {
'User-Agent':
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'
} # 浏览器伪装
# 创建request对象
request = urllib.request.Request(url, headers=header)
# 传入request对象,打开网页,返回response对象
response = urllib.request.urlopen(request)
# 读数据
text = response.read().decode('utf-8')
URL编码和解码
-
from urllib import parse
-
编码:urlencode(传入字典)
编码:quote(传入字符串)
解码:unquote
from urllib import parse
keyword = 'Python爬虫'
# urlencode
dic = {"wd": keyword, 'q': 120}
wd = parse.urlencode(dic)
print(wd)
# quote
quote = parse.quote(keyword)
print(quote)
# 解码
unquote = parse.unquote(quote)
print(unquote)
打印结果
wd=Python%E7%88%AC%E8%99%AB&q=120
Python%E7%88%AC%E8%99%AB
Python爬虫
编码实例:构造淘宝URL
from urllib import parse
# 搜索词
keyword = 'python人工智能'
# 原淘宝url
original_url = 'https://s.taobao.com/search?'
for i in range(9):
# 页码+搜索词
page = i * 44
kw = parse.urlencode({"s": page, "q": keyword})
url = original_url + kw
print(url)
打印结果
https://s.taobao.com/search?s=0&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=44&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=88&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=132&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=176&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=220&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=264&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=308&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=352&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
URL拆分
from urllib.parse import urlsplit, urljoin
url1 = 'https://blog.youkuaiyun.com/Yellow_python'
url2 = '/Yellow_python/article/details/80801731'
# 拆分URL
urls = urlsplit(url1)
print(urls)
# 获取域名
domain = urls[0] + '://' + urls[1]
print(domain)
# 连接URL
url12 = urljoin(domain, url2)
print(url12)
打印结果
SplitResult(scheme='https', netloc='blog.youkuaiyun.com', path='/Yellow_python', query='', fragment='')
https://blog.youkuaiyun.com
URL拼接
urlsplit、urljoin
from urllib.parse import urlsplit, urljoin
print('\033[035m{}\033[0m\n'.format(urlsplit('https://www.baidu.com/s?ie=UTF-8&wd=scrapy')))
def join_url(url, postfix, real):
tu = urlsplit(url)
domain = tu[0] + '://' + tu[1]
url_total = urljoin(domain, postfix)
for i in ['url', 'domain', 'url_total', 'real', 'url_total==real']:
print('\033[033m%-15s\033[0m' % i, eval(i))
print()
ls = [
('https://blog.youkuaiyun.com/Yellow_python',
'https://blog.youkuaiyun.com/Yellow_python/article/details/94435972',
'https://blog.youkuaiyun.com/Yellow_python/article/details/94435972'),
('https://k.autohome.com.cn/314/#pvareaid=2099126',
'/spec/36144/',
'https://k.autohome.com.cn/spec/36144/'),
]
for url, postfix, real in ls:
join_url(url, postfix, real)
函数(复制用)
from urllib.parse import urlsplit, urljoin
def join_url(url, postfix):
tu = urlsplit(url)
domain = tu[0] + '://' + tu[1]
return urljoin(domain, postfix)
url = 'https://github.com/AryeYellow'
postfix = '/AryeYellow/NLP'
url_total = join_url(url, postfix)
print(url_total)
-
print
- https://github.com/AryeYellow/NLP
parse
from urllib import parse
keyword = 'Python爬虫'
# url编码
dic = {"wd": keyword, 'q': 120}
wd = parse.urlencode(dic)
print(wd)
# quote(引述;报价;引文;引号;)
quote = parse.quote(keyword)
print(quote)
# url解码
unquote = parse.unquote(quote)
print(unquote)
-
print
-
wd=Python%E7%88%AC%E8%99%AB&q=120
Python%E7%88%AC%E8%99%AB
Python爬虫
函数(复制用)
from urllib import parse
def encode_url(url, dt):
wd = parse.urlencode(dt)
return url + wd
url = 'https://www.baidu.com/s?'
keyword = 'Python爬虫'
dt = {'ie': 'UTF-8', 'wd': 'K房'}
print(encode_url(url, dt)) # https://www.baidu.com/s?ie=UTF-8&wd=K%E6%88%BF
Handler
代理IP
- urllib.request.ProxyHandler
import urllib.request
ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
url = 'http://httpbin.org/get'
# 代理Handler
proxy_handler = urllib.request.ProxyHandler()
opener = urllib.request.build_opener(proxy_handler)
response = opener.open(url)
print(response.read())
Cookie
- 保存cookie
import http.cookiejar, urllib.request
file_name = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(file_name)
handler = urllib.request.HTTPCookieProcessor(cookie) # 创建cookie处理器
opener = urllib.request.build_opener(handler)
urllib.request.install_opener(opener) # 将opener安装为全局
response = opener.open('http://www.baidu.com') # 打开网页
cookie.save(ignore_discard=True, ignore_expires=True)
- 读取cookie
import http.cookiejar, urllib.request
file_name = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar()
cookie.load(file_name, ignore_discard=True, ignore_expires=True) # 读文件
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
urllib.request.install_opener(opener)
response = opener.open('http://www.baidu.com')
print(response.read().decode())
异常
from urllib import request,error
try:
response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
print('--------------------- e')
print(e)
print('--------------------- e.reason')
print(e.reason)
print('--------------------- e.code')
print(e.code)
print('--------------------- e.headers')
print(e.headers)
打印结果
--------------------- e
HTTP Error 404: Not Found
--------------------- e.reason
Not Found
--------------------- e.code
404
--------------------- e.headers
Server: nginx/1.10.3 (Ubuntu)
Date: Mon, 30 Jul 2018 10:14:42 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Connection: close
Vary: Cookie
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Cache-Control: no-cache, must-revalidate, max-age=0
Link: <https://cuiqingcai.com/wp-json/>; rel="https://api.w.org/"