python3 爬虫 urllib

打开网页

  • urllib.request.urlopen(url)
import urllib.request
# 打开网页,返回 HTTP Response 对象
response = urllib.request.urlopen('http://www.baidu.com')
# 状态码
print(response.getcode())
# 获取当前所爬取的URL地址
print(response.geturl())
# 响应头
print(response.getheaders())
# 读取数据(二进制模式)
print(response.read())

浏览器伪装

import urllib.request
url = 'http://www.baidu.com'
header = {
   'User-Agent':
       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.96 Safari/537.36'
}  # 浏览器伪装
# 创建request对象
request = urllib.request.Request(url, headers=header)
# 传入request对象,打开网页,返回response对象
response = urllib.request.urlopen(request)
# 读数据
text = response.read().decode('utf-8')

URL编码和解码

from urllib import parse
编码:urlencode(传入字典)
编码:quote(传入字符串)
解码:unquote
from urllib import parse
keyword = 'Python爬虫'
# urlencode
dic = {"wd": keyword, 'q': 120}
wd = parse.urlencode(dic)
print(wd)
# quote
quote = parse.quote(keyword)
print(quote)
# 解码
unquote = parse.unquote(quote)
print(unquote)

打印结果

wd=Python%E7%88%AC%E8%99%AB&q=120
Python%E7%88%AC%E8%99%AB
Python爬虫

编码实例:构造淘宝URL

from urllib import parse
# 搜索词
keyword = 'python人工智能'
# 原淘宝url
original_url = 'https://s.taobao.com/search?'
for i in range(9):
    # 页码+搜索词
    page = i * 44
    kw = parse.urlencode({"s": page, "q": keyword})
    url = original_url + kw
    print(url)

打印结果

https://s.taobao.com/search?s=0&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=44&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=88&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=132&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=176&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=220&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=264&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=308&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
https://s.taobao.com/search?s=352&q=python%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD

URL拆分

from urllib.parse import urlsplit, urljoin
url1 = 'https://blog.youkuaiyun.com/Yellow_python'
url2 = '/Yellow_python/article/details/80801731'
# 拆分URL
urls = urlsplit(url1)
print(urls)
# 获取域名
domain = urls[0] + '://' + urls[1]
print(domain)
# 连接URL
url12 = urljoin(domain, url2)
print(url12)

打印结果

SplitResult(scheme='https', netloc='blog.youkuaiyun.com', path='/Yellow_python', query='', fragment='')
https://blog.youkuaiyun.com

URL拼接

urlsplit、urljoin

from urllib.parse import urlsplit, urljoin

print('\033[035m{}\033[0m\n'.format(urlsplit('https://www.baidu.com/s?ie=UTF-8&wd=scrapy')))

def join_url(url, postfix, real):
    tu = urlsplit(url)
    domain = tu[0] + '://' + tu[1]
    url_total = urljoin(domain, postfix)
    for i in ['url', 'domain', 'url_total', 'real', 'url_total==real']:
        print('\033[033m%-15s\033[0m' % i, eval(i))
    print()

ls = [
    ('https://blog.youkuaiyun.com/Yellow_python',
     'https://blog.youkuaiyun.com/Yellow_python/article/details/94435972',
     'https://blog.youkuaiyun.com/Yellow_python/article/details/94435972'),
    ('https://k.autohome.com.cn/314/#pvareaid=2099126',
     '/spec/36144/',
     'https://k.autohome.com.cn/spec/36144/'),
]
for url, postfix, real in ls:
    join_url(url, postfix, real)

函数(复制用)

from urllib.parse import urlsplit, urljoin

def join_url(url, postfix):
    tu = urlsplit(url)
    domain = tu[0] + '://' + tu[1]
    return urljoin(domain, postfix)

url = 'https://github.com/AryeYellow'
postfix = '/AryeYellow/NLP'
url_total = join_url(url, postfix)
print(url_total)
print
https://github.com/AryeYellow/NLP

parse

from urllib import parse
keyword = 'Python爬虫'
# url编码
dic = {"wd": keyword, 'q': 120}
wd = parse.urlencode(dic)
print(wd)
# quote(引述;报价;引文;引号;)
quote = parse.quote(keyword)
print(quote)
# url解码
unquote = parse.unquote(quote)
print(unquote)
print
wd=Python%E7%88%AC%E8%99%AB&q=120
Python%E7%88%AC%E8%99%AB
Python爬虫

函数(复制用)

from urllib import parse

def encode_url(url, dt):
    wd = parse.urlencode(dt)
    return url + wd

url = 'https://www.baidu.com/s?'
keyword = 'Python爬虫'
dt = {'ie': 'UTF-8', 'wd': 'K房'}
print(encode_url(url, dt))  # https://www.baidu.com/s?ie=UTF-8&wd=K%E6%88%BF

Handler

代理IP

  • urllib.request.ProxyHandler
import urllib.request
ua = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
url = 'http://httpbin.org/get'
# 代理Handler
proxy_handler = urllib.request.ProxyHandler()
opener = urllib.request.build_opener(proxy_handler)
response = opener.open(url)
print(response.read())

Cookie

  • 保存cookie
import http.cookiejar, urllib.request
file_name = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(file_name)
handler = urllib.request.HTTPCookieProcessor(cookie)  # 创建cookie处理器
opener = urllib.request.build_opener(handler)
urllib.request.install_opener(opener)  # 将opener安装为全局
response = opener.open('http://www.baidu.com')  # 打开网页
cookie.save(ignore_discard=True, ignore_expires=True)
  • 读取cookie
import http.cookiejar, urllib.request
file_name = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar()
cookie.load(file_name, ignore_discard=True, ignore_expires=True)  # 读文件
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
urllib.request.install_opener(opener)
response = opener.open('http://www.baidu.com')
print(response.read().decode())

异常

from urllib import request,error
try:
    response = request.urlopen('http://cuiqingcai.com/index.htm')
except error.HTTPError as e:
    print('--------------------- e')
    print(e)
    print('--------------------- e.reason')
    print(e.reason)
    print('--------------------- e.code')
    print(e.code)
    print('--------------------- e.headers')
    print(e.headers)

打印结果

--------------------- e
HTTP Error 404: Not Found
--------------------- e.reason
Not Found
--------------------- e.code
404
--------------------- e.headers
Server: nginx/1.10.3 (Ubuntu)
Date: Mon, 30 Jul 2018 10:14:42 GMT
Content-Type: text/html; charset=UTF-8
Transfer-Encoding: chunked
Connection: close
Vary: Cookie
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Cache-Control: no-cache, must-revalidate, max-age=0
Link: <https://cuiqingcai.com/wp-json/>; rel="https://api.w.org/"
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小基基o_O

您的鼓励是我创作的巨大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值