###########网页爬虫###########
## 对于爬虫的理解
## 何为爬虫
通俗来说就是通过爬虫的一系类方法来获取一个网页上相关的信息(如 图片、url、视频、文件等)保存在本地
##浏览网页时历经的过程
浏览器(请求request) ---输入URL地址(如(http://www.baidu.com/index.html file:///mnt ftp://172.25.254.40等)
http协议确定,www.baidu.com访问的域名确定 --- DNS服务器解析到IP地址---确定要访问的内容后将获取的页面内容返回给浏览器(响应过程)
## 爬去网页
## 基本方法
from urllib import request
from urllib.error import URLError
from urllib import request
from urllib.error import URLError
try:
response = request.urlopen('http://www.baidu.com', timeout=10)
content = response.read().decode('utf-8')
print(content)
except URLError as e:
print(e.reason)
## 使用Requset对象 (可以添加其它头部信息)
from urllib import request
from urllib.error import URLError
url = 'http://www.cbrc.gov.cn/chinese/jrjg/index.html'
# 头部信息的返回值时一个字典 我们通过网页的审查元素来获取并保存在一个字典当中
headers = {'User-Agent':"Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
try:
# 实例化request对象,可以自定义请求的头部信息
"""
class Request:
def __init__(self, url, data=None, headers={},
origin_req_host=None, unverifiable=False,
method=None):
"""
req = request.Request(url, headers=headers)
# urlopen不仅可以传递url地址,也可以传递request对象
content = request.urlopen(req).read().decode('utf-8')
print(content)
except URLError as e:
print(e.reason)
else:
print('success')
## 后续添加头部信息
from urllib import request
from urllib.error import URLError
url = 'http://www.cbrc.gov.cn/chinese/jrjg/index.html'
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"
# # 头部信息的返回值时一个字典 我们通过网页的审查元素来获取并保存在一个字典当中
# headers = {'User-Agent':"Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}
# try:
# # 实例化request对象,可以自定义请求的头部信息
# """
# class Request:
#
# def __init__(self, url, data=None, headers={},
# origin_req_host=None, unverifiable=False,
# method=None):
# """
# req = request.Request(url, headers=headers)
# # urlopen不仅可以传递url地址,也可以传递request对象
# content = request.urlopen(req).read().decode('utf-8')
# print(content)
# except URLError as e:
# print(e.reason)
# else:
# print('success')
try:
req=request.Request(url)
"""
def add_header(self, key, val):
# useful for something like authentication
self.headers[key.capitalize()] = val
"""
req.add_header('User_Agent',user_agent)
content = request.urlopen(req).read().decode('utf-8')
print(content)
except URLError as e:
print(e.reason)
else:
print('success')
## 反爬虫对应策略
## 伪装程浏览器 即同上添加浏览器头部信息
1.Android
Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19
Mozilla/5.0 (Linux; U; Android 4.0.4; en-gb; GT-I9300 Build/IMM76D) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30
Mozilla/5.0 (Linux; U; Android 2.2; en-gb; GT-P1000 Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1
2.Firefox
Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0
Mozilla/5.0 (Android; Mobile; rv:14.0) Gecko/14.0 Firefox/14.0
3.Google Chrome
Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36
Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19
4.iOS
Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3
## IP代理
为什么要用代理IP
当抓取网站时, 程序的运行速度很快, 如果通过爬虫去访问, 一个固定的ip访问频率很高,
网站如果做反爬虫策略, 那么就会封掉ip
如何解决?
- 设置延迟;time.sleep(random.randint(1,5))
- 使用IP代理, 让其他IP代替你的IP访问; 登陆网站(http://www.xicidaili.com/)
如何实现步骤?
1). 调用urllib.request.ProxyHandler(proxies=None); --- 类似理解为Request对象
2). 调用Opener--- 类似与urlopen, 这个是定制的
3). 安装Opener
4). 代理IP的选择
from urllib import request
from urllib.error import URLError
url = 'http://httpbin.org/get'
# 将需要伪装的IP地址放在一个字典里
proxy = {'http': '110.40.13.5:80', 'https': '183.129.207.74:14823'}
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"
# 1).调用urllib.request.ProxyHandler(proxies=None); --- 类似理解为Request对象
proxy_support = request.ProxyHandler(proxy)
# 2).调用Opener - -- 类似与urlopen, 这个是定制的
opener = request.build_opener(proxy_support)
# 伪装浏览器
opener.addheaders = [('User-Agent', user_agent)]
# 安装Opener
request.install_opener(opener)
# 代理IP的选择
response = request.urlopen(url)
content = response.read().decode('utf-8')
print(content)
## 保存cookie信息
何为cookie信息?
cookie(储存在用户本地终端上的数据),指某些网站为了辨别用户身份、进行 session 跟踪而储存在用户本地终端上的数据。
只有登陆之后才能访问某个页面;进行一个会话跟踪, 将用户的相关信息包括用户名等保存到本地终端.
cookie:"BAIDUID=E0A0BD91DBF7E9279D4E99A384ECE48C:FG=1; BIDUPSID=E0A0BD91DBF7E9279D4E99A384ECE48C; PSTM=1523083153; BD_UPN=133352; pgv_pvi=4895693824; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_HOME=0; H_PS_PSSID=1465_27213_21113_20930; BD_CK_SAM=1; PSINO=1; H_PS_645EC=21aadTAcLlASAsdA66DzYMSv%2BDOn3p2CCCffSHVxjXiIOyNx3Owlj6%2B3Bbs; pgv_si=s3991780352"
## 获取cookie信息
from http import cookiejar
from urllib.request import HTTPCookieProcessor
from urllib import request
from http import cookiejar
from urllib.request import HTTPCookieProcessor
from urllib import request
# 如何将Cookie信息保存到变量中, 或者文件中;
# 声明一个CookieJar ---> FileCookieJar --> MozillaCookie 这是格式
# 实例化对象
cookie = cookiejar.CookieJar()
# 利用urllib.request和HTTPCookieProcessor创建一个cookie处理器
handler = HTTPCookieProcessor(cookie)
# 通过cookieHandler创建opener
# 默认的opener就是urlopener
opener = request.build_opener(handler)
# 打开url页面 open()方法
response = opener.open('http://www.baidu.com')
# 打印页面里面的cookie信息
print(cookie)
for item in cookie:
print(item)
## 将cookie信息以指定格式保存在文件中
from http import cookiejar
from urllib.request import HTTPCookieProcessor
from urllib import request
# 设置保存的文件名
cookieFile = 'cookie.txt'
# 声明一个MozillaCookie,用来保存cookie信息并且写入文件
cookie = cookiejar.MozillaCookieJar(filename=cookieFile)
handler = request.HTTPCookieProcessor(cookie)
opener = request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(type(cookie))
# ignore_discard, 即使cookie信息将要被丢弃。 也要把它保存到文件中;
# ignore_expires, 如果在文件中的cookie已经存在, 就覆盖原文件写入;
cookie.save(ignore_discard=True, ignore_expires=True)
## 从文件中获取cookie信息并访问网页
from http import cookiejar
from urllib.request import HTTPCookieProcessor
from urllib import request
# 指定cookie文件存在的位置
cookiefile = 'cookie.txt'
# 声明一个MozillaCookie用来保存cookie并写入文件,读取cookie信息
cookie = cookiejar.MozillaCookieJar()
# 从文件中读取cookie内容
cookie.load(filename=cookiefile)
handler = HTTPCookieProcessor(cookie)
opener = request.build_opener(handler)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))
## urllib常见的异常处理
- 异常
exception urllib.error.URLError
exception urllib.error.HTTPError
exception urllib.error.ContentTooShortError(msg, content)
# 常见超时异常
from urllib import request, error
import socket
#
try:
url = 'https://www.baidu.com'
response = request.urlopen(url, timeout=0.01)
print(response.read().decode('utf-8'))
except error.HTTPError as e:
print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
print(e.reason)
if isinstance(e.reason, socket.timeout):
print("超时")
else:
print("成功")
## Url解析
# urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)
# - 功能: 将url分为6部分, 返回一个元组;
# 协议, 服务器的地址(ip:port), 文件路径, 访问的页面
from urllib import parse
url = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=0&rsv_idx=1&tn=baidu&wd=hello&rsv_pq=d0f841b10001fab6&rsv_t=2d43603JgfgVkvPtTiNX%2FIYssE6lWfmSKxVCtgi0Ix5w1mnjks2eEMG%2F0Gw&rqlang=cn&rsv_enter=1&rsv_sug3=6&rsv_sug1=4&rsv_sug7=101&rsv_sug2=0&inputT=838&rsv_sug4=1460'
print(parse.urlparse(url))
print(type(parse.urlparse(url)))
parsed = parse.urlparse(url)
print(parsed.netloc)
print(parsed.path)
print(parsed.scheme)
## urlunparse
from urllib import parse
url = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=0&rsv_idx=1&tn=baidu&wd=hello&rsv_pq=d0f841b10001fab6&rsv_t=2d43603JgfgVkvPtTiNX%2FIYssE6lWfmSKxVCtgi0Ix5w1mnjks2eEMG%2F0Gw&rqlang=cn&rsv_enter=1&rsv_sug3=6&rsv_sug1=4&rsv_sug7=101&rsv_sug2=0&inputT=838&rsv_sug4=1460'
# print(parse.urlparse(url))
# print(type(parse.urlparse(url)))
parsed = parse.urlparse(url)
# print(parsed.netloc)
# print(parsed.path)
# print(parsed.scheme)
print(parse.urlunparse(parsed))
# from urllib.parse import urlencode
from urllib import parse
from urllib.parse import urlencode
url = 'https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=0&rsv_idx=1&tn=baidu&wd=hello&rsv_pq=d0f841b10001fab6&rsv_t=2d43603JgfgVkvPtTiNX%2FIYssE6lWfmSKxVCtgi0Ix5w1mnjks2eEMG%2F0Gw&rqlang=cn&rsv_enter=1&rsv_sug3=6&rsv_sug1=4&rsv_sug7=101&rsv_sug2=0&inputT=838&rsv_sug4=1460'
parsed = parse.urlparse(url)
params = {
'name': 'kobe',
'age': 40
}
url1 = 'http://www.baidu.com?'
url2 = url1 + urlencode(params)
print(url2)