解析器和selenium
爬斗鱼
import requests
from bs4 import BeautifulSoup
import csv
import re
import json
def get_net_data():
url = 'https://www.douyu.com/g_LOL'
response = requests.get(url)
if response.status_code == 200:
return response.text
print('数据获取失败!')
def analysis_data(html: str):
# 解析方式一:直接解析json数据
result = re.findall(r'window\.\$DATA\s*=\s*(\{.*\});', html)
with open('test.txt', 'w', encoding='utf-8') as f:
f.write(result[0])
data = json.loads(result[0])
all_data = []
for item in data['list']:
title = item['rn']
anchor = item['nn']
hot = item['ol']
tag = item.get('od', '暂时还没有描述')
image = item['rs1']
url = f'https://www.douyu.com/topic/lolzxz?rid={item["url"][1:]}'
all_data.append([title, anchor, hot, tag, image, url])
print(all_data)
return all_data
# 解析方式二:解析网页源代码
# soup = BeautifulSoup(html, 'lxml')
# # 拿到所有产品对应的li标签
# li_list = soup.select('.layout-Cover-list>li')
# all_data = []
# for li in li_list:
# # 标题
# title = li.select_one('.DyListCover-intro').get_text()
# # 主播
# anchor = li.select_one('.DyListCover-userName').get_text()
# # 热度
# hot = li.select_one('.DyListCover-hot').get_text()
# # 认证
# tag = li.select_one('.HeaderCell-label-wrap.is-od')
# if tag:
# tag = tag.get_text()
# else:
# tag = '暂时还没有描述'
# # print(tag)
# all_data.append([title, anchor, hot, tag])
# return all_data
def save_data(data):
with open('data.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(['标题', '主播', '热度', '认证', '图片地址', '播放地址'])
writer.writerows(data)
if __name__ == '__main__':
data = analysis_data(get_net_data())
save_data(data)
2. PuQuery
-
创建PuQuery对象
PuQuery对象的本质是一个容器,元素是对应的标签
PyQuery (需要解析的html字符串)
from pyquery import PyQuery html = open('test.html', encoding='utf-8').read() pq = PyQuery(html) # pq对应的容器中只有一个html标签 print(type(pq)) # print(pq) -
通过选择器获得子标签
PyQuery对象(css选择器) - 获取css选择器选中的标签对应pyQuery对象
ps = pq('p') print(ps, type(ps)) print(ps[0]) # <Element p at 0x10c7e0d60> for p in ps: print(p) # 在整个html中获取b标签(pq对应的容器中只有html一个标签) print(pq('b')) # <b>加粗1</b> <b>加粗2</b> <b>加粗3</b> # 在所有的p标签中获取b标签(ps对应的容器中是整个页面中所有的p标签) print(ps('b')) # <b>加粗1</b><b>加粗3</b> print(type(ps[0])) # <class 'lxml.html.HtmlElement'> p1 = PyQuery(ps[-1]) print(p1('b')) # <b>加粗3</b> a_list = pq('a') p_list = pq('p') -
获取标签内容
PyQuery对象.text()
result = pq('h1').text() print(result) # 我是标题1 # 同时获取所有a标签的内容 result = pq('a').text() print(result, type(result)) # 我是超链接1 京东 <class 'str'> # 单独获取每个a标签的内容 for a in pq('a'): print(PyQuery(a).text()) -
获取标签属性
PyQuery对象.attr(属性名)
# 获取第一个a标签的href属性 result = pq('a').attr('href') print(result) # 获取每个a标签的href属性 for a in pq('a'): print(PyQuery(a).attr('href'))
3. xpath
-
构建指定的树结构并且获取根节点
from lxml import etree html = etree.HTML(open('test.html', encoding='utf-8').read()) -
根据路径获取节点(标签)
# 节点对象.xpath(路径) - 获取指定路径对应的所有节点对象对应列表 a_list = html.xpath('/html/body/div/a') print(a_list) -
路径
# 1)绝对路径: /绝对路径 # 注意:绝对路径必须从树结构的根节点开始写(html树结构就是从html开始写) result1 = html.xpath('/html/body/h1') result2 = a_list[0].xpath('/html/body/h1') print(result1) # [<Element h1 at 0x10e725600>] print(result2) # [<Element h1 at 0x10e725600>] result3 = html.xpath('/body/h1') # 绝对路径只能从html开始 print(result3) # [] # 2)相对路径 # . - 表示当前节点(谁去点的xpath当前节点就是谁), ./可以省略 # .. - 标签当前节点的上层节点 result4 = html.xpath('./body/h1') print(result4) # [<Element h1 at 0x104de7700>] div = html.xpath('/html/body/div')[0] print(div) # <Element div at 0x101771180> img1 = div.xpath('/html/body/div/img') print(img1) # [<Element img at 0x10d5cd500>] img2 = div.xpath('./img') print(img2) # [<Element img at 0x10d5cd500>] result5 = div.xpath('../b') print(result5) # [<Element b at 0x108cc8480>] # 3)// - 全文检索 # //p - 获取整个页面中所有的p标签对应的节点 # //div/p - 获取整个页面中在div下面的p标签的 result6 = html.xpath('//p') print(result6) # [<Element p at 0x105ac1740>, <Element p at 0x105ac1780>, <Element p at 0x105ac17c0>] result7 = div.xpath('//p') print(result7) # [<Element p at 0x105ac1740>, <Element p at 0x105ac1780>, <Element p at 0x105ac17c0>] result8 = html.xpath('/html/body/div//p/text()') print(result8) result9 = div.xpath('.//p/text()') print(result9)
补充:路径的最后加’/text()’ 是用来获取标签内容
p = html.xpath('./body/p/text()')
print(p) # ['我是段落1', '我是段落2']
-
获取内容和属性
# 获取内容 - 在路径的最后添加 /text() # 获取属性 - 在路径的最后添加 /@属性名 result10 = html.xpath('//a/text()') print(result10) # ['我是超链接1', '京东', '淘宝'] result11 = html.xpath('//a/@href') print(result11) # ['https://www.baidu.com', 'https://www.jd.com', 'https://www.jd.com'] print(html.xpath('//@id')) # ['id1', 'id2', 'id3'] -
谓语 - 筛选条件
# [N] - 获取第N个指定标签 result = div.xpath('./a[1]/text()') print(result) result = html.xpath('./body/div/p/text()') print(result) # ['我是段落3', '我是段落5', '我是段落11', '我是段落22'] result = html.xpath('./body/div/p[1]/text()') print(result) # ['我是段落3', '我是段落11'] result = html.xpath('./body/div[2]/p/text()') print(result) # ['我是段落11', '我是段落22'] result = html.xpath('./body/div[2]/p[1]/text()') print(result) # ['我是段落11'] # [last()] - 最后一个 # [last()-1] - 倒数第2个 # [last()-N] - 倒数第N+1个 result = html.xpath('./body/div/p[last()]/text()') print(result) # ['我是段落5', '我是段落22'] # [position()<3] - 前两个 # [position()<N] - 前N-1个 result = html.xpath('./body/div[2]/p[position()<3]/text()') print(result) # ['我是段落11', '我是段落22'] result = html.xpath('./body/div[2]/p[position()>=3]/text()') print(result) # ['我是段落33', '我是段落44', '我是段落55'] # [@属性名] - 拥有指定属性 result = html.xpath('./body/div[2]/p[@class]/text()') print(result) # ['我是段落22', '我是段落44', '我是段落55'] # [@属性名=值] - 指定属性是指定值的标签 result = html.xpath('./body/div[2]/p[@class="c1"]/text()') print(result) # ['我是段落22', '我是段落44'] # [子标签名>值]、[子标签名>=值]、[子标签名<值]、[子标签名<=值]、[子标签名=值] # 按照子标签内容进行筛选 result = html.xpath('./body/div[last()]/li[span=150]/p/text()') print(result) -
通配符 - *
# *表示任意节点任意属性 result = html.xpath('./body/div[1]/*') print(result) result = html.xpath('./body/*/*') print(result) result = html.xpath('//img/@*') print(result) # 6.| - 分支 result = html.xpath('./body/div[1]/p/text()|./body/div[1]/a/text()') print(result) # ['我是段落3', '京东', '淘宝', '爱奇艺', '我是段落5'] div = html.xpath('./body/div[1]')[0] print(div.xpath('./p/text()|./a/text()')) # ['我是段落3', '京东', '淘宝', '爱奇艺', '我是段落5'] # result = html.xpath('./body/div[1]/(p|a)/text()') # print(result) # result = html.xpath('./body/div[last()]/li/(span|p)/text()') # print(result)
4. selenium
from selenium import webdriver
# 创建浏览器对象(谷歌浏览器)
b = webdriver.Chrome()
b.get('https://movie.douban.com/top250')
print(b.page_source)
作业
import requests
from lxml import etree
def gei_data():
url = 'https://movie.douban.com/top250'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
}
res = requests.get(url, headers=headers)
if res.status_code == 200:
return res.text
else:
print(res)
def analysis_data(html: str):
html = etree.HTML(gei_data())
all_data = []
gei_li = html.xpath('//*[@id="content"]/div/div[1]/ol/li//img/@src')
# print(gei_li)
gei_name = html.xpath('//*[@id="content"]/div/div[1]/ol/li//img/@alt')
# print(gei_name)
gei_director = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]//p/text()')
# print(gei_director)
gei_score = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div//span[2]/text()')
# print(gei_score)
comments = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div//span[4]/text()')
# print(comments)
all_data.append([gei_li, gei_name, gei_director, gei_score, comments])
return all_data
if __name__ == '__main__':
print(analysis_data(gei_data()))
本文详细介绍了Python爬虫技术,包括使用requests获取网页数据,结合正则表达式与BeautifulSoup解析JSON,以及利用PyQuery简化HTML解析。此外,还探讨了XPath的选择器用法,并展示了如何通过Selenium进行动态网页抓取。通过实例展示了从数据获取到内容分析的完整流程,适合初学者进阶。
1722

被折叠的 条评论
为什么被折叠?



