day-4 解析器和selenium

本文详细介绍了Python爬虫技术,包括使用requests获取网页数据,结合正则表达式与BeautifulSoup解析JSON,以及利用PyQuery简化HTML解析。此外,还探讨了XPath的选择器用法,并展示了如何通过Selenium进行动态网页抓取。通过实例展示了从数据获取到内容分析的完整流程,适合初学者进阶。

解析器和selenium

爬斗鱼
import requests
from bs4 import BeautifulSoup
import csv
import re
import json


def get_net_data():
    url = 'https://www.douyu.com/g_LOL'
    response = requests.get(url)
    if response.status_code == 200:
        return response.text

    print('数据获取失败!')


def analysis_data(html: str):
    # 解析方式一:直接解析json数据
    result = re.findall(r'window\.\$DATA\s*=\s*(\{.*\});', html)
    with open('test.txt', 'w', encoding='utf-8') as f:
        f.write(result[0])
    data = json.loads(result[0])
    all_data = []
    for item in data['list']:
        title = item['rn']
        anchor = item['nn']
        hot = item['ol']
        tag = item.get('od', '暂时还没有描述')
        image = item['rs1']
        url = f'https://www.douyu.com/topic/lolzxz?rid={item["url"][1:]}'
        all_data.append([title, anchor, hot, tag, image, url])

    print(all_data)
    return all_data

    # 解析方式二:解析网页源代码
    # soup = BeautifulSoup(html, 'lxml')
    # # 拿到所有产品对应的li标签
    # li_list = soup.select('.layout-Cover-list>li')
    # all_data = []
    # for li in li_list:
    #     # 标题
    #     title = li.select_one('.DyListCover-intro').get_text()
    #     # 主播
    #     anchor = li.select_one('.DyListCover-userName').get_text()
    #     # 热度
    #     hot = li.select_one('.DyListCover-hot').get_text()
    #     # 认证
    #     tag = li.select_one('.HeaderCell-label-wrap.is-od')
    #     if tag:
    #         tag = tag.get_text()
    #     else:
    #         tag = '暂时还没有描述'
    #     # print(tag)
    #     all_data.append([title, anchor, hot, tag])
    # return all_data


def save_data(data):
    with open('data.csv', 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['标题', '主播', '热度', '认证', '图片地址', '播放地址'])
        writer.writerows(data)


if __name__ == '__main__':
    data = analysis_data(get_net_data())
    save_data(data)
2. PuQuery
  1. 创建PuQuery对象

    PuQuery对象的本质是一个容器,元素是对应的标签

    PyQuery (需要解析的html字符串)

    from pyquery import PyQuery
    
    html = open('test.html', encoding='utf-8').read()
    pq = PyQuery(html)    # pq对应的容器中只有一个html标签
    print(type(pq))
    # print(pq)
    
  2. 通过选择器获得子标签

    PyQuery对象(css选择器) - 获取css选择器选中的标签对应pyQuery对象

    ps = pq('p')
    print(ps, type(ps))
    
    print(ps[0])    # <Element p at 0x10c7e0d60>
    for p in ps:
        print(p)
    
    # 在整个html中获取b标签(pq对应的容器中只有html一个标签)
    print(pq('b'))   # <b>加粗1</b> <b>加粗2</b> <b>加粗3</b>
    
    # 在所有的p标签中获取b标签(ps对应的容器中是整个页面中所有的p标签)
    print(ps('b'))   # <b>加粗1</b><b>加粗3</b>
    
    print(type(ps[0]))  # <class 'lxml.html.HtmlElement'>
    p1 = PyQuery(ps[-1])
    print(p1('b'))    # <b>加粗3</b>
    
    
    a_list = pq('a')
    p_list = pq('p')
    
  3. 获取标签内容

    PyQuery对象.text()

    result = pq('h1').text()
    print(result)    # 我是标题1
    
    # 同时获取所有a标签的内容
    result = pq('a').text()
    print(result, type(result))   # 我是超链接1 京东 <class 'str'>
    
    # 单独获取每个a标签的内容
    for a in pq('a'):
        print(PyQuery(a).text())
    
  4. 获取标签属性

    PyQuery对象.attr(属性名)

    # 获取第一个a标签的href属性
    result = pq('a').attr('href')
    print(result)
    
    # 获取每个a标签的href属性
    for a in pq('a'):
        print(PyQuery(a).attr('href'))
    
3. xpath
  1. 构建指定的树结构并且获取根节点

    from lxml import etree
    
    html = etree.HTML(open('test.html', encoding='utf-8').read())
    
  2. 根据路径获取节点(标签)

    # 节点对象.xpath(路径)  -  获取指定路径对应的所有节点对象对应列表
    
    a_list = html.xpath('/html/body/div/a')
    print(a_list)
    
  3. 路径

    # 1)绝对路径: /绝对路径
    # 注意:绝对路径必须从树结构的根节点开始写(html树结构就是从html开始写)
    result1 = html.xpath('/html/body/h1')
    result2 = a_list[0].xpath('/html/body/h1')
    print(result1)   # [<Element h1 at 0x10e725600>]
    print(result2)   # [<Element h1 at 0x10e725600>]
    
    result3 = html.xpath('/body/h1')    # 绝对路径只能从html开始
    print(result3)    # []
    
    
    # 2)相对路径
    # . - 表示当前节点(谁去点的xpath当前节点就是谁), ./可以省略
    # ..  - 标签当前节点的上层节点
    result4 = html.xpath('./body/h1')
    print(result4)    # [<Element h1 at 0x104de7700>]
    
    div = html.xpath('/html/body/div')[0]
    print(div)     # <Element div at 0x101771180>
    
    img1 = div.xpath('/html/body/div/img')
    print(img1)   # [<Element img at 0x10d5cd500>]
    img2 = div.xpath('./img')
    print(img2)   # [<Element img at 0x10d5cd500>]
    
    result5 = div.xpath('../b')
    print(result5)   # [<Element b at 0x108cc8480>]
    
    
    # 3)// - 全文检索
    # //p  - 获取整个页面中所有的p标签对应的节点
    # //div/p   - 获取整个页面中在div下面的p标签的
    result6 = html.xpath('//p')
    print(result6)    # [<Element p at 0x105ac1740>, <Element p at 0x105ac1780>, <Element p at 0x105ac17c0>]
    
    result7 = div.xpath('//p')
    print(result7)   # [<Element p at 0x105ac1740>, <Element p at 0x105ac1780>, <Element p at 0x105ac17c0>]
    
    result8 = html.xpath('/html/body/div//p/text()')
    print(result8)
    
    result9 = div.xpath('.//p/text()')
    print(result9)
    

补充:路径的最后加’/text()’ 是用来获取标签内容

p = html.xpath('./body/p/text()')
print(p)   # ['我是段落1', '我是段落2']
  1. 获取内容和属性

    # 获取内容 - 在路径的最后添加 /text()
    # 获取属性 - 在路径的最后添加 /@属性名
    result10 = html.xpath('//a/text()')
    print(result10)   # ['我是超链接1', '京东', '淘宝']
    
    result11 = html.xpath('//a/@href')
    print(result11)   # ['https://www.baidu.com', 'https://www.jd.com', 'https://www.jd.com']
    
    print(html.xpath('//@id'))   # ['id1', 'id2', 'id3']
    
  2. 谓语 - 筛选条件

    # [N] - 获取第N个指定标签
    result = div.xpath('./a[1]/text()')
    print(result)
    
    result = html.xpath('./body/div/p/text()')
    print(result)   # ['我是段落3', '我是段落5', '我是段落11', '我是段落22']
    
    result = html.xpath('./body/div/p[1]/text()')
    print(result)   # ['我是段落3', '我是段落11']
    
    result = html.xpath('./body/div[2]/p/text()')
    print(result)  # ['我是段落11', '我是段落22']
    
    result = html.xpath('./body/div[2]/p[1]/text()')
    print(result)    # ['我是段落11']
    
    # [last()]  - 最后一个
    # [last()-1]  -  倒数第2个
    # [last()-N]  - 倒数第N+1个
    result = html.xpath('./body/div/p[last()]/text()')
    print(result)   # ['我是段落5', '我是段落22']
    
    # [position()<3]  -  前两个
    # [position()<N]  -  前N-1个
    result = html.xpath('./body/div[2]/p[position()<3]/text()')
    print(result)   # ['我是段落11', '我是段落22']
    
    result = html.xpath('./body/div[2]/p[position()>=3]/text()')
    print(result)    # ['我是段落33', '我是段落44', '我是段落55']
    
    # [@属性名]  - 拥有指定属性
    result = html.xpath('./body/div[2]/p[@class]/text()')
    print(result)   # ['我是段落22', '我是段落44', '我是段落55']
    
    # [@属性名=值]  - 指定属性是指定值的标签
    result = html.xpath('./body/div[2]/p[@class="c1"]/text()')
    print(result)   # ['我是段落22', '我是段落44']
    
    # [子标签名>值]、[子标签名>=值]、[子标签名<值]、[子标签名<=值]、[子标签名=值]
    # 按照子标签内容进行筛选
    result = html.xpath('./body/div[last()]/li[span=150]/p/text()')
    print(result)
    
  3. 通配符 - *

    # *表示任意节点任意属性
    result = html.xpath('./body/div[1]/*')
    print(result)
    
    result = html.xpath('./body/*/*')
    print(result)
    
    result = html.xpath('//img/@*')
    print(result)
    
    # 6.| - 分支
    result = html.xpath('./body/div[1]/p/text()|./body/div[1]/a/text()')
    print(result)   # ['我是段落3', '京东', '淘宝', '爱奇艺', '我是段落5']
    
    div = html.xpath('./body/div[1]')[0]
    print(div.xpath('./p/text()|./a/text()'))   # ['我是段落3', '京东', '淘宝', '爱奇艺', '我是段落5']
    
    
    # result = html.xpath('./body/div[1]/(p|a)/text()')
    # print(result)
    
    # result = html.xpath('./body/div[last()]/li/(span|p)/text()')
    # print(result)
    
4. selenium
from selenium import webdriver

# 创建浏览器对象(谷歌浏览器)
b = webdriver.Chrome()
b.get('https://movie.douban.com/top250')
print(b.page_source)

作业

import requests
from lxml import etree

def gei_data():
    url = 'https://movie.douban.com/top250'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'
    }

    res = requests.get(url, headers=headers)

    if res.status_code == 200:
        return res.text
    else:
        print(res)


def analysis_data(html: str):
    html = etree.HTML(gei_data())
    all_data = []
    gei_li = html.xpath('//*[@id="content"]/div/div[1]/ol/li//img/@src')
    # print(gei_li)
    gei_name = html.xpath('//*[@id="content"]/div/div[1]/ol/li//img/@alt')
    # print(gei_name)
    gei_director = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]//p/text()')
    # print(gei_director)
    gei_score = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div//span[2]/text()')
    # print(gei_score)
    comments = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[2]/div//span[4]/text()')
    # print(comments)
    all_data.append([gei_li, gei_name, gei_director, gei_score, comments])
    return all_data



if __name__ == '__main__':
        print(analysis_data(gei_data()))
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值