beautifulsoup_study

本文通过一个具体的HTML文档示例,详细介绍了如何使用BeautifulSoup库进行网页解析。从获取文档标题到提取链接,再到遍历文档结构,全面展示了BeautifulSoup的强大功能。
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id="1"><b>first</b><b>test<b><b>The Dormouse's story</b><b>two</b></b></b></p>

<p  id="1" class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup=BeautifulSoup(html_doc,"lxml")
#print(soup.prettify())#结构化浏览
soup.title.name
soup.title.string
soup.title.parent.name
soup.p
soup.p['class']
soup.a
soup.find_all('a')
soup.find(id="link1")
print(soup.p['class'][0])
print(type(soup.p['class']))
print(type(soup.a['id']))
title
<class 'list'>
<class 'str'>
print(soup.p)
print(soup.find_all("b"))
print(type(soup.find_all("b")))
<p class="title" id="1"><b>first</b><b><b>The Dormouse's story</b><b>two</b></b></p>
[<b>first</b>, <b><b>The Dormouse's story</b><b>two</b></b>, <b>The Dormouse's story</b>, <b>two</b>]
<class 'bs4.element.ResultSet'>
print((soup.find("a"))['href'])#get('href')
print(type(soup))
http://example.com/elsie
<class 'bs4.BeautifulSoup'>
print(type(soup.p.contents))
print(soup.p.contents)
print(type(soup.p.contents[0]))
print(type(soup.p.contents[0].contents))
print(type(soup.p.children))
<class 'list'>
[<b>first</b>, <b><b>The Dormouse's story</b><b>two</b></b>]
<class 'bs4.element.Tag'>
<class 'list'>
<class 'list_iterator'>
print(type(soup.title.descendants))
for  i in soup.p.descendants:
    print(i)
<class 'generator'>
<b>first</b>
first
<b>test<b><b>The Dormouse's story</b><b>two</b></b></b>
test
<b><b>The Dormouse's story</b><b>two</b></b>
<b>The Dormouse's story</b>
The Dormouse's story
<b>two</b>
two

程序启动报错 File "D:\2_stydy\python_study\pc\text.py", line 112 def main(): SyntaxError: expected 'except' or 'finally' block,程序为import os import time import requests import tkinter as tk from tkinter import simpledialog from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager from bs4 import BeautifulSoup SAVE_FOLDER = "D:/1_work/a" if not os.path.exists(SAVE_FOLDER): os.makedirs(SAVE_FOLDER) def get_search_query(): root = tk.Tk() root.withdraw() query = simpledialog.askstring("小红书爬虫", "请输入要搜索的内容:") return query def login_xiaohongshu(): options = webdriver.ChromeOptions() options.add_experimental_option("detach", True) driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) driver.get("https://www.xiaohongshu.com") input("请扫码登录后按回车键继续...") return driver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC def search_and_get_images(driver, query): search_box = WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.TAG_NAME, "input")) ) search_box.send_keys(query + Keys.RETURN) # 等待搜索结果容器加载 WebDriverWait(driver, 15).until( EC.presence_of_element_located((By.CSS_SELECTOR, "div[class*='note-container']")) ) # 模拟滚动加载更多内容 last_height = driver.execute_script("return document.body.scrollHeight") for _ in range(3): # 滚动3次 driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(2) new_height = driver.execute_script("return document.body.scrollHeight") if new_height == last_height: break last_height = new_height # 更精确的图片选择器 soup = BeautifulSoup(driver.page_source, "html.parser") note_cont
03-10
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值