Day08

最新推荐文章于 2024-08-04 21:47:22 发布

weixin_30319097

最新推荐文章于 2024-08-04 21:47:22 发布

阅读量103

点赞数

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/raotao/p/11062473.html

该博客为课堂笔记，主要记录了解析库bs4的遍历文档树、搜索文档树相关内容，还涉及爬取豌豆荚app数据，同时提到了mongoDB及pymongo，最后给出了今日作业的转载链接。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

课堂笔记

解析库之bs4：

'''
BeautifulSoup4
1、什么是bs4
    是一个基于re开发的解析库

'''

'''
1.基本使用
'''
from bs4 import BeautifulSoup as bs

import lxml

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="sister"><b>$37</b></p>

<p class="story" id="p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" >Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

# 用BeautifulSoup实例化得到一个soup对象
# 参数一：解析文本，参数二：解析器
soup = bs(html_doc, 'lxml')

print(soup)

# 格式美化
html = soup.prettify()
print(html)

遍历文档树：

from bs4 import BeautifulSoup as bs

import lxml


html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>hy</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""

soup = bs(html_doc, 'lxml')

'''
    1、直接使用
    2、获取标签的名称
    3、获取标签的属性
    4、获取标签的内容
    5、嵌套选择
    6、子节点、子孙节点
    7、父节点、祖先节点
    8、兄弟节点
'''

'''
1、直接使用
'''
# 查找第一个p标签
print(soup.p)
print(soup.a)

'''
2、获取标签的名称
'''
print(soup.head.name)

'''
3、获取标签的属性
'''
# 获取a标签中的所有属性
print(soup.a.attrs)

# 获取a标签中的href属性
print(soup.a.attrs['href'])

'''
4、获取标签的内容
'''
print(soup.p.text)

'''
5、嵌套选择
'''
print(soup.html.head)

'''
6、子节点、子孙节点
'''
# body所有子节点，返回迭代器对象
print(soup.body.children)
# 强制转换为列表
print(list(soup.body.children))

# body的子孙节点，返回生成器对象
print(soup.body.descendants)
# 强制转换为列表
print(list(soup.body.descendants))

'''
7、父节点、祖先节点
'''
# 获取p标签的父亲节点
print(soup.p.parent)
# 获取p标签的祖先节点，返回生成器对象
print(soup.p.parents)
# 强制转换为列表
print(list(soup.p.parents))

'''
8、兄弟节点
'''
# 找p标签下一个兄弟
print(soup.p.next_sibling)
# 找p下面所有兄弟
print(soup.p.next_siblings)
print(list(soup.p.next_siblings))

# 找a标签的上一个兄弟
print(soup.a.previous_sibling)
# 找a标签的上面所有兄弟
print(soup.a.previous_siblings)
print(list(soup.a.previous_siblings))

搜索文档树：

'''
搜索文档树：
    find()     找一个
    find()     找所有

标签查找与属性查找:

    标签:
        name 属性匹配
        attrs 属性查找匹配
        text 文本匹配

        - 字符串过滤器
            字符串全局匹配

        - 正则过滤器
            re模块匹配

        - 列表过滤器
            列表内的数据匹配

        - bool过滤器
            True匹配

        - 方法过滤器
            用于一些要的属性以及不需要的属性查找。

    属性:
        - class_
        - id
'''

from bs4 import BeautifulSoup as bs
import lxml

html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>hy</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>"""

soup = bs(html_doc, 'lxml')


'''
字符串过滤器
'''
# name
# 根据标签名查找
p_tag = soup.find(name='p')
print(p_tag)

# 找到所有p标签
p_all_tag = soup.find_all(name='p')
print(p_all_tag)

# attrs
# 查找第一个class为sister的节点
p_class = soup.find(attrs={'class': 'sister'})
print(p_class)

# 查找所有class为sister的节点
p_all_class = soup.find_all(attrs={'class': 'sister'})
print(p_all_class)

# text
# 查找文本
p_text = soup.find(text='$37')
print(p_text)

# 配合使用
# 找到一个id为link2，文本为Lacie的a标签
p_all = soup.find(name='a', attrs={'id': 'link2'},text='Lacie')
print(p_all)


'''
正则过滤器
'''
import re
# name
# 根据标签名查找
p_tag = soup.find(name=re.compile('p'))
print(p_tag)


'''
列表过滤器
'''
import re
# name
# 根据标签名查找
p_tags = soup.find_all(name=['p', 'a', re.compile('html')])
print(p_tags)


'''
bool过滤器
'''
# 找到有id的p标签
p_tag = soup.find(name='p', attrs={'id': True})
print(p_tag)


'''
方法过滤器
'''
# 匹配标签名为a，属性有id，没有class的标签
def have_id_not_class(tag):
    if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'):
        return tag

tag = soup.find(have_id_not_class)
print(tag)

爬取豌豆荚app数据：

'''
url:
    https://www.wandoujia.com/category/6001


https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=2sFZJXOEckN_7qBULrSyfvj9

https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=2sFZJXOEckN_7qBULrSyfvj9

32个
'''

import requests
from bs4 import BeautifulSoup as bs
import lxml
import re

# 发送get请求
def get_page(url):
    response = requests.get(url)
    return response

# 解析详情页
def parse_detail(data):
    soup = bs(data, 'lxml')

    # 获取APP名称
    name = soup.find(name='span', attrs={'class', 'title'}).text
    # print(name)

    # 好评率
    love = soup.find(name='span', attrs={'class', 'love'}).text
    # print(love)

    # 评论人数
    commit_num = soup.find(name='a', attrs={'class', 'comment-open'}).text
    # print(commit_num)

    # 小编点评
    commit = soup.find(name='div', attrs={'class', 'con'}).text
    # print(commit)

    # 下载链接
    download = soup.find(name='a', attrs={'class', 'normal-dl-btn '}).attrs['href']
    # print(download)

    print(
        '''
        APP名称：{}
        好评率：{}
        评论人数：{}
        小编点评：{}
        下载链接：{}
        '''
        .format(name, love, commit_num, commit, download)
    )

# 解析主页
def parse_text(data):
    soup = bs(data, 'lxml')

    '''
    <li data-pn="com.tuyoo.fish.uc" class="card" data-suffix=""><div class="icon-wrap"><a  href="https://www.wandoujia.com/apps/com.tuyoo.fish.uc"  >  <img src="//img.ucdl.pp.uc.cn/upload_files/wdj_web/public/img/grey-128x128.png" data-original="https://android-artworks.25pp.com/fs08/2019/05/30/9/109_c9c161e9f3eca16f072b27cbfe759bab_con_130x130.png" alt="捕鱼大作战" class="icon lazy" width="68" height="68">  </a></div><div class="app-desc"><h2 class="app-title-h2"><a  href="https://www.wandoujia.com/apps/com.tuyoo.fish.uc"  title="捕鱼大作战" class="name">捕鱼大作战</a></h2><div class="meta">    <span class="install-count">13.9万人安装</span>  <span class="dot">・</span> <span title="33.67MB">33.67MB</span></div><div class="comment">  捕鱼大作战，经典街机新体验  </div></div>   <a class="tag-link" href="https://www.wandoujia.com/category/6001?pos=w/cardtag/gamecategory_com.tuyoo.fish.uc">休闲益智</a>     <a data-app-id="7471166" data-app-vid="700485088" data-app-name="捕鱼大作战" data-app-pname="com.tuyoo.fish.uc" data-app-vcode="41000" data-app-vname="4.1" data-app-categoryid="6001" data-app-subcategoryid="" data-app-icon="https://android-artworks.25pp.com/fs08/2019/05/30/9/109_c9c161e9f3eca16f072b27cbfe759bab_con_130x130.png" data-app-rtype="1"  class="detail-check-btn" href="https://www.wandoujia.com/apps/com.tuyoo.fish.uc">查看 </a>   </li>
    '''
    # 获取所有app的li标签
    li_data = soup.find_all(name='li', attrs={'class': 'card'})

    for li in li_data:
        # 图标地址
        img = li.find(name='img').attrs['data-original']
        print('图标地址：'+img)

        # 下载人数
        count = li.find(name='span', attrs={'class': 'install-count'}).text
        print('下载人数：'+count)

        # 大小
        size = li.find(name='span', text=re.compile('\d+MB')).text
        print('大小：'+size)

        # 详情页链接
        detail_url = li.find(name='a').attrs['href']
        print('详情页链接：'+detail_url)

        # 访问详情页
        detail_response = get_page(detail_url)
        # print(detail_response.text)

        # 解析详情页
        parse_detail(detail_response.text)

def main():
    for i in range(1, 33):
        url = 'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={}&ctoken=2sFZJXOEckN_7qBULrSyfvj9'.format(i)

        response = get_page(url)
        # print(response)
        # 反序列化为字典
        data = response.json()

        app_li = data['data']['content']

        # 解析主页面
        parse_text(app_li)

if __name__ == '__main__':
    main()

mongoDB：

'''
一、安装运行
1.下载安装
    -https://www.mongodb.com/download-center#community

2.安装路径为D:\MongoDB，将D:\MongoDB\bin目录加入环境变量

3.新建目录与文件
    -D:\MongoDB\data\db
    -D:\MongoDB\log\mongod.log

4.在C盘建立文件夹C:/data/db
    -数据存放路径

5.输入mongod启动服务
    进入终端（以管理员身份），输入mongod启动MongoDB服务

6.输入mongo进入MongoDB客户端（不要关闭服务端）
    打开一个新的cmd，输入mongo进入客户端

二、数据库操作
1.切换库
SQL:
    use admin;  有则切换，无则报错

MongoDB：
    use tank;   有则切换，无则创建并切换
    
2.查数据库
SQL:
    show database;

MongoDB：
    show dbs;   仅显示有数据的库

3.删除库
SQL:
    drop database;
    
MongoDB：
    db.dropDatabase();
    
三、集合操作  mysql中叫做表
1.创建集合
SQL:
    creat table f1,f2...
    
MongoDB：
    # 在当前库中通过.创建集合
    db.student

2.插入数据
    # 插入一条数据
    db.student.insert({"name":"hy"})
    
    # 插入多条数据
    db.student.insert({"name1":"hy1"},{"name2":"hy2"})

3.查找数据
    # 查找student集合中所有数据
    db.student.find({})
    
    # 查找一条数据,一条name为hy的数据
    db.student.find({"name":"hy"})
'''

pymongo

from pymongo import MongoClient

# 1.链接MongoDB客户端
# 参数一：mongoDB的IP地址
# 参数二：mongoDB端口号，默认27017
client = MongoClient('localhost', 27017)
print(client)


# 2.进入hy_db库，没有则创建
print(client['hy_db'])

# 3.创建集合
print(client['hy_db']['prople'])

# 4.向hy_db库插入一条数据
data1 = {'name': 'hy', 'age': '23', 'sex': 'male'}
client['hy_db']['people'].insert(data1)

# 5.插入多条数据
data1 = {'name': 'hy1', 'age': '23', 'sex': 'male'}
data2 = {'name': 'hy2', 'age': '22', 'sex': 'male'}
data3 = {'name': 'hy3', 'age': '21', 'sex': 'male'}
client['hy_db']['people'].insert([data1, data2, data3])

# 官方推荐使用
# 插入一条
client['hy_db']['people'].insert_one()
# 插入多条
client['hy_db']['people'].insert_many()


# 6.查数据
# 查看所有数据
data_s = client['hy_db']['people'].find()
# 循环打印所有数据
for data in data_s:
    print(data)

# 查看一条数据
data = client['hy_db']['people'].find_one()
print(data)

今日作业：

'''
url:
    https://www.wandoujia.com/category/6001


https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=2sFZJXOEckN_7qBULrSyfvj9

https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=2sFZJXOEckN_7qBULrSyfvj9

32个页面
'''

import requests
from bs4 import BeautifulSoup as bs
import lxml
import re
from pymongo import MongoClient

# 发送get请求
def get_page(url):
    response = requests.get(url)
    return response

# 解析详情页
def parse_detail(data):
    soup = bs(data, 'lxml')

    # 获取APP名称
    name = soup.find(name='span', attrs={'class', 'title'}).text
    # print(name)

    # 好评率
    love = soup.find(name='span', attrs={'class', 'love'}).text
    # print(love)

    # 评论人数
    commit_num = soup.find(name='a', attrs={'class', 'comment-open'}).text
    # print(commit_num)

    # 小编点评
    commit = soup.find(name='div', attrs={'class', 'con'}).text
    # print(commit)

    # 下载链接
    download = soup.find(name='a', attrs={'class', 'normal-dl-btn '}).attrs['href']
    # print(download)

    # 简介
    intro = soup.find(name='div', attrs={'class', 'desc-info'}).text

    # 网友评论（星星、名字、评论、时间）
    try:
        star = soup.find(name='i', attrs={'class', 'score-current'}).attrs['style']
        star_dict = {'width: 20%': '1颗星', 'width: 40%': '2颗星', 'width: 60%': '3颗星', 'width: 80%': '4颗星', 'width: 100%': '5颗星'}
        user_name = soup.find(name='span', attrs={'class': 'name'}).text
        time = soup.find(name='span', attrs={'class': 'time'}).text
        user_commit = soup.find(name='p', attrs={'class': 'cmt-content'}).text
    except:
        star = None
        star_dict = None
        user_name = None
        time = None
        user_commit = None

    # 1——5张截图链接地址
    link = []
    for i in range(0, 4):
        link.append(soup.find(name='img', attrs={'data-index': '{}'.format(i)}).attrs['src'])

    # print(
    #     '''
    #     APP名称：{}
    #     好评率：{}
    #     评论人数：{}
    #     小编点评：{}
    #     下载链接：{}
    #     简介：{}
    #     姓名：{}
    #     时间：{}
    #     star：{}
    #     评论：{}
    #     截图链接：{}
    #     '''
    #     .format(name, love, commit_num, commit, download, intro, user_name, time, star_dict[star], user_commit, link)
    # )
    client['wandoujia']['detail'].insert({'app_name': name})
    client['wandoujia']['detail'].insert({'love': love})
    client['wandoujia']['detail'].insert({'commit_num': commit_num})
    client['wandoujia']['detail'].insert({'commit': commit})
    client['wandoujia']['detail'].insert({'download_link': download})
    client['wandoujia']['detail'].insert({'intro': intro})
    client['wandoujia']['detail'].insert({'user_name': user_name})
    client['wandoujia']['detail'].insert({'time': time})
    client['wandoujia']['detail'].insert({'star': star_dict[star]})
    client['wandoujia']['detail'].insert({'user_commit': user_commit})
    client['wandoujia']['detail'].insert({'pic_link': link})



# 解析主页
def parse_text(data):
    soup = bs(data, 'lxml')

    # 获取所有app的li标签
    li_data = soup.find_all(name='li', attrs={'class': 'card'})

    for li in li_data:
        # 图标地址
        img = li.find(name='img').attrs['data-original']
        # print('图标地址：'+img)
        client['wandoujia']['index'].insert({'icon_addr': img})

        # 下载人数
        download_count = li.find(name='span', attrs={'class': 'install-count'}).text
        # print('下载人数：'+count)
        client['wandoujia']['index'].insert({'download_count': download_count})

        # 大小
        size = li.find(name='span', text=re.compile('\w+B')).text
        # print('大小：'+size)
        client['wandoujia']['index'].insert({'size': size})

        # 详情页链接
        detail_url = li.find(name='a').attrs['href']
        # print('详情页链接：'+detail_url)
        client['wandoujia']['index'].insert({'detail_url': detail_url})

        # 访问详情页
        detail_response = get_page(detail_url)
        # print(detail_response.text)

        # 解析详情页
        parse_detail(detail_response.text)


def main():
    for i in range(1, 33):
        url = 'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={}&ctoken=2sFZJXOEckN_7qBULrSyfvj9'.format(i)

        response = get_page(url)
        # print(response)
        # 反序列化为字典
        data = response.json()

        app_li = data['data']['content']

        # 解析主页面
        parse_text(app_li)


if __name__ == '__main__':
    client = MongoClient('localhost', 27017)
    print(client)
    client['wandoujia']['index']
    client['wandoujia']['detail']
    main()