课堂笔记
解析库之bs4:
''' BeautifulSoup4 1、什么是bs4 是一个基于re开发的解析库 ''' ''' 1.基本使用 ''' from bs4 import BeautifulSoup as bs import lxml html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 用BeautifulSoup实例化得到一个soup对象 # 参数一:解析文本,参数二:解析器 soup = bs(html_doc, 'lxml') print(soup) # 格式美化 html = soup.prettify() print(html)
遍历文档树:
from bs4 import BeautifulSoup as bs import lxml html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>hy</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>""" soup = bs(html_doc, 'lxml') ''' 1、直接使用 2、获取标签的名称 3、获取标签的属性 4、获取标签的内容 5、嵌套选择 6、子节点、子孙节点 7、父节点、祖先节点 8、兄弟节点 ''' ''' 1、直接使用 ''' # 查找第一个p标签 print(soup.p) print(soup.a) ''' 2、获取标签的名称 ''' print(soup.head.name) ''' 3、获取标签的属性 ''' # 获取a标签中的所有属性 print(soup.a.attrs) # 获取a标签中的href属性 print(soup.a.attrs['href']) ''' 4、获取标签的内容 ''' print(soup.p.text) ''' 5、嵌套选择 ''' print(soup.html.head) ''' 6、子节点、子孙节点 ''' # body所有子节点,返回迭代器对象 print(soup.body.children) # 强制转换为列表 print(list(soup.body.children)) # body的子孙节点,返回生成器对象 print(soup.body.descendants) # 强制转换为列表 print(list(soup.body.descendants)) ''' 7、父节点、祖先节点 ''' # 获取p标签的父亲节点 print(soup.p.parent) # 获取p标签的祖先节点,返回生成器对象 print(soup.p.parents) # 强制转换为列表 print(list(soup.p.parents)) ''' 8、兄弟节点 ''' # 找p标签下一个兄弟 print(soup.p.next_sibling) # 找p下面所有兄弟 print(soup.p.next_siblings) print(list(soup.p.next_siblings)) # 找a标签的上一个兄弟 print(soup.a.previous_sibling) # 找a标签的上面所有兄弟 print(soup.a.previous_siblings) print(list(soup.a.previous_siblings))
搜索文档树:
''' 搜索文档树: find() 找一个 find() 找所有 标签查找与属性查找: 标签: name 属性匹配 attrs 属性查找匹配 text 文本匹配 - 字符串过滤器 字符串全局匹配 - 正则过滤器 re模块匹配 - 列表过滤器 列表内的数据匹配 - bool过滤器 True匹配 - 方法过滤器 用于一些要的属性以及不需要的属性查找。 属性: - class_ - id ''' from bs4 import BeautifulSoup as bs import lxml html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>hy</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.</p><p class="story">...</p>""" soup = bs(html_doc, 'lxml') ''' 字符串过滤器 ''' # name # 根据标签名查找 p_tag = soup.find(name='p') print(p_tag) # 找到所有p标签 p_all_tag = soup.find_all(name='p') print(p_all_tag) # attrs # 查找第一个class为sister的节点 p_class = soup.find(attrs={'class': 'sister'}) print(p_class) # 查找所有class为sister的节点 p_all_class = soup.find_all(attrs={'class': 'sister'}) print(p_all_class) # text # 查找文本 p_text = soup.find(text='$37') print(p_text) # 配合使用 # 找到一个id为link2,文本为Lacie的a标签 p_all = soup.find(name='a', attrs={'id': 'link2'},text='Lacie') print(p_all) ''' 正则过滤器 ''' import re # name # 根据标签名查找 p_tag = soup.find(name=re.compile('p')) print(p_tag) ''' 列表过滤器 ''' import re # name # 根据标签名查找 p_tags = soup.find_all(name=['p', 'a', re.compile('html')]) print(p_tags) ''' bool过滤器 ''' # 找到有id的p标签 p_tag = soup.find(name='p', attrs={'id': True}) print(p_tag) ''' 方法过滤器 ''' # 匹配标签名为a,属性有id,没有class的标签 def have_id_not_class(tag): if tag.name == 'a' and tag.has_attr('id') and tag.has_attr('class'): return tag tag = soup.find(have_id_not_class) print(tag)
爬取豌豆荚app数据:
''' url: https://www.wandoujia.com/category/6001 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=2sFZJXOEckN_7qBULrSyfvj9 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=2sFZJXOEckN_7qBULrSyfvj9 32个 ''' import requests from bs4 import BeautifulSoup as bs import lxml import re # 发送get请求 def get_page(url): response = requests.get(url) return response # 解析详情页 def parse_detail(data): soup = bs(data, 'lxml') # 获取APP名称 name = soup.find(name='span', attrs={'class', 'title'}).text # print(name) # 好评率 love = soup.find(name='span', attrs={'class', 'love'}).text # print(love) # 评论人数 commit_num = soup.find(name='a', attrs={'class', 'comment-open'}).text # print(commit_num) # 小编点评 commit = soup.find(name='div', attrs={'class', 'con'}).text # print(commit) # 下载链接 download = soup.find(name='a', attrs={'class', 'normal-dl-btn '}).attrs['href'] # print(download) print( ''' APP名称:{} 好评率:{} 评论人数:{} 小编点评:{} 下载链接:{} ''' .format(name, love, commit_num, commit, download) ) # 解析主页 def parse_text(data): soup = bs(data, 'lxml') ''' <li data-pn="com.tuyoo.fish.uc" class="card" data-suffix=""><div class="icon-wrap"><a href="https://www.wandoujia.com/apps/com.tuyoo.fish.uc" > <img src="//img.ucdl.pp.uc.cn/upload_files/wdj_web/public/img/grey-128x128.png" data-original="https://android-artworks.25pp.com/fs08/2019/05/30/9/109_c9c161e9f3eca16f072b27cbfe759bab_con_130x130.png" alt="捕鱼大作战" class="icon lazy" width="68" height="68"> </a></div><div class="app-desc"><h2 class="app-title-h2"><a href="https://www.wandoujia.com/apps/com.tuyoo.fish.uc" title="捕鱼大作战" class="name">捕鱼大作战</a></h2><div class="meta"> <span class="install-count">13.9万人安装</span> <span class="dot">・</span> <span title="33.67MB">33.67MB</span></div><div class="comment"> 捕鱼大作战,经典街机新体验 </div></div> <a class="tag-link" href="https://www.wandoujia.com/category/6001?pos=w/cardtag/gamecategory_com.tuyoo.fish.uc">休闲益智</a> <a data-app-id="7471166" data-app-vid="700485088" data-app-name="捕鱼大作战" data-app-pname="com.tuyoo.fish.uc" data-app-vcode="41000" data-app-vname="4.1" data-app-categoryid="6001" data-app-subcategoryid="" data-app-icon="https://android-artworks.25pp.com/fs08/2019/05/30/9/109_c9c161e9f3eca16f072b27cbfe759bab_con_130x130.png" data-app-rtype="1" class="detail-check-btn" href="https://www.wandoujia.com/apps/com.tuyoo.fish.uc">查看 </a> </li> ''' # 获取所有app的li标签 li_data = soup.find_all(name='li', attrs={'class': 'card'}) for li in li_data: # 图标地址 img = li.find(name='img').attrs['data-original'] print('图标地址:'+img) # 下载人数 count = li.find(name='span', attrs={'class': 'install-count'}).text print('下载人数:'+count) # 大小 size = li.find(name='span', text=re.compile('\d+MB')).text print('大小:'+size) # 详情页链接 detail_url = li.find(name='a').attrs['href'] print('详情页链接:'+detail_url) # 访问详情页 detail_response = get_page(detail_url) # print(detail_response.text) # 解析详情页 parse_detail(detail_response.text) def main(): for i in range(1, 33): url = 'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={}&ctoken=2sFZJXOEckN_7qBULrSyfvj9'.format(i) response = get_page(url) # print(response) # 反序列化为字典 data = response.json() app_li = data['data']['content'] # 解析主页面 parse_text(app_li) if __name__ == '__main__': main()
mongoDB:
''' 一、安装运行 1.下载安装 -https://www.mongodb.com/download-center#community 2.安装路径为D:\MongoDB,将D:\MongoDB\bin目录加入环境变量 3.新建目录与文件 -D:\MongoDB\data\db -D:\MongoDB\log\mongod.log 4.在C盘建立文件夹C:/data/db -数据存放路径 5.输入mongod启动服务 进入终端(以管理员身份),输入mongod启动MongoDB服务 6.输入mongo进入MongoDB客户端(不要关闭服务端) 打开一个新的cmd,输入mongo进入客户端 二、数据库操作 1.切换库 SQL: use admin; 有则切换,无则报错 MongoDB: use tank; 有则切换,无则创建并切换 2.查数据库 SQL: show database; MongoDB: show dbs; 仅显示有数据的库 3.删除库 SQL: drop database; MongoDB: db.dropDatabase(); 三、集合操作 mysql中叫做表 1.创建集合 SQL: creat table f1,f2... MongoDB: # 在当前库中通过.创建集合 db.student 2.插入数据 # 插入一条数据 db.student.insert({"name":"hy"}) # 插入多条数据 db.student.insert({"name1":"hy1"},{"name2":"hy2"}) 3.查找数据 # 查找student集合中所有数据 db.student.find({}) # 查找一条数据,一条name为hy的数据 db.student.find({"name":"hy"}) '''
pymongo
from pymongo import MongoClient # 1.链接MongoDB客户端 # 参数一:mongoDB的IP地址 # 参数二:mongoDB端口号,默认27017 client = MongoClient('localhost', 27017) print(client) # 2.进入hy_db库,没有则创建 print(client['hy_db']) # 3.创建集合 print(client['hy_db']['prople']) # 4.向hy_db库插入一条数据 data1 = {'name': 'hy', 'age': '23', 'sex': 'male'} client['hy_db']['people'].insert(data1) # 5.插入多条数据 data1 = {'name': 'hy1', 'age': '23', 'sex': 'male'} data2 = {'name': 'hy2', 'age': '22', 'sex': 'male'} data3 = {'name': 'hy3', 'age': '21', 'sex': 'male'} client['hy_db']['people'].insert([data1, data2, data3]) # 官方推荐使用 # 插入一条 client['hy_db']['people'].insert_one() # 插入多条 client['hy_db']['people'].insert_many() # 6.查数据 # 查看所有数据 data_s = client['hy_db']['people'].find() # 循环打印所有数据 for data in data_s: print(data) # 查看一条数据 data = client['hy_db']['people'].find_one() print(data)
今日作业:
''' url: https://www.wandoujia.com/category/6001 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=2sFZJXOEckN_7qBULrSyfvj9 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=2sFZJXOEckN_7qBULrSyfvj9 32个页面 ''' import requests from bs4 import BeautifulSoup as bs import lxml import re from pymongo import MongoClient # 发送get请求 def get_page(url): response = requests.get(url) return response # 解析详情页 def parse_detail(data): soup = bs(data, 'lxml') # 获取APP名称 name = soup.find(name='span', attrs={'class', 'title'}).text # print(name) # 好评率 love = soup.find(name='span', attrs={'class', 'love'}).text # print(love) # 评论人数 commit_num = soup.find(name='a', attrs={'class', 'comment-open'}).text # print(commit_num) # 小编点评 commit = soup.find(name='div', attrs={'class', 'con'}).text # print(commit) # 下载链接 download = soup.find(name='a', attrs={'class', 'normal-dl-btn '}).attrs['href'] # print(download) # 简介 intro = soup.find(name='div', attrs={'class', 'desc-info'}).text # 网友评论(星星、名字、评论、时间) try: star = soup.find(name='i', attrs={'class', 'score-current'}).attrs['style'] star_dict = {'width: 20%': '1颗星', 'width: 40%': '2颗星', 'width: 60%': '3颗星', 'width: 80%': '4颗星', 'width: 100%': '5颗星'} user_name = soup.find(name='span', attrs={'class': 'name'}).text time = soup.find(name='span', attrs={'class': 'time'}).text user_commit = soup.find(name='p', attrs={'class': 'cmt-content'}).text except: star = None star_dict = None user_name = None time = None user_commit = None # 1——5张截图链接地址 link = [] for i in range(0, 4): link.append(soup.find(name='img', attrs={'data-index': '{}'.format(i)}).attrs['src']) # print( # ''' # APP名称:{} # 好评率:{} # 评论人数:{} # 小编点评:{} # 下载链接:{} # 简介:{} # 姓名:{} # 时间:{} # star:{} # 评论:{} # 截图链接:{} # ''' # .format(name, love, commit_num, commit, download, intro, user_name, time, star_dict[star], user_commit, link) # ) client['wandoujia']['detail'].insert({'app_name': name}) client['wandoujia']['detail'].insert({'love': love}) client['wandoujia']['detail'].insert({'commit_num': commit_num}) client['wandoujia']['detail'].insert({'commit': commit}) client['wandoujia']['detail'].insert({'download_link': download}) client['wandoujia']['detail'].insert({'intro': intro}) client['wandoujia']['detail'].insert({'user_name': user_name}) client['wandoujia']['detail'].insert({'time': time}) client['wandoujia']['detail'].insert({'star': star_dict[star]}) client['wandoujia']['detail'].insert({'user_commit': user_commit}) client['wandoujia']['detail'].insert({'pic_link': link}) # 解析主页 def parse_text(data): soup = bs(data, 'lxml') # 获取所有app的li标签 li_data = soup.find_all(name='li', attrs={'class': 'card'}) for li in li_data: # 图标地址 img = li.find(name='img').attrs['data-original'] # print('图标地址:'+img) client['wandoujia']['index'].insert({'icon_addr': img}) # 下载人数 download_count = li.find(name='span', attrs={'class': 'install-count'}).text # print('下载人数:'+count) client['wandoujia']['index'].insert({'download_count': download_count}) # 大小 size = li.find(name='span', text=re.compile('\w+B')).text # print('大小:'+size) client['wandoujia']['index'].insert({'size': size}) # 详情页链接 detail_url = li.find(name='a').attrs['href'] # print('详情页链接:'+detail_url) client['wandoujia']['index'].insert({'detail_url': detail_url}) # 访问详情页 detail_response = get_page(detail_url) # print(detail_response.text) # 解析详情页 parse_detail(detail_response.text) def main(): for i in range(1, 33): url = 'https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={}&ctoken=2sFZJXOEckN_7qBULrSyfvj9'.format(i) response = get_page(url) # print(response) # 反序列化为字典 data = response.json() app_li = data['data']['content'] # 解析主页面 parse_text(app_li) if __name__ == '__main__': client = MongoClient('localhost', 27017) print(client) client['wandoujia']['index'] client['wandoujia']['detail'] main()