美味汤
Beautiful Soup 4 文档
https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
1.安装beautifulsoup4
pip install beautifulsoup4
2.安装解析器
pip install lxml
bs4基本使用
from bs4 import BeautifulSoup
import lxml
import re
html = '''
<html>
<head>
<title>hello world</title>
</head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
</html>
'''
# 创建bs4对象
soup = BeautifulSoup(html,"lxml")
# print(soup.prettify(),type(soup)) # <class 'bs4.BeautifulSoup'>
# Tag标签
# print(soup.head)
# print(soup.p) # 文档中有多个p标签,默认匹配第一个p标签
# print(soup.p.b)
# 标签中的文本内容
#print(soup.p.b.text)
#print(soup.head.title.text)
#print(soup.head.title.string)
# 属性 attributes
# print(soup.p.attrs) # {'class': ['title']}
# print(soup.a.attrs) # {'href': 'http://example.com/elsie', 'class': ['sister'], 'id': 'link1'}
# print(soup.a.attrs['id']) # link1
# print(soup.a['id']) # link1
# find_all():匹配所有的节点
# print(soup.find_all('p'))
# print(soup.find_all("a"))
# print(soup.find_all("a",attrs={"id":"link1"})) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]
# print(soup.find_all("a",id="link1"))
# print(soup.find_all("a",href=re.compile("cie$")))
# print(soup.find_all(class_="sister"))
# print(soup.find_all("a",limit=2)) # 表示取前两个a标签
# print(soup.find_all(["a","b"])) # 同时获取所有的a标签和b标签
# print(soup.p.find_all("b"))
# select() 需要使用选择器
'''
id选择器 class选择器 标签选择器 伪类选择器 群组选择器 子元素选择器
'''
# print(soup.select("#link2"))
# print(soup.select(".sister"))
print(soup.select("p a"))
示例:bs4 爬取猫眼电影
import requests
from bs4 import BeautifulSoup
import lxml
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
}
url = "https://maoyan.com/films?offset=0"
res = requests.get(url,headers = headers)
# print(res.text)
# 创建bs4对象
soup = BeautifulSoup(res.text,'lxml')
dd_list = soup.select('.movie-list dd')
# print(dd_list)
for dd in dd_list:
# 电影名称:
movie_name = dd.select('.movie-item-title')[0]['title']
# 评分
scores = dd.select('.channel-detail-orange')[0].text
# print(scores)
with open('maoyan.csv','a',encoding='utf-8') as fp:
str = f"{movie_name},{scores}\n\n"
fp.write(str)
fp.flush()
示例:爬取股票基金
import urllib
from urllib import request
from bs4 import BeautifulSoup
stockList = []
def download(url):
headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);"}
request = urllib.request.Request(url, headers=headers) # 请求,修改,模拟http.
data = urllib.request.urlopen(request).read() # 打开请求,抓取数据
soup = BeautifulSoup(data, "lxml", from_encoding="gb2312")
mytable = soup.select("#datalist")
for line in mytable[0].find_all("tr"):
print(line.get_text()) # 提取每一个行业
print(line.select("td:nth-of-type(3)")[0].text) # 提取具体的某一个
if __name__ == '__main__':
download("http://quote.stockstar.com/fund/stock.html")
存入数据库
import pymysql
# 存入数据库
def save_job(tencent_job_list):
# 连接数据库
db = pymysql.connect(host="127.0.0.1", port=3306, user='root', password="root",database='tencent1', charset='utf8')
# 游标
cursor = db.cursor()
# 遍历,插入job
for job in tencent_job_list:
sql = 'insert into job(name, address, type, num) VALUES("%s","%s","%s","%s") ' % (job["name"], job["address"], job["type"], job["num"])
cursor.execute(sql)
db.commit()
cursor.close()
db.close()