1. BeautifulSoup4使用
from bs4 import BeautifulSoup
html = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="dromouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>
"""
作用:能够在HTML或者XML中查找、选择我们所需内容,bs4是python实现的模块。
1.2 BeautifulSoup(参数1, 参数2)
参数1:前端页面的字符串类型源码。参数2:解析器。
对页面源码使用BeautifulSoup解析,解析后的数据类型是bs4。
soup = BeautifulSoup(html, "lxml")
1.3 select
根据CSS选择器查找内容,select获取页面中所有符合CSS选择器的结果,存入到列表中。
1.4 select_one
根据CSS选择器查找内容,select_one得到的结果是select结果的第一个元素。
1.5 prettify
格式化BeautifulSoup解析后的源码。
1.6 注意
select得到的列表中的每个元素和select_one得到的结果一定是bs4类型。
# a. 将body标签下的所有p标签获取
p_list = soup.select('p')
print(p_list)
p = soup.select_one('p')
print(p, type(p))
1.7 方法
1.7.1 text
获取html标签内(双标签)的文本。例如:<p>abcde</p> --> ‘abcde’。
1.7.2 attrs
获取html标签内的属性值。例如:<a herf=“http://www.baidu.com”></> 。
# b. 获取第一个p标签b标签的内容
b = soup.select_one('p.title > b').text
print(b, type(b))
# c. 获取第二个p标签中第三个a标签的href属性
href1 = soup.select_one('p.story > a#link3').attrs['href']
print(href1, type(href1))
href2 = soup.select_one('p.story > a:nth-child(3)').attrs['href']
print(href2, type(href2))
2. 中国新闻网CSS选择器爬虫
import requests
import csv
from bs4 import BeautifulSoup
from tqdm import tqdm
def requests_get(href):
Headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'
}
resp = requests.get(url=href, headers=Headers)
if resp.status_code == 200:
return resp
else:
print(resp.status_code)
if __name__ == '__main__':
# 1. 打开文件
file = open('中国新闻网.csv', 'w', encoding='utf-8', newline='')
# 2. 创建写方法
my_writer = csv.writer(file)
# 3. 写入列名
my_writer.writerow(['新闻类型', '新闻标题', '新闻链接', '新闻时间'])
for page in tqdm(range(1,11), desc='进度条'):
URL = f'https://www.chinanews.com.cn/scroll-news/news{page}.html'
response = requests_get(URL)
response.encoding = 'utf-8'
# print(response.text)
# 1.先解析,转类型
soup = BeautifulSoup(response.text, 'lxml')
# 2. 先找ul下的所有li
li_list = soup.select('body > div.w1280.mt20 > div.content-left > div.content_list > ul > li')
# print(li_list)
for i in li_list:
# print(i)
# 如果attrs操作对象是class属性,得到的结果是列表
if i.text != '':
# a. 获取新闻类型
class_ = i.select_one('li > .dd_lm > a').text
# print(class_)
# b. 获取新闻链接
href = 'https://www.chinanews.com.cn/' + i.select_one('li > .dd_bt > a').attrs['href']
# print(href)
# c. 获取新闻标题
title = i.select_one('li > .dd_bt > a').text
# print(title)
# d. 获取新闻时间
datetime_ = i.select_one('li > .dd_time').text
# print(datetime_)
print(class_, title, href, datetime_)
new_list = [class_, title, href, datetime_]
# 4. 写入数据
my_writer.writerow(new_list)
file.close()
print('写入完毕!')
3. 链家二手房爬虫
import requests
import csv
from bs4 import BeautifulSoup
def requests_get(href):
Headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'
}
resp = requests.get(url=href, headers=Headers)
if resp.status_code == 200:
return resp
else:
print(resp.status_code)
if __name__ == '__main__':
with open('链家二手房.csv', 'w', encoding='utf-8', newline='') as f:
my_writer = csv.writer(f)
my_writer.writerow(['名称', '地址', '简介', '总价', '单价'])
for page in range(1,11):
URL = f'https://cd.lianjia.com/ershoufang/pg{page}/'
response = requests_get(URL)
# print(response.text)
# 1. 先解析转类型
soup = BeautifulSoup(response.text,'lxml')
# 2. 获取ul下面所有li
li_list = soup.select('body > div.content > div.leftContent > ul.sellListContent > li')
# print(li_list)
for i in li_list:
if i.text != '':
# print(i)
# a. 获取名称
title = i.select_one('li > div.info.clear > div.title').text
# b. 获取地址
# content > div.leftContent > ul > li:nth-child(1) > div.info.clear > div.flood > div
address = i.select_one('li > div.info.clear > div.flood > div.positionInfo').text
# c. 获取简介
# content > div.leftContent > ul > li > div.info.clear > div.address > div
introduction = i.select_one('li > div.info.clear > div.address > div.houseInfo').text
# d.总价
total_price = i.select_one('li > div.info.clear > div.priceInfo > div.totalPrice.totalPrice2').text
# e. 单价
uint_price = i.select_one('li > div.info.clear > div.priceInfo > div.unitPrice').text
# print(title, address, introduction, total_price, uint_price)
new_list = [title, address, introduction, total_price, uint_price]
my_writer.writerow(new_list)
print('写入完成!')
4. 链家二手房爬虫(升级)
import re
import requests
import csv
from bs4 import BeautifulSoup
from tqdm import tqdm
"""
要求:
1.划行政区爬取。
2.获取每一套二手房二手房标题、二手房的地理位置、户型、面积、朝向、装修情况、单价、总价。
3.实现多页爬取。
4.使用函数封装不同功能的代码。
5.写入CSV文件。
URL = 'https://cd.lianjia.com/ershoufang/'
"""
def requests_get(href):
Headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39'
}
resp = requests.get(url=href, headers=Headers)
if resp.status_code == 200:
return resp
else:
print(resp.status_code)
def total_page(href):
response_page = requests_get(href)
soup_page = BeautifulSoup(response_page.text, 'lxml')
# #content > div.leftContent > div.contentBottom.clear > div.page-box.fr > div > a:nth-child(5)
# #content > div.leftContent > div.contentBottom.clear > div.page-box.fr > div
page_list = soup_page.select('body > div.content > div.leftContent > div.contentBottom.clear > div.page-box.fr > div')
# print(page_list)
for p in page_list:
# [<div class="page-box house-lst-page-box" comp-module="page" page-data='{"totalPage":100,"curPage":1}' page-url="/ershoufang/pg{page}"></div>]
page = eval(p.attrs['page-data'])["totalPage"]
# print(href, page_list, page)
return page
URL = 'https://cd.lianjia.com/ershoufang/'
response = requests_get(URL)
# print(response.text)
# 1. 解析转类型
soup = BeautifulSoup(response.text, 'lxml')
# 2. 获取区域当前页面所有div
# body > div: nth - child(12) > div > div.position > dl:nth - child(2) > dd > div: nth - child(1) > div
region_a = soup.select('body > div:nth-child(12) > div > div.position > dl:nth-child(2) > dd > div:nth-child(1) a')
# print(region_a)
# 获取所有区域链接和区域名
for i in region_a:
region = i.text
href = 'https://cd.lianjia.com' + i.attrs['href']
# print(region, href)
response_region = requests_get(href)
# 4. 获取总页数
page = total_page(href)
# print(page)
f = open(f'files/{region}.csv', 'w', encoding='utf-8', newline='')
my_writer = csv.writer(f)
my_writer.writerow(['标题', '地理位置', '房子', '单价', '总价'])
for j in tqdm(range(1,page+1), desc=f'{region}进度'):
href = 'https://cd.lianjia.com' + i.attrs['href']+ f'/pg{j}/'
# print(response_.text)
# 1. 解析转类型
soup_region = BeautifulSoup(response_region.text, 'lxml')
# 2. 获取所有li
# content > div.leftContent > ul
li_region = soup_region.select('body > div.content > div.leftContent > ul > li')
# print(li_region)
# 3. 遍历取到每一个二手房取想要的信息
for i_region in li_region:
# a. 标题
title = i_region.select_one('li > div.info.clear > div.title > a').text
# b. 地理位置
position = i_region.select_one('li > div.info.clear > div.flood > div').text
# c. 户型、面积、朝向、装修情况
house_list = i_region.select_one('li > div.info.clear > div.address > div').text.split('|')
house = ''.join(iter for item,iter in enumerate(house_list) if item <=3)
# d. 单价
unitprice = i_region.select_one('li > div.info.clear > div.priceInfo > div.unitPrice > span').text
# e. 总价
total = i_region.select_one('li > div.info.clear > div.priceInfo > div.totalPrice.totalPrice2').text
new_list = [title, position, house, unitprice, total]
my_writer.writerow(new_list)
# print(title, position, house, unitprice, total)
f.close()
print(f'{region}写入完成!')