本篇以 获取美食杰菜谱图片 为例,分别采用正则、xpath、bs4 三种方式,获取同一内容,进行对比,分析三者区别。
url = 'https://www.meishij.net/chufang/diy/guowaicaipu1/japan/'
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
}
1.正则方法:
import re
from urllib.request import Request,urlopen
request = Request(url ,headers=headers)
# 获取响应
response = urlopen(request)
code = response.read().decode()
pattern = re.compile(r'<div class="listtyle1".*?>.*?<a.*?class="big".*?title="(.*?)".*?>.*?<img class="img".*?src="(.*?)".*?>.*?<div class="c1">.*?<span>(.*?)</span>',re.S)
result = pattern.findall(code)
# print(result)
for image in result:
image_alt =image[0]
image_src = image[1]
image_rq = image[2].split(' ')[3]
print(image_alt+'('+image_rq+'人气'+')',image_src)
2.xpath方法:
from lxml import etree
import requests
response = requests.get(url,headers = headers)
# print(response)# 将字符串转化为html代码
root = etree.HTML(response.content)
image_list = root.xpath('//div[@class="listtyle1"]/a')
for image in image_list:
# . : 表示从当前节点开始获取
image_alt = image.xpath('./img/@alt')[0]
image_src = image.xpath('./img/@src')[0]
image_rq = image.xpath('./div/div/div/span/text()')[0].split(' ')[3]
# 拼接内容
print(image_alt+'('+image_rq+'人气'+')',image_src)
3.bs4方法:
from bs4 import BeautifulSoup
import requests
# from lxml import etree
response = requests.get(url , headers = headers).content
soup = BeautifulSoup(response,'lxml')
image_list = soup.select('div.listtyle1')
# print(image_list)
for tag in image_list:
value = tag.find_all('img')
# print(value)
for image in value:
image_alt = image.get('alt')
image_src = image.get('src')
# print(image_alt)
# print(image_src)
image_rq = tag.span.string.split(' ')[3]
# print(image_rq)
print(image_alt+'('+image_rq+'人气'+')',image_src)
三者输出结果如下: