CSS偏移反爬
说明:利用CSS样式将乱序的文字排版为人类正常阅读的顺序,但是爬虫获取到的是乱序的
例如:https://flight.qunar.com/site/oneway_list.htm?searchDepartureAirport=%E5%8C%97%E4%BA%AC&searchArrivalAirport=%E4%B8%8A%E6%B5%B7&searchDepartureTime=2020-09-11&searchArrivalTime=2020-09-13&nextNDays=0&startSearch=true&fromCode=BJS&toCode=SHA&from=flight_dom_search&lowestPrice=null
去哪儿网的机票价格
解决办法:使用光学字符识别技术从图片中提取文字
步骤:
1、分析数字规律;
2、定位数字所在标签,得到基准数据;
3、提取其他标签的偏移量和数字;
4、根据偏移量决定基准数据列表的覆盖元素。
import re
from parsel import Selector
from selenium import webdriver
driver_path = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
driver = webdriver.Chrome(executable_path=driver_path)
url = 'https://flight.qunar.com/site/oneway_list.htm?searchDepartureAirport=%E5%8C%97%E4%BA%AC&searchArrivalAirport=%E4%B8%8A%E6%B5%B7&searchDepartureTime=2020-09-11&searchArrivalTime=2020-09-13&nextNDays=0&startSearch=true&fromCode=BJS&toCode=SHA&from=flight_dom_search&lowestPrice=null'
resp = driver.get(url)
sel = Selector(resp.page_source)
span = sel.css('span.prc_wp')
em = sel.css('em.rel').extract() # 定位em标签
# 定位_b标签
for element in em:
element = Selector(element)
element_b = element.css('b').extract()
b1 = Selector(element_b.pop(0))
# 获取第一对b标签的值:base_price 基准数据列表的元素超过包裹i标签的b标签宽度,进行切片
b1_style = b1.css('b::attr("style")').extract_first()
b1_width = ''.join(re.findall('width:(.*)px;', b1_style))
number = int(int(b1_width) / 16)
base_price = b1.css('i::text').extract()[:number]
# 提取其他b标签的的偏移量和数字
alternate_price = []
for eb in element_b:
eb = Selector(eb)
style = eb.css('b::attr("style")').get() # 提取b标签的style属性
position = ''.join(re.findall('left:(.*)px', style)) # 具体位置
value = eb.css('b::text').get()
alternate_price.append({'position': position, 'value': value})
# 根据偏移量决定基准数据列表的覆盖元素
for al in alternate_price:
position = int(al.get('position'))
value = al.get('value')
plus = True if position >= 0 else False # 判断位置数值是否为正整数
index = int(position / 16) # 计算要替换的下标
base_price[index] = value
print(base_price)