爬虫 leboncoin 极验滑块与风控分析

最新推荐文章于 2024-06-04 00:39:57 发布

小木哟

最新推荐文章于 2024-06-04 00:39:57 发布

阅读量1w

点赞数

分类专栏：爬虫 python 文章标签： python 爬虫

本文链接：https://blog.youkuaiyun.com/u013291301/article/details/124401997

版权

python 同时被 2 个专栏收录

12 篇文章

订阅专栏

爬虫

8 篇文章

订阅专栏

目标网站：aHR0cHM6Ly93d3cubGVib25jb2luLmZyLw==

极验3.0

滑块图片使用canvas画布展示，pyppeteer执行toData也无法获取原图

在这里插入图片描述

pyppeteer拦截图片URL下载下来为乱序

在这里插入图片描述

# 有缺口的背景图
if 'static.geetest.com/pictures/gt/' and '.webp' and '/bg/' in request.url:
    if 'pagead2.googlesyndication' not in request.url:
        if self.picture_url_bg_gap == '':
            self.picture_url_bg_gap = request.url
            self.logger.error(request.url)
    await request.continue_()
# 滑块图
elif 'static.geetest.com/pictures/gt/' and '.png' and '/slice/' in request.url:
    if self.picture_url_slice == '':
        self.picture_url_slice = request.url
        self.logger.error(request.url)
    await request.continue_()

乱序图片还原算法

# 还原极验3.0乱序图片
def parse_bg_captcha(self, img, save_path=None):
    if isinstance(img, (str, Path)):
        _img = Image.open(img)
    elif isinstance(img, bytes):
        _img = Image.open(io.BytesIO(img))
    else:
        raise ValueError(
            f'输入图片类型错误, 必须是<type str>/<type Path>/<type bytes>: {type(img)}')
    # 图片还原顺序, 定值
    _Ge = [39, 38, 48, 49, 41, 40, 46, 47, 35, 34, 50, 51, 33, 32, 28, 29, 27, 26, 36, 37, 31, 30, 44, 45, 43,
            42, 12, 13, 23, 22, 14, 15, 21, 20, 8, 9, 25, 24, 6, 7, 3, 2, 0, 1, 11, 10, 4, 5, 19, 18, 16, 17]
    w_sep, h_sep = 10, 80
    # 还原后的背景图
    new_img = Image.new('RGB', (260, 160))
    for idx in range(len(_Ge)):
        x = _Ge[idx] % 26 * 12 + 1
        y = h_sep if _Ge[idx] > 25 else 0
        # 从背景图中裁剪出对应位置的小块
        img_cut = _img.crop((x, y, x + w_sep, y + h_sep))
        # 将小块拼接到新图中
        new_x = idx % 26 * 10
        new_y = h_sep if idx > 25 else 0
        new_img.paste(img_cut, (new_x, new_y))
    if save_path is not None:
        save_path = Path(save_path).resolve().__str__()
        new_img.save(save_path)
    return save_path

识别缺口位置

# 识别缺口位置
def get_slice_x(self, bg_path, slice_path):
    # 读取背景图片和缺口图片
    bg_img = cv2.imread(bg_path, 0)  # 背景图片
    tp_img = cv2.imread(slice_path, 0)  # 缺口图片
    # 识别图片边缘
    bg_edge = cv2.Canny(bg_img, 100, 200)
    tp_edge = cv2.Canny(tp_img, 100, 200)
    # 转换图片格式
    bg_pic = cv2.cvtColor(bg_edge, cv2.COLOR_GRAY2RGB)
    tp_pic = cv2.cvtColor(tp_edge, cv2.COLOR_GRAY2RGB)
    # 缺口匹配
    res = cv2.matchTemplate(bg_pic, tp_pic, cv2.TM_CCOEFF_NORMED)
    min_val, max_val, min_loc, max_loc = cv2.minMaxLoc(res)  # 寻找最优匹配
    X = max_loc[0]
    # 绘制方框
    th, tw = tp_pic.shape[:2]
    tl = max_loc  # 左上角点的坐标
    br = (tl[0] + tw, tl[1] + th)  # 右下角点的坐标
    cv2.rectangle(bg_img, tl, br, (0, 0, 255), 2)  # 绘制矩形
    cv2.imwrite('out.jpg', bg_img)  # 保存在本地
    return X

避免轨迹检测生成轨迹算法

# 生成轨迹
def slide_list(self, total_length):
    v = 0  # 初速度
    t = 1  # 单位时间为0.3s来统计轨迹，轨迹即0.3内的位移
    slide_result = []  # 位移/轨迹列表，列表内的一个元素代表一个T时间单位的位移,t越大，每次移动的距离越大
    current = 0  # 当前的位移
    mid = total_length * 3 / 5  # 到达mid值开始减速
    while current < total_length:
        if current < mid:
            a = 0.4  # 加速度越小，单位时间的位移越小,模拟的轨迹就越多越详细
        else:
            a = -0.5
        v0 = v  # 初速度
        s = v0 * t + 0.5 * a * (t ** 2)  # 0.2秒时间内的位移
        current += s  # 当前的位置
        slide_result.append(round(s))  # 添加到轨迹列表
        v = v0 + a * t  # 速度已经达到v,该速度作为下次的初速度
    return slide_result

根据生成的轨迹滑动滑块

length_list = self.slide_list(x)
for length in length_list:
    await self.page.mouse.move(self.page.mouse._x + length, self.page.mouse._y, {'delay': random.randint(1000, 2000), 'steps': 3})
await self.page.mouse.move(self.page.mouse._x - 1, self.page.mouse._y, {'delay': random.randint(1000, 2000), 'steps': 3})

pyppeteer拦截成功请求后的request

如何获得request示例

try:
	proxy = {}
    req = {
        "headers": request.headers,
        "data": request.postData,
        "proxy": proxy,
        "timeout": 5,
        "ssl": False,
    }
    try:
        # 使用第三方库获取响应
        async with aiohttp_session.request(
            method=request.method, url=request.url, **req
        ) as response:
            body = await response.read()
    except Exception as e:
        body = ''
        self.logger.error(e)
        await request.abort()
    # 数据返回给浏览器
    resp = {"body": body, "headers": response.headers,
            "status": response.status}
    if response.status == 200:
        self.request = request
        self.stops = True
    await request.respond(resp)
except Exception as e:
    self.logger.error(e)

获得request后请求示例

async def get_data(self, request, i):
    myurl = 'https://www.leboncoin.fr/recherche?text={}&page={}'.format(
        self.keyword, i)
    try:
        proxy = {}
        req = {
            "headers": request.headers,
            "data": request.postData,
            "proxy": proxy,
            "timeout": 5,
            "ssl": False,
        }
        # 使用第三方库获取响应
        async with aiohttp_session.request(
            method=request.method, url=myurl, **req
        ) as response:
            body = await response.read()
        if response.status == 200:
        	return response
        else:
            raise Exception(myurl, response.status)
    except Exception as e:
        self.logger.error(myurl, e)
        raise Exception(e)

pyppeteer的请求返回监听

async def intercept_network_request(self, request):
    if '' in request.url:
        await request.continue_()
async def intercept_network_response(self, response):
    if '' in response.url:
    	await request.continue_()
loop = asyncio.get_event_loop()
loop.run_until_complete(self.page.setRequestInterception(True))
self.page.on('request', lambda request: asyncio.create_task(self.intercept_network_request(request)))
self.page.on('response', lambda response: asyncio.create_task(self.intercept_network_response(response)))