"""
Selenium是驱动浏览器(chrome, firefox, IE)进行浏览器相关操作(打开url,点击网页中按钮功连接、输入文本)。
在Python程序中使用时,需要selenium的库和相关浏览的驱动程序(Window, Linux, Mac)
pip install selenium # 最新版本不一定好用,有可能打开浏览器的速度非常慢
pip --default-timeout=100 install selenium==4.1.1 -i https://pypi.tuna.tsinghua.edu.cn/simple
chrome://version/
https://googlechromelabs.github.io/chrome-for-testing/
executable_path:指定浏览器驱动的路径,一般将chromedriver.exe放在python解释器目录下或者创建系统环境变量后可以不指定
"""
import time
import requests
from selenium.webdriver.chrome.options import Options
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from spider_WE.utils.tujian import crack_code_img_b64
def get_lagou():
# 创建浏览器对象
browser = Chrome()
browser.get('https://www.lagou.com/')
# print(browser.title)
# 找到页面x,点击关闭
# 根据xpath定位页面元素,copy xpath只能在selenium中使用
x_btn = browser.find_element('xpath', '//*[@id="cboxClose"]')
x_btn.click()
# 点击完睡一下等页面元素更新,否则报错:element not interactable
time.sleep(1)
# 找到输入框,输入python回车,点击搜索 Keys.执行键盘操作
browser.find_element('xpath', '//*[@id="search_input"]').send_keys('python', Keys.ENTER)
time.sleep(2)
"""
控制台删除元素
var a=document.getElementsByClassName('content-right__3l85R')[0]
a.parentNode.removeChild(a)
selenium可以动态执行js
"""
browser.execute_script("""
var a=document.getElementsByClassName('content-right__3l85R')[0];
a.parentNode.removeChild(a);
""")
# 数据提取
divs = browser.find_elements('xpath', '//*[@id="jobList"]/div[1]/div[@class="item__10RTO"]')
for div in divs:
title = div.find_element('xpath', './/*[@id="openWinPostion"]')
title.click()
# 此时浏览器这边我们看到的内容已经是详情页的内容
# 但是在selenium眼中,依然是在首页
# 所以必须得让selenium去调整它的视角
# 切换窗口 switch_to.window选择新的窗口 window_handles窗口
browser.switch_to.window(browser.window_handles[-1])
job_detail = browser.find_element('xpath', '//*[@id="job_detail"]/dd[2]/div')
txt = job_detail.text
job_address = browser.find_element('xpath', '//*[@id="job_detail"]/dd[3]')
# 初始化一个空字符串来存储工作地址的文本信息
job_address_text = "工作地址:"
address_elements = job_address.find_elements('xpath',
'.//div[@class="work_addr"]//a[position() < last()] | .//span')
# 遍历子元素,获取文本信息
for element in address_elements:
job_address_text += element.text.strip() + ","
# 移除末尾的逗号
job_address_text = job_address_text.rstrip(",")
print(txt)
print(job_address_text)
print('-----------------------------------------------------')
time.sleep(1)
# 关闭该窗口
browser.close()
time.sleep(1)
# 调整selenium的视角
browser.switch_to.window(browser.window_handles[0])
time.sleep(5)
browser.quit() # 关闭浏览器
"""
切换iframe,找到iframe标签
iframe = browser.find_element('xpath', '//*[@id="mplay"]')
browser.switch_to.frame(iframe)
跳出iframe
browser.switch_to.parent_frame()
"""
"""
下拉列表处理
"""
from selenium.webdriver.support.select import Select # 下拉列表 <select>
def get_pull_down():
browser = Chrome()
browser.get('--------')
sel = browser.find_element('xpath', '-----')
sel_new = Select(sel)
print(sel_new.options) # 所有选项
"""
sel_new.select_by_index() # 根据下标切换
sel_new.select_by_value() # 根据value切换
sel_new.select_by_visible_text() # 根据展示的文字切换
<select>
<option value='2021'>2021年</option> 0
<option value='2020'>2020年</option> 1
</select>
"""
for i in range(len(sel_new.options)):
sel_new.select_by_index(i) # 根据下标切换
time.sleep(5) # 切换完毕等待加载
trs = browser.find_elements('xpath', '-------')
for tr in trs:
print(tr.text)
"""
无头浏览器信息配置
from selenium.webdriver.chrome.options import Options
opt = Options()
opt.add_argument('--headless')
opt.add_argument('--disable-gpu')
browser = Chrome(options=opt)
"""
"""
超级鹰登录 验证码测试
"""
from spider_WE.utils.chaojiying import crack_code_img_by
def login_cjy():
browser = Chrome()
browser.get('https://www.chaojiying.com/user/login/')
png = browser.find_element('xpath', '/html/body/div[3]/div/div[3]/div[1]/form/div/img').screenshot_as_png
v_code = crack_code_img_by(png)
browser.find_element('xpath', '/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input').send_keys('账号')
browser.find_element('xpath', '/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input').send_keys('密码')
browser.find_element('xpath', '/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input').send_keys(v_code)
browser.find_element('xpath', '/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input').click()
"""
requests破解简单验证码登录
"""
import requests
from fake_useragent import UserAgent
def login_tj():
# 请求验证码图片地址,提取到图片的b64以及图片imgId
verify_url = 'https://admin.ttshitu.com/captcha_v2'
session = requests.session()
resp = session.get(verify_url)
img = resp.json() # img b64图片编码 imgId,图片id参数
# print(img)
img_code = crack_code_img_b64(img['img'], 3)
login_url = 'https://admin.ttshitu.com/common/api/login/user'
headers = {'User-Agent': UserAgent().random}
data = {
'captcha': img_code,
'developerFlag': False,
'imgId': img['imgId'],
'needCheck': True,
'password': "密码",
'userName': "用户名"
}
resp = session.post(login_url, headers=headers, json=data)
print(resp.text)
"""
selenium 拿cookie
"""
def get_cookies():
browser = Chrome()
browser.get('https://www.baidu.com')
cookies = browser.get_cookies()
print(cookies)
cookies_dic = {dic['name']: dic['value'] for dic in cookies}
# print(cookies_dic)
# 当你已经有一个字典形式的cookie,可以直接把这个字典当做参数传给requests
headers = {}
requests.get('url', headers=headers, cookies=cookies_dic)
"""
----------等待------------
browser = Chrome()
browser.get('https://baidu.com')
time.sleep(2) # 必须等待2秒
browser.implicitly_wait(10) # 最多等待10秒,可能提前被唤醒,全局,隐士等待
browser.find_element('xpath', 'xxxxx')
browser.find_element('xpath', 'xxxxx')
# WebDriverWait(browser,10,0.5) # 显示等待,最多等待10秒,间隔0.5秒查一次
"""
"""
# 获取当前页面的源代码
page_source = browser.page_source
"""
"""
bili登录点选验证码(验证码通过后弹出手机短信验证页面,手动登录不弹出)
"""
# 事件链
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def login_bili():
opt = Options()
opt.add_argument('--disable-blink-features=AutomationControlled')
browser = Chrome(options=opt)
browser.get('https://www.bilibili.com/')
browser.implicitly_wait(10)
login_btn = browser.find_element(By.XPATH,
'//*[@id="i_cecream"]/div[2]/div[1]/div[1]/ul[2]/li[1]/li/div[1]/div/span')
login_btn.click()
time.sleep(2)
browser.find_element(By.XPATH, '/html/body/div[4]/div/div[4]/div[2]/form/div[1]/input').send_keys('账号')
browser.find_element(By.XPATH, '/html/body/div[4]/div/div[4]/div[2]/form/div[3]/input').send_keys('密码')
browser.find_element(By.XPATH, '/html/body/div[4]/div/div[4]/div[2]/div[2]/div[2]').click()
time.sleep(5)
# 获取整个验证码图片div
verify_div = browser.find_element(By.XPATH, '/html/body/div[5]/div[2]/div[6]/div/div')
# 方法1.保存图片
img_code_png = verify_div.screenshot('img_code.png')
# 方法2.使用图片的bs64
img_code_b64 = verify_div.screenshot_as_base64
verify_code = crack_code_img_b64(img_code_b64, 27)
print(verify_code)
for p in verify_code.split('|'):
x = int(p.split(',')[0])
y = int(p.split(',')[1])
print(x)
print(y)
# 在点击之前检查元素的可见性和大小
print(f"Element visible: {verify_div.is_displayed()}")
print(f"Element size: {verify_div.size}")
# 使用 move_to_element 移动到验证码图片的位置
ActionChains(browser).move_to_element(verify_div).perform()
# 等待元素可点击
WebDriverWait(browser, 10, 0.5).until(
# 该条件表示等待元素可被点击。一般用于确保元素在页面上存在,并且是可见的,并且可接收点击事件
EC.element_to_be_clickable(verify_div)
# 该条件表示等待元素在页面上出现。不关心元素是否可见或者可点击,只要元素在 DOM 结构中存在即可
# EC.presence_of_element_located(verify_div)
)
# .perform() 提交事件
ActionChains(browser).move_to_element_with_offset(verify_div, xoffset=x, yoffset=y).click().perform()
time.sleep(1)
browser.find_element(By.XPATH, '/html/body/div[5]/div[2]/div[6]/div/div/div[3]/a/div').click()
time.sleep(5)
# 找到目标元素
target_element = browser.find_element(By.XPATH,
'//*[@id="i_cecream"]/div[2]/div[1]/div[1]/ul[2]/li[1]/div[1]/a[1]/picture/img')
# 创建 ActionChains 对象
actions = ActionChains(browser)
# 将鼠标悬停在目标元素上
actions.move_to_element(target_element).perform()
time.sleep(10)
"""
selenium+lxml
"""
from lxml import etree
def get_page_source(url):
browser = Chrome()
browser.get(url)
# 显示等待
el = WebDriverWait(browser, 10, 0.5).until(
EC.presence_of_element_located((By.XPATH, 'xxxxxxx'))
)
return browser.page_source
def get_job_name(page_source):
tree = etree.HTML(page_source)
job_names = tree.xpath('xxxxxxxxx')
print(job_names)
"""
ac=ActionChains(browser)
ac.pause(0.35) # 事件休眠
滑块验证码
btn=browser.find_element(By.XPATH,'xxxxxxxxxxxxxx')
# 方法1,抓取滑动
ActionChains(browser).drag_and_drop_by_offset(btn,xoffset=300,yoffset=0).perform()
# 方法2,点击保持
# ActionChains(browser).click_and_hold(btn).move_by_offset(xoffset=300,yoffset=0)
"""
"""
浏览器版本是88以前,要去执行一段js代码
"""
# browser = Chrome()
# browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
# "source": """
# navigator.webdriver = undefined
# Object.defineProperty(navigator, 'webdriver', {
# get: () => undefined
# })
# """
# })
"""
selenium防检测,浏览器版本88以后的方案
opt = Options()
opt.add_argument('--disable-blink-features=AutomationControlled')
browser = Chrome(options=opt)
"""
def get_baidu():
browser = Chrome()
browser.get('https://www.baidu.com')
if __name__ == '__main__':
login_bili()
超级鹰
#!/usr/bin/env python
# coding:utf-8
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
headers=self.headers)
return r.json()
def PostPic_base64(self, base64_str, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
'file_base64': base64_str
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
def crack_code_img(filename):
chaojiying = Chaojiying_Client('账号', '密码', '956724') # 用户中心>>软件ID 生成一个替换 96001
im = open(filename, 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
resp = chaojiying.PostPic(im, 8001)
# print chaojiying.PostPic(base64_str, 1902) #此处为传入 base64代码
return resp['pic_str']
def crack_code_img_by(png_bytes):
chaojiying = Chaojiying_Client('账号', '密码', '956724')
resp = chaojiying.PostPic(png_bytes, 8001)
if resp['err_str']:
return resp['err_str']
return resp['pic_str']
if __name__ == '__main__':
# print(crack_code_img())
pass
图鉴
import base64
import json
import requests
# 一、图片文字类型(默认 3 数英混合):
# 1 : 纯数字
# 1001:纯数字2
# 2 : 纯英文
# 1002:纯英文2
# 3 : 数英混合
# 1003:数英混合2
# 4 : 闪动GIF
# 7 : 无感学习(独家)
# 11 : 计算题
# 1005: 快速计算题
# 16 : 汉字
# 32 : 通用文字识别(证件、单据)
# 66: 问答题
# 49 :recaptcha图片识别
# 二、图片旋转角度类型:
# 29 : 旋转类型
#
# 三、图片坐标点选类型:
# 19 : 1个坐标
# 20 : 3个坐标
# 21 : 3 ~ 5个坐标
# 22 : 5 ~ 8个坐标
# 27 : 1 ~ 4个坐标
# 48 : 轨迹类型
#
# 四、缺口识别
# 18 : 缺口识别(需要2张图 一张目标图一张缺口图)
# 33 : 单缺口识别(返回X轴坐标 只需要1张图)
# 五、拼图识别
# 53:拼图识别
def base64_api(uname, pwd, b64, typeid):
data = {"username": uname, "password": pwd, "typeid": typeid, "image": b64}
result = json.loads(requests.post("http://api.ttshitu.com/predict", json=data).text)
if result['success']:
return result["data"]["result"]
else:
# !!!!!!!注意:返回 人工不足等 错误情况 请加逻辑处理防止脚本卡死 继续重新 识别
return result["message"]
def crack_code_img(img, num):
with open(img, 'rb') as f:
base64_data = base64.b64encode(f.read())
b64 = base64_data.decode()
result = base64_api(uname='账号', pwd='密码', b64=b64, typeid=num)
return result
def crack_code_img_b64(b64, num):
result = base64_api(uname='账号', pwd='密码', b64=b64, typeid=num)
return result
if __name__ == "__main__":
pass