day-9 爬虫实例

最新推荐文章于 2025-09-30 11:13:39 发布

原创最新推荐文章于 2025-09-30 11:13:39 发布 · 2.3k 阅读

0 ·

CC 4.0 BY-SA版权

本文介绍了使用aiohttp进行爬虫实战，调用天行数据API获取信息，并展示了如何实现阿里云邮箱的自动登录，包括利用超级鹰平台解决验证码问题。同时，还涉及到了图片剪切和通过接码平台读取手机验证码的技术细节。

day-9 爬虫实例

1. aiohttp爬虫

import re

import aiohttp
import asyncio

# 命令捕获组（具名捕获组）    ?P<T> 
pattern = re.compile(r'<title>(?P<T>.*?)</title>')

urls = [
    'https://www.python.org/',
    'https://www.taobao.com/',
    'https://pypi.org/',
    'https://www.git-scm.com/',
    'https://www.jd.com/',
    'https://opendata.sz.gov.cn/',
    'https://www.tmall.com/'
]


async def show_title(url):
    """根据指定的URL获取网站标题"""
    await asyncio.sleep(1)  # 等待1秒
    async with aihottp.ClientSession() as seeion:
        						# 等待2秒
        async with session.get(url, timeout=2, ssl=False) as resp:  
            html_code = await resp.text()
            matcher = pattern.search(html_code)
			if matcher:
                print(matcher.groip('T'))
                
                
cos_list = [show_title(url) for url in urls]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(cos_list))
# loop.close()

2. 调用第三方API接口获取数据天行

import requests

for page in range(1,6):
	resp = requests.get(
		http://api.tianapi.com/topnews/index',
		params={
			'key': '自己申请的Key'
			'page': page,
			'num': 20,
		}
	)
	result_dict = resp.json()
	for news in result_dict['newslist']
		print(news['title'])
		print(news['url'])

3.阿里云邮箱自动登录

image_data = browser.get_screenshot_as_png()
# bytes（只读字节串） ----> io.BytesIO（可写字节串）---> getvalue() ---> bytes
# str（只读字符串） ----> io.StringIO（可写字符串）---> getvalue() ---> str
browser_image = Image.open(io.BytesIO(image_data))
# 从截图上剪裁出验证码的图片
x, y = x1 + x2 + x3, y1 + y2 + y3
# Windows系统的写法 ---> 如果截图有问题就把坐标写死
# print(x, y, width, height)
# checkcode_image = browser_image.crop((x, y, x + width, y + height))
# macOS高清屏的写法
checkcode_image = browser_image.crop((x * 2, y * 2, (x + width) * 2, (y + height) * 2))
checkcode_image.save('result.png')
# 通过easyocr做光学文字识别
reader = easyocr.Reader(['en'], gpu=False)
code = reader.readtext('result.png', detail=0)[0]
# 将识别出的验证码输入文本框
checkcode_input = browser.find_element_by_css_selector('#login_checkcode')
checkcode_input.send_keys(code)
login_button = browser.find_element_by_css_selector('#login_submit_btn')
# 模拟用户点击
login_button.click()import io
import ssl

import easyocr

from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait

sll._create_default_https_context = ssl._create_unverified_context

browser = webdriver.Chrome()  # 选择浏览器
browser.set_window_size(1280, 960)  # 设置窗口大小
browser.get('http://mail.1000phone.com/')  # 打开网页
# 隐式等待
browser.implicitly_wait(10)  # 打开浏览器需等待 隐式等待
wait = WebDriverWait(browser, 10)   # 显示等待
wait.until(    # presence_of_element_located  等待指定的元素出现
    expected_conditions.presence_of_element_located(
        (By.CSS_SELECTOR, '.login_panel_iframe')
    )   # 等到.login_panel_iframe 出现代码继续往下执行
)
# 获取到第一个iframe 并保存位置
iframe1 = browser.find_element_by_css_selector('.login_panel_iframe')
# 保存第一个ifrme内嵌窗口坐标
x1, y1 = iframe1.location['x'], iframe1.location['y']

# Chrome对象的switch_to属性的frame方法，可以从页面切换到iframe中
browser.switch_to.frame(iframe1)

# 获取第二个iframe
iframe2 = browser.find_element_by_css_selector('#ding-login-iframe')
x2, y2 = iframe2.location['x'], iframe2.location['y']
# location 那元素在页面的相对位置
browser.switch_to.frame(iframe2)
# 获取输入用户名和密码的文本框并模拟用户输入
username_input = browser.find_element_by_css_selector('#username')
# 模拟用户输入
username_input.send_keys('luohao@1000phone.com')
password_input = browser.find_element_by_css_selector('#password')
password_input.send_keys('Abc123!!')
# 创建一个等待对象   显示等待
wait = WebDriverWait(browser, 10)
wait.until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, '#login_checkcode_ico')))
captcha_img = browser.find_element_by_css_selector('#login_checkcode_ico')

# WebElement对象的size属性代表元素宽度和高度，location属性代表元素在窗口中的位置
size, location = captcha_img.seze, captcha_Img.location
x3, y3, width, height = location['x'], location['y'], seze['width'], size['height']

# 截取整个浏览器窗口的图片获得图片的二进制数据
image_data = browser.get_screenshot_as_png()
# bytes（只读字节串） ----> io.BytesIO（可写字节串）---> getvalue() ---> bytes
# str（只读字符串） ----> io.StringIO（可写字符串）---> getvalue() ---> str
browser_image = Image.open(io.BytesIO(image_data))
# 从截图上剪裁出验证码的图片
x, y = x1 + x2 + x3, y1 + y2 + y3
# Windows系统的写法 ---> 如果截图有问题就把坐标写死
# print(x, y, width, height)
# checkcode_image = browser_image.crop((x, y, x + width, y + height))
# macOS高清屏的写法
checkcode_image = browser_image.crop((x * 1.25, y * 1.25, (x + width) * 1.25, (y + height) * 1.25))
checkcode_image.save('result.png')
# 通过easyocr做光学文字识别
reader = easyocr.Reader(['en'], gpu=False)
code = reader.readtext('result.png', detail=0)[0]
# 将识别出的验证码输入文本框
checkcode_input = browser.find_element_by_css_selector('#login_checkcode')
checkcode_input.send_keys(code)
login_button = browser.find_element_by_css_selector('#login_submit_btn')
# 模拟用户点击
login_button.click()

4. 阿里云邮箱自动登录 —> 利用超级鹰平台打码

import io

from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait

from chaojiying import ChaojiyingClient

browser = webdriver.Chrome()
browser.set_window_size(1280, 960)
browser.get('http://mail.1000phone.com/')
# 隐式等待
browser.implicitly_wait(10)
# 显示等待
# wait = WebDriverWait(browser, 10)
# wait.until(
#     expected_conditions.presence_of_element_located(
#         (By.CSS_SELECTOR, '.login_panel_iframe')
#     )
# )
iframe1 = browser.find_element_by_css_selector('.login_panel_iframe')
x1, y1 = iframe1.location['x'], iframe1.location['y']
# Chrome对象的switch_to属性的frame方法，可以从页面切换到iframe中
browser.switch_to.frame(iframe1)
iframe2 = browser.find_element_by_css_selector('#ding-login-iframe')
x2, y2 = iframe2.location['x'], iframe2.location['y']
browser.switch_to.frame(iframe2)
# 获取输入用户名和密码的文本框并模拟用户输入
username_input = browser.find_element_by_css_selector('#username')
username_input.send_keys('luohao@1000phone.com')
password_input = browser.find_element_by_css_selector('#password')
password_input.send_keys('Abc123!!')
# 创建一个等待对象
wait = WebDriverWait(browser, 10)
wait.until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, '#login_checkcode_ico')))
captcha_img = browser.find_element_by_css_selector('#login_checkcode_ico')
# WebElement对象的size属性代表元素宽度和高度，location属性代表元素在窗口中的位置
size, location = captcha_img.size, captcha_img.location
x3, y3, width, height = location['x'], location['y'], size['width'], size['height']
# 截取整个浏览器窗口的图片获得图片的二进制数据
image_data = browser.get_screenshot_as_png()
# bytes（只读字节串） ----> io.BytesIO（可写字节串）---> getvalue() ---> bytes
# str（只读字符串） ----> io.StringIO（可写字符串）---> getvalue() ---> str
browser_image = Image.open(io.BytesIO(image_data))
# 从截图上剪裁出验证码的图片
x, y = x1 + x2 + x3, y1 + y2 + y3
# Windows系统的写法 ---> 注意"显示设置"中的"缩放与布局"应该设置为100%，否则就要调整坐标
# print(x, y, width, height)
# checkcode_image = browser_image.crop((x, y, x + width, y + height))
# macOS高清屏的写法
checkcode_image = browser_image.crop((x * 2, y * 2, (x + width) * 2, (y + height) * 2))
checkcode_image.save('result.png')
# 通过超级鹰平台识别验证码
chaojiying = ChaojiyingClient('jackfrued', '1Qaz2Wsx', '900260')
with open('result.png', 'rb') as file:
    image_data = file.read()
    result_dict = chaojiying.post_pic(image_data, 1902)
# 将识别出的验证码输入文本框
checkcode_input = browser.find_element_by_css_selector('#login_checkcode')
checkcode_input.send_keys(result_dict['pic_str'])
login_button = browser.find_element_by_css_selector('#login_submit_btn')
# 模拟用户点击
login_button.click()

5.剪切图片

from PIL import Image

from chaojiying import ChaojiyingClient

captcha_image = Image.open('g.jpg')
print(captcha_image.size)
width, height = captcha_image.size
captcha_image = captcha_image.crop((0, 0, width, height - 46))
captcha_image.thumbnail((width // 2, (height - 46) // 2))
captcha_image.save('g2.jpg')

client = ChaojiyingClient('jackfrued', '1Qaz2Wsx', '900260')
with open('g2.jpg', 'rb') as file:
    image_data = file.read()
    print(client.post_pic(image_data, 9004))

6. 超级鹰打码平台

from PIL import Image

from chaojiying import ChaojiyingClient

captcha_image = Image.open('kk.png')
width, height = captcha_image.size
print(width, height)
captcha_image.thumbnail((width // 2, height // 2))
captcha_image.save('kk2.png')

client = ChaojiyingClient('jackfrued', '1Qaz2Wsx', '900260')
with open('kk2.png', 'rb') as file:
    image_data = file.read()
    print(client.post_pic(image_data, 9101))

7. 通过接码平台读取手机验证码

import re

import bs4
import requests

pattern = re.compile(r'\d{4,6}')

resp = requests.get('https://www.yinsiduanxin.com/china-phone-number/verification-code-16521686439.html')
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
# print(resp.text)
td = soup.select_one('body > div.container > div:nth-child(4) > div:nth-child(3) > div.main > div.layui-row > table > tbody > tr:nth-child(1) > td:nth-child(2)')
results = pattern.findall(td.text)
print(results[0])