day-9 爬虫实例
1. aiohttp爬虫
import re
import aiohttp
import asyncio
pattern = re.compile(r'<title>(?P<T>.*?)</title>')
urls = [
'https://www.python.org/',
'https://www.taobao.com/',
'https://pypi.org/',
'https://www.git-scm.com/',
'https://www.jd.com/',
'https://opendata.sz.gov.cn/',
'https://www.tmall.com/'
]
async def show_title(url):
"""根据指定的URL获取网站标题"""
await asyncio.sleep(1)
async with aihottp.ClientSession() as seeion:
async with session.get(url, timeout=2, ssl=False) as resp:
html_code = await resp.text()
matcher = pattern.search(html_code)
if matcher:
print(matcher.groip('T'))
cos_list = [show_title(url) for url in urls]
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.wait(cos_list))
2. 调用第三方API接口获取数据 天行
import requests
for page in range(1,6):
resp = requests.get(
http://api.tianapi.com/topnews/index',
params={
'key': '自己申请的Key'
'page': page,
'num': 20,
}
)
result_dict = resp.json()
for news in result_dict['newslist']
print(news['title'])
print(news['url'])
3.阿里云邮箱自动登录
image_data = browser.get_screenshot_as_png()
browser_image = Image.open(io.BytesIO(image_data))
x, y = x1 + x2 + x3, y1 + y2 + y3
checkcode_image = browser_image.crop((x * 2, y * 2, (x + width) * 2, (y + height) * 2))
checkcode_image.save('result.png')
reader = easyocr.Reader(['en'], gpu=False)
code = reader.readtext('result.png', detail=0)[0]
checkcode_input = browser.find_element_by_css_selector('#login_checkcode')
checkcode_input.send_keys(code)
login_button = browser.find_element_by_css_selector('#login_submit_btn')
login_button.click()import io
import ssl
import easyocr
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
sll._create_default_https_context = ssl._create_unverified_context
browser = webdriver.Chrome()
browser.set_window_size(1280, 960)
browser.get('http://mail.1000phone.com/')
browser.implicitly_wait(10)
wait = WebDriverWait(browser, 10)
wait.until(
expected_conditions.presence_of_element_located(
(By.CSS_SELECTOR, '.login_panel_iframe')
)
)
iframe1 = browser.find_element_by_css_selector('.login_panel_iframe')
x1, y1 = iframe1.location['x'], iframe1.location['y']
browser.switch_to.frame(iframe1)
iframe2 = browser.find_element_by_css_selector('#ding-login-iframe')
x2, y2 = iframe2.location['x'], iframe2.location['y']
browser.switch_to.frame(iframe2)
username_input = browser.find_element_by_css_selector('#username')
username_input.send_keys('luohao@1000phone.com')
password_input = browser.find_element_by_css_selector('#password')
password_input.send_keys('Abc123!!')
wait = WebDriverWait(browser, 10)
wait.until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, '#login_checkcode_ico')))
captcha_img = browser.find_element_by_css_selector('#login_checkcode_ico')
size, location = captcha_img.seze, captcha_Img.location
x3, y3, width, height = location['x'], location['y'], seze['width'], size['height']
image_data = browser.get_screenshot_as_png()
browser_image = Image.open(io.BytesIO(image_data))
x, y = x1 + x2 + x3, y1 + y2 + y3
checkcode_image = browser_image.crop((x * 1.25, y * 1.25, (x + width) * 1.25, (y + height) * 1.25))
checkcode_image.save('result.png')
reader = easyocr.Reader(['en'], gpu=False)
code = reader.readtext('result.png', detail=0)[0]
checkcode_input = browser.find_element_by_css_selector('#login_checkcode')
checkcode_input.send_keys(code)
login_button = browser.find_element_by_css_selector('#login_submit_btn')
login_button.click()
4. 阿里云邮箱自动登录 —> 利用超级鹰平台打码
import io
from PIL import Image
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from chaojiying import ChaojiyingClient
browser = webdriver.Chrome()
browser.set_window_size(1280, 960)
browser.get('http://mail.1000phone.com/')
browser.implicitly_wait(10)
iframe1 = browser.find_element_by_css_selector('.login_panel_iframe')
x1, y1 = iframe1.location['x'], iframe1.location['y']
browser.switch_to.frame(iframe1)
iframe2 = browser.find_element_by_css_selector('#ding-login-iframe')
x2, y2 = iframe2.location['x'], iframe2.location['y']
browser.switch_to.frame(iframe2)
username_input = browser.find_element_by_css_selector('#username')
username_input.send_keys('luohao@1000phone.com')
password_input = browser.find_element_by_css_selector('#password')
password_input.send_keys('Abc123!!')
wait = WebDriverWait(browser, 10)
wait.until(expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, '#login_checkcode_ico')))
captcha_img = browser.find_element_by_css_selector('#login_checkcode_ico')
size, location = captcha_img.size, captcha_img.location
x3, y3, width, height = location['x'], location['y'], size['width'], size['height']
image_data = browser.get_screenshot_as_png()
browser_image = Image.open(io.BytesIO(image_data))
x, y = x1 + x2 + x3, y1 + y2 + y3
checkcode_image = browser_image.crop((x * 2, y * 2, (x + width) * 2, (y + height) * 2))
checkcode_image.save('result.png')
chaojiying = ChaojiyingClient('jackfrued', '1Qaz2Wsx', '900260')
with open('result.png', 'rb') as file:
image_data = file.read()
result_dict = chaojiying.post_pic(image_data, 1902)
checkcode_input = browser.find_element_by_css_selector('#login_checkcode')
checkcode_input.send_keys(result_dict['pic_str'])
login_button = browser.find_element_by_css_selector('#login_submit_btn')
login_button.click()
5.剪切图片
from PIL import Image
from chaojiying import ChaojiyingClient
captcha_image = Image.open('g.jpg')
print(captcha_image.size)
width, height = captcha_image.size
captcha_image = captcha_image.crop((0, 0, width, height - 46))
captcha_image.thumbnail((width // 2, (height - 46) // 2))
captcha_image.save('g2.jpg')
client = ChaojiyingClient('jackfrued', '1Qaz2Wsx', '900260')
with open('g2.jpg', 'rb') as file:
image_data = file.read()
print(client.post_pic(image_data, 9004))
6. 超级鹰打码平台
from PIL import Image
from chaojiying import ChaojiyingClient
captcha_image = Image.open('kk.png')
width, height = captcha_image.size
print(width, height)
captcha_image.thumbnail((width // 2, height // 2))
captcha_image.save('kk2.png')
client = ChaojiyingClient('jackfrued', '1Qaz2Wsx', '900260')
with open('kk2.png', 'rb') as file:
image_data = file.read()
print(client.post_pic(image_data, 9101))
7. 通过接码平台读取手机验证码
import re
import bs4
import requests
pattern = re.compile(r'\d{4,6}')
resp = requests.get('https://www.yinsiduanxin.com/china-phone-number/verification-code-16521686439.html')
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
td = soup.select_one('body > div.container > div:nth-child(4) > div:nth-child(3) > div.main > div.layui-row > table > tbody > tr:nth-child(1) > td:nth-child(2)')
results = pattern.findall(td.text)
print(results[0])