通过selenium爬取新浪微博

最新推荐文章于 2025-05-23 23:09:48 发布

原创最新推荐文章于 2025-05-23 23:09:48 发布 · 417 阅读

1 ·

CC 4.0 BY-SA版权

Python爬虫专栏收录该内容

7 篇文章

订阅专栏

该博客详细介绍了如何使用Selenium自动化登录微博，并针对滑动验证码的处理进行了探讨，包括截图、图像处理和模拟滑动的过程。虽然最终未完全实现滑动验证的自动化，但展示了自动化工具在应对验证码挑战时的思路。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
import csv
import PIL.Image as image
from PIL import Image, ImageEnhance
import time, re, random
import requests
from io import StringIO

# 保存信息
def towrite(item):
    with open('weibo.csv', 'a', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        try:
            writer.writerow(item)
        except:
            print("writer error!")

# # 对比RGB值
# def is_similar(image1, image2):
#     # 获取指定位置的RGB值
#     pixel1 = image1.getpixel((x,y))
#     pixel2 = image2.getpixel((x,y))
#     for i in range(0,3):
#         # 如果相差超过50则认为找到缺口位置
#         if abs(pixel1[i]-pixel2[i]) >= 50:
#             return False
#     return True
#
# # 计算缺口位置
# def get_diff_location(image1, image2):
#     i = 0
#     # 两张原始图的大小都是相同的260*160
#     # 通过两个for循环依次对比RGB值
#     # 如果相差50则认为找到了缺口的位置
#     for i in range(62, 260):
#         for j in range(0, 160):
#             if is_similar(image1, image2, i, j) == False:
#                 return i

# 主函数
def main():
    # 登录微博
    login_url = "https://passport.weibo.cn/signin/login?entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F"
    driver = webdriver.Chrome()
    driver.get(login_url)
    driver.implicitly_wait(60)  # 最多等待60s
    time.sleep(5)
    username = driver.find_element_by_xpath('//*[@id="loginName"]')
    username.send_keys('')  # 填入账号
    password = driver.find_element_by_xpath('//*[@id="loginPassword"]')
    password.send_keys('') # 填入账号密码
    time.sleep(5)
    submit = driver.find_element_by_xpath('//*[@id="loginAction"]')
    submit.click()  # 点击登录
    time.sleep(2)

    # 点击验证码
    try:
        yanzheng = driver.find_element_by_xpath('//*[@id="embed-captcha"]/div/div[2]/div[1]/div[3]')
        yanzheng.click()
    except Exception as err:
        print(err)

    time.sleep(5)

    # # 滑动验证码
    # driver.get_screenshot_as_file("D:/滑动验证.jpg") # 对整个页面截图
    # imgelement = driver.find_element_by_xpath('/html/body/div[4]/div[2]/div[1]/div/div[1]/div[1]/div/a/div[1]/div/canvas[1]') # 定位验证码
    # location = imgelement.location # 获取验证码x，y轴坐标
    # size = imgelement.size # 获取验证码的长宽
    # rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']), int(location['y'] + size['height'])) # 需要截取的位置坐标
    # i = Image.open("D:/滑动验证.jpg") # 打开截图
    # i = i.convert('RGB')
    # frame1 = i.crop(rangle) # 使用Image的crop函数， 从截图中再次截取我们需要的区域
    # frame1.save('D:/滑动验证new.jpg')
    # driver.find_element_by_xpath('/html/body/div[4]/div[2]/div[1]/div/div[1]/div[2]/div[2]').click()
    # time.sleep(3)
    #
    # driver.get_screenshot_as_file("D:/滑动验证.jpg")
    # imgelement = driver.find_element_by_xpath('/html/body/div[4]/div[2]/div[1]/div/div[1]/div[1]/div/a/div[1]/div/canvas[2]')
    # location = imgelement.location  # 获取验证码x，y轴坐标
    # size = imgelement.size  # 获取验证码的长宽
    # rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
    #           int(location['y'] + size['height']))  # 需要截取的位置坐标
    # i = Image.open("D:/滑动验证.jpg")  # 打开截图
    # i = i.convert('RGB')
    # frame2 = i.crop(rangle)  # 使用Image的crop函数， 从截图中再次截取我们需要的区域
    # frame2.save('D:/滑动验证new2.jpg')
    #
    # # 计算缺口位置
    # loc = get_diff_location(frame1, frame2)
    # print('-------------')
    # print(loc)
    # # 找到滑动的圆球
    # element = driver.find_element_by_xpath('/html/body/div[4]/div[2]/div[1]/div/div[1]/div[2]/div[2]')
    # location = element.location
    # # 获取滑动圆球的高度
    # y = location['y']
    # # 鼠标点击元素并按住不放
    # print("点击按钮不放")
    # ActionChains(driver).click_and_hold(on_element=element).perform()
    # time.sleep(0.15)
    # # 拖动
    # print("拖动按钮")
    # ActionChains(driver).move_to_element_with_offset(to_element=element, xoffset=loc + 30, yoffset=loc - 445).perform()
    # # 释放鼠标
    # ActionChains(driver).release(on_element=element).perform()

    # 爬取文章信息并保存
    try:
        all_weibo = driver.find_elements_by_xpath('//*[@id="app"]/div[1]/div[2]/div[2]')
        print("all_weibo:", all_weibo)
        for weibo in all_weibo:
            fabuId = weibo.find_elements_by_xpath('div/div/div/header/div[2]/div/a/h3/text()')[0].text
            fabuNeirong = weibo.find_elements_by_xpath('div/div/article/div/div/div[1]')[0].text
            item = [fabuId, fabuNeirong]
            towrite(item)
    except:
        print("爬取失败！")

# 主函数入口
if __name__ == '__main__':
    main()