爬取csdn的个人博客

最新推荐文章于 2023-11-03 13:52:14 发布

原创最新推荐文章于 2023-11-03 13:52:14 发布 · 1.1k 阅读

4 ·

CC 4.0 BY-SA版权

python 专栏收录该内容

22 篇文章

订阅专栏

本文介绍如何利用selenium登录csdn并爬取个人博客。首先安装selenium和pyperclip库，还需下载chrome浏览器驱动。通过selenium模拟登录后，实现博客内容的全选与复制。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

爬取csdn的个人博客

全文构思：
本文使用账号密码进行登录，所用到的环境安装
sudo pip3 install selenium
sudo pip3 install pyperclip
除此之外使用了chrome的浏览器，需要下载驱动，自行百度

使用selenium登录了之后，进行全选和复制（进入的是博客管理的界面）

代码中的账号密码
login_mobile.send_keys(‘xxx’)
login_password.send_keys(‘xxx’)

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time,os
import pyperclip

class MyCrawl()
    def __init__(self):
        chrome_options = webdriver.ChromeOptions()
        # chrome_options.add_argument('--headless')
        self.browser = webdriver.Chrome(chrome_options=chrome_options)
        self.wait = WebDriverWait(self.browser, 2)
        self.browser.maximize_window()

    def get_one_page(self, my_url):
        self.browser.get(my_url)
        self.parse_page()

    def parse_page(self):
        titles = self.browser.find_elements_by_css_selector('.article-list-item-txt a')
        for title in titles:
            my_title = title.text
            print("文本标题", my_title)
            title.click()
            time.sleep(1)
            self.browser.switch_to_window(self.browser.window_handles[1])
            time.sleep(0.5)
            try:
                editor = self.browser.find_elements_by_css_selector('.editor__inner')
                time.sleep(0.5)
                editor[0].send_keys(Keys.CONTROL, 'a')
                editor[0].send_keys(Keys.CONTROL, 'c')
                my_content = pyperclip.paste()
                print("获得了文本内容")
                print(my_content)
                self.write_file(my_title, my_content)
            except:
                print('富文档编辑器放弃爬取\t%s' % (my_title, ))
                time.sleep(0.5)
                self.write_error(my_title, self.browser.current_url)

            self.browser.close()
            self.browser.switch_to_window(self.browser.window_handles[0])
            print(self.browser.current_url)
            time.sleep(0.5)


    def write_error(self, title, content):
        """
        富文档不进行写入，此文件会记录所有文件名称
        :param title:
        :param content:
        :return:
        """
        file_path = 'csdn博客'
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        try:
            file_name = file_path + os.path.sep + 'error.txt'
            f = open(file_name, 'a+')
            print("写入错误文件%s" % (file_name,))
            f.write(title)
            f.write(content+'\n')
            f.close()
        except :
            print('Failed to Save File，item %s' % file_name)


    def write_file(self, title, content):
        """
        将得到的数据写入文件
        :param title: 文章的标题
        :param comtent: 文章的内容
        :return:
        """
        file_path = 'csdn博客'
        if not os.path.exists(file_path):
            os.makedirs(file_path)
        try:
            file_name = file_path + os.path.sep + '{file_name}.{file_suffix}'.format(
                    file_name=title,
                    file_suffix='txt')
            if not os.path.exists(file_name):
                with open(file_name, 'w+') as f:
                    print("写入文件%s" % (file_name, ))
                    f.write(title)
                    f.write(content)
                    f.close()
            else:
                print('Already Downloaded', file_name)
        except :
            print('Failed to Save File，item %s' % file_name)


    def login_csdn(self):
        """
        登录csdn
        :return:
        """
        self.browser.get('https://passport.youkuaiyun.com/login')
        time.sleep(2)
        user_pass = self.browser.find_elements_by_css_selector('.main-select li a')[1]
        user_pass.click()
        login_mobile = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#all')))
        login_password = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#password-number')))
        time.sleep(0.1)
        login_mobile.send_keys('xxx')
        login_password.send_keys('xxx')
        time.sleep(0.5)
        login_button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn-primary')))
        login_button.click()
        print("登陆成功")
        time.sleep(5)
        area = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login_img')))
        ActionChains(self.browser).move_to_element(area).perform()
        self.browser.get('https://mp.youkuaiyun.com/')
        start_write = self.wait.until(EC.element_to_be_clickable((By.ID, 'btnStart')))
        start_write.click()
        print('进入了管理博客页面')

    def __del__(self):
        self.browser.close()
        pass


if __name__ == '__main__':
    mycrawl = MyCrawl()
    mycrawl.login_csdn()
    groups = ([x for x in range(1, 12 + 1)])
    for i in groups:
        mycrawl.get_one_page('https://mp.youkuaiyun.com/postlist/list/all/'+str(i))

在这里插入图片描述