爬取csdn的个人博客
全文构思:
本文使用账号密码进行登录,所用到的环境安装
sudo pip3 install selenium
sudo pip3 install pyperclip
除此之外使用了chrome的浏览器,需要下载驱动,自行百度
使用selenium登录了之后,进行全选和复制(进入的是博客管理的界面)
代码中的账号密码
login_mobile.send_keys(‘xxx’)
login_password.send_keys(‘xxx’)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time,os
import pyperclip
class MyCrawl()
def __init__(self):
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
self.browser = webdriver.Chrome(chrome_options=chrome_options)
self.wait = WebDriverWait(self.browser, 2)
self.browser.maximize_window()
def get_one_page(self, my_url):
self.browser.get(my_url)
self.parse_page()
def parse_page(self):
titles = self.browser.find_elements_by_css_selector('.article-list-item-txt a')
for title in titles:
my_title = title.text
print("文本标题", my_title)
title.click()
time.sleep(1)
self.browser.switch_to_window(self.browser.window_handles[1])
time.sleep(0.5)
try:
editor = self.browser.find_elements_by_css_selector('.editor__inner')
time.sleep(0.5)
editor[0].send_keys(Keys.CONTROL, 'a')
editor[0].send_keys(Keys.CONTROL, 'c')
my_content = pyperclip.paste()
print("获得了文本内容")
print(my_content)
self.write_file(my_title, my_content)
except:
print('富文档编辑器放弃爬取\t%s' % (my_title, ))
time.sleep(0.5)
self.write_error(my_title, self.browser.current_url)
self.browser.close()
self.browser.switch_to_window(self.browser.window_handles[0])
print(self.browser.current_url)
time.sleep(0.5)
def write_error(self, title, content):
"""
富文档不进行写入,此文件会记录所有文件名称
:param title:
:param content:
:return:
"""
file_path = 'csdn博客'
if not os.path.exists(file_path):
os.makedirs(file_path)
try:
file_name = file_path + os.path.sep + 'error.txt'
f = open(file_name, 'a+')
print("写入错误文件%s" % (file_name,))
f.write(title)
f.write(content+'\n')
f.close()
except :
print('Failed to Save File,item %s' % file_name)
def write_file(self, title, content):
"""
将得到的数据写入文件
:param title: 文章的标题
:param comtent: 文章的内容
:return:
"""
file_path = 'csdn博客'
if not os.path.exists(file_path):
os.makedirs(file_path)
try:
file_name = file_path + os.path.sep + '{file_name}.{file_suffix}'.format(
file_name=title,
file_suffix='txt')
if not os.path.exists(file_name):
with open(file_name, 'w+') as f:
print("写入文件%s" % (file_name, ))
f.write(title)
f.write(content)
f.close()
else:
print('Already Downloaded', file_name)
except :
print('Failed to Save File,item %s' % file_name)
def login_csdn(self):
"""
登录csdn
:return:
"""
self.browser.get('https://passport.youkuaiyun.com/login')
time.sleep(2)
user_pass = self.browser.find_elements_by_css_selector('.main-select li a')[1]
user_pass.click()
login_mobile = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#all')))
login_password = self.wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#password-number')))
time.sleep(0.1)
login_mobile.send_keys('xxx')
login_password.send_keys('xxx')
time.sleep(0.5)
login_button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn-primary')))
login_button.click()
print("登陆成功")
time.sleep(5)
area = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'login_img')))
ActionChains(self.browser).move_to_element(area).perform()
self.browser.get('https://mp.youkuaiyun.com/')
start_write = self.wait.until(EC.element_to_be_clickable((By.ID, 'btnStart')))
start_write.click()
print('进入了管理博客页面')
def __del__(self):
self.browser.close()
pass
if __name__ == '__main__':
mycrawl = MyCrawl()
mycrawl.login_csdn()
groups = ([x for x in range(1, 12 + 1)])
for i in groups:
mycrawl.get_one_page('https://mp.youkuaiyun.com/postlist/list/all/'+str(i))