1. 队列保存前200位作者用户名,每次取一个出来用独立线程取获取其所有的原创博客
2. 使用线程锁,写csv文件,防止多线程写入错乱 ( with csv_writer_lock:)
3. 每次最多运行5个线程,并行爬取5个用户的博客
# coding = utf-8
import csv
import json
import threading
import time
import os
import jsonpath
import requests
from queue import Queue
queue_user = Queue()
def parse(username, res):
blog_base_url = 'https://blog.youkuaiyun.com/'
blogs = json.loads(res.text)['data']['articleList']
for blog in blogs:
if blog['type'] == 1:
with csv_writer_lock: # 写csv线程锁, threading.Lock()
with open('./files/csdn_blog.csv', 'a+', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow([blog_base_url + username + blog['url'], blog['postTime'], blog['title']])
# print(blog['postTime'], blog['url'], blog['title'])
def get_user_list():
rank_url = 'https://blog.youkuaiyun.com/phoenix/web/blog/allRank?pageSize=200&page='
rank_list = json.loads(get_web(rank_url).text)
user_list = jsonpath.jsonpath(rank_list['data']['allRankListItem'], '$..[userName]')
for i in user_list:
queue_user.put(i)
def get_web(url):
header = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Mobile Safari/537.36'}
res = requests.get(url, headers=header)
time.sleep(1)
return res
def get_blogs(username):
i = 0
while True:
i += 1
blog_url = f'https://blog.youkuaiyun.com/phoenix/web/article/list?userName={username}&page={i}'
print(blog_url)
res = get_web(blog_url)
blogs = json.loads(res.text)['data']['articleList']
if blogs:
parse(username, res)
else:
break
if __name__ == '__main__':
get_user_list() # 获取前200位作者的博客账号用户名,并存入队列
print(queue_user.queue)
if os.path.exists('./files/csdn_blog.csv'):
os.remove('./files/csdn_blog.csv')
with open('./files/csdn_blog.csv', 'a+', encoding='utf-8-sig', newline='') as f:
writer = csv.writer(f)
writer.writerow(['文章链接地址', '发布时间', '文章标题'])
csv_writer_lock = threading.Lock() # 写csv文件锁,避免多线程写,导致写入错乱
while True:
threads = []
for i in range(5): # 开启5个线程,分别爬取5个作者的文章
if not queue_user.empty():
t = threading.Thread(target=get_blogs, args=(queue_user.get(),))
t.start()
threads.append(t)
for thread in threads: # 子线程结束后再运行主线程
thread.join()
if queue_user.empty():
break
print('done')
结果:

该程序使用队列和线程实现并行爬取优快云排名前200的作者的原创博客,每个线程获取一个作者的所有博客信息,然后写入CSV文件。通过线程锁避免多线程写入文件时的混乱。每次最多运行5个线程,直至所有作者的博客爬取完毕。
969

被折叠的 条评论
为什么被折叠?



