1、采用多个“采集线程”
获取“页面”
,将页面放入dataQueue中
2、采用多个“解析线程”
从dataQueue中获取页面内容,提取页面内容中的“每个帖子的链接”
,再解析“每个帖子链接中”的内容
pageQueue:存放着页号
dataQueue:存放每个“页号”对应的页面的内容
# 使用了线程库
import threading
# 队列
import queue
# 解析库
from lxml import etree
# 请求处理
import requests
# json处理
import json
import time
# 采集线程:获取pageQueue中的一个页码,将该页的HTML存入dataQueue中
class ThreadCrawl(threading.Thread):
def __init__(self, threadName, pageQueue, dataQueue):
#threading.Thread.__init__(self)
# 调用父类初始化方法
super(ThreadCrawl, self).__init__()
# 线程名
self.threadName = threadName
# 页码队列
self.pageQueue = pageQueue
# 数据队列
self.dataQueue = dataQueue
# 请求报头
self.headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
def run(self):
print("启动 " + self.threadName)
while not CRAWL_EXIT:
try:
# 取出一个数字,先进先出
# 可选参数block,默认值为True
#1. 如果对列为空,block为True的话,不会结束,会进入阻塞状态,直到队列有新的数据
#2. 如果队列为空,block为False的话,就弹出一个Queue.empty()异常,
page = self.pageQueue.get(False) # 获取一个页码
url = "http://www.qiushibaike.com/8hr/page/" + str(page) +"/"
# print(url)
content = requests.get(url, headers = self.headers).text
time.sleep(1)
self.dataQueue.put(content)
# print(len(content))
except:
pass
print("结束 " + self.threadName)
# 解析线程:获取dataQueue中的一个元素(即:一页的HTML内容)
# 解析每个帖子的链接
# 打开每个帖子的链接,解析每个帖子的“作者、标题”
class ThreadParse(threading.Thread):
def __init__(self, threadName, dataQueue, filename, lock):
super(ThreadParse, self).__init__()
# 线程名
self.threadName = threadName
# 数据队列
self.dataQueue = dataQueue
# 保存解析后数据的文件名
self.filename = filename
# 请求报头
self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
# 锁
self.lock = lock
def run(self):
print("启动" + self.threadName)
while not PARSE_EXIT:
try:
html = self.dataQueue.get(False)
self.parse(html)
except:
pass
print("退出" + self.threadName)
def parse(self, html):
# 解析为HTML DOM
html = etree.HTML(html)
node_list = html.xpath('//div//a[@class="recmd-content"]/@href')
# print(node_list)
for node in node_list:
fullurl = "https://www.qiushibaike.com" + node
# print(fullurl)
html = requests.get(fullurl, headers=self.headers).text # 获取HTML的信息
html = etree.HTML(html)
# xpath返回的列表,这个列表就这一个参数,用索引方式取出来,用户名
username = html.xpath('//span[@class="side-user-name"]')[0].text
print(username)
# xpath:标题名
article = html.xpath('//h1[@class="article-title"]')[0].text
print(article)
items = {
"username":username,
"article":article
}
# # with 后面有两个必须执行的操作:__enter__ 和 _exit__
# # 不管里面的操作结果如何,都会执行打开、关闭
# # 打开锁、处理内容、释放锁
with self.lock:
# 写入存储的解析后的数据
self.filename.write(json.dumps(items, ensure_ascii = False) + "\n")
CRAWL_EXIT = False
PARSE_EXIT = False
def main():
# 创建锁
lock = threading.Lock()
# 页码的队列,表示20个页面
pageQueue = queue.Queue(10)
# 放入1~10的数字,先进先出
for i in range(1, 11):
pageQueue.put(i)
# 采集结果(每页的HTML源码)的数据队列,参数为空表示不限制
dataQueue = queue.Queue()
crawlList = ["采集线程1号", "采集线程2号", "采集线程3号"] # 3个采集线程的名字
threadcrawl = [] # 存储3个采集线程的列表集合
for threadName in crawlList:
thread = ThreadCrawl(threadName, pageQueue, dataQueue)
thread.start()
threadcrawl.append(thread)
filename = open("qiushibaike.json", "a")
# 创建锁
lock = threading.Lock()
parseList = ["解析线程1号","解析线程2号","解析线程3号"] # 3个解析线程的名字
threadparse = [] # 存储3个解析线程
for threadName in parseList:
thread = ThreadParse(threadName, dataQueue, filename, lock)
thread.start()
threadparse.append(thread)
# 等待pageQueue队列为空,也就是等待之前的操作执行完毕
while not pageQueue.empty():
pass
# 如果pageQueue为空,采集线程退出循环
global CRAWL_EXIT
CRAWL_EXIT = True
print("pageQueue为空")
for thread in threadcrawl:
thread.join()
print("1")
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT = True
for thread in threadparse:
thread.join()
print("2")
with lock:
# 关闭文件
filename.close()
print("谢谢使用!")
if __name__ == "__main__":
main()