多线程爬虫（3）-多线程实战

最新推荐文章于 2022-02-08 18:33:15 发布

原创最新推荐文章于 2022-02-08 18:33:15 发布 · 173 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#python #爬虫

python 同时被 3 个专栏收录

5 篇文章

订阅专栏

线程

2 篇文章

订阅专栏

懒加载

2 篇文章

订阅专栏

本文详细介绍了一种使用Python多线程实现的高效网络爬虫技术。通过创建专门的采集线程和解析线程，利用队列进行数据传递，实现了网页内容的快速抓取和解析。文章深入讲解了线程的启动、运行和结束机制，以及如何通过队列和锁机制确保数据的安全共享。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

如果刚学不太看的懂先看我前两篇敲级短啊

#分析  两类线程   下载（3）  解析（3）
#下载线程 往队列 put线程  解析线程 往队列 get    


from queue  import Queue
import threading
import time
import requests
from lxml import etree
import json 

#用来存放采集线程
g_crawl_list=[]

#用来存放解析线程
g_parse_list=[]


#作为 存放地址已经完成的信号
Flag=False
class CrawlThread(threading.Thread):
    def __init__(self,name,page_queue,data_queue):
        super(CrawlThread,self).__init__()
        self.name=name
        self.page_queue=page_queue
        self.data_queue=data_queue
        self.url='http://www.fanjian.net/jiantu-{}'
        self.headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
    def run(self):
        print('%s-----线程启动'%self.name)
        while 1:
            #判断采集线程何时退出
            if self.page_queue.empty():
                global Flag
                Flag=True
                break
            
            #从队列中取页码
            page=self.page_queue.get()
            #拼接url  发送请求
            url=self.url.format(page)
            
            r=requests.get(url,headers=self.headers)
            
            #将响应内容存放到data_queue中
            self.data_queue.put(r.text)
            print('%s-----线程结束'%self.name)

    
class ParserThread(threading.Thread):
    def __init__(self,name,data_queue,lock,fp):
        super(ParserThread,self).__init__()
        self.name=name
        self.data_queue=data_queue
        self.lock=lock
        self.fp=fp
    def run(self):
        global Flag
        print('%s-----线程启动'%self.name)
        while 1:
             #判断采集线程何时退出

            if self.data_queue.empty() and Flag:
                print('%s-----线程结束'%self.name)
                break
            
            #从data-queue取出一条数据
            data=self.data_queue.get(True,10)
            #解析内容即可
            self.parse_content(data)

    def parse_content(self,data):
        tree=etree.HTML(data)
        items=[]
        #先查找所有的li  再从li里面找自己的标题和url
        li_list=tree.xpath('//ul[@class="cont-list"]/li')
        for oli in li_list:
             #获取图片标题
             title=oli.xpath('.//h2/a/text()')
             #获取图片的url
             image_url=oli.xpath('.//div[@class="cont-list-main"]//img/@data-src')
             item={
                     '标题':title,
                     '图片':image_url
                  }
             items.append(item)
        #写到文件中
        self.lock.acquire()
        self.fp.write(json.dumps(items,ensure_ascii=False)+'\n')
        self.lock.release()
        
def create_queue():
    #创建页码队列
    page_queue=Queue()
    for page in range(1,11):
        page_queue.put(page)
        
    #创建内容队列
    data_queue=Queue()
    return page_queue,data_queue

#创建采集线程
def create_crawl_thread(page_queue,data_queue):
    crawl_name=['采集线程一号','采集线程二号','采集线程三号']
    for name in crawl_name:
        #创建一个采集线程
        tcrawl=CrawlThread(name,page_queue,data_queue)
        g_crawl_list.append(tcrawl)
        
#创建解析线程
def create_parse_thread(data_queue,lock,fp):
    parse_name=['解析线程一号','解析线程二号','解析线程三号']
    for name in parse_name:
        #创建一个解析线程
        tparse=ParserThread(name,data_queue,lock,fp)
        g_parse_list.append(tparse)
        
        
def main():
    #第一步 创建队列函数  
    page_queue,data_queue=create_queue()
    
    #打开文件
    fp=open('jian.json','a',encoding='utf-8')
    
    #创建锁
    lock=threading.Lock()
    
    #创建采集线程
    create_crawl_thread(page_queue,data_queue)
    time.sleep(3)

    #创建解析线程
    create_parse_thread(data_queue,lock,fp)
    
    
    #启动所有采集线程
    for tcrawl in g_crawl_list:
        tcrawl.start()
    #启动所有解析线程
    for tparse in g_parse_list:
        tparse.start()
    
    
    #主线程等待子线程结束
    for tcrawl in g_crawl_list:
        tcrawl.join()
    
    for tparse in g_parse_list:
        tparse.join()
  #在这里关闭掉
    fp.close()
    print('子线程和主线程都结束')
    
if __name__=='__main__':
    main()