python支持多线程的爬虫

最新推荐文章于 2025-06-18 14:32:40 发布

原创最新推荐文章于 2025-06-18 14:32:40 发布 · 4k 阅读

4 ·

CC 4.0 BY-SA版权

文章标签：

#函数 #网络爬虫 #多线程 #爬虫 #算法

算法同时被 3 个专栏收录

133 篇文章

订阅专栏

移动开发

109 篇文章

订阅专栏

数据挖掘

70 篇文章

订阅专栏

本文分享了使用Python实现多线程网页爬虫的方法，采用广度优先算法，结合线程池与锁机制，实现高效稳定的网页抓取。

python是支持多线程的, 主要是通过thread和threading这两个模块来实现的，本文主要给大家分享python实现多线程网页爬虫

一般来说，使用线程有两种模式, 一种是创建线程要执行的函数, 把这个函数传递进Thread对象里，让它来执行. 另一种是直接从Thread继承，创建一个新的class，把线程执行的代码放到这个新的class里。

实现多线程网页爬虫，采用了多线程和锁机制，实现了广度优先算法的网页爬虫。

先给大家简单介绍下我的实现思路：

对于一个网络爬虫，如果要按广度遍历的方式下载，它是这样的：

1.从给定的入口网址把第一个网页下载下来

2.从第一个网页中提取出所有新的网页地址，放入下载列表中

3.按下载列表中的地址，下载所有新的网页

4.从所有新的网页中找出没有下载过的网页地址，更新下载列表

5.重复3、4两步，直到更新后的下载列表为空表时停止

python代码如下：

100

101

102

103

104

         #!/usr/bin/env python
        
         #coding=utf-8
        
         import 
          threading
        
         import 
          urllib
        
         import 
          re
        
         import 
          time
        
         g_mutex
         =
         threading.Condition()
        
         g_pages
         =
         [] 
         #从中解析所有url链接
        
         g_queueURL
         =
         [] 
         #等待爬取的url链接列表
        
         g_existURL
         =
         [] 
         #已经爬取过的url链接列表
        
         g_failedURL
         =
         [] 
         #下载失败的url链接列表
        
         g_totalcount
         =
         0 
         #下载过的页面数
        
         class 
          Crawler:
        
         def 
         __init__(
         self
         ,crawlername,url,threadnum):
        
         self
         .crawlername
         =
         crawlername
        
         self
         .url
         =
         url
        
         self
         .threadnum
         =
         threadnum
        
         self
         .threadpool
         =
         []
        
         self
         .logfile
         =
         file
         (
         "log.txt"
         ,
         'w'
         )
        
         def 
         craw(
         self
         ):
        
         global 
         g_queueURL
        
         g_queueURL.append(url)  
        
         depth
         =
         0
        
         print 
         self
         .crawlername
         +
         " 启动..."
        
         while
         (
         len
         (g_queueURL)!
         =
         0
         ):
        
         depth
         +
         =
         1
        
         print 
         'Searching depth '
         ,depth,
         '...\n\n'
        
         self
         .logfile.write(
         "URL:"
         +
         g_queueURL[
         0
         ]
         +
         "........"
         )
        
         self
         .downloadAll()
        
         self
         .updateQueueURL()
        
         content
         =
         '\n>>>Depth '
         +
         str
         (depth)
         +
         ':\n'
        
         self
         .logfile.write(content)
        
         i
         =
         0
        
         while 
         i<
         len
         (g_queueURL):
        
         content
         =
         str
         (g_totalcount
         +
         i)
         +
         '->'
         +
         g_queueURL[i]
         +
         '\n'
        
         self
         .logfile.write(content)
        
         i
         +
         =
         1
        
         def 
         downloadAll(
         self
         ):
        
         global 
         g_queueURL
        
         global 
         g_totalcount
        
         i
         =
         0
        
         while 
         i<
         len
         (g_queueURL):
        
         j
         =
         0
        
         while 
         j<
         self
         .threadnum 
         and 
         i
         +
         j < 
         len
         (g_queueURL):
        
         g_totalcount
         +
         =
         1
        
         threadresult
         =
         self
         .download(g_queueURL[i
         +
         j],
         str
         (g_totalcount)
         +
         '.html'
         ,j)
        
         if 
         threadresult!
         =
         None
         :
        
         print 
         'Thread started:'
         ,i
         +
         j,
         '--File number ='
         ,g_totalcount
        
         j
         +
         =
         1
        
         i
         +
         =
         j
        
         for 
         thread 
         in 
          self
         .threadpool:
        
         thread.join(
         30
         )
        
         threadpool
         =
         []
        
         g_queueURL
         =
         []
        
         def 
         download(
         self
         ,url,filename,tid):
        
         crawthread
         =
         CrawlerThread(url,filename,tid)
        
         self
         .threadpool.append(crawthread)
        
         crawthread.start()
        
         def 
         updateQueueURL(
         self
         ):
        
         global 
         g_queueURL
        
         global 
         g_existURL
        
         newUrlList
         =
         []
        
         for 
         content 
         in 
          g_pages:
        
         newUrlList
         +
         =
         self
         .getUrl(content)
        
         g_queueURL
         =
         list
         (
         set
         (newUrlList)
         -
         set
         (g_existURL))  
        
         def 
         getUrl(
         self
         ,content):
        
         reg
         =
         r
         '"(http://.+?)"'
        
         regob
         =
         re.
         compile
         (reg,re.DOTALL)
        
         urllist
         =
         regob.findall(content)
        
         return 
         urllist
        
         class 
          CrawlerThread(threading.Thread):
        
         def 
         __init__(
         self
         ,url,filename,tid):
        
         threading.Thread.__init__(
         self
         )
        
         self
         .url
         =
         url
        
         self
         .filename
         =
         filename
        
         self
         .tid
         =
         tid
        
         def 
         run(
         self
         ):
        
         global 
         g_mutex
        
         global 
         g_failedURL
        
         global 
         g_queueURL
        
         try
         :
        
         page
         =
         urllib.urlopen(
         self
         .url)
        
         html
         =
         page.read()
        
         fout
         =
         file
         (
         self
         .filename,
         'w'
         )
        
         fout.write(html)
        
         fout.close()
        
         except 
         Exception,e:
        
         g_mutex.acquire()
        
         g_existURL.append(
         self
         .url)
        
         g_failedURL.append(
         self
         .url)
        
         g_mutex.release()
        
         print 
         'Failed downloading and saving'
         ,
         self
         .url
        
         print 
         e
        
         return 
         None
        
         g_mutex.acquire()
        
         g_pages.append(html)
        
         g_existURL.append(
         self
         .url)
        
         g_mutex.release()
        
         if 
          __name__
         =
         =
         "__main__"
         :
        
         url
         =
         raw_input
         (
         "请输入url入口:\n"
         )
        
         threadnum
         =
         int
         (
         raw_input
         (
         "设置线程数:"
         ))
        
         crawlername
         =
         "小小爬虫"
        
         crawler
         =
         Crawler(crawlername,url,threadnum)
        
         crawler.craw()