python 广度多线程爬虫

最新推荐文章于 2021-09-14 23:49:03 发布

douyunqian668

最新推荐文章于 2021-09-14 23:49:03 发布

阅读量461

点赞数

CC 4.0 BY-SA版权

分类专栏： Python高级编程

本文链接：https://blog.youkuaiyun.com/douyunqian668/article/details/78546405

Python高级编程专栏收录该内容

107 篇文章

订阅专栏

本文介绍了一种使用Python并行抓取网页的方法。通过多线程技术和BeautifulSoup解析库，实现对指定URL列表的大规模高效抓取，并对抓取到的数据进行初步处理和存储。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

#coding:utf-8

from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from threading import RLock
import requests
import time
from bs4 import BeautifulSoup
class Screap(Thread):
   def __init__(self,urlList,targetnum):
      super(Screap,self).__init__()
      self.num=len(urlList)
      self.lock=RLock()
      self.header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400"}
      self.taskurlList=urlList
      self.threads=[]
      self.okurl=set()
      self.visitedUrl=[]
      self._targetNum=targetnum
      self._totalnum=0
      self.filewrite=open("a.txt","w+")
   def now(self):
      t=time.strftime("%H:%M:%S",time.localtime(time.time()))
      print(t)
   def checkThread(self):
      while True:
            time.sleep(1)
            flag=False
            threadQum=0
            threadNum=len(self.threads)
            for td in self.threads:
               if td.is_alive():
                  continue
               else:
                  threadQum+=1
                  if threadQum==threadNum:
                     flag=True
            if flag:
               break
   def craw(self):
      index=-1
      i=0
      while i<self._targetNum:
         j=0
         self.now()
         while j<self.num:
            threadson=Thread(target=self.download,args=(self.taskurlList[j],self.header,self.lock))

            threadson.start()
            self.threads.append(threadson)
            j+=1
         for son in self.threads:
            son.join(2)
         self.checkThread()
         self.num+=1
         if self.num>=100:
            self.num=20
         self.threads=[]
         self.taskurlList=list(set(self.taskurlList)-set(self.visitedUrl))
         self._totalnum=0
         self.writeUrl(self.okurl)
         i=self.checkUrl()
         print("total url:%s"%len(self.taskurlList))
         print("total visitedurl:%s"%len(self.visitedUrl))
         print("collect url:%s"%i)
   def writeUrl(self,okurl):
         with open("a.txt","w+") as f:
            for i in okurl:
               f.write(i.strip()+"\n")
   def checkUrl(self):
      with open("a.txt") as f:
         for line in f:
            self._totalnum+=1
      return self._totalnum
   def download(self,url,header,lock):
      try:
         response=requests.get(url,headers=header,timeout=20)
         self.visitedUrl.append(url)
      except:
         self.visitedUrl.append(url)
      else:
         self.okurl.add(url)
         if response.status_code==200:
            context=response.content
            soup=BeautifulSoup(context)
            allTag=soup.find_all(self.has_href)
            lock.acquire()
            for tag in allTag:
               hrefvalue=tag.get("href",None)
               if hrefvalue and "http" in hrefvalue and "www" in hrefvalue:
                  self.taskurlList.append(hrefvalue)
            lock.release()
   def has_href(self,tag):
      return tag.has_attr("href")
if __name__ == '__main__':
   targetUrl=["http://www.sina.com","http://www.sohu.com","https://daohang.qq.com/",
               "http://www.163.com"]
   a=Screap(targetUrl,100000)
   a.craw()