#coding:utf-8 from threading import Thread from concurrent.futures import ThreadPoolExecutor from threading import RLock import requests import time from bs4 import BeautifulSoup class Screap(Thread): def __init__(self,urlList,targetnum): super(Screap,self).__init__() self.num=len(urlList) self.lock=RLock() self.header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3427.400 QQBrowser/9.6.12513.400"} self.taskurlList=urlList self.threads=[] self.okurl=set() self.visitedUrl=[] self._targetNum=targetnum self._totalnum=0 self.filewrite=open("a.txt","w+") def now(self): t=time.strftime("%H:%M:%S",time.localtime(time.time())) print(t) def checkThread(self): while True: time.sleep(1) flag=False threadQum=0 threadNum=len(self.threads) for td in self.threads: if td.is_alive(): continue else: threadQum+=1 if threadQum==threadNum: flag=True if flag: break def craw(self): index=-1 i=0 while i<self._targetNum: j=0 self.now() while j<self.num: threadson=Thread(target=self.download,args=(self.taskurlList[j],self.header,self.lock)) threadson.start() self.threads.append(threadson) j+=1 for son in self.threads: son.join(2) self.checkThread() self.num+=1 if self.num>=100: self.num=20 self.threads=[] self.taskurlList=list(set(self.taskurlList)-set(self.visitedUrl)) self._totalnum=0 self.writeUrl(self.okurl) i=self.checkUrl() print("total url:%s"%len(self.taskurlList)) print("total visitedurl:%s"%len(self.visitedUrl)) print("collect url:%s"%i) def writeUrl(self,okurl): with open("a.txt","w+") as f: for i in okurl: f.write(i.strip()+"\n") def checkUrl(self): with open("a.txt") as f: for line in f: self._totalnum+=1 return self._totalnum def download(self,url,header,lock): try: response=requests.get(url,headers=header,timeout=20) self.visitedUrl.append(url) except: self.visitedUrl.append(url) else: self.okurl.add(url) if response.status_code==200: context=response.content soup=BeautifulSoup(context) allTag=soup.find_all(self.has_href) lock.acquire() for tag in allTag: hrefvalue=tag.get("href",None) if hrefvalue and "http" in hrefvalue and "www" in hrefvalue: self.taskurlList.append(hrefvalue) lock.release() def has_href(self,tag): return tag.has_attr("href") if __name__ == '__main__': targetUrl=["http://www.sina.com","http://www.sohu.com","https://daohang.qq.com/", "http://www.163.com"] a=Screap(targetUrl,100000) a.craw()
python 广度多线程爬虫
最新推荐文章于 2021-09-14 23:49:03 发布