# 线程进程
import requests
import threading
import json
from queue import Queue
import time
##写子类
class Thread_crawl(threading.Thread):
def __init__(self,name,page_queue):#初始化
threading.Thread.__init__(self)
#拿到任务的队列
self.page_queue = page_queue
self.name = name
def run(self):
while True:
if self.page_queue.empty():
break
else:
print(self.name,'将要从队列中去任务')
page = self.page_queue.get()
print(self.name,'取出的任务是:',page)
url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1557294422185&keyword=Python&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(page)
self.get_content(url=url)
print(self.name, '完成任务的页码是:', page)
def get_content(self,url):
headers={
'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36'
}
response=requests.get(url=url,headers=headers)
self.get_data(response)
def get_data(self,response):
# 将我们的json字符串转化为标准的python数据格式
data=json.loads(response)
# 提取数据的字典:
data_list=data['Data']['Posts']
for i in data_list:
# 岗位名称
name= i['RecruitPostName']
countryname=i['conutryname']
infor = 'name:'+name+'--'+'countryname:'+countryname
with open('job.txt','a',encoding='utf-8') as fp:
fp.write(infor)
if __name__ == '__main__':
#任务开始时间
# 建一个队列
page_queue = Queue()
for page in range(1,61):
page_queue.put(page)
# 生成线程:
crawl_name = ['c1','c2','c3']
crawl_tread = [] #如果没有这个列表 程序只会执行从 C1 一个线程。
# for i in range(10):
#
for name in crawl_name:
crawl = Thread_crawl(name,page_queue)
crawl.start()
crawl_tread.append(crawl)#存放线程 方便堵塞主线程
#堵塞主线程,让子线程都完成任务后,主线程在往下执行
for thread in crawl_tread:
thread.join()
#结束时间
t_end=time.time()
print(t_end)
print('完成的时间:',)
爬虫多线程
最新推荐文章于 2024-05-02 20:05:43 发布