Thread模块
thread模块为python中的多线程模块
考拉海购小爬虫
from threading import Thread
from queue import Queue
from lxml import etree
import re
import json
import requests
# 1、获取所有分类列表页
# 2、解析列表页,获取详情页url
# 3、解析详情页,获取信息
class ThreadDetailUrl(Thread):
def run(self):
while 1:
url = queue_all_urls.get()
if url:
html = requests.get(url=url, headers=headers).text
content = etree.HTML(html)
detail_page_urls = content.xpath('''//li[@class='goods']//a[@class='title']//@href''')
for url in detail_page_urls:
url = 'https:' + url
queue_detail_page_url.put(url)
else:
break
class ThreadDetailInfo(Thread):
def run(self):
while 1:
url = queue_detail_page_url.get()
html = requests.get(url=url, headers=headers).text
content = etree.HTML(html)
title = content.xpath('''//dt[@class='product-title']//text()''')
print(title)
def main():
global queue_all_urls
url = 'https://search.kaola.com/api/getFrontCategory.html'
global headers
data = requests.get(url=url, headers=headers).text
ids = re.findall(r'[\"\']categoryId[\"\']\:(\d+)\,', data)
for id in ids:
url = 'https://search.kaola.com/category/%s.html' % id
queue_all_urls.put(url)
if __name__ == '__main__':
queue_all_urls = Queue()
queue_detail_page_url = Queue()
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36',
}
main()
t1 = ThreadDetailUrl()
t2 = ThreadDetailInfo()
t1.start()
t2.start()
t1.join()
t2.join()