Python多线程编程

Thread模块

thread模块为python中的多线程模块

考拉海购小爬虫

from threading import Thread
from queue import Queue
from lxml import etree
import re
import json
import requests


# 1、获取所有分类列表页
# 2、解析列表页,获取详情页url
# 3、解析详情页,获取信息

class ThreadDetailUrl(Thread):
    def run(self):
        while 1:
            url = queue_all_urls.get()
            if url:
                html = requests.get(url=url, headers=headers).text
                content = etree.HTML(html)
                detail_page_urls = content.xpath('''//li[@class='goods']//a[@class='title']//@href''')
                for url in detail_page_urls:
                    url = 'https:' + url
                    queue_detail_page_url.put(url)
            else:
                break


class ThreadDetailInfo(Thread):
    def run(self):
        while 1:
            url = queue_detail_page_url.get()
            html = requests.get(url=url, headers=headers).text
            content = etree.HTML(html)
            title = content.xpath('''//dt[@class='product-title']//text()''')
            print(title)


def main():
    global queue_all_urls
    url = 'https://search.kaola.com/api/getFrontCategory.html'

    global headers
    data = requests.get(url=url, headers=headers).text
    ids = re.findall(r'[\"\']categoryId[\"\']\:(\d+)\,', data)
    for id in ids:
        url = 'https://search.kaola.com/category/%s.html' % id
        queue_all_urls.put(url)


if __name__ == '__main__':
    queue_all_urls = Queue()
    queue_detail_page_url = Queue()
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3970.5 Safari/537.36',
    }
    main()
    t1 = ThreadDetailUrl()
    t2 = ThreadDetailInfo()
    t1.start()
    t2.start()
    t1.join()
    t2.join()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值