Python多线程爬取

最新推荐文章于 2025-12-11 20:49:41 发布
原创最新推荐文章于 2025-12-11 20:49:41 发布 · 846 阅读
CC 4.0 BY-SA版权
文章标签：
import datetime
import time
import pymysql
import requests
import mysqlConcent
import os.path
import re

from scrapy import Selector
from PIL import Image
from threading import Thread

productlist=[]
productdetail=[]
pics=[]

url = 'http://www.ddddd.com/'
mysqldb=mysqlConcent.MysqlLocal()

class GetProductList(Thread):
    def run(self):
        while 1:
            try:
                url = productlist.pop()
                # time.sleep(0.5)
            except IndexError as ie:
                # time.sleep(0.5)
                continue
            classify_id = re.search('\d+',url).group()
            list_info = requests.get(url).text
            sel = Selector(text=list_info)
            getgooditem = sel.xpath("//div[@class='goodsItem']").extract()

            if getgooditem:
                for j in getgooditem:
                    sell = Selector(text=j)
                    urls = sell.xpath("//a/@href").extract()[0:1]

                    title = sell.xpath("//div/p/a/@title").extract()[0]
                    face_img = sell.css('img::attr(src)').extract()[0]
                    idstr = sell.xpath("//div/a/@href").extract()
                    product_id = re.search('\d+', idstr[0]).group()
                    price_str = sell.xpath("//div/font/text()").extract()
                    price = re.search('[0-9]{1,}', price_str[0]).group()
                    detail_url = url + urls[0]

                    param = {}
                    param['id'] = product_id
                    param['title'] = title
                    param['price'] = price
                    param['face_pic'] = face_img
                    param['classify_id'] = classify_id
                    param['status'] = 'Y'
                    param['date_create'] = mysqldb.getSysTime()
                    # 保存图片到本地
                    pics.append(face_img)

                    # 将抓取的信息新增到商品信息表
                    flg = mysqldb.addOneDataToTable(table_name='product', data=param,flags='1')
                    if flg:
                        print("新增成功，返回的ID是{}".format(flg))
                    detail={}
                    detail['url'] = detail_url
                    detail['product_id'] = classify_id
                    productdetail.append(detail)
                    continue
            #获取下一页的信息
            try:
                pagebar = sel.css(".pagebar .next::attr('href')").extract()
                if pagebar:
                    productlist.append(url+pagebar[0])
            except Exception as e:
                print('获取下一页失败{}'.format(e));

class SaveImageToLocal(Thread):
    def run(self):
        while 1:
            try:
                img_url = pics.pop()
                if os.path.exists(img_url):
                    print('目标文件已存在%s' % (img_url))
                    continue
                # time.sleep(0.5)
            except Exception:
                # time.sleep(0.5)
                continue

            picUrl = url + img_url
            # 获取图片内容
            res = requests.get(picUrl).content
            # 获取图片路径目录
            getPicDir = os.path.dirname(img_url)
            # 判断上传的目录是否存在
            if not os.path.isdir(getPicDir):
                os.makedirs(getPicDir)
            # 判断上传的目标文件是否存在
            print('正在创建文件%s' % (img_url))
            with open(img_url, 'wb') as fp:
                fp.write(res)
          

class SaveProductDetail(Thread):
    def run(self):
        while 1:
            try:
                param = productdetail.pop()
                # time.sleep(0.5)
            except Exception:
                # time.sleep(0.5)
                continue
            url = param['url']
            product_id=param['product_id']
            print('product的url%s,id是%s' % (url,product_id))

            detail = requests.get(url).text
            sel = Selector(text=detail)
            btnul = sel.css(".bnt_ul").extract()[0]

            aaa = sel.xpath("//label/text()").extract()
            bbb = str(aaa).replace(' ', '').replace('\\n', '').strip('[\n]')

            try:
                # 获取颜色
                color = ''
                if bbb:
                    color = re.findall('[\u4e00-\u9fa5]{1,}', bbb)[0]
                # 获取尺寸
                size = ''
                if 'CM' in bbb or 'cm' in bbb:
                    # size = re.findall('\d+[a-zA-Z]{1,}', bbb)[0]\*\d+\S+[a-zA-Z]{1,}\S
                    size = re.findall('(\d+[a-zA-Z]{1,}|\d+\*\d+\S+[a-zA-Z]{1,}\S|\d+\/\d+\S+\))', bbb)[0]
                    pass
            except:
                print(url)

            # 获取详情ID
            product_detail_id = re.search('\d+', url).group()
            product_detail = sel.xpath("//div[@id='com_h']/blockquote/p").extract()
            product_detail_imgs = sel.xpath("//div[@id='com_h']/blockquote/p//img/@src").extract()
            content = ''.join((jk for jk in product_detail))
            # 保存详情的图片至本地
            if product_detail_imgs:
                for img in product_detail_imgs:
                    detail_img = img
                    if img[0] == '/':
                        detail_img = str(img).strip('/')
                    print(detail_img)
                    pics.append(detail_img)
            #ceshi
            # continue
            param = {}
            param['product_id'] = product_id
            param['product_detail_id'] = product_detail_id
            param['size'] = size
            param['color'] = color
            param['content'] = content.replace('\'', '\"')
            param['date_create'] = mysqldb.getSysTime()

            id = mysqldb.addOneDataToTable(table_name='product_detail', data=param)
            print(id)

if __name__=='__main__':
    result = requests.get(url)
    sel = Selector(text=result.text)
    tag = sel.xpath("//div[@class='menu']/a/@href").extract()[1:7]
    for i in tag:
        productlist.append(url + i)
    #商品列表    
    t1=GetProductList()
    #获取商品图片
    t2=SaveImageToLocal()
    #获取商品详情
    t3=SaveProductDetail()
    t1.start()
    t2.start()
    t3.start()