import datetime
import time
import pymysql
import requests
import mysqlConcent
import os.path
import re
from scrapy import Selector
from PIL import Image
from threading import Thread
productlist=[]
productdetail=[]
pics=[]
url = 'http://www.ddddd.com/'
mysqldb=mysqlConcent.MysqlLocal()
class GetProductList(Thread):
def run(self):
while 1:
try:
url = productlist.pop()
# time.sleep(0.5)
except IndexError as ie:
# time.sleep(0.5)
continue
classify_id = re.search('\d+',url).group()
list_info = requests.get(url).text
sel = Selector(text=list_info)
getgooditem = sel.xpath("//div[@class='goodsItem']").extract()
if getgooditem:
for j in getgooditem:
sell = Selector(text=j)
urls = sell.xpath("//a/@href").extract()[0:1]
title = sell.xpath("//div/p/a/@title").extract()[0]
face_img = sell.css('img::attr(src)').extract()[0]
idstr = sell.xpath("//div/a/@href").extract()
product_id = re.search('\d+', idstr[0]).group()
price_str = sell.xpath("//div/font/text()").extract()
price = re.search('[0-9]{1,}', price_str[0]).group()
detail_url = url + urls[0]
param = {}
param['id'] = product_id
param['title'] = title
param['price'] = price
param['face_pic'] = face_img
param['classify_id'] = classify_id
param['status'] = 'Y'
param['date_create'] = mysqldb.getSysTime()
# 保存图片到本地
pics.append(face_img)
# 将抓取的信息新增到商品信息表
flg = mysqldb.addOneDataToTable(table_name='product', data=param,flags='1')
if flg:
print("新增成功,返回的ID是{}".format(flg))
detail={}
detail['url'] = detail_url
detail['product_id'] = classify_id
productdetail.append(detail)
continue
#获取下一页的信息
try:
pagebar = sel.css(".pagebar .next::attr('href')").extract()
if pagebar:
productlist.append(url+pagebar[0])
except Exception as e:
print('获取下一页失败{}'.format(e));
class SaveImageToLocal(Thread):
def run(self):
while 1:
try:
img_url = pics.pop()
if os.path.exists(img_url):
print('目标文件已存在%s' % (img_url))
continue
# time.sleep(0.5)
except Exception:
# time.sleep(0.5)
continue
picUrl = url + img_url
# 获取图片内容
res = requests.get(picUrl).content
# 获取图片路径目录
getPicDir = os.path.dirname(img_url)
# 判断上传的目录是否存在
if not os.path.isdir(getPicDir):
os.makedirs(getPicDir)
# 判断上传的目标文件是否存在
print('正在创建文件%s' % (img_url))
with open(img_url, 'wb') as fp:
fp.write(res)
class SaveProductDetail(Thread):
def run(self):
while 1:
try:
param = productdetail.pop()
# time.sleep(0.5)
except Exception:
# time.sleep(0.5)
continue
url = param['url']
product_id=param['product_id']
print('product的url%s,id是%s' % (url,product_id))
detail = requests.get(url).text
sel = Selector(text=detail)
btnul = sel.css(".bnt_ul").extract()[0]
aaa = sel.xpath("//label/text()").extract()
bbb = str(aaa).replace(' ', '').replace('\\n', '').strip('[\n]')
try:
# 获取颜色
color = ''
if bbb:
color = re.findall('[\u4e00-\u9fa5]{1,}', bbb)[0]
# 获取尺寸
size = ''
if 'CM' in bbb or 'cm' in bbb:
# size = re.findall('\d+[a-zA-Z]{1,}', bbb)[0]\*\d+\S+[a-zA-Z]{1,}\S
size = re.findall('(\d+[a-zA-Z]{1,}|\d+\*\d+\S+[a-zA-Z]{1,}\S|\d+\/\d+\S+\))', bbb)[0]
pass
except:
print(url)
# 获取详情ID
product_detail_id = re.search('\d+', url).group()
product_detail = sel.xpath("//div[@id='com_h']/blockquote/p").extract()
product_detail_imgs = sel.xpath("//div[@id='com_h']/blockquote/p//img/@src").extract()
content = ''.join((jk for jk in product_detail))
# 保存详情的图片至本地
if product_detail_imgs:
for img in product_detail_imgs:
detail_img = img
if img[0] == '/':
detail_img = str(img).strip('/')
print(detail_img)
pics.append(detail_img)
#ceshi
# continue
param = {}
param['product_id'] = product_id
param['product_detail_id'] = product_detail_id
param['size'] = size
param['color'] = color
param['content'] = content.replace('\'', '\"')
param['date_create'] = mysqldb.getSysTime()
id = mysqldb.addOneDataToTable(table_name='product_detail', data=param)
print(id)
if __name__=='__main__':
result = requests.get(url)
sel = Selector(text=result.text)
tag = sel.xpath("//div[@class='menu']/a/@href").extract()[1:7]
for i in tag:
productlist.append(url + i)
#商品列表
t1=GetProductList()
#获取商品图片
t2=SaveImageToLocal()
#获取商品详情
t3=SaveProductDetail()
t1.start()
t2.start()
t3.start()
Python多线程爬取
最新推荐文章于 2025-12-11 20:49:41 发布
1271

被折叠的 条评论
为什么被折叠?



