本次爬虫内容为商品SKU、标题、价格、主图、详情图、参数、规格,适用于大部分商品的属性。
一、代理IP
因为爬虫的数据量比较大,考虑使用代理IP池,从https://cn-proxy.com/获取IP,并存储到数据库中方便使用。
本部分的模块有lxml、pymysql、requests
使用百度验证IP代理是否可用
from lxml import etree
import pymysql
import requests.adapters
def get_new_ip():
# 获取并更新数据表
# 连接数据库
connect = pymysql.Connect(
host='localhost',
port=3306,
user='root',
passwd='root',
db='ip',
charset='utf8'
)
# 获取游标
cursor = connect.cursor()
# del_sql = 'TRUNCATE `proxy`'
cursor.execute('TRUNCATE `proxy`')
connect.commit()
ip_url = 'https://cn-proxy.com/'
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}
req = requests.get(ip_url, headers=header)
req.encoding = 'utf-8'
sku_first = etree.HTML(req.text)
search_ip = sku_first.xpath('//table[(@class="sortable")]/tbody/tr/td[1]/text()')
search_port = sku_first.xpath('//table[(@class="sortable")]/tbody/tr/td[2]/text()')
print(search_ip)
print(search_port)
for (each_ip, each_port) in zip(search_ip, search_port):
ip_proxy = each_ip + ':' + each_port
print(ip_proxy)
try:
requests.get('https://www.baidu.com/', proxies={'http': ip_proxy})
except:
print('Failed')
else:
print('OK!')
insert_http = 'INSERT IGNORE INTO `proxy` (`ip`,`port`) VALUES ( "%s", "%s")'
data = (each_ip, each_port)
cursor.execute(insert_http % data)
connect.commit()
if __name__ == '__main__':
get_new_ip()
二、主体部分
from lxml import etree
import requests
import time
import pymysql
import urllib.parse
import urllib.request
import re
from io import BytesIO
import base64
import requests.adapters
import get_ip_SQL
import random
from requests.adapters import HTTPAdapter
import get_new_ip
# 获取最新IP
get_new_ip.get_new_ip()
# ip list
ip_list = get_ip_SQL.get_ip()
# keyword = input('请输入想查询的关键字:')
# 所有分类遍历
keyword_list = ['家用电器', '手机', '运营商', '数码', '电脑', '办公', '家居', '家具', '家装', '厨具', '男装', '女装', '童装', '内衣', '美妆',
'个护清洁', '宠物', '女鞋', '箱包', '钟表', '珠宝', '男鞋', '运动', '户外', '房产', '汽车', '汽车用品', '母婴', '玩具乐器', '食品',
'酒类', '生鲜', '特产', '艺术', '礼品鲜花', '农资绿植', '医药保健', '计生情趣', '图书', '文娱', '电子书', '机票', '酒店', '旅游',
'生活', '理财', '众筹', '白条', '保险', '安装', '维修', '清洗保养', '工业品']
for keyword in keyword_list:
# 转换为Unicode
keyword_unicode = urllib.parse.quote(keyword)
print(keyword_unicode)
# 连接数据库
connect = pymysql.Connect(
host='localhost',
port=3306,
user='root',
passwd='root',
db='jd',
charset='utf8mb4'
)
# 获取游标
cursor = connect.cursor()
create_word = '''CREATE TABLE IF NOT EXISTS `%s` (\
`ID` bigint(255) NOT NULL AUTO_INCREMENT,\
`sku` bigint(255) UNSIGNED NOT NULL,\
`p_title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,\
`p_price` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,\
`p_price_plus` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,\
`img_list` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,\
`parameter` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,\
`details_img` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,\
`spec_info` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,\
PRIMARY KEY (`ID`) USING BTREE,\
UNIQUE INDEX `sku`(`sku`) USING BTREE\
) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;
'''
# 根据名称建表
cursor.execute(create_word % str(keyword))
connect.commit()
# 设置cookie表,防止同一个cookie被屏蔽
cookie_list = [ # 篇幅原因,此处自行添加各cookie ]
# 设置user-agent表,防止屏蔽
user_list = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36']
img_header = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'dnt': '1',
'upgrade-insecure-requests': '1',
'user-agent': random.choice(user_list)
}
# 由于后半页30商品信息为动态加载,所以分步加载
# 前半页30个商品信息
def get_sku_first(n):
global url_first
global random_ip
random_ip = {'http': random.choice(ip_list)}
print('代理IP:', random_ip)
url_first = 'https://search.jd.com/Search?keyword=' + keyword_unicode + '&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&stock=1&page=' + str(2*n-1) + '&s=59'
header = {
'user-agent': random.choice(user_list),
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'dnt': '1',
'upgrade-insecure-requests': '1',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'cookie': cookie_list[0]
}
simple_header = {'user-agent': random.choice(user_list),
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'upgrade-insecure-requests': '1',
'cache-control': 'max-age=0',
'cookie': random.choice(cookie_list)}
print(url_first)
# 先根据搜索页面获得每个商品的 skuid
while True:
try:
req_first = requests.get(url_first, headers=header, proxies={'http': random.choice(ip_list)}, timeout=800)
break
except:
print('连接失败,正在重试')
time.sleep(2)
continue
req_first.encoding = 'utf-8'
sku_first = etree.HTML(req_first.text)
data_first = sku_first.xpath('//li[(@class="gl-item")]')
print(data_first)
for i in data_first:
# 获取商品sku
p_sku_first = i.xpath('@data-sku')
# 获取连接
link = 'https://item.jd.com/' + p_sku_first[0] + '.html'
print(link)
cursor.execute('SELECT sku from `%s`' % keyword)
sku_t_f = cursor.fetchall()
# 判断数据库是否有信息
if p_sku_first[0] not in str(sku_t_f):
# 获取商品标题
# 从商品详情页获取信息
while True:
try:
details = requests.get(link, headers=header, proxies={'http': random.choice(ip_list)}, timeout=800)
break
except:
print('连接失败,正在重试')
time.sleep(2)
continue
details.encoding = 'GBK'
link_data = etree.HTML(details.text)
# 获取标题
details_title = link_data.xpath('//div[(@class="sku-name")]/text()')
for name in details_title:
if not name.strip() == '':
details_name = name.strip()
# print(details_name)
# 获取图片链接
details_img = link_data.xpath('//ul[(@class="lh")]/li/img/@src')
save_img_base64_list = []
save_view_img_url_list = []
# 将图片链接中小图替换为大图
for img_each in details_img:
get_img_url = 'https:' + img_each.strip()
# print('get url :', get_img_url)
# 先替换n5为n1
img_url_replace_1 = get_img_url.replace('/n5/', '/n1/', 1)
# print('change 1 :', img_url_replace_1)
img_replace_word = '/s\d{2}x\d{2}_jfs/'
search = re.findall(img_replace_word, img_url_replace_1)
# 判断链接中是否存在分辨率参数
if not len(search) == 0:
img_url_replace_2 = img_url_replace_1.replace(search[0], '/s546x546_jfs/', 1)
save_img_url = img_url_replace_2
else:
save_img_url = img_url_replace_1
print(save_img_url)
save_view_img_url_list.append(save_img_url)
# 获取参数
parameter_info = link_data.xpath('//ul[(@class="parameter2 p-parameter-list")]/li/text()')
if len(parameter_info) == 0:
parameter_info = link_data.xpath('//ul[(@class="parameter2")]/li/text()')
# 规格与包装specification
spec_info_list = link_data.xpath('//div[(@class="Ptable-item")]/dl/dl/dt/text()')
spec_info_item = link_data.xpath('//div[(@class="Ptable-item")]/dl/dl/dd[last()]/text()')
if len(spec_info_list) == 0 and len(spec_info_item) == 0:
spec_info_dic = {}
else:
spec_info_dic = dict(zip(spec_info_list, spec_info_item))
# print('字典为', spec_info_dic)
# 由于本页详情图片可能采用懒加载,则从以下几个方面查找图片链接
# 先从本页查看是否有详情图链接,否则 dx.3.cn 获取详情图片
save_img_base64_list = []
save_details_img_url_list = []
details_pic_1 = link_data.xpath('//div[(@id="J-detail-content")]/style/text()')
# details_pic_url_1 = re.findall('background-image:url\((.*?)\)', str(details_pic_1))
# 某些商品详情ID由总SKUID设置,所以先进行判断是否有总ID
skuid = re.findall("mainSkuId:'(.*?)'", details.text)
# print('sku', skuid)
# 若有总ID,将连接设置为总ID链接,否则设置为商品SKU链接
if not len(skuid) == 0:
details_link = 'https://dx.3.cn/desc/' + skuid[0] + '?cdn=2&callback=showdesc'
else:
details_link = 'https://dx.3.cn/desc/' + p_sku_first[0] + '?cdn=2&callback=showdesc'
# print(details_link)
# print('页面内容', goods_details.text)
# 读取dx.3.cn中的内容
while True:
try:
goods_details_link = requests.get(details_link, headers=simple_header, proxies={'http': random.choice(ip_list)}, timeout=800)
break
except:
print('连接错误,5s后重试')
time.sleep(5)
continue
data = goods_details_link.text
# 查找链接内容
info_image = 'background-image:url\((.*?)\)'
info_image_url = re.findall(info_image, data)
# print('内容为', data)
# 如果没找到,则换一个匹配头
if len(info_image_url) == 0:
info_image_url = re.findall('''data-lazyload=\\\\"(.*?)\\\\"''', data)
if len(info_image_url) == 0:
info_image_url = re.findall('background-image:url\((.*?)\)', str(details_pic_1))
for each_details_img in info_image_url:
each_details_img = each_details_img.strip('\\"')
if not each_details_img == '':
if not len(each_details_img) == 0:
if not len(re.findall('http', each_details_img)) == 0:
each_details_img = each_details_img.strip('\\"')
else:
each_details_img = 'https:' + each_details_img.strip('\\"')
if not len(re.findall('http:////', each_details_img)) == 0:
each_details_img = each_details_img.replace("http:////", "https://")
print('这是详情页图片 --->', each_details_img)
# print('图片列表:', info_image_url)
save_details_img_url_list.append(each_details_img)
# 由于部分页面价格采取懒加载,由F12中Sources观察到p.3.cn可以获取到price
# p.3.cn 获取价格
price_url = 'https://p.3.cn/prices/mgets?callback=jQuery6775278&skuids=J_' + p_sku_first[0]
while True:
try:
price_page = requests.get(price_url, headers=simple_header, proxies={'http': random.choice(ip_list)}, timeout=800)
break
except:
print('连接失败,正在重试')
time.sleep(2)
continue
price_page.encoding = 'utf8'
# print(price_page.text)
price = '"p":"(.*?)"'
plus_price = '"tpp":"(.*?)"'
price_data = re.findall(price, price_page.text)
plus_price_data = re.findall(plus_price, price_page.text)
# 判断价格是否有效,是否有PLUS价格
if len(price_data) == 0:
price_data = ''
else:
price_data = price_data[0]
if len(plus_price_data) == 0:
plus_price_data = ''
else:
plus_price_data = plus_price_data[0]
print('价格:', price_data)
print('PLUS价格:', plus_price_data)
insert_first = '''INSERT IGNORE INTO `%s` (`sku`, `p_title`, `p_price`, `p_price_plus`, `img_list`, `parameter`, `details_img`, `spec_info`) VALUES ( "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")'''
# 插入至数据库,发现某些参数中带有单引号,会导致sql语句异常,所以使用了pymysql.escape_string()方法
preview_data = (str(keyword),
p_sku_first[0],
pymysql.escape_string(details_name),
price_data,
plus_price_data,
pymysql.escape_string(str(save_view_img_url_list)),
pymysql.escape_string(str(parameter_info)),
pymysql.escape_string(str(save_details_img_url_list)),
pymysql.escape_string(str(spec_info_dic))
)
cursor.execute(insert_first % preview_data)
connect.commit()
# print('The Num %d, ID : %s' % (count, i))
# time.sleep(2)
else:
print('此商品已存在于数据库')
# 同样方式加载后半页
# 后半页 动-态-加-载 30个商品信息
def get_sku_end(n):
log_id = '%.5f' % time.time()
url_end = 'https://search.jd.com/s_new.php?keyword=' + keyword_unicode + '&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&stock=1&page=' + str(2*n) + '&s=' + str(48*n-20) + '&scrolling=y&log_id=' + str(log_id)
header = {
'user-agent': random.choice(user_list),
'accept-language': 'zh - CN, zh;',
'dnt': '1',
'upgrade-insecure-requests': '1',
'x-requested-with': 'XMLHttpRequest',
'referer': url_first,
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'cookie': cookie_list[0]
}
simple_header = {'user-agent': random.choice(user_list),
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'upgrade-insecure-requests': '1',
'cache-control': 'max-age=0',
'cookie': random.choice(cookie_list)}
while True:
try:
req_end = requests.get(url_end, headers=header, proxies={'http': random.choice(ip_list)}, timeout=800)
break
except:
print('连接失败,正在重试')
time.sleep(2)
continue
req_end.encoding = 'utf-8'
sku_end = etree.HTML(req_end.text)
data_end = sku_end.xpath('//li[(@class="gl-item")]')
for i in data_end:
# 获取商品sku
p_sku_end = i.xpath('@data-sku')
# 获取连接
link = 'https://item.jd.com/' + p_sku_end[0] + '.html'
print(link)
cursor.execute('SELECT sku from `%s`' % keyword)
sku_t_f = cursor.fetchall()
# 判断数据库是否有信息
if p_sku_end[0] not in str(sku_t_f):
# 获取商品标题
# 从商品详情页获取信息
while True:
try:
details = requests.get(link, headers=header, proxies={'http': random.choice(ip_list)}, timeout=800)
break
except:
print('连接失败,正在重试')
time.sleep(2)
continue
details.encoding = 'GBK'
link_data = etree.HTML(details.text)
# 获取标题
details_title = link_data.xpath('//div[(@class="sku-name")]/text()')
for name in details_title:
if not name.strip() == '':
details_name = name.strip()
# print(details_name)
# 获取图片链接
details_img = link_data.xpath('//ul[(@class="lh")]/li/img/@src')
save_img_base64_list = []
save_view_img_url_list = []
# 将图片链接中小图替换为大图
for img_each in details_img:
get_img_url = 'https:' + img_each.strip()
# print('get url :', get_img_url)
# 先替换n5为n1
img_url_replace_1 = get_img_url.replace('/n5/', '/n1/', 1)
# print('change 1 :', img_url_replace_1)
img_replace_word = '/s\d{2}x\d{2}_jfs/'
search = re.findall(img_replace_word, img_url_replace_1)
# 判断链接中是否存在分辨率参数
if not len(search) == 0:
img_url_replace_2 = img_url_replace_1.replace(search[0], '/s546x546_jfs/', 1)
save_img_url = img_url_replace_2
else:
save_img_url = img_url_replace_1
print(save_img_url)
save_view_img_url_list.append(save_img_url)
# 获取参数
parameter_info = link_data.xpath('//ul[(@class="parameter2 p-parameter-list")]/li/text()')
if len(parameter_info) == 0:
parameter_info = link_data.xpath('//ul[(@class="parameter2")]/li/text()')
# 规格与包装specification
spec_info_list = link_data.xpath('//div[(@class="Ptable-item")]/dl/dl/dt/text()')
spec_info_item = link_data.xpath('//div[(@class="Ptable-item")]/dl/dl/dd[last()]/text()')
if len(spec_info_list) == 0 and len(spec_info_item) == 0:
spec_info_dic = {}
else:
spec_info_dic = dict(zip(spec_info_list, spec_info_item))
# print('字典为', spec_info_dic)
# 先从本业查看是否有详情图链接,否则 dx.3.cn 获取详情图片
save_img_base64_list = []
save_details_img_url_list = []
details_pic_1 = link_data.xpath('//div[(@id="J-detail-content")]/style/text()')
# details_pic_url_1 = re.findall('background-image:url\((.*?)\)', str(details_pic_1))
# 某些商品详情ID由总SKUID设置,所以先进行判断是否有总ID
skuid = re.findall("mainSkuId:'(.*?)'", details.text)
# print('sku', skuid)
# 若有总ID,将连接设置为总ID链接,否则设置为商品SKU链接
if not len(skuid) == 0:
details_link = 'https://dx.3.cn/desc/' + skuid[0] + '?cdn=2&callback=showdesc'
else:
details_link = 'https://dx.3.cn/desc/' + p_sku_end[0] + '?cdn=2&callback=showdesc'
# print(details_link)
# print('页面内容', goods_details.text)
# 读取dx.3.cn中的内容
while True:
try:
goods_details_link = requests.get(details_link, headers=simple_header,
proxies={'http': random.choice(ip_list)}, timeout=800)
break
except:
print('连接错误,5s后重试')
time.sleep(5)
continue
data = goods_details_link.text
# 查找链接内容
info_image = 'background-image:url\((.*?)\)'
info_image_url = re.findall(info_image, data)
# print('内容为', data)
# 如果没找到,则换一个匹配头
if len(info_image_url) == 0:
info_image_url = re.findall('''data-lazyload=\\\\"(.*?)\\\\"''', data)
if len(info_image_url) == 0:
info_image_url = re.findall('background-image:url\((.*?)\)', str(details_pic_1))
for each_details_img in info_image_url:
each_details_img = each_details_img.strip('\\"')
if not each_details_img == '':
if not len(each_details_img) == 0:
if not len(re.findall('http', each_details_img)) == 0:
each_details_img = each_details_img.strip('\\"')
else:
each_details_img = 'https:' + each_details_img.strip('\\"')
if not len(re.findall('http:////', each_details_img)) == 0:
each_details_img = each_details_img.replace("http:////", "https://")
print('这是详情页图片 --->', each_details_img)
# print('图片列表:', info_image_url)
save_details_img_url_list.append(each_details_img)
# p.3.cn 获取价格
price_url = 'https://p.3.cn/prices/mgets?callback=jQuery6775278&skuids=J_' + p_sku_end[0]
while True:
try:
price_page = requests.get(price_url, headers=header, proxies={'http': random.choice(ip_list)}, timeout=800)
break
except:
print('连接失败,正在重试')
time.sleep(2)
continue
price_page.encoding = 'utf8'
# print(price_page.text)
price = '"p":"(.*?)"'
plus_price = '"tpp":"(.*?)"'
price_data = re.findall(price, price_page.text)
plus_price_data = re.findall(plus_price, price_page.text)
# 判断价格是否有效,是否有PLUS价格
if len(price_data) == 0:
price_data = ''
else:
price_data = price_data[0]
if len(plus_price_data) == 0:
plus_price_data = ''
else:
plus_price_data = plus_price_data[0]
# print('价格:', price_data)
# print('PLUS价格:', plus_price_data)
insert_first = '''INSERT IGNORE INTO `%s` (`sku`, `p_title`, `p_price`, `p_price_plus`, `img_list`, `parameter`, `details_img`, `spec_info`) VALUES ( "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")'''
preview_data = (str(keyword),
p_sku_end[0],
pymysql.escape_string(details_name),
price_data,
plus_price_data,
pymysql.escape_string(str(save_view_img_url_list)),
pymysql.escape_string(str(parameter_info)),
pymysql.escape_string(str(save_details_img_url_list)),
pymysql.escape_string(str(spec_info_dic))
)
cursor.execute(insert_first % preview_data)
connect.commit()
# print('The Num %d, ID : %s' % (count, i))
# time.sleep(2)
else:
print('此商品已存在于数据库')
# 循环查询前50页
if __name__ == '__main__':
for page_num in range(1, 51):
# print 查看当前页
print('***************************************************')
print(' First_Page: ' + str(page_num))
get_sku_first(page_num)
print(' Last_Page: ' + str(page_num))
get_sku_end(page_num)