记一次JD爬虫

本次爬虫内容为商品SKU、标题、价格、主图、详情图、参数、规格,适用于大部分商品的属性。

一、代理IP

因为爬虫的数据量比较大,考虑使用代理IP池,从https://cn-proxy.com/获取IP,并存储到数据库中方便使用。

本部分的模块有lxml、pymysql、requests

使用百度验证IP代理是否可用

from lxml import etree
import pymysql
import requests.adapters


def get_new_ip():
    # 获取并更新数据表
    # 连接数据库
    connect = pymysql.Connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='root',
        db='ip',
        charset='utf8'
    )

    # 获取游标
    cursor = connect.cursor()

    # del_sql = 'TRUNCATE `proxy`'
    cursor.execute('TRUNCATE `proxy`')
    connect.commit()

    ip_url = 'https://cn-proxy.com/'

    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'}

    req = requests.get(ip_url, headers=header)
    req.encoding = 'utf-8'
    sku_first = etree.HTML(req.text)
    search_ip = sku_first.xpath('//table[(@class="sortable")]/tbody/tr/td[1]/text()')
    search_port = sku_first.xpath('//table[(@class="sortable")]/tbody/tr/td[2]/text()')
    print(search_ip)
    print(search_port)

    for (each_ip, each_port) in zip(search_ip, search_port):
        ip_proxy = each_ip + ':' + each_port
        print(ip_proxy)

        try:
            requests.get('https://www.baidu.com/', proxies={'http': ip_proxy})
        except:
            print('Failed')
        else:
            print('OK!')
            insert_http = 'INSERT IGNORE INTO `proxy` (`ip`,`port`) VALUES ( "%s", "%s")'
            data = (each_ip, each_port)
            cursor.execute(insert_http % data)
            connect.commit()


if __name__ == '__main__':
    get_new_ip()

 

二、主体部分

from lxml import etree
import requests
import time
import pymysql
import urllib.parse
import urllib.request
import re
from io import BytesIO
import base64
import requests.adapters
import get_ip_SQL
import random
from requests.adapters import HTTPAdapter
import get_new_ip


# 获取最新IP
get_new_ip.get_new_ip()
# ip list
ip_list = get_ip_SQL.get_ip()



# keyword = input('请输入想查询的关键字:')
# 所有分类遍历
keyword_list = ['家用电器', '手机', '运营商', '数码', '电脑', '办公', '家居', '家具', '家装', '厨具', '男装', '女装', '童装', '内衣', '美妆',
                '个护清洁', '宠物', '女鞋', '箱包', '钟表', '珠宝', '男鞋', '运动', '户外', '房产', '汽车', '汽车用品', '母婴', '玩具乐器', '食品',
                '酒类', '生鲜', '特产', '艺术', '礼品鲜花', '农资绿植', '医药保健', '计生情趣', '图书', '文娱', '电子书', '机票', '酒店', '旅游',
                '生活', '理财', '众筹', '白条', '保险', '安装', '维修', '清洗保养', '工业品']

for keyword in keyword_list:
# 转换为Unicode
    keyword_unicode = urllib.parse.quote(keyword)
    print(keyword_unicode)


    # 连接数据库
    connect = pymysql.Connect(
        host='localhost',
        port=3306,
        user='root',
        passwd='root',
        db='jd',
        charset='utf8mb4'
    )

    # 获取游标
    cursor = connect.cursor()

    create_word = '''CREATE TABLE IF NOT EXISTS `%s`  (\
      `ID` bigint(255) NOT NULL AUTO_INCREMENT,\
      `sku` bigint(255) UNSIGNED NOT NULL,\
      `p_title` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,\
      `p_price` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,\
      `p_price_plus` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,\
      `img_list` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,\
      `parameter` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,\
      `details_img` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,\
      `spec_info` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,\
      PRIMARY KEY (`ID`) USING BTREE,\
      UNIQUE INDEX `sku`(`sku`) USING BTREE\
    ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_general_ci ROW_FORMAT = Dynamic;
    '''
# 根据名称建表
    cursor.execute(create_word % str(keyword))
    connect.commit()


    # 设置cookie表,防止同一个cookie被屏蔽
    cookie_list = [ # 篇幅原因,此处自行添加各cookie ]
    # 设置user-agent表,防止屏蔽
    user_list = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0',
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
                 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
                 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36']

    img_header = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'max-age=0',
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'user-agent': random.choice(user_list)
    }

# 由于后半页30商品信息为动态加载,所以分步加载
    # 前半页30个商品信息
    def get_sku_first(n):
        global url_first
        global random_ip
        random_ip = {'http': random.choice(ip_list)}
        print('代理IP:', random_ip)
        url_first = 'https://search.jd.com/Search?keyword=' + keyword_unicode + '&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&stock=1&page=' + str(2*n-1) + '&s=59'

        header = {
            'user-agent': random.choice(user_list),
            'accept-language': 'zh-CN,zh;q=0.9',
            'cache-control': 'max-age=0',
            'dnt': '1',
            'upgrade-insecure-requests': '1',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'cookie': cookie_list[0]
        }

        simple_header = {'user-agent': random.choice(user_list),
                         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                         'upgrade-insecure-requests': '1',
                         'cache-control': 'max-age=0',
                         'cookie': random.choice(cookie_list)}
        print(url_first)
# 先根据搜索页面获得每个商品的 skuid
        while True:
            try:
                req_first = requests.get(url_first, headers=header, proxies={'http': random.choice(ip_list)}, timeout=800)
                break
            except:
                print('连接失败,正在重试')
                time.sleep(2)
                continue
        req_first.encoding = 'utf-8'
        sku_first = etree.HTML(req_first.text)
        data_first = sku_first.xpath('//li[(@class="gl-item")]')
        print(data_first)
        for i in data_first:
            # 获取商品sku
            p_sku_first = i.xpath('@data-sku')
            # 获取连接
            link = 'https://item.jd.com/' + p_sku_first[0] + '.html'
            print(link)
            cursor.execute('SELECT sku from `%s`' % keyword)
            sku_t_f = cursor.fetchall()
            # 判断数据库是否有信息
            if p_sku_first[0] not in str(sku_t_f):
                # 获取商品标题
                # 从商品详情页获取信息
                while True:
                    try:
                        details = requests.get(link, headers=header, proxies={'http': random.choice(ip_list)}, timeout=800)
                        break
                    except:
                        print('连接失败,正在重试')
                        time.sleep(2)
                        continue
                details.encoding = 'GBK'
                link_data = etree.HTML(details.text)

                # 获取标题
                details_title = link_data.xpath('//div[(@class="sku-name")]/text()')
                for name in details_title:
                    if not name.strip() == '':
                        details_name = name.strip()
                        # print(details_name)

                # 获取图片链接
                details_img = link_data.xpath('//ul[(@class="lh")]/li/img/@src')
                save_img_base64_list = []
                save_view_img_url_list = []
                # 将图片链接中小图替换为大图
                for img_each in details_img:
                    get_img_url = 'https:' + img_each.strip()
                    # print('get url :', get_img_url)
                    # 先替换n5为n1
                    img_url_replace_1 = get_img_url.replace('/n5/', '/n1/', 1)
                    # print('change 1 :', img_url_replace_1)
                    img_replace_word = '/s\d{2}x\d{2}_jfs/'
                    search = re.findall(img_replace_word, img_url_replace_1)
                    # 判断链接中是否存在分辨率参数
                    if not len(search) == 0:
                        img_url_replace_2 = img_url_replace_1.replace(search[0], '/s546x546_jfs/', 1)
                        save_img_url = img_url_replace_2
                    else:
                        save_img_url = img_url_replace_1
                    print(save_img_url)
                    save_view_img_url_list.append(save_img_url)

                # 获取参数
                parameter_info = link_data.xpath('//ul[(@class="parameter2 p-parameter-list")]/li/text()')
                if len(parameter_info) == 0:
                    parameter_info = link_data.xpath('//ul[(@class="parameter2")]/li/text()')
                # 规格与包装specification
                spec_info_list = link_data.xpath('//div[(@class="Ptable-item")]/dl/dl/dt/text()')
                spec_info_item = link_data.xpath('//div[(@class="Ptable-item")]/dl/dl/dd[last()]/text()')
                if len(spec_info_list) == 0 and len(spec_info_item) == 0:
                    spec_info_dic = {}
                else:
                    spec_info_dic = dict(zip(spec_info_list, spec_info_item))
                # print('字典为', spec_info_dic)

                # 由于本页详情图片可能采用懒加载,则从以下几个方面查找图片链接
                # 先从本页查看是否有详情图链接,否则 dx.3.cn 获取详情图片
                save_img_base64_list = []
                save_details_img_url_list = []
                details_pic_1 = link_data.xpath('//div[(@id="J-detail-content")]/style/text()')
                # details_pic_url_1 = re.findall('background-image:url\((.*?)\)', str(details_pic_1))
                # 某些商品详情ID由总SKUID设置,所以先进行判断是否有总ID
                skuid = re.findall("mainSkuId:'(.*?)'", details.text)
                # print('sku', skuid)
                # 若有总ID,将连接设置为总ID链接,否则设置为商品SKU链接
                if not len(skuid) == 0:
                    details_link = 'https://dx.3.cn/desc/' + skuid[0] + '?cdn=2&callback=showdesc'
                else:
                    details_link = 'https://dx.3.cn/desc/' + p_sku_first[0] + '?cdn=2&callback=showdesc'
            # print(details_link)
            # print('页面内容', goods_details.text)
            # 读取dx.3.cn中的内容
                while True:
                    try:
                        goods_details_link = requests.get(details_link, headers=simple_header, proxies={'http': random.choice(ip_list)}, timeout=800)
                        break
                    except:
                        print('连接错误,5s后重试')
                        time.sleep(5)
                        continue
                data = goods_details_link.text
            # 查找链接内容
                info_image = 'background-image:url\((.*?)\)'
                info_image_url = re.findall(info_image, data)
            # print('内容为', data)
            # 如果没找到,则换一个匹配头
                if len(info_image_url) == 0:
                    info_image_url = re.findall('''data-lazyload=\\\\"(.*?)\\\\"''', data)
                    if len(info_image_url) == 0:
                        info_image_url = re.findall('background-image:url\((.*?)\)', str(details_pic_1))
                for each_details_img in info_image_url:
                    each_details_img = each_details_img.strip('\\"')
                    if not each_details_img == '':
                        if not len(each_details_img) == 0:
                            if not len(re.findall('http', each_details_img)) == 0:
                                each_details_img = each_details_img.strip('\\"')
                            else:
                                each_details_img = 'https:' + each_details_img.strip('\\"')
                    if not len(re.findall('http:////', each_details_img)) == 0:
                        each_details_img = each_details_img.replace("http:////", "https://")
                    print('这是详情页图片 --->', each_details_img)
                    # print('图片列表:', info_image_url)
                    save_details_img_url_list.append(each_details_img)

                # 由于部分页面价格采取懒加载,由F12中Sources观察到p.3.cn可以获取到price
                # p.3.cn 获取价格
                price_url = 'https://p.3.cn/prices/mgets?callback=jQuery6775278&skuids=J_' + p_sku_first[0]
                while True:
                    try:
                        price_page = requests.get(price_url, headers=simple_header, proxies={'http': random.choice(ip_list)}, timeout=800)
                        break
                    except:
                        print('连接失败,正在重试')
                        time.sleep(2)
                        continue
                price_page.encoding = 'utf8'
                # print(price_page.text)
                price = '"p":"(.*?)"'
                plus_price = '"tpp":"(.*?)"'
                price_data = re.findall(price, price_page.text)
                plus_price_data = re.findall(plus_price, price_page.text)
                # 判断价格是否有效,是否有PLUS价格
                if len(price_data) == 0:
                    price_data = ''
                else:
                    price_data = price_data[0]

                if len(plus_price_data) == 0:
                    plus_price_data = ''
                else:
                    plus_price_data = plus_price_data[0]

                print('价格:', price_data)
                print('PLUS价格:', plus_price_data)

                insert_first = '''INSERT IGNORE INTO `%s` (`sku`, `p_title`, `p_price`, `p_price_plus`, `img_list`, `parameter`, `details_img`, `spec_info`) VALUES ( "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")'''
                # 插入至数据库,发现某些参数中带有单引号,会导致sql语句异常,所以使用了pymysql.escape_string()方法
                preview_data = (str(keyword),
                                p_sku_first[0],
                                pymysql.escape_string(details_name),
                                price_data,
                                plus_price_data,
                                pymysql.escape_string(str(save_view_img_url_list)),
                                pymysql.escape_string(str(parameter_info)),
                                pymysql.escape_string(str(save_details_img_url_list)),
                                pymysql.escape_string(str(spec_info_dic))
                                )
                cursor.execute(insert_first % preview_data)
                connect.commit()
                # print('The Num %d, ID : %s' % (count, i))
                # time.sleep(2)
            else:
                print('此商品已存在于数据库')


    # 同样方式加载后半页
    # 后半页 动-态-加-载 30个商品信息
    def get_sku_end(n):
        log_id = '%.5f' % time.time()
        url_end = 'https://search.jd.com/s_new.php?keyword=' + keyword_unicode + '&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&stock=1&page=' + str(2*n) + '&s=' + str(48*n-20) + '&scrolling=y&log_id=' + str(log_id)
        header = {
            'user-agent': random.choice(user_list),
            'accept-language': 'zh - CN, zh;',
            'dnt': '1',
            'upgrade-insecure-requests': '1',
            'x-requested-with': 'XMLHttpRequest',
            'referer': url_first,
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'cookie': cookie_list[0]
        }
        simple_header = {'user-agent': random.choice(user_list),
                         'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                         'upgrade-insecure-requests': '1',
                         'cache-control': 'max-age=0',
                         'cookie': random.choice(cookie_list)}

        while True:
            try:
                req_end = requests.get(url_end, headers=header, proxies={'http': random.choice(ip_list)}, timeout=800)
                break
            except:
                print('连接失败,正在重试')
                time.sleep(2)
                continue
        req_end.encoding = 'utf-8'
        sku_end = etree.HTML(req_end.text)
        data_end = sku_end.xpath('//li[(@class="gl-item")]')

        for i in data_end:
            # 获取商品sku
            p_sku_end = i.xpath('@data-sku')
            # 获取连接
            link = 'https://item.jd.com/' + p_sku_end[0] + '.html'
            print(link)
            cursor.execute('SELECT sku from `%s`' % keyword)
            sku_t_f = cursor.fetchall()
            # 判断数据库是否有信息
            if p_sku_end[0] not in str(sku_t_f):
                # 获取商品标题
                # 从商品详情页获取信息
                while True:
                    try:
                        details = requests.get(link, headers=header, proxies={'http': random.choice(ip_list)}, timeout=800)
                        break
                    except:
                        print('连接失败,正在重试')
                        time.sleep(2)
                        continue
                details.encoding = 'GBK'
                link_data = etree.HTML(details.text)
                # 获取标题
                details_title = link_data.xpath('//div[(@class="sku-name")]/text()')
                for name in details_title:
                    if not name.strip() == '':
                        details_name = name.strip()
                # print(details_name)

                # 获取图片链接
                details_img = link_data.xpath('//ul[(@class="lh")]/li/img/@src')
                save_img_base64_list = []
                save_view_img_url_list = []
                # 将图片链接中小图替换为大图
                for img_each in details_img:
                    get_img_url = 'https:' + img_each.strip()
                    # print('get url :', get_img_url)
                    # 先替换n5为n1
                    img_url_replace_1 = get_img_url.replace('/n5/', '/n1/', 1)
                    # print('change 1 :', img_url_replace_1)
                    img_replace_word = '/s\d{2}x\d{2}_jfs/'
                    search = re.findall(img_replace_word, img_url_replace_1)
                    # 判断链接中是否存在分辨率参数
                    if not len(search) == 0:
                        img_url_replace_2 = img_url_replace_1.replace(search[0], '/s546x546_jfs/', 1)
                        save_img_url = img_url_replace_2
                    else:
                        save_img_url = img_url_replace_1
                    print(save_img_url)
                    save_view_img_url_list.append(save_img_url)

                # 获取参数
                parameter_info = link_data.xpath('//ul[(@class="parameter2 p-parameter-list")]/li/text()')
                if len(parameter_info) == 0:
                    parameter_info = link_data.xpath('//ul[(@class="parameter2")]/li/text()')
                # 规格与包装specification
                spec_info_list = link_data.xpath('//div[(@class="Ptable-item")]/dl/dl/dt/text()')
                spec_info_item = link_data.xpath('//div[(@class="Ptable-item")]/dl/dl/dd[last()]/text()')
                if len(spec_info_list) == 0 and len(spec_info_item) == 0:
                    spec_info_dic = {}
                else:
                    spec_info_dic = dict(zip(spec_info_list, spec_info_item))
                # print('字典为', spec_info_dic)

                # 先从本业查看是否有详情图链接,否则 dx.3.cn 获取详情图片
                save_img_base64_list = []
                save_details_img_url_list = []
                details_pic_1 = link_data.xpath('//div[(@id="J-detail-content")]/style/text()')
                # details_pic_url_1 = re.findall('background-image:url\((.*?)\)', str(details_pic_1))
                # 某些商品详情ID由总SKUID设置,所以先进行判断是否有总ID
                skuid = re.findall("mainSkuId:'(.*?)'", details.text)
                # print('sku', skuid)
                # 若有总ID,将连接设置为总ID链接,否则设置为商品SKU链接
                if not len(skuid) == 0:
                    details_link = 'https://dx.3.cn/desc/' + skuid[0] + '?cdn=2&callback=showdesc'
                else:
                    details_link = 'https://dx.3.cn/desc/' + p_sku_end[0] + '?cdn=2&callback=showdesc'
                # print(details_link)
                # print('页面内容', goods_details.text)
                # 读取dx.3.cn中的内容
                while True:
                    try:
                        goods_details_link = requests.get(details_link, headers=simple_header,
                                                          proxies={'http': random.choice(ip_list)}, timeout=800)
                        break
                    except:
                        print('连接错误,5s后重试')
                        time.sleep(5)
                        continue
                data = goods_details_link.text
                # 查找链接内容
                info_image = 'background-image:url\((.*?)\)'
                info_image_url = re.findall(info_image, data)
                # print('内容为', data)
                # 如果没找到,则换一个匹配头
                if len(info_image_url) == 0:
                    info_image_url = re.findall('''data-lazyload=\\\\"(.*?)\\\\"''', data)
                    if len(info_image_url) == 0:
                        info_image_url = re.findall('background-image:url\((.*?)\)', str(details_pic_1))
                for each_details_img in info_image_url:
                    each_details_img = each_details_img.strip('\\"')
                    if not each_details_img == '':
                        if not len(each_details_img) == 0:
                            if not len(re.findall('http', each_details_img)) == 0:
                                each_details_img = each_details_img.strip('\\"')
                            else:
                                each_details_img = 'https:' + each_details_img.strip('\\"')
                    if not len(re.findall('http:////', each_details_img)) == 0:
                        each_details_img = each_details_img.replace("http:////", "https://")
                    print('这是详情页图片 --->', each_details_img)
                    # print('图片列表:', info_image_url)
                    save_details_img_url_list.append(each_details_img)

                # p.3.cn 获取价格
                price_url = 'https://p.3.cn/prices/mgets?callback=jQuery6775278&skuids=J_' + p_sku_end[0]
                while True:
                    try:
                        price_page = requests.get(price_url, headers=header, proxies={'http': random.choice(ip_list)}, timeout=800)
                        break
                    except:
                        print('连接失败,正在重试')
                        time.sleep(2)
                        continue
                price_page.encoding = 'utf8'
                # print(price_page.text)
                price = '"p":"(.*?)"'
                plus_price = '"tpp":"(.*?)"'
                price_data = re.findall(price, price_page.text)
                plus_price_data = re.findall(plus_price, price_page.text)
                # 判断价格是否有效,是否有PLUS价格
                if len(price_data) == 0:
                    price_data = ''
                else:
                    price_data = price_data[0]

                if len(plus_price_data) == 0:
                    plus_price_data = ''
                else:
                    plus_price_data = plus_price_data[0]

                # print('价格:', price_data)
                # print('PLUS价格:', plus_price_data)

                insert_first = '''INSERT IGNORE INTO `%s` (`sku`, `p_title`, `p_price`, `p_price_plus`, `img_list`, `parameter`, `details_img`, `spec_info`) VALUES ( "%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")'''
                preview_data = (str(keyword),
                                p_sku_end[0],
                                pymysql.escape_string(details_name),
                                price_data,
                                plus_price_data,
                                pymysql.escape_string(str(save_view_img_url_list)),
                                pymysql.escape_string(str(parameter_info)),
                                pymysql.escape_string(str(save_details_img_url_list)),
                                pymysql.escape_string(str(spec_info_dic))
                                )
                cursor.execute(insert_first % preview_data)
                connect.commit()
                # print('The Num %d, ID : %s' % (count, i))
                # time.sleep(2)
            else:
                print('此商品已存在于数据库')


  # 循环查询前50页
    if __name__ == '__main__':
        for page_num in range(1, 51):
            # print 查看当前页
            print('***************************************************')
            print('   First_Page:   ' + str(page_num))
            get_sku_first(page_num)
            print('   Last_Page:   ' + str(page_num))
            get_sku_end(page_num)

新手零基础入门,仅作笔记,大佬勿喷。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值