爬取京东商品详情页信息

之前写过爬取京东商品导航信息,现在献上爬取京东商品详情页信息。

#爬取京东商品详情页信息
#2017/7/30

import requests
from bs4 import BeautifulSoup
import os
import csv
import re
import json
import time

#爬取页面链接
def make_a_link(keyword,page):
    try:
        r = requests.get("https://search.jd.com/Search?keyword=" + keyword +'&enc=utf-8&page=' + str(2*page-1))
        r.raise_for_status
        print('正在爬取第{}页...'.format(page))
        print('---'*45)
        r.encoding = 'gbk'
        return r.text
    except:
        print('链接错误!!!')
        return ''


#爬取页面链接
def find_only_link(html):
    soup = BeautifulSoup(html,'lxml')
    links = soup.find_all('div',class_='gl-i-wrap')
    return (link.find('div',class_='p-name p-name-type-2').a['href'] for link in links)#页面链接的生成表达式

#链接单页面
def link_to_url(link):
    try:
        r = requests.get(link)
        r.raise_for_status
        r.encoding = 'gbk'
        return r.text
    except:
        print('此页无法链接!!!')
        return ''


#爬取商品价格
def getprice(purl):
    uid = re.match(r'.+?(\d+).+',purl).group(1)
    content = link_to_url('https://p.3.cn/prices/mgets?skuIds=J_' + uid)
    jd = json.loads(content.lstrip('[').rstrip(']\n'))#生成json数据格式
    return jd['p']

#爬取商品评论
def getcomment(purl):
    uid = re.match(r'.+?(\d+).+',purl).group(1)
    content = link_to_url('https://club.jd.com/comment/productCommentSummaries.action?referenceIds=' + uid)
    jd = json.loads(content)
    comment = []
    jds = jd['CommentsCount'][0]
    comment.append(jds['CommentCountStr'])#评论数
    comment.append(jds['GoodCountStr'])#好评数
    comment.append(jds['GoodRate'])#好评率
    return comment

#爬取商品名称
def getname(purl):
    uid = re.match(r'.+?(\d+).+',purl).group(1)
    content = link_to_url('https://c.3.cn/recommend?&methods=accessories&sku=' + uid + '&cat=9987%2C653%2C655')
    try:
        jd = json.loads(content)
        return jd['accessories']['data']['wName']
    except:
            return ''
    

#爬取卖家
def getseller(purl):
    uid = re.match(r'.+?(\d+).+',purl).group(1)
    content = link_to_url('https://chat1.jd.com/api/checkChat?pid=' + uid + '&returnCharset=utf-8')
    try:
        jd = json.loads(content.lstrip('null(').rstrip(');'))
        try:
            return jd['seller']
        except:
            return ''
    except:
        ''

#保存到csv
def save_to_csv(ulist,keyword):
    path = 'D:/数据/'
    if not os.path.exists(path):
        os.mkdir(path)
    with open(path + '京东' + keyword + '数据.csv','w+' ) as f:
        writer = csv.writer(f)
        writer.writerow(['商品','价格','店铺','链接','评论数','好评数','好评率'])
        for i in range(len(ulist)):
            if ulist[i] and ulist[i][0]:
                writer.writerow([ulist[i][0],ulist[i][1],ulist[i][2],ulist[i][3],ulist[i][4],ulist[i][5],ulist[i][6]])


 #主函数
def relmain(keyword):#高阶函数
    def main(page):
        r = re.compile(r'.*?html')
        ulist = []
        for p in range(page):
            p += 1
            text = make_a_link(keyword,p)
            for url in find_only_link(text):
                ul = []
                if r.match(url):
                    if getname(url):
                        ul.append(getname(url))#商品名称
                        print(getname(url))
                        ul.append(getprice(url))#价格
                        ul.append(getseller(url))#店铺
                        ul.append('https:' + url)#链接
                        print('https:' + url)
                        ul.extend(getcomment(url))#评论
                        print('*-*' * 45)
                        
                
                ulist.append(ul)
        
        save_to_csv(ulist,keyword)
    return main

if __name__ == '__main__':
    keyword = input('输入要爬取的商品:')
    pages = int(input('输入要爬取的页数:'))
    time_start = time.time()
    relmain(keyword)(pages)
    print('耗时{}秒。'.format(time.time() - time_start))#爬取所需时间


评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值