- 抓取地址
https://search.jd.com/Search?keyword=%E7%A7%91%E5%B9%BB%E5%B0%8F%E8%AF%B4&enc=utf-8&suggest=1.def.0.V16--featuredump,&wq=%E7%A7%91%E5%B9%BB&pvid=814262d98b22410fbd624ce0cf1a19fa
- 页面分析
- 由于页面采取了滚动到底部刷新的机制,所以要特别注意需要滚动才能看到整个页面和下一页的标签
- 采用
lua
脚本的方式来实现简单的可配置爬虫 - 具体的
scrapy
框架构建爬虫就不赘述了,直接上spider
代码
jd.py
代码:
# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy import Request
from scrapy_splash import SplashRequest
from jdsplash.items import JdsplashItem
lua_loadall = open('jd_loadall.lua').read()
def Getlua_next(pageNum):
lua_next = (open('next.lua').read())%str(pageNum)
return lua_next
class JdSpider(scrapy.Spider):
name = 'jd'
allowed_domains = ['jd.com']
start_urls = ['https://search.jd.com/Search?keyword=%E7%A7%91%E5%B9%BB%E5%B0%8F%E8%AF%B4&enc=utf-8&suggest=1.def.0.V16--featuredump,&wq=%E7%A7%91%E5%B9%BB&pvid=814262d98b22410fbd624ce0cf1a19fa']
def start_requests(self):
for url in self.start_urls:
# 开始
yield Request(url, callback=self.parse_url, meta={'page': 1}, dont_filter=True)
def parse_url(self, response):
url = response.url
metadata = {'page':response.meta['page']}
yield SplashRequest(url,meta=metadata,endpoint='execute',args={'lua_source':lua_loadall},cache_args=['lua_source'])
def parse_url2(self, response):
url = response.body_as_unicode()
# 加载剩余item
yield SplashRequest(url, meta={'page': response.meta['page']}, endpoint='execute',
args={'lua_source': lua_loadall}, cache_args=['lua_source'])
def parse(self, response):
# 处理数据
pagenum = int(response.meta['page'])
for book in response.xpath('.//li[@class="gl-item"]'): # 获取所有item
url = book.xpath('.//div[@class="p-name"]/a/@href').extract_first() # 获取item的url
bookname = book.xpath('.//div[@class="p-name"]/a/em').extract_first()
# 正则提取替换得到书名
rebookname = re.compile(r'<.*?>')
price = book.xpath('.//div[@class="p-price"]/strong/i/text()').extract_first() # 获取价格
if 'https' not in url: # 争对自营的部分 添加http
url = response.urljoin(url)
item = JdsplashItem()
item['BookName'] = rebookname.sub('', bookname) # 根据正则进行反向替换,去掉书名中的一些元素
item['Price'] = price
item['BuyLink'] = url
yield item
# 翻页 判断是否到最后一页
if len(response.xpath('.//div[@class="pn-next disabled"]/em/b/text()').extract()) <= 0:
yield SplashRequest(response.url, meta={'page': pagenum + 1}, callback=self.parse_url2, endpoint='execute',
args={'lua_source': Getlua_next(2 * pagenum + 1)}, cache_args=['lua_source'],
dont_filter=True)
jd_loadall.lua
脚本代码
- 获取当前页的全部内容, 滚动到底部, 显示下一页标签和整个页面内容
function main(splash)
splash: go(splash.args.url)
splash: wait(5)
splash: runjs('document.getElementById("J-global-toolbar").scrollIntoView()')
splash: wait(5)
return {splash:html()}
end
next.lua
脚本代码
- 调用页面点击下一页
js
代码,实现翻页功能
function main(splash)
splash: go(splash.args.url)
splash: wait(2)
splash:runjs("SEARCH.page(%s, true)")
splash:wait(2)
return splash:url()
end
注意事项:
jd_loadall.lua
和next.lua
两个脚本代码文件需要放在整个爬虫项目的根目录下,和spiders
目录的上级目录是平级的关系, 才能生效;