jd手机信息爬虫

最新推荐文章于 2023-11-27 10:40:41 发布

海的神话

最新推荐文章于 2023-11-27 10:40:41 发布

阅读量880

点赞数

CC 4.0 BY-SA版权

分类专栏：网络爬虫

本文链接：https://blog.youkuaiyun.com/hai200501019/article/details/78024075

网络爬虫专栏收录该内容

1 篇文章

订阅专栏

本文介绍了作者使用Python的Scrapy框架和PyQt5库爬取京东手机页面第一页的商品名称、价格、店铺和成交量。面对动态网页，作者通过PyQt5加载网页以获取信息，并将数据保存到xlsx文件。目前仅实现了一部分功能，后续将继续完善。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

最近在自己学习python和研究网络爬虫，自己用scrapy实现了简单的爬虫，爬取京东手机信息，值爬了第一页的手机名称，价格，手机店名，成交量等信息。不是很完整，后续继续研究，一步一步晚上。

1. 使用的IDE是pycharm，环境比较难折腾

2. 由于是动态网页，开始的时候怎么也获取不到价格等信息，所有用了PyQt5加载网页，然后再获取信息。

3. 把信息导出到xlsx文件

Jdspider.py

import sys
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider,Rule
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5 import QtWebEngine
from PyQt5 import QtWidgets
from PyQt5.QtCore import QUrl
from PyQt5.QtCore import QEventLoop, QUrl, QTimer
from jdcrawl.items import JdcrawlItem

class JdSpider(CrawlSpider):
    name = "jdSpider"
    view = None
    app = None
    html = None
    isloadFinish = False
    allowed_domains = ["jd"]
    start_urls = [
        "https://list.jd.com/list.html?cat=9987,653,655&page=1&sort=sort_rank_asc&trans=1&JL=6_0_0&ms=6#J_main"
    ]

    def _callable(self, html):
        self.html = html
        print("app quit")
        self.app.quit()
        self.app.exit(0)
        self.isloadFinish = True
        print("_callable")
        filename = 'response.html'
        fp = open(filename, 'w',encoding='utf-8')
        fp.write(html)
        fp.close()

    def parserHtml(self):
        print("parserHtml..")
        select = Selector(text=self.html)
        for sel in select.xpath('//*[@id="plist"]/ul/li[@class="gl-item"]'):
            item = JdcrawlItem()
            name = sel.xpath('div/div[@class="p-name"]/a/em/text()').extract()
            shop = sel.xpath('div/div[@class="p-shop"]/span/a[@title]/text()').extract()
            price = sel.xpath('div/div[@class="p-price"]/strong[@class="J_price"]/i/text()').extract()
            comment = sel.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()
            item["phoneName"] = name
            item["phoneShop"] = shop
            item["price"] = price
            item["comments"] = comment
            print(name)
            # yield item

    def _timer_for_html(self):
      print("_timer_for_html")
        self.view.page().toHtml(self._callable)

    def _loadFinished(self, result):
        print("load finish.....")
        QTimer.singleShot(2 * 1000, self._timer_for_html)

    def parse(self, response):
        print("parse")
        self.app = QtWidgets.QApplication(sys.argv)
        self.view = QWebEngineView()
        self.view.loadFinished.connect(self._loadFinished)
        self.view.load(QUrl(response.url))
        self.app.exec();
        select = Selector(text=self.html)
        for sel in select.xpath('//*[@id="plist"]/ul/li[@class="gl-item"]'):
            item = JdcrawlItem()
            name = sel.xpath('div/div[@class="p-namep-name-type3"]/a/em/text()').extract()
            shop = sel.xpath('div/div[@class="p-shop"]/span/a[@title]/text()').extract()
            price = sel.xpath('div/div[@class="p-price"]/strong[@class="J_price"]/i/text()').extract()
            comment = sel.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()
            if len(name) > 0:
                name = name[0].strip()
            if len(shop) > 0 :
                shop = shop[0].strip()
            if len(price) > 0:
                price = price[0].strip()
            if len(comment) > 0:
                comment = comment[0].strip()
            item["phoneName"] = name
            item["phoneShop"] = shop
            item["price"] = price
            item["comments"] = comment
            # print(name)
            yield item

items.py

import scrapy


class JdcrawlItem(scrapy.Item):
    # define the fields for your item here like:
    phoneName = scrapy.Field()
    phoneShop = scrapy.Field()
    price = scrapy.Field()
    comments = scrapy.Field()

pipelines.py

from openpyxl import Workbook

class JdcrawlPipeline(object):
    wb = Workbook()
    ws = wb.active
    ws.append(['手机名称', '店名', '价格', '成交量'])  # 设置表头

    def process_item(self, item, spider):
        print("process item")
        line = [item['phoneName'], item['phoneShop'], item['price'], item['comments']]  # 把数据中每一项整理出来
        print(line)
        self.ws.append(line)  # 将数据以行的形式添加到xlsx中
        self.wb.save('phoneinfo.xlsx')  # 保存xlsx文件
        return item
#

settings.py新增以下代码：

ITEM_PIPELINES = {
   'jdcrawl.pipelines.JdcrawlPipeline': 300,
}

虽然之前用的是qt，由于对PyQt5不熟悉，所以运行到时候会弹出错误弹窗。这个后面再完善。