jd手机信息爬虫

本文介绍了作者使用Python的Scrapy框架和PyQt5库爬取京东手机页面第一页的商品名称、价格、店铺和成交量。面对动态网页,作者通过PyQt5加载网页以获取信息,并将数据保存到xlsx文件。目前仅实现了一部分功能,后续将继续完善。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

最近在自己学习python和研究网络爬虫,自己用scrapy实现了简单的爬虫,爬取京东手机信息,值爬了第一页的手机名称,价格,手机店名,成交量等信息。不是很完整,后续继续研究,一步一步晚上。

1.      使用的IDE是pycharm,环境比较难折腾

2.      由于是动态网页,开始的时候怎么也获取不到价格等信息,所有用了PyQt5加载网页,然后再获取信息。

3.      把信息导出到xlsx文件

 

Jdspider.py

import sys
from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider,Rule
from PyQt5.QtWebEngineWidgets import QWebEngineView
from PyQt5 import QtWebEngine
from PyQt5 import QtWidgets
from PyQt5.QtCore import QUrl
from PyQt5.QtCore import QEventLoop, QUrl, QTimer
from jdcrawl.items import JdcrawlItem

class JdSpider(CrawlSpider):
    name = "jdSpider"
   
view = None
   
app = None
   
html = None
   
isloadFinish = False
   
allowed_domains = ["jd"]
    start_urls = [
        "https://list.jd.com/list.html?cat=9987,653,655&page=1&sort=sort_rank_asc&trans=1&JL=6_0_0&ms=6#J_main"
   
]

    def _callable(self, html):
        self.html = html
        print("app quit")
        self.app.quit()
        self.app.exit(0)
        self.isloadFinish = True
       
print("_callable")
        filename = 'response.html'
       
fp = open(filename, 'w',encoding='utf-8')
        fp.write(html)
        fp.close()


    def parserHtml(self):
        print("parserHtml..")
        select = Selector(text=self.html)
        for sel in select.xpath('//*[@id="plist"]/ul/li[@class="gl-item"]'):
            item = JdcrawlItem()
            name = sel.xpath('div/div[@class="p-name"]/a/em/text()').extract()
            shop = sel.xpath('div/div[@class="p-shop"]/span/a[@title]/text()').extract()
            price = sel.xpath('div/div[@class="p-price"]/strong[@class="J_price"]/i/text()').extract()
            comment = sel.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()
            item["phoneName"] = name
            item["phoneShop"] = shop
            item["price"] = price
            item["comments"] = comment
            print(name)
            # yield item

   
def _timer_for_html(self):
        print("_timer_for_html")
        self.view.page().toHtml(self._callable)

    def _loadFinished(self, result):
        print("load finish.....")
        QTimer.singleShot(2 * 1000, self._timer_for_html)

    def parse(self, response):
        print("parse")
        self.app = QtWidgets.QApplication(sys.argv)
        self.view = QWebEngineView()
        self.view.loadFinished.connect(self._loadFinished)
        self.view.load(QUrl(response.url))
        self.app.exec();
        select = Selector(text=self.html)
        for sel in select.xpath('//*[@id="plist"]/ul/li[@class="gl-item"]'):
            item = JdcrawlItem()
            name = sel.xpath('div/div[@class="p-namep-name-type3"]/a/em/text()').extract()
            shop = sel.xpath('div/div[@class="p-shop"]/span/a[@title]/text()').extract()
            price = sel.xpath('div/div[@class="p-price"]/strong[@class="J_price"]/i/text()').extract()
            comment = sel.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()
            if len(name) > 0:
                name = name[0].strip()
            if len(shop) > 0 :
                shop = shop[0].strip()
            if len(price) > 0:
                price = price[0].strip()
            if len(comment) > 0:
                comment = comment[0].strip()
            item["phoneName"] = name
            item["phoneShop"] = shop
            item["price"] = price
            item["comments"] = comment
            # print(name)
           
yield item

items.py

import scrapy


class JdcrawlItem(scrapy.Item):
    # define the fields for your item here like:
    phoneName = scrapy.Field()
    phoneShop = scrapy.Field()
    price = scrapy.Field()
    comments = scrapy.Field()

 

pipelines.py

from openpyxl import Workbook

class JdcrawlPipeline(object):
    wb = Workbook()
    ws = wb.active
    ws.append(['手机名称', '店名', '价格', '成交量'])  # 设置表头

    def process_item(self, item, spider):
        print("process item")
        line = [item['phoneName'], item['phoneShop'], item['price'], item['comments']]  # 把数据中每一项整理出来
        print(line)
        self.ws.append(line)  # 将数据以行的形式添加到xlsx中
        self.wb.save('phoneinfo.xlsx'# 保存xlsx文件
        return item
#

 

settings.py新增以下代码:

ITEM_PIPELINES = {
   'jdcrawl.pipelines.JdcrawlPipeline': 300,
}

 

虽然之前用的是qt,由于对PyQt5不熟悉,所以运行到时候会弹出错误弹窗。这个后面再完善。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值