最近在自己学习python和研究网络爬虫,自己用scrapy实现了简单的爬虫,爬取京东手机信息,值爬了第一页的手机名称,价格,手机店名,成交量等信息。不是很完整,后续继续研究,一步一步晚上。
1. 使用的IDE是pycharm,环境比较难折腾
2. 由于是动态网页,开始的时候怎么也获取不到价格等信息,所有用了PyQt5加载网页,然后再获取信息。
3. 把信息导出到xlsx文件
Jdspider.py
import sys
from scrapy.selector import
Selector
from scrapy.contrib.spiders
import CrawlSpider,Rule
from PyQt5.QtWebEngineWidgets
import QWebEngineView
from PyQt5 import
QtWebEngine
from PyQt5 import
QtWidgets
from PyQt5.QtCore import
QUrl
from PyQt5.QtCore import
QEventLoop, QUrl, QTimer
from jdcrawl.items import
JdcrawlItem
class JdSpider(CrawlSpider):
name = "jdSpider"
view = None
app = None
html = None
isloadFinish = False
allowed_domains = ["jd"]
start_urls = [
"https://list.jd.com/list.html?cat=9987,653,655&page=1&sort=sort_rank_asc&trans=1&JL=6_0_0&ms=6#J_main"
]
def _callable(self, html):
self.html = html
print("app quit")
self.app.quit()
self.app.exit(0)
self.isloadFinish = True
print("_callable")
filename = 'response.html'
fp = open(filename,
'w',encoding='utf-8')
fp.write(html)
fp.close()
def parserHtml(self):
print("parserHtml..")
select = Selector(text=self.html)
for sel in
select.xpath('//*[@id="plist"]/ul/li[@class="gl-item"]'):
item = JdcrawlItem()
name = sel.xpath('div/div[@class="p-name"]/a/em/text()').extract()
shop = sel.xpath('div/div[@class="p-shop"]/span/a[@title]/text()').extract()
price = sel.xpath('div/div[@class="p-price"]/strong[@class="J_price"]/i/text()').extract()
comment = sel.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()
item["phoneName"] = name
item["phoneShop"] = shop
item["price"] = price
item["comments"] = comment
print(name)
# yield item
def _timer_for_html(self):
print("_timer_for_html")
self.view.page().toHtml(self._callable)
def _loadFinished(self,
result):
print("load finish.....")
QTimer.singleShot(2 *
1000, self._timer_for_html)
def parse(self, response):
print("parse")
self.app = QtWidgets.QApplication(sys.argv)
self.view = QWebEngineView()
self.view.loadFinished.connect(self._loadFinished)
self.view.load(QUrl(response.url))
self.app.exec();
select = Selector(text=self.html)
for sel in
select.xpath('//*[@id="plist"]/ul/li[@class="gl-item"]'):
item = JdcrawlItem()
name = sel.xpath('div/div[@class="p-namep-name-type3"]/a/em/text()').extract()
shop = sel.xpath('div/div[@class="p-shop"]/span/a[@title]/text()').extract()
price = sel.xpath('div/div[@class="p-price"]/strong[@class="J_price"]/i/text()').extract()
comment = sel.xpath('div/div[@class="p-commit"]/strong/a/text()').extract()
if len(name) >
0:
name = name[0].strip()
if len(shop) >
0 :
shop = shop[0].strip()
if len(price) >
0:
price = price[0].strip()
if len(comment) >
0:
comment = comment[0].strip()
item["phoneName"] = name
item["phoneShop"] = shop
item["price"] = price
item["comments"] = comment
# print(name)
yield item
items.py
import scrapy class JdcrawlItem(scrapy.Item): # define the fields for your item here like: phoneName = scrapy.Field() phoneShop = scrapy.Field() price = scrapy.Field() comments = scrapy.Field()
pipelines.py
from openpyxl import Workbook class JdcrawlPipeline(object): wb = Workbook() ws = wb.active ws.append(['手机名称', '店名', '价格', '成交量']) # 设置表头 def process_item(self, item, spider): print("process item") line = [item['phoneName'], item['phoneShop'], item['price'], item['comments']] # 把数据中每一项整理出来 print(line) self.ws.append(line) # 将数据以行的形式添加到xlsx中 self.wb.save('phoneinfo.xlsx') # 保存xlsx文件 return item #
settings.py新增以下代码:
ITEM_PIPELINES = { 'jdcrawl.pipelines.JdcrawlPipeline': 300, }
虽然之前用的是qt,由于对PyQt5不熟悉,所以运行到时候会弹出错误弹窗。这个后面再完善。