scrapy学习记录

最新推荐文章于 2023-10-24 21:00:31 发布

linmumum

最新推荐文章于 2023-10-24 21:00:31 发布

阅读量233

点赞数

分类专栏： framework robot

本文链接：https://blog.youkuaiyun.com/linmumum/article/details/105489596

版权

framework robot 专栏收录该内容

30 篇文章

订阅专栏

1.安装：pip install scrapy

2.创建项目:scrapy startproject JDdog

3.创建文件:

4.附上一部分代码

#coding=utf-8
import scrapy, os, json, time
from openpyxl import Workbook
import xlsxwriter
cwd = os.getcwd()#获得当前目录
name = "jdtest"  # 定义蜘蛛名

Screentime = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time()))
def save_excel_row_data(_list, address, filename):
   wb = xlsxwriter.Workbook(address)
   ws = wb.add_worksheet(filename)
   j = 0
   for i in _list:
      ws.write(j, 0, i)
      j = j+1
   # ws.append(_list)
   wb.close()
#将list数据按列写入excel
def save_excel_data(_list, address, filename):
    wb = Workbook()
    ws = wb.create_sheet(filename)
    ws.append(_list)
    wb.save(address)
    
class JDdog(scrapy.Spider):  # 需要继承scrapy.Spider类
   name = "jdtest"  # 定义蜘蛛名
   def start_requests(self):  # 由此方法通过下面链接爬取页面
      
      # 定义爬取的链接
      urls = [
         'https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BAmate30',
         # 'https://try.jd.com/',
      ]
      for url in urls:
         yield scrapy.Request(url=url, callback=self.parse)  # 爬取到的页面如何处理？提交给parse方法处理
   
   
   # def parse(self, response):
   #  '''
   #  start_requests已经爬取到页面，那如何提取我们想要的内容呢？那就可以在这个方法里面定义。
   #  这里的话，并木有定义，只是简单的把页面做了一个保存，并没有涉及提取我们想要的数据，后面会慢慢说到
   #  也就是用xpath、正则、或是css进行相应提取，这个例子就是让你看看scrapy运行的流程：
   #  1、定义链接；
   #  2、通过链接爬取（下载）页面；
   #  3、定义规则，然后提取数据；
   #  就是这么个流程，似不似很简单呀？
   #  '''
   #  page = response.url.split("/")[-2]  # 根据上面的链接提取分页,如：/page/1/，提取到的就是：1
   #  filename = 'mingyan-%s.html' % page  # 拼接文件名，如果是第一页，最终文件名便是：mingyan-1.html
   #  with open(filename, 'wb') as f:  # python文件操作，不多说了；
   #     f.write(response.body)  # 刚才下载的页面去哪里了？response.body就代表了刚才下载的页面！
   #  self.log('保存文件: %s' % filename)  # 打个日志
   # 将list数据按列写入excel
   
   def parse(self, response):
      _nlist = []
      _plist = []
      selector = scrapy.Selector(response)
      n_s_list = selector.xpath("//div[@class ='p-name p-name-type-2']")
      n_list = selector.xpath("//div[@class ='p-name p-name-type-2']/a/em")
      for i in n_list:
         second_name = n_s_list.xpath("//a/em/font[1][@class ='skcolor_ljg']")
         for name2 in second_name:
            name2 = name2.xpath('./text()')
            name2 = name2.extract()[-1]
         n_list = i.xpath('./text()')
         n_list = n_list.extract()
         _name = str(n_list[0])+str(name2)+str(n_list[-1])
         p_list = selector.xpath("//div[@class ='p-price']/strong/i")
         for sel in p_list:
            p_list = sel.xpath('./text()')
            p_list = p_list.extract_first()
            _plist.append(p_list)
            _nlist.append(_name)
      ##css写法
      # _pricelist = response.css('div.p-price')
      # _list = []
      # for var in _pricelist:
      #  price = var.css('i')
      #  price = str(price)
      #  price = price.replace("[<Selector xpath='descendant-or-self::i' data='<i>", "")
      #  price = price.replace("</i>'>]", "")
      fileName = '华为mate30价格.txt'   # 定义文件名,
      address = cwd + '\\京东商城华为mate30价格'+Screentime+'.xlsx'
      print(address)
      wb = xlsxwriter.Workbook(address)
      ws = wb.add_worksheet(fileName)
      j = 0
      for i in _nlist:
         ws.write(j, 0, i)
         j = j + 1
         row = 0
         for i in _plist:
            ws.write(row, 1, i)
            row = row + 1
         # ws.append(_list)
      wb.close()
      
if __name__ == "__main__":
   import os
   command1 = 'cd' + ' ' + str(cwd)
   command2 = 'cd e:'
   command3 = 'scrapy crawl ' + name
   os.system(command1)
   os.system(command2)
   os.system(command3)

# if __name__ == "__main__":
   # #scrapy调试工具的使用
   # import os
   # cwd = os.getcwd()  # 获得当前目录
   # command1 = 'scrapy shell https://search.jd.com/Search?keyword=%E5%8D%8E%E4%B8%BAmate30'
   # os.system(command1)
   #response.css('title') 在调试框输入，检查是否展示<title>京东(JD.COM)-正品低价、品质保障、配送及时、轻松购物...
   #response.css('title').extract()
   #response.css('title::text').extract_first()我们在title后面加上了 ::text ,这代表提取标签里面的数据，至此，我们已经成功提取到了我们需要的数据：