1.命令行创建项目
scrapy startproject itcast
2.编写items.py
import scrapy
class ItcastItem(scrapy.Item):
# define the fields for your item here like:
# 老师的姓名
name = scrapy.Field()
# 老师的职称
title = scrapy.Field()
# 老师信息
info = scrapy.Field()
3.spiders目录下创建itspid.py文件
scrapy genspider itspid "itcast.cn"
4.编写itspid.py文件
import scrapy
# item文件的类
from itcast.items import ItcastItem
class ItspidSpider(scrapy.Spider):
# 爬虫名,启动爬虫是需要的参数*必须
name = 'itspid'
# 爬取域范围,循序爬虫再这个域名下进行爬取(可选)
allowed_domains = ['itcast.cn']
# 起始url,爬虫执行后第一批请求,将从这个列表里获取
start_urls = ['http://www.itcast.cn/channel/teacher.shtml']
def parse(self, response):
node_list = response.xpath("//div[@class='li_txt']