win+r 输入cmd 打开 终端输入
cd desktop
scrapy startprojectTX movies
cd TXmoviese
scrapy genspider txms v.qq.com
修改setting
ROBOTSTXT_OBEY=False
DOWNLOAD_DELAY=1
DEFAULT_REQUEST_HEADERS{
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language':'en',
'UserAgent':'Mozilla/5.0'
}
ITEM_PIPELINES={'TXmovies.pipelines.TxmoviesPipeline':300,}
提取数据
import scrapy
class TxmoviesItem(scrapy.Item):
#definethefieldsforyouritemherelike:
#name=scrapy.Field()
name=scrapy.Field()
description=scrapy.Field()
程序代码
import scrapy
from ..items import TxmoviesItem
class TxmsSpider(scrapy.Spider):
name = 'txms'
allowed_domains = ['v.qq.com']
start_urls = ['http://v.qq.com/']
# 开始的地址
start_urls=['https://v.qq.com/x/bu/pagesheet/list?append=1&channel=carroon&iarea=1&listpage=2&offset=0&pagesize=30']
# 位移
offset=0
def parse(self, response):
# 定义数据结构
items=TxmoviesItem()
# 解析视频数据
lists=response.xpath('//div[@class="list_item"]')
# 遍历list,解析名字和描述
for i in lists:
items['name'] = i.xpath('./a/@title').get()
items['description'] = i.xpath('./div/div/@title').get()
yield items
if self.offset<120:
self.offset+=30
url = 'https://v.qq.com/x/bu/pagesheet/list?append=1&channel=carroon&iarea=1&listpage=2&offset={}&pagesize=30'.format(str(self.offset))
yield scrapy.Request(url=url,callback=self.parse)
管道输出
class TxmoviesPipeline:
def process_item(self, item, spider):
print(item)
return item
执行代码
from scrapy import cmdline
cmdline.execute('scrapy craw ltxms'.split()