1. 定义图片管道
import scrapy
class MyItem(scrapy.Item):
# ... other item fields ...
image_urls = scrapy.Field()
images = scrapy.Field()
2. 开启图片管道
在settings中ITEM_PIPELINES = {'scrapy.contrib.pipeline.images.ImagesPipeline': 1}
并将IMAGES_STORE设置为一个有效的文件夹,用来存储下载的图片。否则管道将保持禁用状态
import os
BASE_DIR = os.path.dirname((os.path.abspath(__file__)))
MEDIA_ALLOW_REDIRECTS = True
IMAGES_STORE = os.path.join(BASE_DIR, "images")
3. 最后定义爬取图片的spider
import scrapy
from ..items import ImagesItem
class ImageSpider(scrapy.Spider):
name = 'image'
allowed_domains = ['jj20.com']
start_urls = ['http://www.jj20.com/bz/zrfg/']
def parse(self, response):
image_urls = response.xpath('//ul[@class="pic2 vvi fix"]/li')
for image_url in image_urls:
# print(image_url)
next_page = image_url.xpath("a/@href").extract_first()
# print(next_page)
yield response.follow(next_page, callback=self.parse_image)
next_page = response.xpath('//a[text()="下一页"]/@href')
print(next_page)
if next_page:
yield response.follow(next_page, self.parse)
def parse_image(self, response):
# image_url = response.xpath('//div[@class="tu-o"]/span[@id="kk"]/a').extract_first()
image_url = response.xpath('//img[@id="bigImg"]/@src').extract_first()
# print("~" * 30)
print(image_url)
image_item = ImagesItem()
image_item["image_urls"] = [image_url]
yield image_item
next_image_url = response.xpath('//a[@id="pageNext"]/@href').extract_first()
print("~" * 30)
print(next_image_url)
if next_image_url:
yield response.follow(next_image_url, self.parse_image)
4. 如若爬取图片不能用,尝试反外链及ua设置
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
"Referer": "http://www.jj20.com/bz/zrfg/rcrl/25689_2.html"
}
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0"