spider.py
import scrapy
from meinv.items import MeinvItem
class MyScrapy (scrapy.Spider):
name = "my"
allowed_domains = ["moko.cc"]
start_urls = [
"http://www.moko.cc/post/aaronsky/list.html"
]
def parse(self, response):
item = MeinvItem()
item['image_urls'] = response.xpath('//img//@src2').extract() # 提取图片链接
yield item
new_urls = response.xpath('//a[@class="coverBg wC"]//@href').extract() # 跳页
# print 'new_url',new_url
for url in new_urls:
print(url)
yield scrapy.Request("http://www.moko.cc"+url, callback=self.parse)
items.py
import scrapy
class MeinvItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
image_urls = scrapy.Field()
pass
pipelines.py
import os
import urllib
from meinv import settings
class MeinvPipeline(object):
def process_item(self, item, spider):
dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name) # 存储路径
print('dir_path:'+dir_path)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
for image_url in item['image_urls']:
list_name = image_url.split('/')
file_name = "".join(list_name) +".jpg"
# print 'filename',file_name
file_path = '%s/%s' % (dir_path, file_name)
# print 'file_path',file_path
if os.path.exists(file_name):
continue
with open(file_path, 'wb') as file_writer:
conn = urllib.urlopen(image_url) # 下载图片
file_writer.write(conn.read())
file_writer.close()
return item
setting.py
BOT_NAME = 'meinv'
SPIDER_MODULES = ['meinv.spiders']
NEWSPIDER_MODULE = 'meinv.spiders'
IMAGES_STORE='/Users/zlinsun/Desktop/store'
DOWNLOAD_DELAY = 0.25
ROBOTSTXT_OBEY = True
DEPTH_LIMIT=10 #深度
ITEM_PIPELINES = {
'meinv.pipelines.MeinvPipeline': 300,
}
运行 main.py
cmdline.execute("scrapy crawl my".split())