我有一个scrapy spider,它使用file:///命令作为起始url来查找磁盘上的静态html文件,但是我无法加载gzip文件并在我的150000个文件的目录中循环,这些文件都有.html.gz后缀,我尝试了几种不同的方法,我已经注释了这些方法,但是到目前为止,我的代码看起来是在from scrapy.spiders import CrawlSpider
from Scrapy_new.items import Scrapy_newTestItem
import gzip
import glob
import os.path
class Scrapy_newSpider(CrawlSpider):
name = "info_extract"
source_dir = '/path/to/file/'
allowed_domains = []
start_urls = ['file:/path/to/files/.*html.gz']
def parse_item(self, response):
item = Scrapy_newTestItem()
item['user'] = response.xpath('//*[@id="page-user"]/div[1]/div/div/div[2]/div/div[2]/div[1]/h1/span[2]/text()').extract()
item['list_of_links'] = response.xpath('//*[@id="page-user"]/div[1]/div/div/div[2]/div/div[3]/div[3]/a/@href').extract()
item['list_of_text'] = response.xpath('//*[@id="page-user"]/div[1]/div/div/div/div/div/div/a/text()').extract()
运行此命令将给出错误代码
^{pr2}$
更改我的代码,使文件首先解压缩,然后按如下方式传递:source_dir = 'path/to/files/'
for src_name in glob.glob(os.path.join(source_dir, '*.gz')):
base = os.path.basename(src_name)
with gzip.open(src_name, 'rb') as infile:
#start_urls = ['/path/to/files*.html']#
file_cont = infile.read()
start_urls = file_cont#['file:file_cont']
给出以下错误:Traceback (most recent call last):
File "/usr/local/lib/python2.7/site-packages/scrapy/core/engine.py", line 127, in _next_request
request = next(slot.start_requests)
File "/usr/local/lib/python2.7/site-packages/scrapy/spiders/__init__.py", line 70, in start_requests
yield self.make_requests_from_url(url)
File "/usr/local/lib/python2.7/site-packages/scrapy/spiders/__init__.py", line 73, in make_requests_from_url
return Request(url, dont_filter=True)
File "/usr/local/lib/python2.7/site-packages/scrapy/http/request/__init__.py", line 25, in __init__
self._set_url(url)
File "/usr/local/lib/python2.7/site-packages/scrapy/http/request/__init__.py", line 57, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: %3C