这里的错误与allowed_domains
Scrapy 中的限制有关。下面是利用scrapy对网易新闻四个栏目爬取url以及详情内容
import scrapy
from scrapy import cmdline
from items import LBItem, NewSpiderItem
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
class SpiderSpider(scrapy.Spider):
def __init__(self):
# Define list for storing news links
self.all_urls = []
# Setup headless Chrome browser
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
self.browse = webdriver.Chrome(options=chrome_options)
name = 'spider'
# Update allowed domains to include all relevant 163.com subdomains
allowed_domains = ['163.com', 'news.163.com', 'www.163.com']
start_urls = ['https://news.163.com/domestic/']
def parse(self, response):
li_list = response.xpath('//div[@class="ns_area list"]/ul/li/a')
index = [4,2,1,5]
for i in index:
new_name = li_list[i].xpath('./text()')[0].extract()
new_url = li_list[i].xpath('./@href')[0].extract()
print('News section info:', new_name, new_url)
lbitem = LBItem()
lbitem['name'] = new_name
lbitem['link'] = new_url
print('Final category info:', lbitem)
yield lbitem
self.all_urls.append(new_url)
for url in self.all_urls:
yield scrapy.Request(url=url, dont_filter=True, callback=self.list_parse)
def list_parse(self, response):
title = response.xpath('//div[@class="news_title"]/h3/a/text()').extract()
content_url = response.xpath('//div[@class="news_title"]/h3/a/@href').extract()
for titles, content_urls in zip(title, content_url):
item = NewSpiderItem()
item['title'] = titles
item['content_url'] = content_urls
print("title", titles)
print("News detail link", content_urls)
yield scrapy.Request(url=content_urls, callback=self.get_data, meta={'item': item}, dont_filter=True)
def get_data(self, response):
item = response.meta['item']
biaoqian = response.xpath('//div[@class="post_main"]/h1/text()').extract()
item['biaoqian'] = biaoqian
content = response.xpath('//div[@class="post_body"]/p/text()').extract()
item['content'] = ''.join(content)
for biaoqians, contents in zip(biaoqian, content):
print("News text content", biaoqians)
print("Final parsing result", contents)
yield item
if __name__ == '__main__':
cmdline.execute(['scrapy', 'crawl', 'spider'])
解决方法:
- 已更新
allowed_domains
以包含所有相关子域:allowed_domains = ['163.com', 'news.163.com', 'www.163.com']
- 添加
dont_filter=True
到请求list_parse
方法中以确保 URL 不会被过滤掉。yield scrapy.Request(url=content_urls, callback=self.get_data, meta={'item': item}, dont_filter=True)