import scrapy
import re
import json
import logging
from datetime import datetime
from urllib.parse import urlparse
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from nepu_spider.items import NepuSpiderItem
class InfoSpider(CrawlSpider):
name = 'info'
allowed_domains = ['nepu.edu.cn', 'www.nepu.edu.cn']
start_urls = [
'https://www.nepu.edu.cn/tzgg.htm',
'https://www.nepu.edu.cn/xxgk/xxjj.htm',
'https://www.nepu.edu.cn/jgsz/dzglbm.htm',
'https://www.nepu.edu.cn/jgsz/xysz.htm',
'https://www.nepu.edu.cn/rcpy1.htm',
'https://www.nepu.edu.cn/kxyj/kygk1.htm',
'https://www.nepu.edu.cn/szdw/szgk.htm',
'https://www.nepu.edu.cn/zsjy1.htm',
'https://www.nepu.edu.cn/xtgz1.htm',
'https://www.nepu.edu.cn/gjjl/xjjl.htm',
'https://www.nepu.edu.cn/xyfg.htm',
'https://www.nepu.edu.cn/xsc/xsgz.htm',
'https://www.nepu.edu.cn/xxgk/xrld.htm',
'https://www.nepu.edu.cn/jxky/jwtz.htm',
]
rules = (
Rule(LinkExtractor(
allow=(r'/info/\d+/\d+\.htm', r'\.htm$'),
restrict_xpaths=(
'//div[@class="list_right_con"]',
'//div[@class="v_news_content"]',
'//div[contains(@class, "main_con")]',
'//div[@id="vsb_content"]'
)
), callback='parse_item', follow=True),
Rule(LinkExtractor(
allow=(r'/(tzgg|rcpy|kxyj|zsjy|szdw|gjjl|xyfg|xxgk|jgsz)/',),
restrict_xpaths=(
'//div[@class="menu"]',
'//div[contains(@class, "channel")]',
'//div[@class="list_right_con"]/ul/li'
)
), follow=True),
)
custom_settings = {
'DEPTH_LIMIT': 4,
'CONCURRENT_REQUESTS': 16,
'DOWNLOAD_DELAY': 0.5,
'AUTOTHROTTLE_ENABLED': True,
'LOG_LEVEL': 'INFO',
'LOG_FILE': 'nepu_spider.log'
}
def __init__(self, *args, **kwargs):
super(InfoSpider, self).__init__(*args, **kwargs)
self.visited_urls = set()
def start_requests(self):
self.logger.info("🚀 开始爬取东北石油大学官网...")
self.logger.info(f"初始URL数量: {len(self.start_urls)}")
for url in self.start_urls:
yield scrapy.Request(url, callback=self.parse, meta={'start_url': True})
def parse_item(self, response):
if response.url in self.visited_urls:
self.logger.debug(f"已访问过的URL: {response.url}")
return
self.visited_urls.add(response.url)
def parse(self, response):
# 解析当前页面中的链接
for href in response.css('a::attr(href)').getall():
url = response.urljoin(href)
if url not in self.visited_urls:
self.visited_urls.add(url)
yield scrapy.Request(url, callback=self.parse_item)
def parse_item(self, response):
# 解析具体条目内容
yield {
'url': response.url,
'title': response.css('h1::text').get()
}
item = NepuSpiderItem()
item['url'] = response.url
item['source'] = '东北石油大学官网'
item['depth'] = response.meta.get('depth', 0)
item['response_status'] = response.status
try:
# 标题提取逻辑
title_selectors = ['h1::text', '.title::text', 'h2::text', 'title::text']
title = None
for selector in title_selectors:
title_candidate = response.css(selector).get()
if title_candidate:
clean_title = title_candidate.split('|')[0].split('_')[0].split('-')[0].strip()
if clean_title and len(clean_title) > 3:
title = clean_title
break
item['title'] = title or '无标题'
# 日期提取逻辑
date_selectors = ['.time::text', '.date::text', '.publish-time::text', '//span[@id="publish-time"]/text()']
publish_date = None
for selector in date_selectors:
date_text = response.css(selector).get()
if date_text:
match = re.search(r'(\d{4})[年/-](\d{1,2})[月/-](\d{1,2})', date_text)
if match:
year, month, day = match.groups()
publish_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}"
break
item['publish_date'] = publish_date or ''
# 内容提取逻辑
content_selectors = [
'//div[@id="vsb_content"]',
'//div[@class="v_news_content"]',
'//div[contains(@class, "article-content")]',
'//div[contains(@class, "content")]',
'//div[@class="TRS_Editor"]',
'//div[@class="con"]',
'//div[@id="zoom"]',
'//div[@class="content_main"]',
'//body'
]
content_html = None
for selector in content_selectors:
content_div = response.xpath(selector)
if content_div:
content_html = content_div.get()
if content_html and len(content_html) > 500:
break
if not content_html:
self.logger.warning(f"⚠️ 无法提取 content: {response.url}")
content_html = ""
content_html = re.sub(r'<script[^>]*>.*?</script>', '', content_html, flags=re.DOTALL)
content_html = re.sub(r'<style[^>]*>.*?</style>', '', content_html, flags=re.DOTALL)
content_html = re.sub(r'<!--.*?-->', '', content_html, flags=re.DOTALL)
content_html = re.sub(r'<iframe[^>]*>.*?</iframe>', '', content_html, flags=re.DOTALL)
content_html = re.sub(r'相关附件', '', content_html)
content_html = re.sub(r'\s+', ' ', content_html)
item['content'] = content_html
# 分类提取逻辑
breadcrumbs = response.css('.breadcrumb a::text, .weizhi a::text').getall()
if breadcrumbs and len(breadcrumbs) > 1:
item['category'] = ' > '.join([b.strip() for b in breadcrumbs[1:] if b.strip()])
else:
path_parts = urlparse(response.url).path.split('/')
category_map = {
'tzgg': '通知公告',
'xxgk': '学校概况',
'jgsz': '机构设置',
'rcpy': '人才培养',
'kxyj': '科学研究',
'szdw': '师资队伍',
'zsjy': '招生就业',
'xtgz': '团体组织',
'gjjl': '国际交流',
'xyfg': '校园法规',
'xsc': '学生工作',
'jxky': '教学科研',
'info': '信息发布',
'news': '新闻中心'
}
for part in path_parts:
if part in category_map:
item['category'] = category_map[part]
break
else:
item['category'] = path_parts[1] if len(path_parts) > 1 else '其他'
item['content_hash'] = hashlib.sha1(item['content'].encode()).hexdigest()
self.logger.info(f"✅ 成功提取: {item['title']} ({response.url})")
yield item
except Exception as e:
self.logger.error(f"❌ 解析失败: {response.url} | 错误: {str(e)}")
with open('spider_errors.log', 'a', encoding='utf-8') as f:
error_info = {
'url': response.url,
'error': str(e),
'timestamp': datetime.now().isoformat(),
'html': response.text[:1000]
}
f.write(json.dumps(error_info, ensure_ascii=False) + '\n')
def closed(self, reason):
self.logger.info(f"🛑 爬虫结束,原因: {reason}")
self.logger.info(f"总计爬取页面: {len(self.visited_urls)}")
stats = {
'total_urls': len(self.visited_urls),
'finish_time': datetime.now().isoformat(),
'finish_reason': reason
}
with open('spider_stats.json', 'w', encoding='utf-8') as f:
json.dump(stats, f, ensure_ascii=False, indent=2)根据问题
改吧,要改后完整代码