架构安装、搭建环境、使用
更新 python -m pip install --upgrade pip
安装 pip install scrapy
pip install pandas
pip install pymysql --user
pip install mysql-connector-python
查看twisted版本
pip show twisted
pip uninstall twisted
pip install twisted==22.10.0
使用
1.创建项目‘scrapy startproject cuc_news’
2.更换目录‘cd cuc_news’
3.创建spider‘scrapy genspider example(别名/spider名) example.com(目标网址)’
eg. ‘scrapy genspider cuc_news cuc.edu.cn’
4. 运行命令‘ scrapy crawl cuc_news’
cucnews
import scrapy
import re
from cuc_news.items import CUCNewsItem
class NewsSpider(scrapy.Spider):
name = 'news'
allowed_domains = ['cuc.edu.cn']
start_urls = ['https://www.cuc.edu.cn/news/1901/list.htm']
page_count = 0 # 记录当前爬取的页数
def parse(self, response):
# 解析当前列表页,抓取每一条新闻
for news in response.xpath('//*[@id="l-container"]/div/div/div[2]/div/div[2]/div/div/ul/li'):
# 获取每条新闻的链接
news_url = news.xpath('a/@href').get()
# 进入每条新闻详情页
if news_url:
yield response.follow(news_url, self.parse_news)
# 检查是否已经爬取了第五页
if self.page_count < 5: # 爬取前五页
# 找到并点击下一页链接
next_page = response.xpath('//*[@id="wp_paging_w6"]/ul/li[2]/a[3]/@href').get()
if next_page:
self.page_count += 1 # 增加爬取页数
yield response.follow(next_page, self.parse)
def parse_news(self, response):
# 解析新闻详情页
title = response.xpath('//*[&