第二章:基本库的使用
用到的库有requests、multiprocessing、re、logging等,通过这个案例可以将之前学习的知识串联起来
# -*- coding: utf-8 -*-
# @Time : 2023/4/1 21:43
# @Author :liuw
# @File : ScrapeSsrl.py
# @Software: PyCharm
'''
基础爬虫案例实战
1.利用request爬取站点每一页电影列表。顺着列表在爬取每个电影的详情页
2.利用正则表达式提取每部电影名称、封面、类别、上映时间、评分、剧情简介。
3.把以上爬取的内容保存为JSON文本文件
4.使用多进程实现爬取的加速
分析网页的div结构得知:
电影的详情页是 https://ssr1.scrape.center/detail/1
列表页: https://ssr1.scrape.center/page/2
'''
import multiprocessing
import requests
import logging
import re
import json
from urllib.parse import urljoin
from os import makedirs
from os.path import exists
logging.basicConfig(level=logging.INFO, format='%(asctime)s -%(levelname)s:%(message)s')
BASE_URL = 'https://ssr1.scrape.center'
TOTAL_PAGE = 10
'''
定义一个页面爬取的方法
'''
def scrape_page(url):
logging.info('scraping %s...', url)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
logging.error('get invalid status code%s while scraping %s',
response.status_code, url)
except requests.RequestException:
logging.error('error occurred while scraping %s', url, exc_info=True)
'''
定义列表页的爬取方法
'''
def scrape_index(page):
index_url = f'{BASE_URL}/page/{page}'
return scrape_page(index_url)
'''
解析列表项,得到每部电影的详情页URL
<a data-v-7f856186="" href="/detail/1" class="name">
<h2 data-v-7f856186="" class="m-b-sm">霸王别姬 - Farewell My Concubine</h2>
</a>
'''
def parse_index(html):
pattern = re.compile('<a.*?href="(.*?)".*?class="name">')
items = re.findall(pattern, html)
if not items:
return []
for item in items:
# item 值就是上文提到的/detail/1
detail_url = urljoin(BASE_URL, item)
logging.info('get detail url %s', detail_url) # https://ssr1.scrape.center/detail/1
yield detail_url # 调用yield返回
'''
成功获取了详情页URL,定义一个详情页的爬取方法
单独定义一个scrape_detail方法在逻辑上显得更清晰,以后想添加日志输出、增加预处理都可以在scrape_detail方法进行实现
'''
def scrape_detail(url):
return scrape_page(url)
'''
解析详情页
cover 封面
name 名称
categories 类别
published_at 上映时间
drama 剧情简介
score 评分
'''
def parse_detail(html):
# < img data-v-63864230 src = "https://p0.meituan.net/movie/ce4da3e03e655b5b88ed31b5cd7896cf62472.jpg@464w_644h_1e_1c" class ="cover" >
cover_pattern = re.compile('class="item.*?<img.*?src="(.*?)".*?class="cover">', re.S)
name_pattern = re.compile('<h2.*?>(.*?)</h2>')
categories_pattern = re.compile('<button.*?category.*?<span>(.*?)</span>.*?</button>', re.S)
published_at_pattern = re.compile('(\d{4}-\d{2}-\d{2})\s?上映')
drama_pattern = re.compile('<div.*?drama.*?>.*?<p.*?>(.*?)</p>', re.S)
score_pattern = re.compile('<p.*?score.*?>(.*?)</p>', re.S)
cover = re.search(cover_pattern, html).group(1).strip() \
if re.search(cover_pattern, html) else None
name = re.search(name_pattern, html).group(1).strip() \
if re.search(name_pattern, html) else None
categories = re.findall(categories_pattern, html) \
if re.findall(categories_pattern, html) else []
published_at = re.search(published_at_pattern, html).group(1).strip() \
if re.search(published_at_pattern, html) else None
drama = re.search(drama_pattern, html).group(1).strip() \
if re.search(drama_pattern, html) else None
score = re.search(score_pattern, html).group(1).strip() \
if re.search(score_pattern, html) else None
return {
'cover': cover,
'name': name,
'categories': categories,
'published_at': published_at,
'drama': drama,
'score': score
}
'''
定义一个保存数据的方法
'''
RESULTS_DIR = 'results'
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
def save_data(data):
name = data.get('name')
data_path = f'{RESULTS_DIR}/{name}.json'
json.dump(data, open(data_path, 'w', encoding='utf-8'), ensure_ascii=False, indent=2)
def main(page):
# for page in range(1, TOTAL_PAGE + 1):
index_html = scrape_index(page)
detail_urls = parse_index(index_html)
for detail_url in detail_urls:
detail_html = scrape_detail(detail_url)
data = parse_detail(detail_html)
logging.info('get detail data %s', data)
logging.info('saving detail data to json file')
save_data(data)
logging.info('data saved successfully')
# logging.info('detail urls %s', list(detail_urls)) # 调用list()将结果(生成器)输出
if __name__ == '__main__':
pool = multiprocessing.Pool()
pages = range(1, TOTAL_PAGE + 1) # 所有需要遍历的页码 即1-10
pool.map(main, pages) # 把每次的调用分别变成一个进程,加入进程池中,进程池依据环境来决定运行多少个进程。如8核机器同时有8个进程在并行运行
pool.close()
pool.join()