import json
import logging
from pyppeteer import launch
from pyppeteer.errors import TimeoutError
import asyncio
from os import makedirs
from os.path import exists
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s: %(message)s")
INDEX_URL = "https://dynamic2.scrape.center/page/{page}"
TIMEOUT = 10
TOTAL_PAGE = 10
WINDOW_WIDTH, WINDOW_HEIGHT = 1366, 768
HEADLESS = False
RESULTS_DIR = "results"
exists(RESULTS_DIR) or makedirs(RESULTS_DIR)
browser, tab = None, None
async def init():
global browser, tab
browser = await launch(headless=HEADLESS,
args=['--disable-infobars', f'--window-size={WINDOW_WIDTH},{WINDOW_HEIGHT}'])
tab = await browser.newPage()
await tab.setViewport({"width": WINDOW_WIDTH, "height": WINDOW_HEIGHT})
async def scrape_page(url, selector):
"""
通用爬取的方法
:param url: 需要爬取的链接
:param selector: 等待渲染出的节点对应的css选择器
:return: 只是打开当前链接的页面进行加载,没有解析行为
"""
logging.info("scraping %s" % url)
try:
await tab.goto(url)
await tab.waitForSelector(selector, options={
"timeout": TIMEOUT * 1000
})
except TimeoutError:
logging.error("error occurred while scraping %s" % url, exc_info=True)
async def scrape_index(page):
"""
爬取列表页
:param page: 当前的页码
:return:
"""
url = INDEX_URL.format(page=page)
await scrape_page(url, ".item .name")
async def parse_index():
"""
解析列表页,提取详情页的url
querySelectorAll返回节点列表
querySelectorAllEval(selector,pageFunction) # 参数1:css选择器 参数2:要执行的js代码
这里 JavaScript 对应的 pageFunction 输入参数就是 nodes,输出结果是调用了 map 方法得到每个 node,
然后调用 node 的 href 属性即可。这样返回结果就是当前列表页的所有电影的详情页 URL 组成的列表了
:return:返回详情页url组成的列表
"""
return await tab.querySelectorAllEval(".item .name", "nodes => nodes.map(node => node.href)")
async def scrape_detail(url):
"""
详情页爬取
:param url: 当前详情页的URL
:return:
"""
await scrape_page(url, "h2")
async def parse_detail():
"""
解析详情页面
:return: 返回当前页面的字典数据
"""
url = tab.url
name = await tab.querySelectorEval("h2", "node => node.innerText")
categories = await tab.querySelectorAllEval(".categories button span", "nodes => nodes.map(node => node.innerText)")
cover = await tab.querySelectorEval(".cover", "node => node.src")
score = await tab.querySelectorEval(".score", "node => node.innerText")
drama = await tab.querySelectorEval(".drama p", "node => node.innerText")
return {
"url": url,
"name": name,
"categories": categories,
"cover": cover,
"score": score,
"drama": drama,
}
async def save_data(data):
name = data.get("name")
data_path = f"{RESULTS_DIR}/{name}.json"
json.dump(data, open(data_path, "w", encoding="utf-8"), ensure_ascii=False, indent=2)
async def main():
await init()
try:
for page in range(1, TOTAL_PAGE + 1):
await scrape_index(page)
detail_urls = await parse_index()
for detail_url in detail_urls:
await scrape_detail(detail_url)
detail_data = await parse_detail()
await save_data(detail_data)
logging.info("data %s" % detail_data)
finally:
await browser.close()
if __name__ == '__main__':
asyncio.get_event_loop().run_until_complete(main())