【目标】
下载某网站 https://www.pcac.org.cn/eportal/ui?pageId=595055
拟获取列表中文档数据。
先获取总页数,如下图:
![]()
然后逐步翻页获取每个页面链接。
有些链接指向.pdf文档,可以直接下载;有些链接指向html文件,直接保存;有些有一个附件,直接下载;有些有多个附件,建立目录后保存。
【优化细节】
若文件已经下载过,则自动跳过;
若获取某个文件下载出错,则自动跳过进行下一个下载。
【下载附件】推荐用requests库,好处是有错误状态码,且不用判断文件下载时间,且可以重命名文件,故不用driver.get(url)下载文件。
【示范代码】
import os
import time
from urllib import parse
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36"}
def mk_dir(dir1):
if not os.path.exists(dir1):
os.mkdir(dir1)
def file_exist(base_dir, pure_filename):
if os.path.exists(os.path.join(base_dir, pure_filename)):
return True
return False
base_dir = os.path.join(os.getcwd(), 'download')
mk_dir(base_dir)
def write_html_page(browser, filename):
browser.switch_to.window(browser.window_handles[-1])
page = browser.page_source
with open(filename, 'w', encoding='utf-8') as f:
f.write(page)
print('-' * 70)
chrome_options = Options()
chrome_options.add_argument("--headless") # 在后台运行
chrome_options.add_experimental_option("prefs", {
"download.default_directory": base_dir,
})
driver = webdriver.Chrome(options=chrome_options, service=Service('./chromedriver.exe'))
base_url = 'https://www.pcac.org.cn/eportal/ui?pageId=595055'
driver.get(base_url)
time.sleep(1)
total_page = driver.find_element(By.XPATH, '//div[@class="easysite-total-page fy-div"]/span[1]/b[2]').text
# total_page=driver.find_element(By.XPATH,'//div[@class="easysite-total-page fy-div"]')
# 85篇文章 当前页:1/6 123456 下一页
print(f'最大页数为:{total_page}')
for i in range(1, int(total_page) + 1):
url2 = f'https://www.pcac.org.cn/eportal/ui?pageId=595055¤tPage={i}&moduleId=b18e5af5da4f400fac1fd7ebf4a8bdd1&staticRequest=yes'
driver.get(url2)
li_list = driver.find_elements(By.XPATH, '//*[@id="b18e5af5da4f400fac1fd7ebf4a8bdd1"]/div[2]/ul/li')
item_list = []
for li in li_list:
tag_a = li.find_element(By.TAG_NAME, 'a')
title1 = tag_a.text
link1 = tag_a.get_attribute('href')
item_list.append((title1, link1))
print('='*20+f'第{i}页,共有{len(item_list)}条记录'+'='*15)
k = 1
for item in item_list:
print('>>' + '-' * 60)
print(f'[第{i}页-{k}]-正在获取{item[1]}')
if '.pdf' in item[1]:
print(f'【直接获取】pdf文件:{item[1]}')
# 直接下载url文件
old_filename = item[1].split('/')[-1]
old_filename = parse.unquote(old_filename)
new_filename = item[0] + '.pdf'
full_name = os.path.join(base_dir, new_filename)
print(old_filename, new_filename)
if file_exist(base_dir, new_filename):
print(f'------跳过-->文件<{new_filename}>已下载...')
k += 1
continue
try:
r = requests.get(item[1], headers=headers)
print(r.status_code)
with open(full_name, 'wb') as f:
f.write(r.content)
except:
print(f'>>>except 下载发生错误,跳过{item[1]}')
k += 1
else:
# 先打开二级页面
driver.get(item[1])
content = driver.find_element(By.XPATH, '//div[@class="xl-main"]')
attach_list = content.find_elements(By.TAG_NAME, 'a')
if not attach_list: # 为html文件,无附件下载
print('【单个html】网页内容,直接下载')
fname=os.path.join(base_dir, driver.title + ".html")
if os.path.exists(fname):
print(f'------跳过-->文件<{driver.title + ".html"}>已下载...')
k += 1
continue
write_html_page(driver, f'{fname}')
k += 1
continue
if len(attach_list) == 1: # 单个文件下载,保存在当期目录
a_tag = attach_list[0]
item_dir = base_dir
link3 = a_tag.get_attribute('href')
old_filename = a_tag.get_attribute('href').split('/')[-1]
old_filename = parse.unquote(old_filename)
new_filename = a_tag.text
new_full_filename = os.path.join(base_dir, new_filename)
print('【单个文件】下载:', link3, old_filename, new_filename, new_full_filename)
# if os.path.exists(os.path.join(item_dir, new_filename)):
if file_exist(base_dir, new_filename):
print(f'------跳过-->文件<{new_filename}>已下载...')
k += 1
continue
try:
r = requests.get(link3, headers=headers)
print(r.status_code)
with open(new_full_filename, 'wb') as f:
f.write(r.content)
except:
print(f'>>>except 下载发生错误,跳过{link3}')
k += 1
else:
# len(attach_list) > 1: 有多个附件需要下载,建立目录
item_dir = os.path.join(base_dir, item[0])
mk_dir(item_dir)
print(f'【多附件下载】,附件个数:{len(attach_list)}')
print('item_dir', item_dir)
files = []
for a_tag in attach_list:
# element = a_tag.get_attribute('outerHTML')
# print(element)
link3 = a_tag.get_attribute('href')
old_filename = a_tag.get_attribute('href').split('/')[-1]
old_filename = parse.unquote(old_filename)
new_filename = a_tag.text
full_filename = os.path.join(item_dir, new_filename)
print(link3, old_filename, new_filename, full_filename)
if os.path.exists(full_filename):
print(f'------跳过-->文件<{full_filename}>已下载...')
continue
files.append((old_filename, new_filename, full_filename, link3))
for f1 in files:
try:
r = requests.get(f1[3], headers=headers)
print(r.status_code)
with open(f1[2], 'wb') as f:
f.write(r.content)
except:
print(f'>>>except 下载发生错误,跳过下载{f1[3]}')
continue
k += 1
input('请输入......')
【发文章不易,请多多点赞、关注、支持,谢谢!】
本文介绍如何使用Python爬虫技术,结合requests库和Selenium,下载https://www.pcac.org.cn网站的文档,包括处理不同类型的文件链接、自动跳过已存在文件和管理附件下载。
5707

被折叠的 条评论
为什么被折叠?



