BlackBoard作为尼龙御用的教学系统,有着比较强大的反爬能力。用request基本上连主页都进不去,会直接握手失败TT 不过我们可以使用更强大的selenium + google driver模拟真人操作,完美规避SSL错误,耶✌️
下面是简单的展示视频,以STA2002这门包括lecture课件pdf、R讲解pdf和很多csv数据的超级多文件课为例,通过简单运行代码➡️完美下载所有course content进入output文件(输入账号密码部分可以提前硬编码,但为了大家能自己选择登陆哪个账号/下载哪门课的内容我就没写)
Bibo🌦 2025-04-04 18.07.04
这部分是downloader.py 主要负责从每个文件夹页面下载文件,逐个访问文件链接并触发下载
import os
import time
from selenium.webdriver.common.by import By
def download_files_from_folder(driver, folder_name, folder_url, download_root="output"):
print(f"\n📂 正在访问文件夹:{folder_name}")
folder_path = os.path.join(download_root, folder_name.replace(" ", "_"))
os.makedirs(folder_path, exist_ok=True)
# 目标页面
driver.get(folder_url)
time.sleep(1)
# 获取页面链接
from bs4 import BeautifulSoup
soup = BeautifulSoup(driver.page_source, "html.parser")
file_links = []
for link in soup.find_all("a", href=True):
href = link["href"]
text = link.get_text(strip=True)
if "bbcswebdav" in href or "file_id=" in href:
if not href.startswith("http"):
href = "https://bb.cuhk.edu.cn" + href
file_links.append((text, href))
print(f"🔗 共找到 {len(file_links)} 个文件链接")
# 逐个打开链接让浏览器自动下载并回到主页
for name, link in file_links:
print(f"下载{name} ing")
# 记住当前窗口
main_window = driver.current_window_handle
driver.get(link)
time.sleep(0.5)
driver.switch_to.window(main_window)
time.sleep(0.5)
print(f"📁 {name} 文件下载触发成功,文件保存在默认下载目录。")
print("耶全部文件触发下载完成(请检查浏览器默认下载文件夹)")
下面这部分是start_browser.py 主要负责启动浏览器、登录、抓取页面信息,并开始下载
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
from bs4 import BeautifulSoup
from downloader import download_files_from_folder
CHROMEDRIVER_PATH = "/Users/hexiaoxuan/driver/chromedriver"
def launch_browser(download_dir="output"):
options = Options()
options.add_argument("--start-maximized")
prefs = {
"plugins.always_open_pdf_externally": True, # 让 PDF 不预览,直接下载
"download.default_directory": os.path.abspath(download_dir),
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
}
options.add_experimental_option("prefs", prefs)
service = Service(CHROMEDRIVER_PATH)
driver = webdriver.Chrome(service=service, options=options)
driver.get("https://bb.cuhk.edu.cn/")
input("登陆界面")
print("登录完成")
print("当前页面 URL:", driver.current_url)
return driver
def parse_content_links(driver):
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
folder_links = []
for link in soup.find_all("a", href=True):
href = link["href"]
text = link.get_text(strip=True)
if "content_id=" in href:
full_url = "https://bb.cuhk.edu.cn" + href
folder_links.append((text, full_url))
print("哇哦找到了以下子页面链接:")
for name, url in folder_links:
print(f"🔗 {name} ➡️ {url}")
return folder_links
if __name__ == "__main__":
driver = launch_browser()
# 获取页面中的链接
links = parse_content_links(driver)
for name, url in links:
print(f"正在访问文件夹:{name}")
# 创建子文件夹
folder_path = os.path.join("output", name)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
download_files_from_folder(driver, name, url, folder_path) # 下载文件到指定的子文件夹中

被折叠的 条评论
为什么被折叠?



