import argparse
import re
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import List, Optional, Tuple
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
# 可选导入(若未安装则关闭 --use-ytdlp 即可)
try:
import yt_dlp # type: ignore
HAS_YTDLP = True
except Exception: # noqa: BLE001
HAS_YTDLP = False
MEDIA_EXTENSIONS = (
".mp4",
".webm",
".mkv",
".mov",
".mp3",
".m4a",
".aac",
".wav",
".flac",
)
def sanitize_filename(name: str) -> str:
name = re.sub(r"[\\/:*?\"<>|]", "_", name)
name = re.sub(r"\s+", " ", name).strip()
return name or "media"
def request_page(url: str, timeout: int = 20) -> Tuple[str, requests.Response]:
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
),
}
resp = requests.get(url, headers=headers, timeout=timeout)
resp.raise_for_status()
return resp.text, resp
def discover_media_links(html: str, base_url: str) -> List[str]:
soup = BeautifulSoup(html, "lxml")
links: List[str] = []
# <video>/<audio>/source
for tag in soup.find_all(["video", "audio"]):
src = tag.get("src")
if src:
links.append(requests.compat.urljoin(base_url, src))
for source in tag.find_all("source"):
src2 = source.get("src")
if src2:
links.append(requests.compat.urljoin(base_url, src2))
# <a href="*.mp4|*.mp3|...">
for a in soup.find_all("a", href=True):
href = a["href"]
url = requests.compat.urljoin(base_url, href)
if any(url.lower().split("?")[0].endswith(ext) for ext in MEDIA_EXTENSIONS):
links.append(url)
# 脚本文本中的直链
pattern = re.compile(r"https?://[^'\"\s>]+", re.I)
for script in soup.find_all("script"):
text = script.string or script.get_text() or ""
for m in pattern.findall(text):
if any(m.lower().split("?")[0].endswith(ext) for ext in MEDIA_EXTENSIONS):
links.append(m)
# 去重
seen = set()
uniq: List[str] = []
for u in links:
if u not in seen:
uniq.append(u)
seen.add(u)
return uniq
def human_size(num: Optional[int]) -> str:
if num is None:
return "?"
units = ["B", "KB", "MB", "GB", "TB"]
size = float(num)
for unit in units:
if size < 1024.0:
return f"{size:.2f} {unit}"
size /= 1024.0
return f"{size:.2f} PB"
def is_probably_stream(url: str) -> bool:
low = url.lower()
return any(x in low for x in [".m3u8", "manifest", "dash.mpd", "playlist"]) or "?m3u8" in low
def stream_download(url: str, outfile: Path, chunk_size: int = 1 << 20, resume: bool = True) -> None:
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
),
}
start = 0
mode = "wb"
if resume and outfile.exists():
start = outfile.stat().st_size
if start > 0:
headers["Range"] = f"bytes={start}-"
mode = "ab"
with requests.get(url, headers=headers, stream=True, timeout=30) as r:
r.raise_for_status()
total: Optional[int] = None
if "Content-Length" in r.headers:
try:
total = int(r.headers["Content-Length"]) + start if "Range" in headers else int(r.headers["Content-Length"])
except Exception: # noqa: BLE001
total = None
pbar = tqdm(total=total, initial=start, unit="B", unit_scale=True, desc=outfile.name)
with open(outfile, mode) as f:
for chunk in r.iter_content(chunk_size=chunk_size):
if chunk:
f.write(chunk)
pbar.update(len(chunk))
pbar.close()
def download_via_ytdlp(url: str, outdir: Path, filename: Optional[str]) -> None:
if not HAS_YTDLP:
raise RuntimeError("yt-dlp 不可用,请先安装或关闭 --use-ytdlp")
ydl_opts = {
"outtmpl": str(outdir / (filename or "%(title)s.%(ext)s")),
"format": "bestaudio/bestvideo/best",
"merge_output_format": "mp4",
"concurrent_fragment_downloads": 8,
"retries": 10,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[attr-defined]
ydl.download([url])
def download_with_retry(func, *args, retries: int = 3, base_delay: float = 1.0, **kwargs) -> None:
attempt = 0
while True:
try:
return func(*args, **kwargs)
except Exception as e: # noqa: BLE001
attempt += 1
if attempt > retries:
raise e
delay = base_delay * (2 ** (attempt - 1))
print(f"重试第 {attempt} 次,等待 {delay:.1f}s:{e}")
time.sleep(delay)
def ensure_outdir(path: Path) -> None:
path.mkdir(parents=True, exist_ok=True)
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description="从网页提取并下载视频/音频")
p.add_argument("url", help="目标网页 URL 或媒体直链")
p.add_argument("-o", "--output", default="downloads", help="输出目录")
p.add_argument("-n", "--name", default=None, help="文件名前缀或固定文件名(不含扩展名)")
p.add_argument("--no-resume", action="store_true", help="关闭断点续传")
p.add_argument("--use-ytdlp", action="store_true", help="对流媒体/复杂页面使用 yt-dlp")
p.add_argument("--max", type=int, default=0, help="最多下载前 N 条(0=全部)")
p.add_argument("--workers", type=int, default=4, help="并发下载线程数")
p.add_argument("--retries", type=int, default=3, help="失败重试次数")
return p.parse_args()
def main() -> None:
args = parse_args()
outdir = Path(args.output)
ensure_outdir(outdir)
# 单个直链下载
if any(args.url.lower().split("?")[0].endswith(ext) for ext in MEDIA_EXTENSIONS) or is_probably_stream(args.url):
filename_base = sanitize_filename(args.name or Path(args.url).stem)
if args.use_ytdlp or is_probably_stream(args.url):
download_with_retry(download_via_ytdlp, args.url, outdir, filename_base, retries=args.retries)
print(f"完成: {args.url}")
return
outfile = outdir / f"{filename_base}{Path(args.url).suffix.split('?')[0]}"
download_with_retry(stream_download, args.url, outfile, resume=not args.no_resume, retries=args.retries)
print(f"完成: {outfile} ({human_size(outfile.stat().st_size)})")
return
# 网页提取
html, resp = request_page(args.url)
soup = BeautifulSoup(html, "lxml")
title = sanitize_filename(soup.title.string if soup.title else "page")
media_links = discover_media_links(html, resp.url)
if args.max and len(media_links) > args.max:
media_links = media_links[: args.max]
if not media_links:
print("未发现可下载的媒体链接。可尝试 --use-ytdlp。")
return
print(f"发现 {len(media_links)} 个媒体链接。")
def process_link(idx: int, link: str) -> str:
base = sanitize_filename(args.name or f"{title}_{idx:02d}")
if args.use_ytdlp or is_probably_stream(link):
try:
download_with_retry(download_via_ytdlp, link, outdir, base, retries=args.retries)
return f"完成: {link}"
except Exception as e: # noqa: BLE001
return f"yt-dlp 下载失败: {link} -> {e}"
else:
ext = Path(link.split("?")[0]).suffix or ".bin"
outfile = outdir / f"{base}{ext}"
try:
download_with_retry(stream_download, link, outfile, resume=not args.no_resume, retries=args.retries)
return f"完成: {outfile} ({human_size(outfile.stat().st_size)})"
except Exception as e: # noqa: BLE001
return f"直链下载失败: {link} -> {e}"
with ThreadPoolExecutor(max_workers=max(1, args.workers)) as executor:
futures = [executor.submit(process_link, idx, link) for idx, link in enumerate(media_links, 1)]
for fut in as_completed(futures):
print(fut.result())
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("已取消")
sys.exit(1)
最新发布