解决tqdm在windows上重复换行

博客讲述了作者在Windows 10上遇到tqdm进度条显示异常的问题。作者尝试了设置ascii=True参数并未解决问题,但通过手动安装colorama库得到了改善。开发者在GitHub上迅速回应了此问题,并计划在未来的v4.61.2版本中自动安装colorama以解决此类问题。
import time
from tqdm import tqdm

for i in tqdm(range(3), desc='level_1'):
    for j in tqdm(range(5), desc
可以在原有基础上 再简化一下这个代码吗 不要修改相关代码的名称之类的 哦可以删除多余的注释部分 保留关键的环节点就行 import requests # 导入requests库,用于发送HTTP请求 from bs4 import BeautifulSoup # 导入BeautifulSoup库,用于解析HTML页面 import csv # 导入csv库,用于处理CSV文件 from tqdm import tqdm # 导入tqdm库,用于显示进度条 import re # 导入re库,用于正则表达式操作 import os # 导入os库,用于操作系统相关功能,如文件和目录操作 import random # 导入random库,用于生成随机数 import time # 导入time库,用于处理时间相关操作 from jsonpath_ng import parse # 导入jsonpath_ng库的parse函数,用于解析JSONPath表达式 # 定义一个函数,用于发送HTTP请求获取页面内容 def get_page(url, headers): try: # 发送GET请求到指定的URL,并携带请求头信息 response = requests.get(url, headers=headers) # 设置响应的编码为gbk,因为当当网使用gbk编码 response.encoding = "gbk" # 检查请求是否成功,如果失败会抛出异常 response.raise_for_status() # 返回响应的文本内容 return response.text except requests.exceptions.RequestException as e: # 如果请求过程中出现异常,打印错误信息 print(f"请求失败: {e}") # 返回空字符串 return "" # 定义一个函数,用于解析书籍榜单页面,提取书籍信息并保存到CSV def get_book_list_data(html): book_ids = [] # 初始化一个空列表,用于存储提取到的书籍ID # 使用BeautifulSoup解析HTML内容 soup = BeautifulSoup(html, "html.parser") # 查找书籍列表的ul元素 ul = soup.find("ul", class_="bang_list clearfix bang_list_mode") if not ul: # 如果未找到书籍列表的ul元素,打印提示信息 print("未找到书籍列表,请检查页面结构是否变化") # 返回空列表 return book_ids # 遍历所有li元素(每本书) li_list = ul.find_all("li") for li in li_list: # 对li_list列表中的每个li元素进行遍历 try: # 提取排名信息 rank_div = li.find("div", class_="list_num red") or li.find("div", class_="list_num") rank = rank_div.get_text().strip(".") # 获取排名文本并去除末尾的点号 # 提取书籍ID book_link = li.find("div", class_="name").a["href"] book_id = book_link.split('/')[-1].split('.')[0] # 从链接中提取书籍ID # 提取书名 book_name = li.find("div", class_="name").a["title"] # 提取评论数量 content = li.find("div", class_="star").a.get_text().strip("条评论") # 提取推荐指数 recommend = li.find("span", class_="tuijian").get_text().strip("推荐") # 提取作者、出版社和出版时间 publisher_info = li.find_all("div", class_="publisher_info") author = publisher_info[0].a["title"] if publisher_info[0].a else None # 如果有作者信息则提取,否则为None publish_time = publisher_info[1].span.get_text() press = publisher_info[1].a.get_text() # 提取价格信息 original_price = li.find("span", class_="price_r").get_text().strip("¥") now_price = li.find("span", class_="price_n").get_text().strip("¥") discount = li.find("span", class_="price_s").get_text().strip("折") # 组织数据 data = [rank, book_id, book_name, author, publish_time, press, original_price, now_price, discount, content, recommend] # 写入CSV文件 with open("data.csv", "a", encoding="utf-8", newline="") as csv_file: csv_writer = csv.writer(csv_file) # 修改变量名 csv_writer.writerow(data) # 将数据写入CSV文件的一行 # 收集有效书籍ID if book_id != "未知ID": book_ids.append(book_id) # 如果书籍ID有效,则添加到book_ids列表中 except Exception as e: # 如果解析书籍信息时出现异常,打印错误信息 print(f"解析书籍信息时出错: {e}") continue # 跳过当前循环,继续处理下一个li元素 return book_ids # 返回提取到的书籍ID列表 # 定义一个函数,用于从CSV文件中读取前N个书籍ID def get_top_book_ids(limit=10): book_ids = [] # 初始化一个空列表,用于存储书籍ID try: with open("data.csv", "r", encoding="utf-8") as data_file: reader = csv.reader(data_file) next(reader) # 跳过CSV文件的表头 for i, row in enumerate(reader): # 遍历CSV文件的每一行 if i >= limit: break # 如果已经获取了指定数量的书籍ID,跳出循环 book_id = row[1] if book_id and book_id != "未知ID": book_ids.append(book_id) # 如果书籍ID有效,则添加到book_ids列表中 except Exception as e: # 如果读取书籍ID时出现异常,打印错误信息 print(f"读取书籍ID失败: {e}") return book_ids # 返回书籍ID列表 # 定义一个函数,用于爬取指定书籍的评论并保存到CSV文件 def crawl_book_reviews(pages, book_id, reviews_dir): csv_file = os.path.join(reviews_dir, f'当当网商品_{book_id}_评价.csv') # 构建评论文件的完整路径 # 创建CSV文件并写入表头 with open(csv_file, 'w', encoding='utf-8-sig', newline='') as review_file: c_writer = csv.DictWriter(review_file, fieldnames=['用户名', '评论内容']) c_writer.writeheader() # 写入CSV文件的表头 for i in range(pages): page = i + 1 # 从第1页到指定的页数进行遍历 # 使用传入的书籍ID构建URL url = f'http://product.dangdang.com/index.php?r=comment%2Flist&productId={book_id}&categoryPath=58.65.03.03.00.00&mainProductId={book_id}&mediumId=21&pageIndex={page}&sortType=1&filterType=1&isSystem=1&tagId=0&tagFilterCount=0&template=cloth' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36', 'Referer': f'http://product.dangdang.com/{book_id}.html', 'Cookie': 'from=460-5-biaoti; order_follow_source=P-460-5-bi%7C%231%7C%23www.baidu.com%252Fother.php%253Fsc.060000jRtGgkBd47ECAxHUxBlqwLkfBJsl8lSLtmm9Zl27Qa_kZyOm2Qg_lyRgkRd4vKD9uWt%7C%230-%7C-; ddscreen=2; __permanent_id=20210304204636997189494350346254347; __visit_id=20210304204637001245338343220621735; __out_refer=1614861997%7C!%7Cwww.baidu.com%7C!%7C%25E5%25BD%2593%25E5%25BD%2591%25E7%25BD%2591; __ddc_15d_f=1614861997%7C!%7C000000000000000000000000000000000000000000000000000000000000000_utm_brand_id%3D11106; dest_area=country_id%3D9000%26province_id%3D111%26city_id%3D0%26district_id%3D0%26town_id%3D0; pos_0_end=1614862009963; __ddc_1d=1614862062%7C!%7C_utm_brand_id%3D11106; __ddc_24h=1614862062%7C!%7C_utm_brand_id%3D11106; __ddc_15d=1614862062%7C!%7C_utm_brand_id%3D11106; pos_9_end=1614862078563; ad_ids=4343831%2C3554365%7C%233%2C2; secret_key=f097eea219c17c155499399cb471dd5a; pos_1_start=1614863547245; pos_1_end=1614863547264; __rpm=%7Cp_23665180.029..1614863548625; __trace_id=20210304211706253212636290464425201' } try: # 发送请求获取JSON数据 response = requests.get(url, headers=headers) response.raise_for_status() # 检查请求是否成功 py_data = response.json() # 将响应内容解析为JSON格式 except Exception as e: print(f"第 {page} 页请求失败: {e}") continue try: # 使用jsonpath提取HTML内容 jsonpath_expr = parse('$..html') matches = [match.value for match in jsonpath_expr.find(py_data)] # 使用列表推导式提取匹配的HTML内容 if matches: html_data = matches[0] else: print(f"第 {page} 页未找到 HTML 数据") # 如果未找到HTML数据,打印提示信息 continue # 使用正则表达式提取评论信息 comments = re.findall(r'<div class="describe_detail">\s*<span>(.*?)</span>\s*</div>', html_data) nicknames = re.findall(r'alt="(.*?)"', html_data) # 确保昵称和评论数量匹配 min_length = min(len(comments), len(nicknames)) # 写入评论数据 if min_length > 0: # 确保有有效数据 with open(csv_file, 'a', encoding='utf-8-sig', newline='') as pinglun_f: pinglun_writer = csv.DictWriter(pinglun_f, fieldnames=['用户名', '评论内容']) for idx in range(min_length): if nicknames[idx].strip() and comments[idx].strip(): pinglun_writer.writerow({ '用户名': nicknames[idx], '评论内容': comments[idx] }) print(f"书籍ID {book_id} 的第 {page} 页完成,添加了 {min_length} 条评论") else: print(f"书籍ID {book_id} 的第 {page} 页没有找到任何评论") time.sleep(random.uniform(0.8, 2.6)) # 随机延迟避免被封 except Exception as e: # 如果处理页面时出现异常,打印错误信息 print(f"第 {page} 页处理失败: {e}") continue # 跳过当前循环,继续处理下一页 # 定义一个函数,用于创建评论文件存储目录 def create_reviews_directory(): reviews_dir = "book_reviews" # 定义评论文件存储目录的名称 try: if not os.path.exists(reviews_dir): os.makedirs(reviews_dir) # 如果目录不存在,则创建目录 print(f"已创建评论文件夹: {reviews_dir}") return reviews_dir # 返回目录路径 except Exception as e: # 如果创建目录时出现异常,打印错误信息 print(f"创建文件夹失败: {e}") return None # 返回None # 定义一个函数,用于显示主菜单并处理用户选择 def main_menu(): while True: # 进入无限循环,直到用户选择退出 print("\n=== 当当网书籍数据分析系统 ===") print("1. 爬取书籍榜单数据") print("2. 退出系统") choice = input("请选择操作(1-2): ") # 获取用户输入的选择 if choice == "1": # 爬取书籍榜单数据 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78" } # 创建并写入榜单数据CSV表头 with open("data.csv", "w", encoding="utf-8", newline="") as bangdan_f: head = ["排名", "书籍ID", "书名", "作者", "出版时间", "出版社", "原价", "现价", "打折", "评论", "推荐"] bangdan_writer = csv.writer(bangdan_f) bangdan_writer.writerow(head) # 写入CSV文件的表头 # 创建评论文件存储目录 reviews_dir = create_reviews_directory() if not reviews_dir: # 如果无法创建评论文件夹,打印提示信息并继续循环 print("无法创建评论文件夹,程序终止") continue # 爬取书籍榜单数据 print("开始爬取书籍榜单数据...") all_book_ids = [] # 初始化一个空列表,用于存储所有书籍ID for i in tqdm(range(1, 26)): # 从第1页到第25页进行遍历 url = f"http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent30-0-0-1-{i}" html = get_page(url, headers) # 调用get_page函数获取页面内容 if html: book_ids = get_book_list_data(html) # 调用get_book_list_data函数提取书籍信息 all_book_ids.extend(book_ids) # 将提取到的书籍ID添加到all_book_ids列表中 time.sleep(random.uniform(0.8, 2.6)) # 随机延迟 print(f"书籍榜单数据爬取完成,共获取 {len(all_book_ids)} 本书籍信息") # 获取前10本热门书籍ID top_book_ids = get_top_book_ids(10) # 调用get_top_book_ids函数获取前10本热门书籍ID if not top_book_ids: # 如果未找到有效的书籍ID,打印提示信息 print("未找到有效的书籍ID,请检查data.csv文件") else: print(f"准备爬取以下书籍的评论:{top_book_ids}") pages = int(input('请输入要爬取的评论页数(每本书): ')) # 获取用户输入的评论页数 # 爬取评论数据 for book_id in top_book_ids: # 遍历前10本热门书籍ID print(f"\n开始爬取书籍ID {book_id} 的评论...") crawl_book_reviews(pages, book_id, reviews_dir) # 调用crawl_book_reviews函数爬取评论数据 print("\n所有书籍评论爬取完成!") print("榜单数据已保存到 data.csv") print(f"各书籍评论已分别保存到 {reviews_dir} 文件夹中") elif choice == "2": print("感谢使用,再见!") break else: print("无效的选择,请重新输入") # 如果用户输入的选择无效,打印提示信息 if __name__ == "__main__": #检查脚本是否作为主程序运行 # 初始化 CSV 文件:如果 data.csv 文件不存在,则创建该文件并写入表头 if not os.path.exists("data.csv"): with open("data.csv", "w", encoding="utf-8", newline="") as f: writer = csv.writer(f) writer.writerow(["排名", "书籍ID", "书名", "作者", "出版时间", "出版社", "原价", "现价", "打折", "评论", "推荐"]) # 写入CSV文件的表头 create_reviews_directory() # 调用create_reviews_directory函数创建评论文件存储目录 # 启动主菜单 main_menu() # 调用main_menu函数显示主菜单并处理用户选择
06-12
import argparse import re import sys import time from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from typing import List, Optional, Tuple import requests from bs4 import BeautifulSoup from tqdm import tqdm # 可选导入(若未安装则关闭 --use-ytdlp 即可) try: import yt_dlp # type: ignore HAS_YTDLP = True except Exception: # noqa: BLE001 HAS_YTDLP = False MEDIA_EXTENSIONS = ( ".mp4", ".webm", ".mkv", ".mov", ".mp3", ".m4a", ".aac", ".wav", ".flac", ) def sanitize_filename(name: str) -> str: name = re.sub(r"[\\/:*?\"<>|]", "_", name) name = re.sub(r"\s+", " ", name).strip() return name or "media" def request_page(url: str, timeout: int = 20) -> Tuple[str, requests.Response]: headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36" ), } resp = requests.get(url, headers=headers, timeout=timeout) resp.raise_for_status() return resp.text, resp def discover_media_links(html: str, base_url: str) -> List[str]: soup = BeautifulSoup(html, "lxml") links: List[str] = [] # <video>/<audio>/source for tag in soup.find_all(["video", "audio"]): src = tag.get("src") if src: links.append(requests.compat.urljoin(base_url, src)) for source in tag.find_all("source"): src2 = source.get("src") if src2: links.append(requests.compat.urljoin(base_url, src2)) # <a href="*.mp4|*.mp3|..."> for a in soup.find_all("a", href=True): href = a["href"] url = requests.compat.urljoin(base_url, href) if any(url.lower().split("?")[0].endswith(ext) for ext in MEDIA_EXTENSIONS): links.append(url) # 脚本文本中的直链 pattern = re.compile(r"https?://[^'\"\s>]+", re.I) for script in soup.find_all("script"): text = script.string or script.get_text() or "" for m in pattern.findall(text): if any(m.lower().split("?")[0].endswith(ext) for ext in MEDIA_EXTENSIONS): links.append(m) # 去重 seen = set() uniq: List[str] = [] for u in links: if u not in seen: uniq.append(u) seen.add(u) return uniq def human_size(num: Optional[int]) -> str: if num is None: return "?" units = ["B", "KB", "MB", "GB", "TB"] size = float(num) for unit in units: if size < 1024.0: return f"{size:.2f} {unit}" size /= 1024.0 return f"{size:.2f} PB" def is_probably_stream(url: str) -> bool: low = url.lower() return any(x in low for x in [".m3u8", "manifest", "dash.mpd", "playlist"]) or "?m3u8" in low def stream_download(url: str, outfile: Path, chunk_size: int = 1 << 20, resume: bool = True) -> None: headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36" ), } start = 0 mode = "wb" if resume and outfile.exists(): start = outfile.stat().st_size if start > 0: headers["Range"] = f"bytes={start}-" mode = "ab" with requests.get(url, headers=headers, stream=True, timeout=30) as r: r.raise_for_status() total: Optional[int] = None if "Content-Length" in r.headers: try: total = int(r.headers["Content-Length"]) + start if "Range" in headers else int(r.headers["Content-Length"]) except Exception: # noqa: BLE001 total = None pbar = tqdm(total=total, initial=start, unit="B", unit_scale=True, desc=outfile.name) with open(outfile, mode) as f: for chunk in r.iter_content(chunk_size=chunk_size): if chunk: f.write(chunk) pbar.update(len(chunk)) pbar.close() def download_via_ytdlp(url: str, outdir: Path, filename: Optional[str]) -> None: if not HAS_YTDLP: raise RuntimeError("yt-dlp 不可用,请先安装或关闭 --use-ytdlp") ydl_opts = { "outtmpl": str(outdir / (filename or "%(title)s.%(ext)s")), "format": "bestaudio/bestvideo/best", "merge_output_format": "mp4", "concurrent_fragment_downloads": 8, "retries": 10, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: # type: ignore[attr-defined] ydl.download([url]) def download_with_retry(func, *args, retries: int = 3, base_delay: float = 1.0, **kwargs) -> None: attempt = 0 while True: try: return func(*args, **kwargs) except Exception as e: # noqa: BLE001 attempt += 1 if attempt > retries: raise e delay = base_delay * (2 ** (attempt - 1)) print(f"重试第 {attempt} 次,等待 {delay:.1f}s:{e}") time.sleep(delay) def ensure_outdir(path: Path) -> None: path.mkdir(parents=True, exist_ok=True) def parse_args() -> argparse.Namespace: p = argparse.ArgumentParser(description="从网页提取并下载视频/音频") p.add_argument("url", help="目标网页 URL 或媒体直链") p.add_argument("-o", "--output", default="downloads", help="输出目录") p.add_argument("-n", "--name", default=None, help="文件名前缀或固定文件名(不含扩展名)") p.add_argument("--no-resume", action="store_true", help="关闭断点续传") p.add_argument("--use-ytdlp", action="store_true", help="对流媒体/复杂页面使用 yt-dlp") p.add_argument("--max", type=int, default=0, help="最多下载前 N 条(0=全部)") p.add_argument("--workers", type=int, default=4, help="并发下载线程数") p.add_argument("--retries", type=int, default=3, help="失败重试次数") return p.parse_args() def main() -> None: args = parse_args() outdir = Path(args.output) ensure_outdir(outdir) # 单个直链下载 if any(args.url.lower().split("?")[0].endswith(ext) for ext in MEDIA_EXTENSIONS) or is_probably_stream(args.url): filename_base = sanitize_filename(args.name or Path(args.url).stem) if args.use_ytdlp or is_probably_stream(args.url): download_with_retry(download_via_ytdlp, args.url, outdir, filename_base, retries=args.retries) print(f"完成: {args.url}") return outfile = outdir / f"{filename_base}{Path(args.url).suffix.split('?')[0]}" download_with_retry(stream_download, args.url, outfile, resume=not args.no_resume, retries=args.retries) print(f"完成: {outfile} ({human_size(outfile.stat().st_size)})") return # 网页提取 html, resp = request_page(args.url) soup = BeautifulSoup(html, "lxml") title = sanitize_filename(soup.title.string if soup.title else "page") media_links = discover_media_links(html, resp.url) if args.max and len(media_links) > args.max: media_links = media_links[: args.max] if not media_links: print("未发现可下载的媒体链接。可尝试 --use-ytdlp。") return print(f"发现 {len(media_links)} 个媒体链接。") def process_link(idx: int, link: str) -> str: base = sanitize_filename(args.name or f"{title}_{idx:02d}") if args.use_ytdlp or is_probably_stream(link): try: download_with_retry(download_via_ytdlp, link, outdir, base, retries=args.retries) return f"完成: {link}" except Exception as e: # noqa: BLE001 return f"yt-dlp 下载失败: {link} -> {e}" else: ext = Path(link.split("?")[0]).suffix or ".bin" outfile = outdir / f"{base}{ext}" try: download_with_retry(stream_download, link, outfile, resume=not args.no_resume, retries=args.retries) return f"完成: {outfile} ({human_size(outfile.stat().st_size)})" except Exception as e: # noqa: BLE001 return f"直链下载失败: {link} -> {e}" with ThreadPoolExecutor(max_workers=max(1, args.workers)) as executor: futures = [executor.submit(process_link, idx, link) for idx, link in enumerate(media_links, 1)] for fut in as_completed(futures): print(fut.result()) if __name__ == "__main__": try: main() except KeyboardInterrupt: print("已取消") sys.exit(1)
最新发布
09-16
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值