chapter5_使用顺序容器并分析字符串

本文深入探讨了迭代器的概念,包括其基本功能、类型及其在容器中的作用。特别介绍了const_iterator与iterator的区别,以及erase()方法的工作原理,强调了删除元素后迭代器的有效性问题。
  1. 迭代器

    1. 一个迭代器是一个值,它能够

      (1) 识别一个容器以及容器中的一个元素

      (2) 检查存储在这个元素中的值

      (3) 提供操作来移动在容器中的元素

      (4) 采用对应于容器所能够有效处理的方式来对可用的操作进行约束

    2. 迭代器类型

      (1) container-type::const_iterator – 只读迭代器

      (2) container-type::iterator – 可读可写迭代器

      iterator对象可以转换为const_iterator,反之则不行

    3. erase()方法

      (1) 会返回一个iterator迭代器,指向紧跟在被删除掉元素的后面

      (2) 在删除以后,所有被删除位置之后的迭代器均会失效(包括 .end(),.end()返回的是最后一个元素的下一个位置)。 —>erase()的返回值一般要用上

def _parse_parent_node(self, text_obj: TextObject, count_idx: int, label: str = "body") -> None: """ 解析当前文本段落所属的父章节,直接更新当前TextObject对象 """ tgt_data = self.router.get(label) if tgt_data: if count_idx >= 0: # 普通文本处理:获取页码与自动编号 x = count_idx + 1 page_id, auto_number, auto_header, left_indent = self._get_page_id(x, text_obj, para_index_offset_changed = False) text_obj.layout.page_id = str(page_id) if auto_number: all_text = f"{auto_number} {text_obj.text}" elif auto_header: all_text = f"{auto_header} {text_obj.text}" else: all_text = text_obj.text else: all_text = text_obj.text chapter_id = self._parse_chapter_id(all_text) if chapter_id is None: # 没有章节号的普通文本 for idx, ele in enumerate(reversed(tgt_data)): if idx == 0: text_obj.layout.prev_ref = ele ele.layout.next_ref = text_obj if not isinstance(ele, TextObject): continue ele_chapter_id = self._parse_chapter_id(ele.text) if ele_chapter_id: # 普通文本找到父章节 text_obj.layout.parent_ref = ele text_obj.layout.parent_content = ele.text break else: # 有章节号的标题,根据自己的章节号查找父章节 text_obj.layout.is_chapter_title = 1 text_obj.layout.chapter_id = chapter_id for idx, ele in enumerate(reversed(tgt_data)): if idx == 0: text_obj.layout.prev_ref = ele ele.layout.next_ref = text_obj if not isinstance(ele, TextObject): continue ele_chapter_id = self._parse_chapter_id(ele.text) if ele_chapter_id and self.full_to_half(chapter_id.rsplit(".", maxsplit=1)[0]) == self.full_to_half( ele_chapter_id): # 找到父章节,更新当前TextObject对象 text_obj.layout.parent_ref = ele text_obj.layout.parent_content = ele.text break else: chapter_id = self._parse_chapter_id(text_obj.text) if chapter_id: text_obj.layout.is_chapter_title = 1 text_obj.layout.chapter_id = chapter_id if not text_obj.layout.parent_ref: text_obj.layout.parent_ref = self._block_obj 解析一下
11-07
import requests from bs4 import BeautifulSoup import re import time import os import random from fake_useragent import UserAgent from urllib.parse import urlparse, urljoin, urlunparse import logging # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("novel_downloader.log"), logging.StreamHandler() ] ) # 创建用户代理生成器 ua = UserAgent() # 需要过滤的导航元素列表(已添加笔趣阁专用元素) NAV_ELEMENTS = [ "首页", "关灯", "字体:", "大", "中", "小", "上一页", "返回目录", "下一页", "加入书签", "推荐票", "返回书页", "目录", "设置", "书架", "加入书架", "上一章", "下一章", "书签", "投票", "举报", "本章未完,请点击下一页继续阅读", "本章结束,请点击下一章继续阅读", "返回", "电脑版", "手机版", "APP版", "客户端", "介绍", "足迹", "超大", "进书架", "本章未完 点击下一页继续阅读", "顶部", "底部", "页脚", "页眉", "章节列表", "加书签", "我的书架", "阅读历史", "本章已完", "请收藏本站", "请记住本书首发域名", "天才一秒记住本站地址", "顶点小说", "笔趣阁", "更新最快", "推荐阅读", "上一节", "下一节", "目录页", "返回顶部", "加入书签", "投票推荐", "章节报错", "加入书签,方便阅读", # 笔趣阁专用 "本章未完,点击下一页继续阅读" # 笔趣阁专用 ] # 特殊换行保留标签 PRESERVE_LINEBREAK_TAGS = ["br", "p", "div", "pre", "blockquote"] # 常见内容容器选择器(添加笔趣阁专用选择器) CONTENT_SELECTORS = [ ('div', 'Readarea ReadAjax_content'), # 笔趣阁专用选择器 ('div', 'size16 color5 pt-read-text'), ('div', 'pt-read-text'), ('div', 'novel-content'), ('div', 'chapter-content'), ('div', 'novelcontent'), ('div', 'content'), ('div', 'chapter-content'), ('article', None), ('div', 'chapter_body'), ('div', 'read-content'), ('div', 'txt_cont'), ('div', 'content-body'), ('div', 'read-box'), ('div', 'chapter-content-inner'), ('div', 'chapter-text'), ('div', 'content-main'), ('div', 'chapter-container'), ('div', 'chapter-content'), ('div', 'chapter'), ('div', 'main-content'), ('div', 'entry-content'), ('div', 'article-content'), ('div', 'content-body'), ] # 下一页链接选择器(添加笔趣阁专用选择器) NEXT_PAGE_SELECTORS = [ # 笔趣阁专用选择器 ('a', {'id': 'pt_next'}), # 优先匹配id=pt_next的链接 # 通用选择器 ('a', re.compile(r'下一页|下一章|下一节|继续阅读'), None), ('button', re.compile(r'下一页|下一章|下一节|继续阅读'), 'parent_a'), ('a', None, re.compile(r'next-page|next|next-chapter')), ('a', None, 'next'), ('a', None, 'bt_next'), ('a', None, 'btn-next'), ('a', None, 'nextChapter'), ('a', None, 'next_chapter'), ('a', None, 'chapter-next'), ('a', None, 'btn-next-page'), ('a', {'id': 'nextChapterBtn'}), ('a', {'id': 'next_chapter'}), ('a', {'id': 'next_page'}), ('a', {'id': 'btn_next'}), ('a', {'id': 'nextChapter'}), ('a', {'class': 'btn-info'}), ('a', {'class': 'next-page'}), ('a', {'class': 'next-chapter'}), ('a', {'class': 'next-button'}), ] def normalize_url(url, base_url): """规范化URL,确保正确处理相对路径和绝对路径""" parsed_base = urlparse(base_url) parsed_url = urlparse(url) # 如果URL是绝对路径,直接返回 if parsed_url.scheme and parsed_url.netloc: return url # 处理相对路径 if url.startswith('//'): return f"{parsed_base.scheme}:{url}" # 处理以/开头的路径 if url.startswith('/'): return f"{parsed_base.scheme}://{parsed_base.netloc}{url}" # 处理相对路径 base_path = parsed_base.path.rsplit('/', 1)[0] if '.' in parsed_base.path else parsed_base.path return f"{parsed_base.scheme}://{parsed_base.netloc}{base_path}/{url}" def extract_content_and_next_link(html, base_url): """从HTML中提取内容找到下一页链接,优化换行处理""" soup = BeautifulSoup(html, 'html.parser') content_div = None # 尝试多种内容容器选择器 for selector in CONTENT_SELECTORS: tag, class_name = selector if class_name: content_div = soup.find(tag, class_=class_name) else: content_div = soup.find(tag) if content_div: logging.info(f"使用选择器: {selector} 找到内容容器") break # 兜底策略 if not content_div: logging.warning("使用body作为内容容器") content_div = soup.find('body') if not content_div: logging.error("无法找到内容容器") return "", None # 1. 移除导航元素 for nav_text in NAV_ELEMENTS: for element in content_div.find_all(string=re.compile(re.escape(nav_text))): element.extract() # 2. 移除script/style标签 for script in content_div.find_all(['script', 'style', 'noscript', 'iframe']): script.extract() # 3. 清理空白字符 for nbsp in content_div.find_all(string=re.compile(r'\u00a0')): nbsp.replace_with(nbsp.replace('\u00a0', ' ')) # 4. 处理换行标签 for tag in PRESERVE_LINEBREAK_TAGS: for element in content_div.find_all(tag): element.append('\n') # 5. 获取原始文本 raw_text = content_div.get_text(separator='\n', strip=False) # 6. 文本优化处理 cleaned_text = re.sub(r'\n{3,}', '\n\n', raw_text) # 多个连续换行合 cleaned_text = re.sub(r'(\S)\n(\S)', r'\1 \2', cleaned_text) # 合单字换行 cleaned_text = re.sub(r'\n\s+\n', '\n\n', cleaned_text) # 清理空白行 cleaned_text = re.sub(r'^\s+', '', cleaned_text, flags=re.MULTILINE) # 删除行首空白 # 7. 添加章节标题 title_tag = soup.find('title') chapter_title = title_tag.text.split('-')[0] if title_tag else "未知章节" cleaned_text = f"## {chapter_title} ##\n\n" + cleaned_text.strip() # 8. 保留诗歌格式 cleaned_text = re.sub(r'(\n {4,}.+?\n)\n+', r'\1\n', cleaned_text) cleaned_text = re.sub(r'\n{2,}', '\n\n', cleaned_text) # 多个空行合 # 查找下一页链接 next_link = None # 优先处理笔趣阁专用分页区域 readpage_div = soup.find('p', class_='Readpage') if readpage_div: next_tag = readpage_div.find('a', id='pt_next') if next_tag and 'href' in next_tag.attrs: next_link = next_tag['href'] next_link = normalize_url(next_link, base_url) logging.info(f"通过Readpage区域找到下一页: {next_link}") return cleaned_text + "\n\n", next_link # 通用选择器匹配 for selector in NEXT_PAGE_SELECTORS: tag, *params = selector try: if len(params) == 1 and isinstance(params[0], dict): # 属性选择器 element = soup.find(tag, attrs=params[0]) elif len(params) == 2: # 文本和类名选择器 text, class_name = params if text and class_name: element = soup.find(tag, string=text, class_=class_name) elif text: element = soup.find(tag, string=text) elif class_name: element = soup.find(tag, class_=class_name) elif len(params) == 1: # 单参数选择器 param = params[0] if isinstance(param, dict): element = soup.find(tag, attrs=param) elif isinstance(param, str): element = soup.find(tag, class_=param) elif hasattr(param, 'match'): element = soup.find(tag, string=param) else: element = None if element and 'href' in element.attrs: next_link = element['href'] next_link = normalize_url(next_link, base_url) logging.info(f"通过选择器 {selector} 找到下一页: {next_link}") return cleaned_text + "\n\n", next_link except Exception as e: logging.warning(f"选择器 {selector} 匹配出错: {str(e)}") continue # 查找JavaScript变量 script_tags = soup.find_all('script') for script in script_tags: if script.string: match = re.search( r'var\s+(?:nexturl|nextUrl|next_page_url|nextChapterUrl)\s*=\s*["\'](.*?)["\'];', script.string ) if match: next_link = match.group(1) if '<' in next_link: link_soup = BeautifulSoup(next_link, 'html.parser') a_tag = link_soup.find('a') if a_tag and 'href' in a_tag.attrs: next_link = a_tag['href'] next_link = normalize_url(next_link, base_url) logging.info(f"通过JavaScript变量找到下一页: {next_link}") return cleaned_text + "\n\n", next_link logging.warning("未找到下一页链接") return cleaned_text + "\n\n", None def get_random_headers(): """生成随机的请求头""" return { 'User-Agent': ua.random, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0', 'DNT': '1', 'Referer': 'https://www.dbxsd.com/' } def detect_encoding(response): """智能检测页面编码""" if 'content-type' in response.headers: content_type = response.headers['content-type'].lower() if 'charset=' in content_type: return content_type.split('charset=')[-1] soup = BeautifulSoup(response.content, 'html.parser', from_encoding='iso-8859-1') meta_tag = soup.find('meta', charset=True) if meta_tag: return meta_tag['charset'] meta_tag = soup.find('meta', {'http-equiv': re.compile(r'content-type', re.I)}) if meta_tag and 'content' in meta_tag.attrs: content = meta_tag['content'].lower() if 'charset=' in content: return content.split('charset=')[-1] return 'utf-8' def download_novel(start_url, output_file="novel.txt", max_retries=5): """下载整本小说""" parsed_url = urlparse(start_url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" session = requests.Session() session.headers.update({ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'DNT': '1', 'Upgrade-Insecure-Requests': '1' }) current_url = start_url chapter_count = 0 retry_count = 0 total_bytes = 0 visited_urls = set() output_dir = os.path.dirname(output_file) if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) logging.info(f"开始下载小说,起始URL: {start_url}") logging.info(f"基础URL: {base_url}") with open(output_file, 'w', encoding='utf-8') as f: while current_url and retry_count < max_retries: if current_url in visited_urls: logging.warning(f"检测到重复URL: {current_url},跳过") break visited_urls.add(current_url) chapter_count += 1 try: logging.info(f"正在下载第 {chapter_count} 章: {current_url}") delay = random.uniform(1.5, 4.0) time.sleep(delay) headers = get_random_headers() headers['Referer'] = base_url response = session.get(current_url, headers=headers, timeout=15) if response.status_code != 200: logging.error(f"错误: 无法获取页面,状态码: {response.status_code}") retry_count += 1 continue detected_encoding = detect_encoding(response) response.encoding = detected_encoding logging.info(f"检测到编码: {detected_encoding}") content, next_link = extract_content_and_next_link(response.text, current_url) if content and len(content.strip()) > 20: bytes_written = f.write(content) total_bytes += bytes_written f.flush() logging.info(f"成功写入第 {chapter_count} 章内容 ({len(content)} 字符)") retry_count = 0 else: logging.warning(f"未提取到有效内容,可能页面结构变化") debug_file = f"debug_ch{chapter_count}.html" with open(debug_file, 'w', encoding='utf-8') as debug_f: debug_f.write(response.text) logging.info(f"已保存调试文件: {debug_file}") retry_count += 1 continue if next_link and next_link != current_url: current_url = next_link else: current_url = None logging.info("已到达最后一章") except requests.exceptions.RequestException as e: logging.error(f"网络请求出错: {str(e)}") retry_count += 1 time.sleep(5) except Exception as e: logging.error(f"处理章节时出错: {str(e)}") retry_count += 1 time.sleep(5) logging.info(f"已保存到: {output_file}") logging.info(f"总章节数: {chapter_count}") logging.info(f"总字数: {total_bytes} 字节") if chapter_count > 0: logging.info(f"小说下载完成!") else: logging.error("下载失败,未获取到任何章节内容") return chapter_count, total_bytes def main(): """主函数,处理用户输入""" print("=" * 50) print("笔趣阁小说下载器 - 自动翻页版") print("=" * 50) start_url = input("请输入小说起始URL: ").strip() if not start_url: print("错误: 起始URL不能为空!") return output_file = input("请输入输出文件名(默认: novel.txt): ").strip() if not output_file: output_file = "novel.txt" start_time = time.time() chapter_count, total_bytes = download_novel(start_url, output_file) print("\n" + "=" * 50) if chapter_count > 0: print(f"下载完成! 共 {chapter_count} 章,{total_bytes} 字节") else: print("下载失败,请检查日志文件了解详情") print(f"耗时: {time.time() - start_time:.2f} 秒") print(f"文件已保存至: {os.path.abspath(output_file)}") print("=" * 50) if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\n用户中断,程序退出") except Exception as e: print(f"程序发生错误: {str(e)}") logging.exception("程序发生未处理异常") 这个脚本我想要更完善一点的规则,有些页面还是不通用
09-02
代码下载地址: https://pan.quark.cn/s/b4a8e0160cfc 齿轮与轴系零件在机械设备中扮演着至关重要的角色,它们负责实现动力传输、调整运动形态以及承受工作载荷等核心功能。 在机械工程的设计实践中,齿轮和轴系的设计是一项关键的技术任务,其内容涵盖了材料选用、构造规划、承载能力分析等多个技术层面。 下面将系统性地介绍《齿轮及轴系零件结构设计指导书》中的核心知识点。 一、齿轮设计1. 齿轮种类:依据齿廓轮廓的不同,齿轮可划分为直齿齿轮、斜齿轮以及人字齿轮等类别,各类齿轮均具有特定的性能特点与适用工况,能够满足多样化的工作环境与载荷需求。 2. 齿轮规格参数:模数大小、压力角数值、齿数数量、分度圆尺寸等是齿轮设计的基础数据,这些参数直接决定了齿轮的物理尺寸与运行性能。 3. 齿轮材质选用:齿轮材料的确定需综合评估其耐磨损性能、硬度水平以及韧性表现,常用的材料包括铸铁、钢材、铝合金等。 4. 齿轮强度验证:需进行齿面接触应力分析与齿根弯曲应力分析,以确保齿轮在实际运行过程中不会出现过度磨损或结构破坏。 5. 齿轮加工工艺:涉及切削加工、滚齿加工、剃齿加工、淬火处理等工艺流程,工艺方案的选择将直接影响齿轮的加工精度与使用寿命。 二、轴设计1. 轴的分类方式:依据轴在机械装置中的功能定位与受力特点,可将轴划分为心轴、转轴以及传动轴等类型。 2. 轴的材料选择:通常采用钢材作为轴的材料,例如碳素结构钢或合金结构钢,特殊需求时可选用不锈钢材料或轻质合金材料。 3. 轴的构造规划:需详细考虑轴的轴向长度、截面直径、键槽布置、轴承安装位置等要素,以满足轴的强度要求、刚度要求以及稳定性要求。 4. 轴的强度验证:需进行轴的扭转强度分析与弯曲强度分析,以防止轴在运行过程中发生塑性变形...
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值