第 15 章 Logging

本文详细介绍了rsyslog的配置方法及其核心配置文件rsyslog.conf的内容。包括如何安装rsyslog、配置日志模块、设置全局指令以及指定日志文件权限等。

15.1. rsyslog

www.rsyslog.com

目前rsyslog已经成为Linux标配之日程序,默认会安装,如果没有安装请使用下面命令安装。

yum install rsyslog
	    

15.1.1. rsyslog.conf

$ cat /etc/rsyslog.conf 
#  /etc/rsyslog.conf	Configuration file for rsyslog.
#
#			For more information see
#			/usr/share/doc/rsyslog-doc/html/rsyslog_conf.html
#
#  Default logging rules can be found in /etc/rsyslog.d/50-default.conf


#################
#### MODULES ####
#################

$ModLoad imuxsock # provides support for local system logging
$ModLoad imklog   # provides kernel logging support
#$ModLoad immark  # provides --MARK-- message capability

# provides UDP syslog reception
#$ModLoad imudp
#$UDPServerRun 514

# provides TCP syslog reception
#$ModLoad imtcp
#$InputTCPServerRun 514

# Enable non-kernel facility klog messages
$KLogPermitNonKernelFacility on

###########################
#### GLOBAL DIRECTIVES ####
###########################

#
# Use traditional timestamp format.
# To enable high precision timestamps, comment out the following line.
#
$ActionFileDefaultTemplate RSYSLOG_TraditionalFileFormat

# Filter duplicated messages
$RepeatedMsgReduction on

#
# Set the default permissions for all log files.
#
$FileOwner syslog
$FileGroup adm
$FileCreateMode 0640
$DirCreateMode 0755
$Umask 0022
$PrivDropToUser syslog
$PrivDropToGroup syslog

#
# Where to place spool and state files
#
$WorkDirectory /var/spool/rsyslog

#
# Include all config files in /etc/rsyslog.d/
#
$IncludeConfig /etc/rsyslog.d/*.conf			
		




原文出处:Netkiller 系列 手札
本文作者:陈景峯
转载请与作者联系,同时请务必标明文章原始出处和作者信息及本声明。

import requests from bs4 import BeautifulSoup import re import time import os import random from fake_useragent import UserAgent from urllib.parse import urlparse, urljoin import logging # 配置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("novel_downloader.log"), logging.StreamHandler() ] ) # 创建用户代理生成器 ua = UserAgent() # 需要过滤的导航元素列表 NAV_ELEMENTS = [ "首页", "关灯", "字体:", "大", "中", "小", "上一页", "返回目录", "下一页", "加入书签", "推荐票", "返回书页", "目录", "设置", "书架", "加入书架", "上一", "下一", "书签", "投票", "举报", "本未完,请点击下一页继续阅读", "本结束,请点击下一继续阅读", "返回", "电脑版", "手机版", "APP版", "客户端", "介绍", "足迹", "超大", "进书架", "本未完 点击下一页继续阅读", "顶部", "底部", "页脚", "页眉", "节列表", "加书签", "我的书架", "阅读历史", "本已完 m.3qdu.com", "请收藏本站", "请记住本书首发域名", "天才一秒记住本站地址", "顶点小说", "笔趣阁", "更新最快", "</p>" ] # 特殊换行保留标签 PRESERVE_LINEBREAK_TAGS = ["br", "p", "div", "pre", "blockquote"] def extract_content_and_next_link(html, base_url): """从HTML中提取内容并找到下一页链接,优化换行处理""" soup = BeautifulSoup(html, 'html.parser') # 内容容器选择器 content_div = None selectors = [ ('div', 'size16 color5 pt-read-text'), ('div', 'pt-read-text'), ('div', 'novel-content'), ('div', 'chapter-content'), ('div', 'novelcontent'), ('div', 'content'), ('div', 'chapter-content'), ('article', None), ('div', 'chapter_body'), ('div', 'read-content'), ('div', 'txt_cont'), ('div', 'content-body'), ('div', 'read-box'), ('div', 'chapter-content-inner'), ('div', 'chapter-content'), ('div', 'chapter-text'), ('div', 'chapter-content'), ('div', 'chapter-content'), ('div', 'chapter-content'), ] for tag, class_name in selectors: if class_name: content_div = soup.find(tag, class_=class_name) else: content_div = soup.find(tag) if content_div: break if not content_div: logging.warning("使用body作为内容容器") content_div = soup.find('body') if not content_div: logging.error("无法找到内容容器") return "", None # ==== 优化换行处理 ==== # 1. 移除导航元素 for nav_text in NAV_ELEMENTS: for element in content_div.find_all(string=re.compile(re.escape(nav_text))): if element.strip() == nav_text: element.extract() # 2. 智能处理特殊空白字符 for nbsp in content_div.find_all(string=re.compile(r'\u00a0')): nbsp.replace_with(nbsp.replace('\u00a0', ' ')) # 3. 处理换行标签 # 保留原始换行标签但标记为特殊处理 for tag in PRESERVE_LINEBREAK_TAGS: for element in content_div.find_all(tag): element.append('\n') # 4. 获取原始文本(保留换行) raw_text = content_div.get_text(separator='\n', strip=False) # 5. 关键优化:合并多余的换行符 # - 将连续3个以上换行符替换为1个空行 # - 保留单行换行(用于诗歌/对话) # - 合并单字换行 cleaned_text = re.sub(r'\n{3,}', '\n\n', raw_text) # 多个连续换行合并 cleaned_text = re.sub(r'(\S)\n(\S)', r'\1 \2', cleaned_text) # 合并单字换行 cleaned_text = re.sub(r'\n\s+\n', '\n\n', cleaned_text) # 清理空白行 cleaned_text = re.sub(r'^\s+', '', cleaned_text, flags=re.MULTILINE) # 删除行首空白 # 6. 特殊处理节标题 title_tag = soup.find('title') chapter_title = title_tag.text.split('-')[0] if title_tag else "未知节" cleaned_text = f"## {chapter_title} ##\n\n" + cleaned_text.strip() # 7. 保留诗歌格式(检测缩进) cleaned_text = re.sub(r'(\n {4,}.+?\n)\n+', r'\1\n', cleaned_text) # 8. 最终优化 cleaned_text = re.sub(r'\n{2,}', '\n\n', cleaned_text) # 多个空行合并 # ===================== # 查找下一页链接 next_link = None next_page_selectors = [ ('a', re.compile(r'下一页|下一|下一节|继续阅读'), None), ('a', None, re.compile(r'next-page|next|next-chapter')), ('a', None, 'next'), ('a', None, 'bt_next'), ('a', None, 'btn-next'), ('a', None, 'nextChapter'), ('a', None, 'next_chapter'), ('a', None, 'chapter-next'), ('a', id=='nextChapterBtn'), ('a', id=='next_chapter'), ('a', id=='next_page'), ('a', id=='btn_next') ] for selector in next_page_selectors: if len(selector) == 3: tag, text_pattern, class_pattern = selector if text_pattern: next_page_tag = soup.find(tag, string=text_pattern) elif class_pattern: next_page_tag = soup.find(tag, class_=class_pattern) elif len(selector) == 2: attr, value = selector next_page_tag = soup.find('a', **{attr: value}) if next_page_tag and 'href' in next_page_tag.attrs: next_link = next_page_tag['href'] if not next_link.startswith('http'): next_link = urljoin(base_url, next_link) logging.info(f"找到下一页链接: {next_link}") break return cleaned_text + "\n\n", next_link def get_random_headers(): """生成随机的请求头""" return { 'User-Agent': ua.random, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0', 'DNT': '1', 'Referer': 'https://www.dppss.com/' } def download_novel(start_url, output_file="novel.txt", max_retries=5): """下载整本小说""" parsed_url = urlparse(start_url) base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" current_url = start_url chapter_count = 0 retry_count = 0 total_bytes = 0 output_dir = os.path.dirname(output_file) if output_dir and not os.path.exists(output_dir): os.makedirs(output_dir) logging.info(f"开始下载小说,起始URL: {start_url}") logging.info(f"基础URL: {base_url}") session = requests.Session() with open(output_file, 'w', encoding='utf-8') as f: while current_url and retry_count < max_retries: chapter_count += 1 logging.info(f"正在下载第 {chapter_count} : {current_url}") try: headers = get_random_headers() headers['Referer'] = base_url response = session.get(current_url, headers=headers, timeout=20) if response.status_code != 200: logging.error(f"错误: 无法获取页面,状态码: {response.status_code}") retry_count += 1 time.sleep(3) continue # 自动检测编码 if response.encoding == 'ISO-8859-1': response.encoding = 'utf-8' content, next_link = extract_content_and_next_link(response.text, base_url) if content and len(content.strip()) > 20: bytes_written = f.write(content) total_bytes += bytes_written f.flush() logging.info(f"成功写入第 {chapter_count} 内容 ({bytes_written} 字节)") retry_count = 0 else: logging.warning(f"未提取到有效内容,可能页面结构变化") debug_file = f"debug_ch{chapter_count}.html" with open(debug_file, 'w', encoding='utf-8') as debug_f: debug_f.write(response.text) logging.info(f"已保存调试文件: {debug_file}") retry_count += 1 time.sleep(5) continue if next_link and next_link != current_url: current_url = next_link else: current_url = None logging.info("已到达最后一") delay = random.uniform(1.0, 3.0) logging.info(f"等待 {delay:.2f} 秒后继续...") time.sleep(delay) except requests.exceptions.RequestException as e: logging.error(f"网络请求出错: {str(e)}") retry_count += 1 time.sleep(5) except Exception as e: logging.error(f"处理节时出错: {str(e)}") retry_count += 1 time.sleep(5) logging.info(f"已保存到: {output_file}") logging.info(f"总字数: {total_bytes} 字节") if chapter_count > 0: logging.info(f"小说下载完成! 共 {chapter_count} ") else: logging.error("下载失败,未获取到任何节内容") return chapter_count, total_bytes def main(): """主函数,处理用户输入""" print("=" * 50) print("小说下载器 - 优化换行版") print("=" * 50) while True: start_url = input("请输入小说起始URL: ").strip() if start_url: break print("错误: 起始URL不能为空,请重新输入!") output_file = input("请输入输出文件名(默认: novel.txt): ").strip() if not output_file: output_file = "novel.txt" start_time = time.time() chapter_count, total_bytes = download_novel(start_url, output_file) if chapter_count > 0: print(f"\n下载完成! 共 {chapter_count} ,{total_bytes} 字节") print(f"耗时: {time.time() - start_time:.2f} 秒") print(f"文件已保存至: {os.path.abspath(output_file)}") else: print("\n下载失败,请检查日志文件了解详情") if __name__ == "__main__": try: main() except KeyboardInterrupt: print("\n用户中断,程序退出") except Exception as e: print(f"程序发生错误: {str(e)}") class DouYin: pass修改一下,给我一个健全的代码
最新发布
08-26
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值