import requests
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urljoin
# 定义关键词列表
KEYWORDS = [""]
def sanitize_text(text):
"""增强型文本清洗"""
# 去除HTML标签
text = re.sub(r'<[^>]+>', '', text)
# 去除URL链接
text = re.sub(r'https?://\S+', '', text)
# 替换HTML实体
replacements = {
' ': ' ', '&': '&', '"': '"',
'<': '<', '>': '>'
}
for k, v in replacements.items():
text = text.replace(k, v)
# 清理特殊符号
text = re.sub(r'[■◆▼©®™●【】]', '', text)
# 规范化空格
text = re.sub(r'\s+', ' ', text).strip()
return text
def save_content(url, desktop_path):
"""安全获取并保存网页内容"""
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'utf-8'
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
main_content = soup.find(['article', 'div'], class_=re.compile(r'content|main'))
clean_text = sanitize_text(main_content.get_text() if main_content else soup.get_text())
# 生成安全文件名
filename = re.sub(r'[\\/*?:"<>|]', '', url.split('/')[-1])[:50] + '.txt'
filepath = os.path.join(desktop_path, filename)
with open(filepath, 'w', encoding='utf-8') as f:
f.write(clean_text)
print(f'成功保存: {filename}')
else:
print(f'访问失败: {url} 状态码: {response.status_code}')
except Exception as e:
print(f'处理{url}时出错: {str(e)}')
def main():
"""主程序"""
# 获取桌面路径(跨平台)
desktop = os.path.join(os.path.expanduser('~'), 'Desktop')
# 模拟种子页面(需合法授权后替换实际目标网站)
seed_url = "http://www.81.cn/" # 示例地址
try:
res = requests.get(seed_url, timeout=10)
soup = BeautifulSoup(res.text, 'html.parser')
# 提取包含关键词的链接
links = []
for a in soup.find_all('a', href=True):
text = a.get_text().strip()
if any(keyword in text for keyword in KEYWORDS):
absolute_url = urljoin(seed_url, a['href'])
links.append(absolute_url)
# 去重处理
unique_links = list(set(links))
# 保存内容(建议控制频率)
for link in unique_links[:9]: # 示例仅处理前5条
save_content(link, desktop)
except Exception as e:
print(f'程序终止: {str(e)}')
if __name__ == "__main__":
main()
在上述代码基础上将所有输出的txt保存在桌面的一个新的文件夹中,给出完整代码
最新发布