import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from datetime import datetime
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
def fetch_website_content(url, headers=None, max_retries=3, timeout=10):
session = requests.Session()
retries = Retry(total=max_retries, backoff_factor=1, status_forcelist=[500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))
try:
response = session.get(url, headers=headers, timeout=timeout)
response.raise_for_status() # 检查 HTTP 响应状态码是否为 200
except requests.exceptions.ConnectionError as e:
print("网络连接失败,请检查目标主机是否可达或 DNS 是否配置正确。")
except requests.exceptions.Timeout as e:
print("请求超时,请尝试增加超时时间或检查网络状况。")
except requests.exceptions.HTTPError as e:
print(f"HTTP 错误:{response.status_code},请确认 URL 是否正确。")
except requests.exceptions.RequestException as e:
print(f"发生未知请求错误:{e}")
else:
return response.text
return None
def fetch_website_content(url):
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# 使用通用方法提取正文内容(可替换为 newspaper 等工具)
content_div = soup.find('div', class_=re.compile(r'content|article|post-body'))
if not content_div:
content_div = soup.find('body') # 备用方案
content = content_div.get_text(strip=True) if content_div else ''
return content
def clean_content(text):
# 去除多余空格和不可见字符
cleaned_text = re.sub(r'\s+', ' ', text).strip()
return cleaned_text
def extract_date(text):
# 尝试从文本中提取日期
date_match = re.search(r'(\d{4}-\d{2}-\d{2})|(\d{4}/\d{2}/\d{2})', text)
if date_match:
date_str = date_match.group(0).replace('/', '-')
try:
return datetime.strptime(date_str, '%Y-%m-%d').date()
except ValueError:
return None
return None
# 示例站点列表
urls = [
'https://e.cdsb.com/html/2025-02/20/content_789172.htm',
'https://e.cdsb.com/html/2025-02/20/content_789141.htm',
'https://e.cdsb.com/html/2025-02/20/content_789177.htm',
'https://e.cdsb.com/html/2025-02/20/content_789100.htm',
'https://e.cdsb.com/html/2025-02/20/content_789101.htm',
'https://e.cdsb.com/html/2025-02/20/content_789102.htm',
'https://e.cdsb.com/html/2025-02/20/content_789103.htm',
'https://e.cdsb.com/html/2025-02/20/content_789104.htm',
'https://e.cdsb.com/html/2025-02/20/content_789105.htm',
'https://e.cdsb.com/html/2025-02/20/content_789106.htm',
'https://e.cdsb.com/html/2025-02/20/content_789107.htm',
'https://e.cdsb.com/html/2025-02/20/content_789108.htm',
'https://e.cdsb.com/html/2025-02/20/content_789109.htm',
'https://e.cdsb.com/html/2025-02/20/content_789110.htm',
'https://e.cdsb.com/html/2025-02/20/content_789111.htm',
'https://e.cdsb.com/html/2025-02/20/content_789112.htm',
'https://e.cdsb.com/html/2025-02/20/content_789113.htm',
'https://e.cdsb.com/html/2025-02/20/content_789114.htm',
'https://e.cdsb.com/html/2025-02/20/content_789115.htm',
'https://e.cdsb.com/html/2025-02/20/content_789116.htm',
'https://e.cdsb.com/html/2025-02/20/content_789117.htm',
'https://e.cdsb.com/html/2025-02/20/content_789118.htm',
'https://e.cdsb.com/html/2025-02/20/content_789119.htm',
'https://e.cdsb.com/html/2025-02/20/content_789120.htm',
'https://e.cdsb.com/html/2025-02/20/content_789121.htm',
'https://e.cdsb.com/html/2025-02/20/content_789122.htm',
'https://e.cdsb.com/html/2025-02/20/content_789123.htm',
'https://e.cdsb.com/html/2025-02/20/content_789124.htm',
'https://e.cdsb.com/html/2025-02/20/content_789125.htm',
'https://e.cdsb.com/html/2025-02/20/content_789126.htm',
'https://e.cdsb.com/html/2025-02/20/content_789127.htm',
'https://e.cdsb.com/html/2025-02/20/content_789128.htm',
'https://e.cdsb.com/html/2025-02/20/content_789129.htm',
'https://e.cdsb.com/html/2025-02/20/content_789130.htm',
'https://e.cdsb.com/html/2025-02/20/content_789131.htm',
'https://e.cdsb.com/html/2025-02/20/content_789132.htm',
'https://e.cdsb.com/html/2025-02/20/content_789133.htm',
'https://e.cdsb.com/html/2025-02/20/content_789134.htm',
'https://e.cdsb.com/html/2025-02/20/content_789135.htm',
'https://e.cdsb.com/html/2025-02/20/content_789136.htm',
'https://e.cdsb.com/html/2025-02/20/content_789137.htm',
'https://e.cdsb.com/html/2025-02/20/content_789138.htm',
'https://e.cdsb.com/html/2025-02/20/content_789139.htm',
'https://e.cdsb.com/html/2025-02/20/content_789140.htm',
'https://e.cdsb.com/html/2025-02/20/content_789141.htm',
'https://e.cdsb.com/html/2025-02/20/content_789142.htm',
'https://e.cdsb.com/html/2025-02/20/content_789143.htm',
'https://e.cdsb.com/html/2025-02/20/content_789144.htm',
'https://e.cdsb.com/html/2025-02/20/content_789145.htm',
'https://e.cdsb.com/html/2025-02/20/content_789146.htm',
'https://e.cdsb.com/html/2025-02/20/content_789147.htm',
'https://e.cdsb.com/html/2025-02/20/content_789148.htm',
'https://e.cdsb.com/html/2025-02/20/content_789149.htm',
'https://e.cdsb.com/html/2025-02/20/content_789150.htm',
'https://e.cdsb.com/html/2025-02/20/content_789151.htm',
'https://e.cdsb.com/html/2025-02/20/content_789152.htm',
'https://e.cdsb.com/html/2025-02/20/content_789153.htm',
'https://e.cdsb.com/html/2025-02/20/content_789154.htm',
'https://e.cdsb.com/html/2025-02/20/content_789155.htm',
'https://e.cdsb.com/html/2025-02/20/content_789156.htm',
'https://e.cdsb.com/html/2025-02/20/content_789157.htm',
'https://e.cdsb.com/html/2025-02/20/content_789158.htm',
'https://e.cdsb.com/html/2025-02/20/content_789159.htm',
'https://e.cdsb.com/html/2025-02/20/content_789160.htm',
'https://e.cdsb.com/html/2025-02/20/content_789161.htm',
'https://e.cdsb.com/html/2025-02/20/content_789162.htm',
'https://e.cdsb.com/html/2025-02/20/content_789163.htm',
'https://e.cdsb.com/html/2025-02/20/content_789164.htm',
'https://e.cdsb.com/html/2025-02/20/content_789165.htm',
'https://e.cdsb.com/html/2025-02/20/content_789166.htm',
'https://e.cdsb.com/html/2025-02/20/content_789167.htm',
'https://e.cdsb.com/html/2025-02/20/content_789168.htm',
'https://e.cdsb.com/html/2025-02/20/content_789169.htm',
'https://e.cdsb.com/html/2025-02/20/content_789170.htm',
'https://e.cdsb.com/html/2025-02/20/content_789171.htm',
'https://e.cdsb.com/html/2025-02/20/content_789172.htm',
'https://e.cdsb.com/html/2025-02/20/content_789173.htm',
'https://e.cdsb.com/html/2025-02/20/content_789174.htm',
'https://e.cdsb.com/html/2025-02/20/content_789175.htm',
'https://e.cdsb.com/html/2025-02/20/content_789176.htm',
'https://e.cdsb.com/html/2025-02/20/content_789177.htm',
'https://e.cdsb.com/html/2025-02/20/content_789178.htm',
'https://e.cdsb.com/html/2025-02/20/content_789179.htm',
'https://e.cdsb.com/html/2025-02/20/content_789180.htm',
'https://e.cdsb.com/html/2025-02/20/content_789181.htm',
'https://e.cdsb.com/html/2025-02/20/content_789182.htm',
'https://e.cdsb.com/html/2025-02/20/content_789183.htm',
'https://e.cdsb.com/html/2025-02/20/content_789184.htm',
'https://e.cdsb.com/html/2025-02/20/content_789185.htm',
'https://e.cdsb.com/html/2025-02/20/content_789186.htm',
'https://e.cdsb.com/html/2025-02/20/content_789187.htm',
'https://e.cdsb.com/html/2025-02/20/content_789188.htm',
'https://e.cdsb.com/html/2025-02/20/content_789189.htm',
'https://e.cdsb.com/html/2025-02/20/content_789190.htm',
'https://e.cdsb.com/html/2025-02/20/content_789191.htm',
'https://e.cdsb.com/html/2025-02/20/content_789192.htm',
'https://e.cdsb.com/html/2025-02/20/content_789193.htm',
'https://e.cdsb.com/html/2025-02/20/content_789194.htm',
'https://e.cdsb.com/html/2025-02/20/content_789195.htm',
'https://e.cdsb.com/html/2025-02/20/content_789196.htm',
'https://e.cdsb.com/html/2025-02/20/content_789197.htm',
'https://e.cdsb.com/html/2025-02/20/content_789198.htm',
'https://e.cdsb.com/html/2025-02/20/content_789199.htm',
]
results = []
for url in urls:
raw_content = fetch_website_content(url)
cleaned_content = clean_content(raw_content)
publish_date = extract_date(cleaned_content)
results.append({
'发布日期': publish_date or '未知日期',
'网址': url,
'内容': cleaned_content
})
# 写入Excel文件
df = pd.DataFrame(results)
df.to_excel('context.xlsx', index=False)
对上述代码进行分析,补充处理采集内容是乱码的代码,输出完整代码
最新发布