import requests
from bs4 import BeautifulSoup
import re
import os
import json
from datetime import datetime
import time
import urllib.parse
# 创建存储目录
if not os.path.exists('./ipo_documents'):
os.makedirs('./ipo_documents')
if not os.path.exists('./ipo_documents/pdf_files'):
os.makedirs('./ipo_documents/pdf_files')
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Referer': 'https://listing.szse.cn/disclosure/ipo/index.html',
'Accept-Encoding': 'gzip, deflate, br, zstd'
}
def extract_document_links(html_content):
"""从HTML内容中提取文档链接"""
soup = BeautifulSoup(html_content, 'html.parser')
documents = []
# 查找所有表格行
rows = soup.find_all('tr')
for row in rows:
# 跳过表头行
if row.find('th'):
continue
# 获取文档名称
doc_name_td = row.find('td')
if not doc_name_td:
continue
doc_name = doc_name_td.get_text(strip=True)
if not doc_name:
continue
# 查找所有链接
link_tds = row.find_all('td')[1:] # 跳过第一个td(文档名称)
for link_td in link_tds:
links = link_td.find_all('a', href=True)
for link in links:
href = link.get('href', '').strip()
date_text = link.get_text(strip=True)
# 清理URL中的特殊字符和空格
href = re.sub(r'[\s"\'”]', '', href)
# 跳过非PDF链接
if not href.lower().endswith('.pdf'):
continue
# 修复常见URL问题
if href.startswith('httns:'):
href = href.replace('httns:', 'https:')
if '//reDortdocs.' in href:
href = href.replace('reDortdocs.', 'reportdocs.')
if '//reP0rtdocs.' in href:
href = href.replace('reP0rtdocs.', 'reportdocs.')
# 确保是完整的URL
if not href.startswith('http'):
href = f"https://reportdocs.static.szse.cn{href}"
# 添加到文档列表
documents.append({
'document_name': doc_name,
'publish_date': date_text,
'file_url': href
})
return documents
def download_pdf_file(url, file_path, max_retries=3):
"""下载PDF文件"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
'Referer': 'https://listing.szse.cn/',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive'
}
for attempt in range(max_retries):
try:
response = requests.get(url, headers=headers, stream=True, timeout=30)
response.raise_for_status()
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
# 检查文件是否有效
if os.path.getsize(file_path) < 1024: # 小于1KB可能是错误页面
os.remove(file_path)
raise Exception("下载的文件太小,可能是错误页面")
return True
except requests.exceptions.RequestException as e:
print(f"下载失败 (尝试 {attempt + 1}/{max_retries}): {str(e)}")
time.sleep(2) # 等待后重试
except Exception as e:
print(f"下载失败: {str(e)}")
return False
return False
def sanitize_filename(name):
"""清理文件名中的非法字符"""
# 移除非法字符
name = re.sub(r'[\\/*?:"<>|]', '', name)
# 替换空格为下划线
name = re.sub(r'\s+', '_', name)
# 截断过长的文件名
if len(name) > 150:
name = name[:150]
return name
def generate_pdf_links(base_url, years, months):
"""根据URL模式生成PDF链接"""
generated_links = []
for year in years:
for month in months:
# 构建URL路径
path = f"{year}{month:02d}"
url = f"{base_url}/UpFiles/rasinfodisc1/{path}/RAS_{path}_*.pdf"
# 添加到列表
generated_links.append({
'document_name': '自动生成',
'publish_date': f"{year}-{month:02d}",
'file_url': url
})
return generated_links
def main():
target_url = "https://listing.szse.cn/projectdynamic/ipo/detail/index.html?id=1003423"
print(f"开始爬取IPO项目详情页: {target_url}")
# 用户指定的PDF链接(直接添加)
custom_links = [
"https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202306/RAS_202306_EFF55B536A0445029A7FC26A9AFDC151.pdf",
"https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202506/RAS_202506_1021050A714A3ED75944E597BBAED9456B6037.pdf",
"https://reportdocs.static.szse.cn/UpFiles/rasinfodisc1/202507/RAS_202507_041725E36EB4D843004F0997B334B64E5003BA.pdf"
]
try:
# 发送HTTP请求获取页面内容
response = requests.get(target_url, headers=headers)
response.raise_for_status() # 检查请求是否成功
# 解析文档链接
documents = extract_document_links(response.text)
# 添加用户指定的链接
for url in custom_links:
# 检查链接是否已在文档列表中
if not any(doc['file_url'] == url for doc in documents):
documents.append({
'document_name': '自定义链接',
'publish_date': '未知日期',
'file_url': url
})
# 分析PDF链接模式
print("\n分析PDF链接模式...")
pattern = re.compile(
r'https://reportdocs\.static\.szse\.cn/UpFiles/rasinfodisc1/(\d{6})/RAS_\d{6}_[A-Z0-9]+\.pdf')
# 统计年份和月份分布
year_month_counts = {}
for doc in documents:
match = pattern.match(doc['file_url'])
if match:
ym = match.group(1) # 格式如202306
year_month_counts[ym] = year_month_counts.get(ym, 0) + 1
# 打印链接模式统计
print("PDF链接年份/月份分布:")
for ym, count in sorted(year_month_counts.items()):
print(f" {ym[:4]}-{ym[4:6]}: {count}个文件")
# 生成更多PDF链接(如果需要)
if year_month_counts:
years = sorted(set(ym[:4] for ym in year_month_counts.keys()))
months = sorted(set(int(ym[4:6]) for ym in year_month_counts.keys()))
print(f"\n可扩展年份: {years}, 月份: {months}")
# 询问用户是否要生成更多链接
generate_more = input("\n是否要生成更多PDF链接? (y/n): ").lower() == 'y'
if generate_more:
# 生成更多PDF链接
generated_docs = generate_pdf_links(
"https://reportdocs.static.szse.cn",
years,
months
)
# 添加生成的链接到文档列表
documents.extend(generated_docs)
print(f"已生成 {len(generated_docs)} 个可能的PDF链接")
if not documents:
print("未找到任何文档链接")
return
print(f"共找到 {len(documents)} 个PDF文档链接")
# 下载PDF文件
download_count = 0
for i, doc in enumerate(documents):
# 创建安全的文件名
safe_name = sanitize_filename(doc['document_name'])
safe_date = sanitize_filename(doc['publish_date'])
filename = f"{safe_name}_{safe_date}.pdf"
file_path = os.path.join('./ipo_documents/pdf_files', filename)
# 避免文件名冲突
counter = 1
while os.path.exists(file_path):
filename = f"{safe_name}_{safe_date}_{counter}.pdf"
file_path = os.path.join('./ipo_documents/pdf_files', filename)
counter += 1
print(f"正在下载 ({i + 1}/{len(documents)}): {doc['document_name']} - {doc['publish_date']}")
print(f"URL: {doc['file_url']}")
if download_pdf_file(doc['file_url'], file_path):
doc['local_path'] = file_path
download_count += 1
print(f"已保存到: {file_path}\n")
else:
print("下载失败\n")
# 创建结果文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f"./ipo_documents/project_1003423_{timestamp}.json"
# 保存结果到JSON文件
with open(output_file, 'w', encoding='utf-8') as f:
json.dump({
'source_url': target_url,
'download_time': timestamp,
'documents': documents
}, f, ensure_ascii=False, indent=2)
print(f"\n处理完成! 成功下载 {download_count}/{len(documents)} 个文件")
print(f"元数据已保存到: {output_file}")
except requests.exceptions.RequestException as e:
print(f"请求失败: {str(e)}")
except Exception as e:
print(f"发生错误: {str(e)}")
if __name__ == '__main__':
main() 修改一下代码,只需要下载选取网址的三个文件,不需要运行下载七个