Traceback (most recent call last):
File "D:/HuaweiMoveData/Users/35658/Desktop/31 2.py", line 786, in <module>
from fake_useragent import UserAgent
File "D:\python3.8\lib\site-packages\fake_useragent\__init__.py", line 4, in <module>
from fake_useragent.fake import FakeUserAgent, UserAgent
File "D:\python3.8\lib\site-packages\fake_useragent\fake.py", line 8, in <module>
from fake_useragent.utils import BrowserUserAgentData, load
File "D:\python3.8\lib\site-packages\fake_useragent\utils.py", line 42, in <module>
def load() -> list[BrowserUserAgentData]:
TypeError: 'type' object is not subscriptable下面代码运行还是报错,为什么,怎么办import requests
from bs4 import BeautifulSoup
import pandas as pd
import jieba
import jieba.posseg as pseg
import time
import random
import re
from fake_useragent import UserAgent
from datetime import datetime
import os
import json
from urllib.parse import quote
# 初始化用户代理生成器
ua = UserAgent()
# 配置jieba分词器
jieba.initialize()
# 添加金融科技领域专有名词和公司名称
tech_keywords = ['科技', '技术', '数字', '智能', '数据', '信息', '云', 'AI', '区块链', '金融科技', '创新', '研发']
company_names = ['腾讯', '阿里', '百度', '京东', '字节跳动', '华为', '小米', '蚂蚁集团', '商汤科技', '旷视科技', '科大讯飞']
# 添加自定义词典
for name in company_names:
jieba.add_word(name, freq=1000, tag='nt')
jieba.add_word('北京银行', freq=1000, tag='nt')
jieba.add_word('北银', freq=1000, tag='nt')
jieba.add_word('BNK', freq=1000, tag='nt')
# 搜索关键词
search_query = "北京银行 科技公司 合作"
encoded_query = quote(search_query, safe='') # 安全编码
def get_dynamic_headers():
"""生成动态请求头部"""
return {
"User-Agent": ua.random,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Connection": "keep-alive",
"Referer": "https://www.ringdata.com/",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1"
}
def fetch_news_list(page_num):
"""获取指定页面的新闻列表"""
base_url = "https://www.ringdata.com/news"
params = {
"keywords": search_query,
"page": page_num # 修正分页参数
}
try:
# 使用动态头部
headers = get_dynamic_headers()
response = requests.get(
base_url,
params=params,
headers=headers,
timeout=30
)
response.raise_for_status()
# 检查是否被重定向或反爬
if response.status_code != 200:
print(f"异常状态码: {response.status_code}")
return None
return response.text
except requests.exceptions.RequestException as e:
print(f"获取第 {page_num} 页失败: {str(e)}")
return None
def parse_news_list(html_content):
"""解析新闻列表页"""
soup = BeautifulSoup(html_content, 'html.parser')
news_list = []
# 更通用的新闻项选择器
items = soup.select('.news-list .item, .info-list li, .article-list li, .result-list li')
if not items:
# 尝试备用选择器
items = soup.select('div.item, li.item, div.news-item, li.news-item')
for item in items:
try:
# 提取标题和链接
title_elem = item.select_one('a[href]')
if not title_elem:
continue
title = title_elem.get_text(strip=True)
relative_url = title_elem.get('href', '')
# 构造完整URL
if relative_url.startswith('/'):
full_url = f"https://www.ringdata.com{relative_url}"
elif relative_url.startswith('http'):
full_url = relative_url
else:
full_url = f"https://www.ringdata.com/{relative_url}"
# 提取源和日期
source = "未知来源"
date = "未知日期"
# 改进的元信息提取
meta_container = item.select_one('.source, .date, .info, .meta, .time, .pub-time')
if meta_container:
meta_text = meta_container.get_text(strip=True)
# 提取日期(优先匹配YYYY-MM-DD格式)
date_match = re.search(r'(\d{4}-\d{1,2}-\d{1,2})', meta_text)
if date_match:
date = date_match.group(0)
else:
# 尝试其他日期格式
date_match = re.search(r'(\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日]?|\d{4}[-/年]\d{1,2}月|\d{4}年)', meta_text)
if date_match:
date = date_match.group()
# 提取来源
source_match = re.search(r'来源[::]?\s*([^|\s]+)', meta_text)
if source_match:
source = source_match.group(1)
elif len(meta_text) < 20 and not re.search(r'\d', meta_text):
source = meta_text
news_list.append({
"title": title,
"url": full_url,
"source": source,
"publish_date": date
})
except Exception as e:
print(f"解析新闻项时出错: {str(e)}")
return news_list
def extract_news_content(url):
"""提取新闻内容和发布时间"""
try:
headers = get_dynamic_headers()
response = requests.get(url, headers=headers, timeout=30)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
# 正文内容提取
content_elem = soup.select_one('.article-content, .content, .main-content, .news-content')
content = content_elem.get_text(strip=True) if content_elem else ""
# 发布时间提取
time_elem = soup.select_one('.pub-time, .publish-date, .time')
publish_time = time_elem.get_text(strip=True) if time_elem else ""
# 标准化时间格式
if publish_time:
try:
# 尝试转换为标准格式
dt = datetime.strptime(publish_time, '%Y-%m-%d %H:%M:%S')
return content, dt.strftime('%Y-%m-%d')
except:
# 尝试其他格式
date_match = re.search(r'(\d{4}-\d{1,2}-\d{1,2})', publish_time)
if date_match:
return content, date_match.group(0)
return content, publish_time
return content, ""
except Exception as e:
print(f"提取内容失败: {str(e)}")
return "", ""
# 其余函数保持不变(extract_tech_companies, analyze_cooperation, extract_cooperation_date, scrape_all_news, save_results, generate_report)
if __name__ == "__main__":
# 开始爬取
start_time = time.time()
results = scrape_all_news(max_pages=10) # 减少测试页数
# 保存结果
df = save_results(results)
# 生成报告
if df is not None:
generate_report(df)
# 统计信息
print(f"\n共获取 {len(results) if results else 0} 条合作记录")
print(f"总耗时: {time.time() - start_time:.2f} 秒")
最新发布