Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\35658\AppData\Local\Temp\jieba.cache
Loading model cost 0.524 seconds.
Prefix dict has been built successfully.
开始爬取北京银行与科技公司合作新闻...
正在处理第 1 页...
第 1 页无新闻,停止爬取
未找到符合条件的合作信息
共获取 0 条合作记录
总耗时: 0.43 秒这段代码没结果import requests
from bs4 import BeautifulSoup
import pandas as pd
import jieba
import jieba.posseg as pseg
import time
import random
import re
from gne import GeneralNewsExtractor
from datetime import datetime
import os
import urllib.parse
# 配置jieba分词器
jieba.initialize()
# 添加金融科技领域专有名词和公司名称
tech_keywords = ['科技', '技术', '数字', '智能', '数据', '信息', '云', 'AI', '区块链', '金融科技', '创新', '研发']
company_names = ['腾讯', '阿里', '百度', '京东', '字节跳动', '华为', '小米', '蚂蚁集团', '商汤科技', '旷视科技', '科大讯飞']
# 添加自定义词典
for name in company_names:
jieba.add_word(name, freq=1000, tag='nt')
jieba.add_word('北京银行', freq=1000, tag='nt')
jieba.add_word('北银', freq=1000, tag='nt')
jieba.add_word('BNK', freq=1000, tag='nt')
# 初始化通用新闻抽取器
extractor = GeneralNewsExtractor()
# 搜索关键词
search_query = "北京银行 科技公司 合作"
encoded_query = urllib.parse.quote(search_query) # 确保正确编码
# 修复后的请求头配置
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1"
}
# 添加Cookie的单独处理
def get_cookies():
"""正确编码和处理Cookie"""
cookie_str = "x-hng=lang=zh-CN&domain=www.ringdata.com; tokenWebRefresh=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySW5mbyI6eyJ1c2VySWQiOjM4MTQ2LCJuYW1lIjoi5Yav6YC45L2zIiwidGVsUGhvbmUiOm51bGwsImVtYWlsIjoiZmVuZzEyM2ZveEBmb3htYWlsLmNvbSIsImlmTG9naW5ObHAiOmZhbHNlLCJubHBSb2xlIjpudWxsLCJubHBQZXJtaXNzaW9uIjpudWxsLCJpZkxvZ2luU3BpZGVyIjpmYWxzZSwic3BpZGVyUm9sZSI6bnVsbCwic3BpZGVyUGVybWlzc2lvbiI6bnVsbCwiaWZMb2dpbkNuZCI6ZmFsc2UsInBlcm1pc3Npb24iOm51bGwsInR5cGUiOjEsIndzS2V5IjoiMzgxNDZfMTc1NjU3ODk5MTAwMCIsInRva2VuIjpudWxsfSwidXNlcl9uYW1lIjoi5Yav6YC45L2zIiwic2NvcGUiOlsiYWxsIl0sImF0aSI6IjFjYzNjZGFjLWIwZmEtNDQ0Yi05M2ExLWM2ZWIzZTgxNzZjOSIsImV4cCI6MTgyODU4MjU5MSwianRpIjoiODVhZDljZGQtOWIwMC00ZmY5LTgyODItNGY0ZjRhYjFmZDY5IiwiY2xpZW50X2lkIjoibW9uZXR3YXJlIn0.zOAr-8CgRuNHWHnR1P6EHeUV7-xK9s71VCJh1h36isM"
# 正确解析Cookie字符串
cookies = {}
for cookie in cookie_str.split(';'):
if '=' in cookie:
key, value = cookie.strip().split('=', 1)
cookies[key] = urllib.parse.quote(value, safe='') if "%" not in value else value
return cookies
cookies = get_cookies() # 获取正确编码的Cookie
def fetch_news_list(page_num):
"""获取指定页面的新闻列表"""
base_url = "https://www.ringdata.com/news/result"
params = {
"keywords": search_query,
"position": page_num
}
try:
# 使用单独传递的cookies参数
response = requests.get(
base_url,
params=params,
headers=headers,
cookies=cookies, # 使用单独传递的cookies
timeout=30
)
response.raise_for_status()
# 检查是否被重定向或反爬
if response.url != base_url and "login" in response.url:
print("需要登录或Cookie已过期")
return None
return response.text
except requests.exceptions.RequestException as e:
print(f"获取第 {page_num} 页失败: {str(e)}")
return None
def parse_news_list(html_content):
"""解析新闻列表页"""
soup = BeautifulSoup(html_content, 'html.parser')
news_list = []
# 查找新闻项 - 使用更灵活的选择器
items = soup.select('.info-item, .news-item, .item, .list-item')
for item in items:
try:
# 提取标题和链接
title_elem = item.select_one('a[href]')
if not title_elem:
continue
title = title_elem.get_text(strip=True)
relative_url = title_elem.get('href', '')
# 构造完整URL
if relative_url.startswith('/'):
full_url = f"https://www.ringdata.com{relative_url}"
elif relative_url.startswith('http'):
full_url = relative_url
else:
full_url = f"https://www.ringdata.com/{relative_url}"
# 提取源和日期
source = "未知来源"
date = "未知日期"
# 尝试多种方法查找日期和来源
meta_container = item.select_one('.source, .date, .info, .meta, .time')
if meta_container:
meta_text = meta_container.get_text(strip=True)
# 使用正则提取日期
date_match = re.search(r'(\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日]?|\d{4}[-/年]\d{1,2}月|\d{4}年)', meta_text)
if date_match:
date = date_match.group()
# 尝试提取来源
source_match = re.search(r'来源[::]?\s*([^|\s]+)', meta_text)
if source_match:
source = source_match.group(1)
elif len(meta_text) < 20 and not re.search(r'\d', meta_text):
source = meta_text
news_list.append({
"title": title,
"url": full_url,
"source": source,
"publish_date": date
})
except Exception as e:
print(f"解析新闻项时出错: {str(e)}")
return news_list
def extract_news_content(url):
"""使用gne抽取新闻内容和发布时间"""
try:
# 使用单独传递的cookies
response = requests.get(url, headers=headers, cookies=cookies, timeout=30)
html = response.text
# 使用gne提取新闻信息
result = extractor.extract(html, with_body_html=False)
# 标准化时间格式
publish_time = result.get('publish_time', '')
if publish_time:
try:
# 尝试转换为标准格式
dt = datetime.strptime(publish_time, '%Y-%m-%d %H:%M:%S')
return result['content'], dt.strftime('%Y-%m-%d')
except:
return result['content'], publish_time
return result['content'], ''
except Exception as e:
print(f"使用gne提取内容失败: {str(e)}")
return "", ""
def extract_tech_companies(text):
"""从文本中提取科技公司实体"""
words = pseg.cut(text)
tech_companies = set()
current_entity = []
bank_keywords = {'北京银行', '北银', 'BNK'}
for word, flag in words:
# 机构名或专有名词
if flag in ['nt', 'nz', 'j', 'x'] and word not in bank_keywords:
# 检查是否是科技相关
if any(kw in word for kw in tech_keywords) or word in company_names:
current_entity.append(word)
elif current_entity:
# 如果当前实体不为空,添加进去
entity = ''.join(current_entity)
tech_companies.add(entity)
current_entity = [word] # 开始新实体
else:
current_entity.append(word)
elif current_entity:
# 遇到非机构名词,完成当前实体
entity = ''.join(current_entity)
if any(kw in entity for kw in tech_keywords) or entity in company_names:
tech_companies.add(entity)
current_entity = []
# 处理最后一个实体
if current_entity:
entity = ''.join(current_entity)
if any(kw in entity for kw in tech_keywords) or entity in company_names:
tech_companies.add(entity)
# 过滤掉过短的词
return {c for c in tech_companies if len(c) >= 2}
def analyze_cooperation(content, tech_companies):
"""分析内容提取合作关系"""
# 合作关键词
coop_keywords = {'合作', '签约', '战略', '联手', '协作', '共同', '携手', '联盟', '协议', '合作项目', '签署', '签约仪式', '战略合作'}
coop_companies = set()
# 查找包含合作关键词的句子
sentences = re.split(r'[。!?;\n]', content)
coop_sentences = [s for s in sentences if any(kw in s for kw in coop_keywords)]
# 找出在合作句子中出现的公司
for company in tech_companies:
if any(company in s for s in coop_sentences):
coop_companies.add(company)
return coop_companies
def extract_cooperation_date(content, publish_date):
"""从内容中提取合作时间"""
# 尝试在内容中查找具体日期
date_patterns = [
r'(\d{4})[-年](\d{1,2})[-月](\d{1,2})日?',
r'(\d{4})年(\d{1,2})月',
r'(\d{4})年'
]
for pattern in date_patterns:
match = re.search(pattern, content)
if match:
groups = match.groups()
if len(groups) == 3:
return f"{groups[0]}-{groups[1].zfill(2)}-{groups[2].zfill(2)}"
elif len(groups) == 2:
return f"{groups[0]}-{groups[1].zfill(2)}-01"
else:
return f"{groups[0]}-01-01"
# 使用新闻发布日期
return publish_date
def scrape_all_news(max_pages=50):
"""爬取所有新闻数据"""
all_results = []
page_num = 1
print(f"开始爬取北京银行与科技公司合作新闻...")
while page_num <= max_pages:
print(f"正在处理第 {page_num} 页...")
# 获取新闻列表页
list_html = fetch_news_list(page_num)
if not list_html:
break
# 解析新闻列表
news_list = parse_news_list(list_html)
if not news_list:
print(f"第 {page_num} 页无新闻,停止爬取")
break
print(f"找到 {len(news_list)} 条新闻")
# 处理每条新闻
for news in news_list:
print(f" 分析新闻: {news['title'][:40]}...")
# 获取新闻详情
content, detailed_date = extract_news_content(news['url'])
publish_date = detailed_date or news['publish_date']
# 提取科技公司
full_text = f"{news['title']}。{content}"
tech_companies = extract_tech_companies(full_text)
if not tech_companies:
print(" 未识别到科技公司")
continue
# 分析合作关系
coop_companies = analyze_cooperation(content, tech_companies)
if not coop_companies:
print(" 未识别到合作关系")
continue
# 提取合作时间
coop_date = extract_cooperation_date(content, publish_date)
# 添加到结果
all_results.append({
"银行": "北京银行",
"合作公司": ", ".join(coop_companies),
"合作时间": coop_date,
"新闻标题": news['title'],
"新闻发布时间": publish_date,
"新闻来源": news['source'],
"新闻链接": news['url']
})
print(f" 发现合作: {', '.join(coop_companies)}")
# 每个新闻间隔1-3秒
time.sleep(random.uniform(1, 3))
# 翻页间隔3-6秒
time.sleep(random.uniform(3, 6))
page_num += 1
return all_results
def save_results(results):
"""保存结果到文件"""
if not results:
print("未找到符合条件的合作信息")
return None
# 创建数据目录
data_dir = "北京银行合作数据"
os.makedirs(data_dir, exist_ok=True)
# 转换为DataFrame
df = pd.DataFrame(results)
# 保存CSV
csv_path = os.path.join(data_dir, "北京银行_科技公司合作.csv")
df.to_csv(csv_path, index=False, encoding='utf-8-sig')
# 保存Excel
excel_path = os.path.join(data_dir, "北京银行_科技公司合作.xlsx")
df.to_excel(excel_path, index=False)
# 保存原始数据JSON
json_path = os.path.join(data_dir, "原始数据.json")
with open(json_path, 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=2)
print(f"保存成功: {csv_path}, {excel_path}, {json_path}")
return df
def generate_report(df):
"""生成分析报告"""
if df is None or df.empty:
return
report_dir = "分析报告"
os.makedirs(report_dir, exist_ok=True)
# 按公司统计
df['合作年份'] = df['合作时间'].str.extract(r'(\d{4})')[0]
company_stats = df.assign(合作公司=df['合作公司'].str.split(', ')).explode('合作公司')
# 公司合作次数排名
company_count = company_stats['合作公司'].value_counts().reset_index()
company_count.columns = ['公司名称', '合作次数']
company_count.to_csv(os.path.join(report_dir, '公司合作次数排名.csv'), index=False, encoding='utf-8-sig')
# 年度合作趋势
year_count = company_stats['合作年份'].value_counts().sort_index().reset_index()
year_count.columns = ['年份', '合作次数']
year_count.to_csv(os.path.join(report_dir, '年度合作趋势.csv'), index=False, encoding='utf-8-sig')
# 热门公司TOP10
top_companies = company_stats['合作公司'].value_counts().head(10)
print("\n热门合作科技公司TOP10:")
print(top_companies)
print("\n分析报告已生成在 '分析报告' 目录中")
if __name__ == "__main__":
# 开始爬取
start_time = time.time()
results = scrape_all_news(max_pages=100)
# 保存结果
df = save_results(results)
# 生成报告
if df is not None:
generate_report(df)
# 统计信息
print(f"\n共获取 {len(results) if results else 0} 条合作记录")
print(f"总耗时: {time.time() - start_time:.2f} 秒")
为什么,怎么办
最新发布