使用local.xml和addlink方法来自定义toplinks

以前总是用暴力的方法来自定义top links,方法是直接改links.phtml的html。Links由Mage_Page_Block_Template_Links这个类生成,其实用的是Varien_Object。最近决定更正这个暴力的行为。

首先,local.xml我们应该知道,可以覆盖布局的行为。这个在app/code/core/Mage/Core/Model/Layout/Update.php 的418行可以看到,我用的是magento 1.5.1.0版本。

<?xml version="1.0"?>
<layout version="0.1.0">
	<default>
		<reference name="root">
			<reference name="top.links">
				<action method="addLink" translate="label title">
					<label>About Us</label>
					<url>about</url>
					<title>About Us</title>
					<prepare>true</prepare>
					<position>999</position>
					<liParams/>
					<aParams>class="top-link-about-us"</aParams>
					<beforeText>id="alink"</beforeText>
					<afterText>Text</afterText>
				</action>
			</reference>
		</reference>
	</default>
</layout>

如上的local.xml出来的效果是这样的:


有些奇怪...beforeText和afterText都不太对劲..似乎跟字面意思不符合...需要继续深入地研究内核才行...

如果用这样的xml的话

				<action method="removeLinkByUrl"><url helper="customer/getAccountUrl"/></action>
				
				<action method="addLink" translate="label title" module="customer">
					<label>My Custom Account</label>
					<url helper="customer/getAccountUrl"/>
					<title>My Account</title>
					<prepare/>
					<urlParams/>
					<position>10</position>
				</action>
出来的结果变成这样:


移除的话你可以使用remove name或者removeLinkByUrl

				<action method="removeLinkByUrl"><url helper="customer/getAccountUrl"/></action>
				
				<remove name="checkout_cart_link"/>

这样对于开发以来toplinks的模块来说,是再灵活不过了..好了,打算开始做基于ajax的login logout,这个应该派得上用场...
Building prefix dict from the default dictionary ... Loading model from cache C:\Users\35658\AppData\Local\Temp\jieba.cache Loading model cost 0.524 seconds. Prefix dict has been built successfully. 开始爬取北京银行与科技公司合作新闻... 正在处理第 1 页... 第 1 页无新闻,停止爬取 未找到符合条件的合作信息 共获取 0 条合作记录 总耗时: 0.43 秒这段代码没结果import requests from bs4 import BeautifulSoup import pandas as pd import jieba import jieba.posseg as pseg import time import random import re from gne import GeneralNewsExtractor from datetime import datetime import os import urllib.parse # 配置jieba分词器 jieba.initialize() # 添加金融科技领域专有名词公司名称 tech_keywords = ['科技', '技术', '数字', '智能', '数据', '信息', '云', 'AI', '区块链', '金融科技', '创新', '研发'] company_names = ['腾讯', '阿里', '百度', '京东', '字节跳动', '华为', '小米', '蚂蚁集团', '商汤科技', '旷视科技', '科大讯飞'] # 添加自定义词典 for name in company_names: jieba.add_word(name, freq=1000, tag='nt') jieba.add_word('北京银行', freq=1000, tag='nt') jieba.add_word('北银', freq=1000, tag='nt') jieba.add_word('BNK', freq=1000, tag='nt') # 初始化通用新闻抽取器 extractor = GeneralNewsExtractor() # 搜索关键词 search_query = "北京银行 科技公司 合作" encoded_query = urllib.parse.quote(search_query) # 确保正确编码 # 修复后的请求头配置 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1" } # 添加Cookie的单独处理 def get_cookies(): """正确编码处理Cookie""" cookie_str = "x-hng=lang=zh-CN&domain=www.ringdata.com; tokenWebRefresh=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ1c2VySW5mbyI6eyJ1c2VySWQiOjM4MTQ2LCJuYW1lIjoi5Yav6YC45L2zIiwidGVsUGhvbmUiOm51bGwsImVtYWlsIjoiZmVuZzEyM2ZveEBmb3htYWlsLmNvbSIsImlmTG9naW5ObHAiOmZhbHNlLCJubHBSb2xlIjpudWxsLCJubHBQZXJtaXNzaW9uIjpudWxsLCJpZkxvZ2luU3BpZGVyIjpmYWxzZSwic3BpZGVyUm9sZSI6bnVsbCwic3BpZGVyUGVybWlzc2lvbiI6bnVsbCwiaWZMb2dpbkNuZCI6ZmFsc2UsInBlcm1pc3Npb24iOm51bGwsInR5cGUiOjEsIndzS2V5IjoiMzgxNDZfMTc1NjU3ODk5MTAwMCIsInRva2VuIjpudWxsfSwidXNlcl9uYW1lIjoi5Yav6YC45L2zIiwic2NvcGUiOlsiYWxsIl0sImF0aSI6IjFjYzNjZGFjLWIwZmEtNDQ0Yi05M2ExLWM2ZWIzZTgxNzZjOSIsImV4cCI6MTgyODU4MjU5MSwianRpIjoiODVhZDljZGQtOWIwMC00ZmY5LTgyODItNGY0ZjRhYjFmZDY5IiwiY2xpZW50X2lkIjoibW9uZXR3YXJlIn0.zOAr-8CgRuNHWHnR1P6EHeUV7-xK9s71VCJh1h36isM" # 正确解析Cookie字符串 cookies = {} for cookie in cookie_str.split(';'): if '=' in cookie: key, value = cookie.strip().split('=', 1) cookies[key] = urllib.parse.quote(value, safe='') if "%" not in value else value return cookies cookies = get_cookies() # 获取正确编码的Cookie def fetch_news_list(page_num): """获取指定页面的新闻列表""" base_url = "https://www.ringdata.com/news/result" params = { "keywords": search_query, "position": page_num } try: # 使用单独传递的cookies参数 response = requests.get( base_url, params=params, headers=headers, cookies=cookies, # 使用单独传递的cookies timeout=30 ) response.raise_for_status() # 检查是否被重定向或反爬 if response.url != base_url and "login" in response.url: print("需要登录或Cookie已过期") return None return response.text except requests.exceptions.RequestException as e: print(f"获取第 {page_num} 页失败: {str(e)}") return None def parse_news_list(html_content): """解析新闻列表页""" soup = BeautifulSoup(html_content, 'html.parser') news_list = [] # 查找新闻项 - 使用更灵活的选择器 items = soup.select('.info-item, .news-item, .item, .list-item') for item in items: try: # 提取标题链接 title_elem = item.select_one('a[href]') if not title_elem: continue title = title_elem.get_text(strip=True) relative_url = title_elem.get('href', '') # 构造完整URL if relative_url.startswith('/'): full_url = f"https://www.ringdata.com{relative_url}" elif relative_url.startswith('http'): full_url = relative_url else: full_url = f"https://www.ringdata.com/{relative_url}" # 提取源日期 source = "未知来源" date = "未知日期" # 尝试多种方法查找日期来源 meta_container = item.select_one('.source, .date, .info, .meta, .time') if meta_container: meta_text = meta_container.get_text(strip=True) # 使用正则提取日期 date_match = re.search(r'(\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日]?|\d{4}[-/年]\d{1,2}月|\d{4}年)', meta_text) if date_match: date = date_match.group() # 尝试提取来源 source_match = re.search(r'来源[::]?\s*([^|\s]+)', meta_text) if source_match: source = source_match.group(1) elif len(meta_text) < 20 and not re.search(r'\d', meta_text): source = meta_text news_list.append({ "title": title, "url": full_url, "source": source, "publish_date": date }) except Exception as e: print(f"解析新闻项时出错: {str(e)}") return news_list def extract_news_content(url): """使用gne抽取新闻内容发布时间""" try: # 使用单独传递的cookies response = requests.get(url, headers=headers, cookies=cookies, timeout=30) html = response.text # 使用gne提取新闻信息 result = extractor.extract(html, with_body_html=False) # 标准化时间格式 publish_time = result.get('publish_time', '') if publish_time: try: # 尝试转换为标准格式 dt = datetime.strptime(publish_time, '%Y-%m-%d %H:%M:%S') return result['content'], dt.strftime('%Y-%m-%d') except: return result['content'], publish_time return result['content'], '' except Exception as e: print(f"使用gne提取内容失败: {str(e)}") return "", "" def extract_tech_companies(text): """从文本中提取科技公司实体""" words = pseg.cut(text) tech_companies = set() current_entity = [] bank_keywords = {'北京银行', '北银', 'BNK'} for word, flag in words: # 机构名或专有名词 if flag in ['nt', 'nz', 'j', 'x'] and word not in bank_keywords: # 检查是否是科技相关 if any(kw in word for kw in tech_keywords) or word in company_names: current_entity.append(word) elif current_entity: # 如果当前实体不为空,添加进去 entity = ''.join(current_entity) tech_companies.add(entity) current_entity = [word] # 开始新实体 else: current_entity.append(word) elif current_entity: # 遇到非机构名词,完成当前实体 entity = ''.join(current_entity) if any(kw in entity for kw in tech_keywords) or entity in company_names: tech_companies.add(entity) current_entity = [] # 处理最后一个实体 if current_entity: entity = ''.join(current_entity) if any(kw in entity for kw in tech_keywords) or entity in company_names: tech_companies.add(entity) # 过滤掉过短的词 return {c for c in tech_companies if len(c) >= 2} def analyze_cooperation(content, tech_companies): """分析内容提取合作关系""" # 合作关键词 coop_keywords = {'合作', '签约', '战略', '联手', '协作', '共同', '携手', '联盟', '协议', '合作项目', '签署', '签约仪式', '战略合作'} coop_companies = set() # 查找包含合作关键词的句子 sentences = re.split(r'[。!?;\n]', content) coop_sentences = [s for s in sentences if any(kw in s for kw in coop_keywords)] # 找出在合作句子中出现的公司 for company in tech_companies: if any(company in s for s in coop_sentences): coop_companies.add(company) return coop_companies def extract_cooperation_date(content, publish_date): """从内容中提取合作时间""" # 尝试在内容中查找具体日期 date_patterns = [ r'(\d{4})[-年](\d{1,2})[-月](\d{1,2})日?', r'(\d{4})年(\d{1,2})月', r'(\d{4})年' ] for pattern in date_patterns: match = re.search(pattern, content) if match: groups = match.groups() if len(groups) == 3: return f"{groups[0]}-{groups[1].zfill(2)}-{groups[2].zfill(2)}" elif len(groups) == 2: return f"{groups[0]}-{groups[1].zfill(2)}-01" else: return f"{groups[0]}-01-01" # 使用新闻发布日期 return publish_date def scrape_all_news(max_pages=50): """爬取所有新闻数据""" all_results = [] page_num = 1 print(f"开始爬取北京银行与科技公司合作新闻...") while page_num <= max_pages: print(f"正在处理第 {page_num} 页...") # 获取新闻列表页 list_html = fetch_news_list(page_num) if not list_html: break # 解析新闻列表 news_list = parse_news_list(list_html) if not news_list: print(f"第 {page_num} 页无新闻,停止爬取") break print(f"找到 {len(news_list)} 条新闻") # 处理每条新闻 for news in news_list: print(f" 分析新闻: {news['title'][:40]}...") # 获取新闻详情 content, detailed_date = extract_news_content(news['url']) publish_date = detailed_date or news['publish_date'] # 提取科技公司 full_text = f"{news['title']}。{content}" tech_companies = extract_tech_companies(full_text) if not tech_companies: print(" 未识别到科技公司") continue # 分析合作关系 coop_companies = analyze_cooperation(content, tech_companies) if not coop_companies: print(" 未识别到合作关系") continue # 提取合作时间 coop_date = extract_cooperation_date(content, publish_date) # 添加到结果 all_results.append({ "银行": "北京银行", "合作公司": ", ".join(coop_companies), "合作时间": coop_date, "新闻标题": news['title'], "新闻发布时间": publish_date, "新闻来源": news['source'], "新闻链接": news['url'] }) print(f" 发现合作: {', '.join(coop_companies)}") # 每个新闻间隔1-3秒 time.sleep(random.uniform(1, 3)) # 翻页间隔3-6秒 time.sleep(random.uniform(3, 6)) page_num += 1 return all_results def save_results(results): """保存结果到文件""" if not results: print("未找到符合条件的合作信息") return None # 创建数据目录 data_dir = "北京银行合作数据" os.makedirs(data_dir, exist_ok=True) # 转换为DataFrame df = pd.DataFrame(results) # 保存CSV csv_path = os.path.join(data_dir, "北京银行_科技公司合作.csv") df.to_csv(csv_path, index=False, encoding='utf-8-sig') # 保存Excel excel_path = os.path.join(data_dir, "北京银行_科技公司合作.xlsx") df.to_excel(excel_path, index=False) # 保存原始数据JSON json_path = os.path.join(data_dir, "原始数据.json") with open(json_path, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2) print(f"保存成功: {csv_path}, {excel_path}, {json_path}") return df def generate_report(df): """生成分析报告""" if df is None or df.empty: return report_dir = "分析报告" os.makedirs(report_dir, exist_ok=True) # 按公司统计 df['合作年份'] = df['合作时间'].str.extract(r'(\d{4})')[0] company_stats = df.assign(合作公司=df['合作公司'].str.split(', ')).explode('合作公司') # 公司合作次数排名 company_count = company_stats['合作公司'].value_counts().reset_index() company_count.columns = ['公司名称', '合作次数'] company_count.to_csv(os.path.join(report_dir, '公司合作次数排名.csv'), index=False, encoding='utf-8-sig') # 年度合作趋势 year_count = company_stats['合作年份'].value_counts().sort_index().reset_index() year_count.columns = ['年份', '合作次数'] year_count.to_csv(os.path.join(report_dir, '年度合作趋势.csv'), index=False, encoding='utf-8-sig') # 热门公司TOP10 top_companies = company_stats['合作公司'].value_counts().head(10) print("\n热门合作科技公司TOP10:") print(top_companies) print("\n分析报告已生成在 '分析报告' 目录中") if __name__ == "__main__": # 开始爬取 start_time = time.time() results = scrape_all_news(max_pages=100) # 保存结果 df = save_results(results) # 生成报告 if df is not None: generate_report(df) # 统计信息 print(f"\n共获取 {len(results) if results else 0} 条合作记录") print(f"总耗时: {time.time() - start_time:.2f} 秒") 为什么,怎么办
最新发布
09-02
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值