import requests import argparse from collections import namedtuple import requests.exceptions import logging from functools import lru_cache from collections import Counter import json import hashlib import csv from wordcloud import WordCloud import matplotlib.pyplot as plt # 配置日志记录基本设置 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', filename='search_log.log', filemode='w') # 百度翻译API相关配置,替换为你申请的真实信息 BAIDU_TRANSLATE_APPID = "YOUR_APPID" BAIDU_TRANSLATE_SECRET_KEY = "YOUR_SECRET_KEY" BAIDU_TRANSLATE_URL = "https://fanyi-api.baidu.com/api/trans/vip/translate" # 假设的关键词拓展API相关配置(需替换为真实的) KEYWORD_EXPAND_API_URL = "https://example.com/keyword_expand" KEYWORD_EXPAND_API_KEY = "YOUR_KEYWORD_EXPAND_API_KEY" # 定义一些常见的浏览器User-Agent字符串 USER_AGENTS = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Firefox/89.0.2 Safari/537.36", "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Mobile/15E148 Safari/604.1" ] # 定义不同的内容类型分类 CONTENT_TYPES = ["news", "blog", "forum"] # 定义一个类似GoogleResult的类来存放搜索结果信息 SearchResult = namedtuple('SearchResult', ['title', 'text', 'url', 'source_type']) @lru_cache(maxsize=128) def search(keywords, num_results=10, filter_domain=None, engine="google", target_lang="en", filter_content_type=None): """ 使用SerpApi向指定搜索引擎发起查询,获取搜索结果,并可将结果翻译成指定语言,还能按内容类型筛选结果 :param keywords: 搜索关键词列表(这里可以是原始关键词或拓展后的关键词) :param num_results: 要获取的结果数量,默认值为10 :param filter_domain: 用于筛选结果的域名,若为None则不进行域名筛选 :param engine: 指定的搜索引擎名称,如"google"、"baidu"、"bing"等(需SerpApi支持) :param target_lang: 目标翻译语言,如"en"表示英语(需根据百度翻译支持的语言代码调整) :param filter_content_type: 用于筛选结果的内容类型,如"news"表示新闻类等,若为None则不进行内容类型筛选 :return: 经过去重、按相关性排序后的包含搜索结果信息的列表,标题和摘要文本可翻译成指定语言 """ api_key = "YOUR_API_KEY" # 这里替换为你自己申请的SerpApi的API密钥 combined_keyword = " ".join(keywords) params = { "api_key": api_key, "engine": engine, # 使用传入的搜索引擎名称 "q": combined_keyword } headers = { "User-Agent": USER_AGENTS[0] # 随机选择一个User-Agent,这里简单取第一个,可优化为随机选择 } logger = logging.getLogger(__name__) logger.info(f"发起搜索请求,关键词: {combined_keyword}, 结果数量限制: {num_results}, 筛选域名: {filter_domain}, 搜索引擎: {engine}, 目标语言: {target_lang}, 筛选内容类型: {filter_content_type}") try: response = requests.get("https://serpapi.com/search", params=params, timeout=5, headers=headers) response.raise_for_status() # 检查请求是否成功 data = response.json() results = [] seen_urls = set() # 用于记录已经出现过的url,实现去重 count = 0 # 用于记录已经获取的结果数量 if "organic_results" in data: # 假设返回数据中每个结果有'relevance_score'字段表示相关性评分,实际需根据真实数据调整 sorted_results = sorted(data["organic_results"], key=lambda x: x.get("relevance_score", 0), reverse=True) for result in sorted_results: title = result.get("title", "") text = result.get("snippet", "") source_type = result.get("source_type", "") # 获取来源类型字段 # 调用翻译函数将标题和文本翻译成目标语言 translated_title = translate_text(title, target_lang) translated_text = translate_text(text, target_lang) url = result.get("link", "") if filter_domain: if filter_domain in url: # 根据域名进行筛选 if filter_content_type: if source_type == filter_content_type: # 根据内容类型进行筛选 if url not in seen_urls: # 检查url是否已存在,不存在则添加 new_result = SearchResult(translated_title, translated_text, url, source_type) results.append(new_result) seen_urls.add(url) else: if url not in seen_urls: # 检查url是否已存在,不存在则添加 new_result = SearchResult(translated_title, translated_text, url, source_type) results.append(new_result) seen_urls.add(url) else: if filter_content_type: if source_type == filter_content_type: # 根据内容类型进行筛选 if url not in seen_urls: # 检查url是否已存在,不存在则添加 new_result = SearchResult(translated_title, translated_text, url, source_type) results.append(new_result) seen_urls.add(url) else: if url not in seen_urls: # 检查url是否已存在,不存在则添加 new_result = SearchResult(translated_title, translated_text, url, source_type) results.append(new_result) seen_urls.add(url) count += 1 if count >= num_results: # 达到指定数量则停止获取 break return results except requests.exceptions.Timeout: logger.error("请求超时,请检查网络连接或稍后再试") return [] except requests.RequestException as e: logger.error(f"请求出现异常: {e}") return [] def translate_text(text, target_lang): """ 使用百度翻译API将文本翻译成目标语言 :param text: 要翻译的文本内容 :param target_lang: 目标语言代码,如"en"表示英语 :return: 翻译后的文本内容 """ salt = "123456" # 随机生成的盐值,可按实际情况调整 sign = generate_sign(BAIDU_TRANSLATE_APPID, text, salt, BAIDU_TRANSLATE_SECRET_KEY) params = { "q": text, "from": "auto", "to": target_lang, "appid": BAIDU_TRANSLATE_APPID, "salt": salt, "sign": sign } logger = logging.getLogger(__name__) try: response = requests.get(BAIDU_TRANSLATE_URL, params=params) response.raise_for_status() data = response.json() return "".join([trans['dst'] for trans in data['trans_result']]) except requests.RequestException as e: logger.error(f"翻译请求出现异常: {e}") return text def generate_sign(appid, query, salt, secret_key): """ 生成百度翻译API请求的签名 :param appid: 百度翻译API的APPID :param query: 要翻译的文本内容 :param salt: 盐值 :param secret_key: 百度翻译API的密钥 :return: 生成的签名 """ sign_str = appid + query + salt + secret_key md5 = hashlib.md5() md5.update(sign_str.encode('utf-8')) return md5.hexdigest() def expand_keywords(keywords): """ 调用关键词拓展API获取拓展后的关键词列表(这里需替换为真实可用的服务) :param keywords: 原始关键词列表 :return: 拓展后的关键词列表 """ params = { "api_key": KEYWORD_EXPAND_API_KEY, "keywords": keywords } try: response = requests.get(KEYWORD_EXPAND_API_URL, params=params) response.raise_for_status() data = response.json() return data.get("expanded_keywords", keywords) # 假设返回数据中有'expanded_keywords'字段,若没有则返回原始关键词 except requests.RequestException as e: logger = logging.getLogger(__name__) logger.error(f"关键词拓展请求出现异常: {e}") return keywords def generate_wordcloud(text): """ 根据给定的文本内容生成词云图 :param text: 包含标题、摘要等的文本内容 """ wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text) plt.figure(figsize=(10, 5)) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off') plt.show() def main(): parser = argparse.ArgumentParser(description='Search using keywords on different search engines and translate results') parser.add_argument('-k', '--keywords', nargs='+', type=str, required=True, help='specify search keywords') parser.add_argument('-n', '--num_results', type=int, default=10, help='specify the number of results to retrieve') parser.add_argument('-d', '--domain', type=str, help='specify the domain to filter results') parser.add_argument('-e', '--engine', type=str, default="google", choices=["google", "baidu", "bing"], help='specify the search engine') parser.add_argument('-l', '--target_lang', type=str, default="en", help='specify the target language for translation') parser.add_argument('-t', '--content_type', type=str, choices=CONTENT_TYPES, help='specify the content type to filter results') parser.add_argument('-f', '--file', type=str, default='results.txt', help='specify the file name to save results') args = parser.parse_args() keywords = args.keywords num_results = args.num_results domain = args.domain engine = args.engine target_lang = args.target_lang content_type = args.content_type file_name = args.file expanded_keywords = expand_keywords(keywords) # 获取拓展后的关键词 results = search(expanded_keywords, num_results, domain, engine, target_lang, content_type) # 进行结果统计分析,这里以统计不同域名出现次数为例 domain_counter = Counter([result.url.split('/')[2] for result in results if result.url]) logger = logging.getLogger(__name__) for domain, count in domain_counter.items(): logger.info(f"域名 {domain} 出现次数: {count}") # 合并所有结果的标题和摘要文本用于生成词云图 all_text = " ".join([f"{result.title} {result.text}" for result in results]) generate_wordcloud(all_text) # 判断保存格式,如果是CSV格式则保存为CSV文件,否则按原来的文本格式保存 if file_name.endswith('.csv'): save_results_as_csv(results, file_name) else: with open(file_name, 'w', encoding='utf-8') as f: for result in results: highlighted_title = highlight_keywords(result.title, keywords) highlighted_text = highlight_keywords(result.text, keywords) f.write(f"Title: {highlighted_title}\n") f.write(f"Text: {highlighted_text}\n") f.write(f"URL: {result.url}\n") f.write("-" * 50 + "\n") print(f"Results have been saved to {file_name}") def highlight_keywords(text, keywords): """ 将文本中的关键词进行高亮显示(这里简单用<b>标签包裹模拟加粗) :param text: 要处理的文本内容 :param keywords: 关键词列表 :return: 处理后的文本 """ for keyword in keywords: text = text.replace(keyword, f"<b>{keyword}</b>") return text def save_results_as_csv(results, file_name): """ 将搜索结果保存为CSV格式文件 :param results: 搜索结果列表 :param file_name: 文件名 """ fieldnames = ['Title', 'Text', 'URL', 'Source Type'] with open(file_name, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for result in results: writer.writerow({ 'Title': result.title, 'Text': result.text, 'URL': result.url, 'Source Type': result.source_type }) if __name__ == "__main__": main()