import json
import requests
import os
import logging
from datetime import datetime, timedelta
import pytz
from urllib.parse import quote
import time
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("data_fetcher.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def get_broadcast_data_with_token(tenant_access_token):
"""
使用token获取飞书的数据
"""
# 获取 Feishu Bitable 数据
url = 'https://open.feishu.cn/open-apis/bitable/v1/apps/E1zybPqiqa0TaesZjKKch5ZcnJd/tables/tblwFY4k3pmrV5WK/records/search'
headers = {
'Content-Type': 'application/json',
'Authorization': f'Bearer {tenant_access_token}' # 使用新的 token
}
data = {} # 如果需要传递查询条件,可以在这里添加
try:
logger.info(f"正在请求飞书数据,URL: {url}")
response = requests.post(url, headers=headers, json=data, timeout=30)
response.raise_for_status() # 如果响应失败,将抛出异常
response_dict = response.json() # 将返回的 JSON 数据转换为字典
items = response_dict.get("data", {}).get("items", [])
logger.info(f"成功获取飞书数据,共 {len(items)} 条记录")
data = []
for item in items:
fields = item.get("fields", {})
# 获取时间段
time_segment = extract_time_segment(fields, '时间段')
# 提取文案内容
song1_text = extract_text_field(fields, '需更新文案')
song2_text = extract_text_field(fields, '需更新文案2')
# 根据时间段决定是否处理文案
if time_segment == "08:10-08:15":
# 08:10-08:15时间段不需要特殊处理
song1_name = ""
song1_artist = ""
song2_name = ""
song2_artist = ""
else:
# 其他时间段需要处理
song1_parts = song1_text.split(' ', 1) # 以空格分割,最多分割一次
song1_name = song1_parts[0] if len(song1_parts) > 0 else ""
song1_artist = song1_parts[1] if len(song1_parts) > 1 else ""
song2_parts = song2_text.split(' ', 1) # 以空格分割,最多分割一次
song2_name = song2_parts[0] if len(song2_parts) > 0 else ""
song2_artist = song2_parts[1] if len(song2_parts) > 1 else ""
data.append({
"播音日期": extract_broadcast_date(fields, '播音日期'),
"时间段": time_segment,
"开播音乐file_token": extract_file_token(fields, '开播音乐'),
"开场白-播报file_token": extract_file_token(fields, '开场白-播报'),
"需更新文案": song1_text,
"需更新文案2": song2_text,
"壹歌名": song1_name,
"壹歌手": song1_artist,
"贰歌名": song2_name,
"贰歌手": song2_artist,
"结束语-播报file_token": extract_file_token(fields, '结束语-播报'),
"结束音乐file_token": extract_file_token(fields, '结束音乐')
})
return data
except requests.exceptions.HTTPError as http_err:
logger.error(f"HTTP 错误发生: {http_err}")
except requests.exceptions.Timeout:
logger.error("请求超时,服务器响应时间过长")
except requests.exceptions.ConnectionError:
logger.error("连接错误,无法连接到服务器")
except Exception as err:
logger.error(f"其他错误发生: {err}", exc_info=True)
return []
def extract_file_token(fields, field_name):
"""提取 file_token"""
field_data = fields.get(field_name, [])
if isinstance(field_data, list) and len(field_data) > 0:
value = field_data[0]
if isinstance(value, dict):
return value.get("file_token", "")
return ''
def extract_text_field(fields, field_name):
"""提取文本字段内容"""
field_data = fields.get(field_name, [])
if isinstance(field_data, list) and len(field_data) > 0:
value = field_data[0]
if isinstance(value, dict):
return value.get("text", "")
return ''
def extract_time_segment(fields, field_name):
"""提取时间段字段"""
field_data = fields.get(field_name, [])
if isinstance(field_data, list) and len(field_data) > 0:
value = field_data[0]
if isinstance(value, dict):
return value.get("text", "")
return None
def extract_broadcast_date(fields, field_name):
"""提取播音日期字段"""
field_data = fields.get(field_name, 0)
if isinstance(field_data, int):
try:
timestamp = field_data / 1000 # 时间戳转化为秒
parsed_date = datetime.fromtimestamp(timestamp, tz=pytz.utc).astimezone(pytz.timezone('Asia/Shanghai'))
return parsed_date.strftime("%Y-%m-%d") # 转换为 "YYYY-MM-DD" 格式
except (ValueError, OverflowError):
pass
return None
def get_auth_token():
"""获取认证 token"""
url = "https://open.feishu.cn/open-apis/auth/v3/tenant_access_token/internal"
headers = {"Content-Type": "application/json; charset=utf-8"}
payload = {"app_id": "cli_a882683e8779d00c", "app_secret": "3NKkALA7vyMRVnpKJinmrb1LJ7YuK4H0"}
try:
logger.info("正在获取认证token")
response = requests.post(url, json=payload, headers=headers, timeout=30)
response.raise_for_status()
data = response.json()
if data["code"] == 0:
logger.info("成功获取认证token")
return data["tenant_access_token"]
else:
logger.error(f"请求失败:{data['msg']}(错误码:{data['code']})")
except requests.exceptions.HTTPError as http_err:
logger.error(f"HTTP 错误发生: {http_err}")
except requests.exceptions.Timeout:
logger.error("获取token超时")
except requests.exceptions.ConnectionError:
logger.error("连接错误,无法获取token")
except Exception as e:
logger.error(f"获取token异常:{e}", exc_info=True)
return None
def display_data(data):
"""展示获取的数据"""
if not data:
print("没有获取到任何数据")
return
print(f"共获取到 {len(data)} 条广播数据记录:")
for i, item in enumerate(data, 1):
print(f"\n--- 记录 {i} ---")
for key, value in item.items():
if key.endswith('file_token') and value:
# 对于file_token,只显示前10个字符,避免太长
print(f"{key}: {value[:10]}...")
else:
print(f"{key}: {value}")
def download_song(song_name, artist=None, save_path="./downloads"):
"""
搜索并下载指定歌曲
参数:
song_name (str): 歌曲名称
artist (str, optional): 歌手名称,默认为None
save_path (str, optional): 保存路径,默认为"./downloads"
"""
# 创建保存目录(如果不存在)
if not os.path.exists(save_path):
os.makedirs(save_path)
# 构建搜索关键词
search_query = song_name
if artist:
search_query += f" {artist}"
print(f"正在搜索歌曲: {search_query}")
# 设置Edge浏览器选项
edge_options = Options()
edge_options.add_argument('--disable-gpu')
edge_options.add_argument('--no-sandbox')
edge_options.add_argument('--disable-dev-shm-usage')
# 设置下载路径
prefs = {
"download.default_directory": save_path,
"download.prompt_for_download": False,
"download.directory_upgrade": True,
"safebrowsing.enabled": True
}
edge_options.add_experimental_option("prefs", prefs)
# 指定EdgeDriver的路径
edge_driver_path = r"C:\Users\shaopeng.qi\Downloads\edgedriver_win64 (1)\msedgedriver.exe"
# 检查路径是否存在
if not os.path.exists(edge_driver_path):
print(f"错误: EdgeDriver路径不存在 - {edge_driver_path}")
print("请下载与您Edge浏览器版本匹配的EdgeDriver,并更新路径")
print("下载地址: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/")
return
# 初始化WebDriver
service = Service(executable_path=edge_driver_path)
service.log_path = "edgedriver.log"
driver = webdriver.Edge(service=service, options=edge_options)
try:
# 直接访问搜索结果页面
search_url = f"https://www.gequbao.com/s/{quote(search_query)}.html"
print(f"正在访问搜索结果页面: {search_url}")
driver.get(search_url)
# 等待页面加载完成
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".card-body"))
)
# 检查是否有搜索结果
try:
no_results = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH, "//div[contains(text(), '没有找到相关歌曲')]"))
)
print("未找到匹配的歌曲")
return
except:
# 没有找到"没有找到相关歌曲"的提示,继续执行
pass
# 滚动到页面底部,确保所有元素都加载
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
# 查找第一个歌曲的链接
print("正在查找歌曲链接...")
song_links = WebDriverWait(driver, 15).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".card a.music-link"))
)
if not song_links:
print("未找到歌曲链接")
with open("error_page.html", "w", encoding="utf-8") as f:
f.write(driver.page_source)
print("已保存当前页面源码到error_page.html,可用于分析问题")
return
# 获取第一个歌曲链接的href属性
first_song_link = song_links[0].get_attribute("href")
print(f"找到歌曲链接: {first_song_link}")
# 提取歌曲ID
match = re.search(r'/music/(\d+)', first_song_link)
if not match:
print("无法从链接中提取歌曲ID")
with open("error_page.html", "w", encoding="utf-8") as f:
f.write(driver.page_source)
print("已保存当前页面源码到error_page.html,可用于分析问题")
return
song_id = match.group(1)
print(f"提取的歌曲ID: {song_id}")
# 直接构建并访问歌曲详情页URL
song_detail_url = f"https://www.gequbao.com/music/{song_id}"
print(f"正在访问歌曲详情页: {song_detail_url}")
driver.get(song_detail_url)
# 等待页面加载
print("等待页面加载完成...")
WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".song-title"))
)
# 点击下载按钮 - 使用更准确的选择器
print("正在查找并点击下载按钮...")
try:
# 使用更精确的选择器定位下载按钮
download_btn = WebDriverWait(driver, 15).until(
EC.element_to_be_clickable((By.ID, "btn-download-mp3"))
)
print("找到下载按钮")
# 获取下载链接
download_url = download_btn.get_attribute("href")
print(f"下载链接: {download_url}")
# 直接访问下载链接,跳过弹窗
print("正在直接访问下载链接...")
driver.get(download_url)
# 等待下载完成
print("等待下载完成...")
time.sleep(15) # 增加等待时间
# 检查下载目录
print("检查下载目录...")
files = os.listdir(save_path)
print(f"目录内容: {files}")
# 查找新下载的文件
downloaded_files = [f for f in files if f.endswith('.mp3') and f != 'edgedriver.log']
if downloaded_files:
print(f"成功下载: {downloaded_files[0]}")
else:
print("未找到MP3文件,但下载可能已完成")
except Exception as e:
print(f"点击下载按钮时出错: {e}")
# 尝试直接访问下载链接
print("尝试直接访问下载链接...")
direct_download_url = f"https://www.gequbao.com/api/play/{song_id}"
print(f"直接下载URL: {direct_download_url}")
driver.get(direct_download_url)
time.sleep(15) # 增加等待时间
except Exception as e:
print(f"操作过程中发生错误: {e}")
# 保存错误页面
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
error_file = f"error_page_{timestamp}.html"
with open(error_file, "w", encoding="utf-8") as f:
f.write(driver.page_source)
print(f"已保存当前页面源码到 {error_file},可用于分析问题")
finally:
# 关闭浏览器
driver.quit()
print("浏览器已关闭")
def sanitize_filename(filename):
"""
清理文件名,移除Windows不允许的字符
参数:
filename (str): 原始文件名
返回:
str: 清理后的文件名
"""
# 替换Windows不允许的字符为下划线
return filename.replace(':', '_').replace('\\', '_').replace('/', '_').replace('*', '_').replace('?', '_').replace('"', '_').replace('<', '_').replace('>', '_').replace('|', '_')
def download_songs_from_feishu(data, base_download_path="C:\\Users\\shaopeng.qi\\bobao"):
"""
根据飞书数据下载歌曲
参数:
data (list): 从飞书获取的数据列表
base_download_path (str): 基础下载路径,默认为"C:\\Users\\shaopeng.qi\\bobao"
"""
if not data:
print("没有数据可用于下载歌曲")
return
# 确保基础下载路径存在
if not os.path.exists(base_download_path):
os.makedirs(base_download_path)
# 遍历所有记录
for i, record in enumerate(data, 1):
print(f"\n--- 处理记录 {i}/{len(data)} ---")
print(f"播音日期: {record['播音日期']}, 时间段: {record['时间段']}")
# 下载第一首歌
song1_name = record['壹歌名']
song1_artist = record['壹歌手']
if song1_name and song1_artist:
print(f"\n下载第一首歌: {song1_name} - {song1_artist}")
# 创建基于日期和时间段的保存路径,并清理文件名
safe_date = sanitize_filename(record['播音日期'])
safe_time = sanitize_filename(record['时间段'])
save_dir = f"{base_download_path}\\{safe_date}_{safe_time}"
download_song(song1_name, song1_artist, save_dir)
else:
print("\n第一首歌信息不完整,跳过下载")
# 下载第二首歌
song2_name = record['贰歌名']
song2_artist = record['贰歌手']
if song2_name and song2_artist:
print(f"\n下载第二首歌: {song2_name} - {song2_artist}")
# 创建基于日期和时间段的保存路径,并清理文件名
safe_date = sanitize_filename(record['播音日期'])
safe_time = sanitize_filename(record['时间段'])
save_dir = f"{base_download_path}\\{safe_date}_{safe_time}"
download_song(song2_name, song2_artist, save_dir)
else:
print("\n第二首歌信息不完整,跳过下载")
def main():
"""主函数"""
logger.info("===== 飞书数据获取与歌曲下载程序启动 =====")
# 获取认证token
authorization = get_auth_token()
if not authorization:
logger.error("获取认证token失败,程序退出")
return
# 获取广播数据
data = get_broadcast_data_with_token(authorization)
# 展示获取的数据
display_data(data)
# 打印歌曲信息
if data:
first_record = data[0]
print(f"\n歌曲信息:")
print(f"第一首歌: {first_record['壹歌名']} - {first_record['壹歌手']}")
print(f"第二首歌: {first_record['贰歌名']} - {first_record['贰歌手']}")
# 保存数据到JSON文件
try:
with open('broadcast_data.json', 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print("\n数据已保存到 broadcast_data.json 文件")
except Exception as e:
logger.error(f"保存数据失败: {e}")
# 根据飞书数据下载歌曲
print("\n开始下载歌曲...")
download_songs_from_feishu(data)
logger.info("===== 飞书数据获取与歌曲下载程序结束 =====")
if __name__ == "__main__":
main()
上述代码中的壹歌名和贰歌名,是下面代码中song_name的值,壹歌手和贰歌手,是下面代码中artist的值,用尽量少的改动,将上下两段代码合在一起,最后下载的第一首歌命名为file_4,第二首歌命名为file_6
import os
import requests
from urllib.parse import quote
import time
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import re
def download_song(song_name, artist=None, save_path="./downloads"):
"""
搜索并下载指定歌曲
参数:
song_name (str): 歌曲名称
artist (str, optional): 歌手名称,默认为None
save_path (str, optional): 保存路径,默认为"./downloads"
"""
# 创建保存目录(如果不存在)
if not os.path.exists(save_path):
os.makedirs(save_path)
# 构建搜索关键词
search_query = song_name
if artist:
search_query += f" {artist}"
print(f"正在搜索歌曲: {search_query}")
# 设置Edge浏览器选项
edge_options = Options()
edge_options.add_argument('--disable-gpu')
edge_options.add_argument('--no-sandbox')
edge_options.add_argument('--disable-dev-shm-usage')
# 指定EdgeDriver的路径
edge_driver_path = r"C:\Users\shaopeng.qi\Downloads\edgedriver_win64 (1)\msedgedriver.exe"
# 检查路径是否存在
if not os.path.exists(edge_driver_path):
print(f"错误: EdgeDriver路径不存在 - {edge_driver_path}")
print("请下载与您Edge浏览器版本匹配的EdgeDriver,并更新路径")
print("下载地址: https://developer.microsoft.com/en-us/microsoft-edge/tools/webdriver/")
return
# 初始化WebDriver
service = Service(executable_path=edge_driver_path)
service.log_path = "edgedriver.log"
driver = webdriver.Edge(service=service, options=edge_options)
try:
# 直接访问搜索结果页面
search_url = f"https://www.gequbao.com/s/{quote(search_query)}.html"
print(f"正在访问搜索结果页面: {search_url}")
driver.get(search_url)
# 等待页面加载完成
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".card-body"))
)
# 检查是否有搜索结果
try:
no_results = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH, "//div[contains(text(), '没有找到相关歌曲')]"))
)
print("未找到匹配的歌曲")
return
except:
# 没有找到"没有找到相关歌曲"的提示,继续执行
pass
# 滚动到页面底部,确保所有元素都加载
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
# 查找第一个歌曲的链接
print("正在查找歌曲链接...")
song_links = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".music-link"))
)
if not song_links:
print("未找到歌曲链接")
with open("error_page.html", "w", encoding="utf-8") as f:
f.write(driver.page_source)
print("已保存当前页面源码到error_page.html,可用于分析问题")
return
# 获取第一个歌曲链接的href属性
first_song_link = song_links[0].get_attribute("href")
print(f"找到歌曲链接: {first_song_link}")
# 提取歌曲ID
match = re.search(r'/music/(\d+)', first_song_link)
if not match:
print("无法从链接中提取歌曲ID")
with open("error_page.html", "w", encoding="utf-8") as f:
f.write(driver.page_source)
print("已保存当前页面源码到error_page.html,可用于分析问题")
return
song_id = match.group(1)
print(f"提取的歌曲ID: {song_id}")
# 直接构建并访问歌曲详情页URL
song_detail_url = f"https://www.gequbao.com/music/{song_id}"
print(f"正在访问歌曲详情页: {song_detail_url}")
# 打开新窗口
driver.execute_script(f"window.open('{song_detail_url}');")
# 等待新窗口打开
time.sleep(2)
# 检查是否打开了新窗口
if len(driver.window_handles) > 1:
# 切换到新窗口
for window in driver.window_handles:
if window != driver.current_window_handle:
driver.switch_to.window(window)
print("已切换到歌曲详情页")
break
else:
print("未成功打开歌曲详情页,尝试直接访问")
# 直接访问歌曲详情页
driver.get(song_detail_url)
print("已直接访问歌曲详情页")
# 等待页面加载
print("等待页面加载完成...")
time.sleep(5)
# 检查是否在歌曲详情页
try:
# 查找歌曲标题元素,确认在详情页
song_title = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, ".song-title"))
)
print(f"已进入歌曲详情页,歌曲标题: {song_title.text}")
except:
# 如果找不到歌曲标题元素,尝试其他可能的元素
try:
# 尝试查找下载按钮
download_btn = WebDriverWait(driver, 3).until(
EC.presence_of_element_located((By.XPATH, "//a[contains(text(), '下载')]"))
)
print("已进入歌曲详情页,找到下载按钮")
except:
print("无法确认是否在歌曲详情页")
with open("error_page.html", "w", encoding="utf-8") as f:
f.write(driver.page_source)
print("已保存当前页面源码到error_page.html,可用于分析问题")
return
# 移除了点击播放按钮的操作
# 点击下载按钮
print("正在查找并点击下载按钮...")
try:
download_btn = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//*[@id=\"btn-download-mp3\"]"))
)
print("找到下载按钮")
# 点击下载按钮
download_btn.click()
print("已点击下载按钮")
# 等待下载对话框出现
print("等待下载对话框出现...")
time.sleep(3)
# 点击低品质MP3按钮 - 使用更灵活的XPath选择器
print("正在查找并点击低品质MP3按钮...")
try:
# 使用通配符匹配动态ID部分
low_quality_btn = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.XPATH, "//*[starts-with(@id, 'jconfirm-box')]/div/ul/li[2]/a"))
)
print("找到低品质MP3按钮")
# 点击低品质MP3按钮
low_quality_btn.click()
print("已点击低品质MP3按钮,开始下载歌曲")
# 等待下载开始
print("程序将暂停10秒,等待下载开始...")
time.sleep(10)
# 提示用户可以手动关闭浏览器
print("程序已完成,浏览器将自动关闭...")
# 等待下载完成
print("等待下载完成...")
time.sleep(5) # 增加额外等待时间确保下载完成
# 关闭浏览器
driver.quit()
print("浏览器已关闭")
return
except Exception as e:
print(f"点击低品质MP3按钮时出错: {e}")
with open("error_page.html", "w", encoding="utf-8") as f:
f.write(driver.page_source)
print("已保存当前页面源码到error_page.html,可用于分析问题")
return
except Exception as e:
print(f"点击下载按钮时出错: {e}")
with open("error_page.html", "w", encoding="utf-8") as f:
f.write(driver.page_source)
print("已保存当前页面源码到error_page.html,可用于分析问题")
return
except Exception as e:
print(f"操作过程中发生错误: {e}")
with open("error_page.html", "w", encoding="utf-8") as f:
f.write(driver.page_source)
print("已保存当前页面源码到error_page.html,可用于分析问题")
finally:
# 移除了手动关闭提示
# print("程序执行完毕,浏览器将保持打开状态。")
# print("您可以继续欣赏音乐,如需关闭浏览器,请手动操作。")
pass
if __name__ == "__main__":
song_name = input("请输入歌曲名称: ")
artist = input("请输入歌手名称 (可选,直接回车跳过): ")
if not artist:
artist = None
download_song(song_name, artist)
最新发布