class SYCNMSpider:
def __init__(self, gradio_app, mapping_path):
self.__gr = gradio_app # Gradio 应用
self.mapping_flows_data = self.init_mapping_flows(mapping_path)
self.__paser_tools = ParserTools(gradio_app)
@staticmethod
def init_mapping_flows(flows_path):
"""
初始化映射表
"""
df = pd.read_excel(flows_path, header=[0, 1, 2])
# 填充空值
df.fillna(method='ffill', inplace=True)
new_flows = df['新旧版流量对照']['新版']
old_flows = df['新旧版流量对照']['旧版']
mapping_flows = []
for x, y in zip(new_flows.values, old_flows.values):
# 映射数据
new_flows = {"level_one_flows": x[0], "level_two_flows": x[1], "level_three_flows": x[2],
"leaf_flows": x[3]}
old_flows = {"level_one_flows": y[0], "level_two_flows": y[1], "level_three_flows": y[2],
"leaf_flows": y[3]}
# 添加到映射表
mapping_flows.append({"new_flows": new_flows, "old_flows": old_flows})
return mapping_flows
@staticmethod
def seach_mapping_flows(mapping_flows, res_return=False, leaf_flows_name="", mapping_new_or_old="new_flows"):
"""
搜索映射表
:param flows_level: 流量等级
:param flows_name: 流量名称
:param mapping_new_or_old: 映射新旧版
"""
mapping_new_or_old_all = {"new_flows": "old_flows", "old_flows": "new_flows"}
if mapping_new_or_old not in mapping_new_or_old_all:
raise ValueError("mapping_new_or_old must be in mapping_new_or_old_all 'new_flows' or 'old_flows'")
return_flows = mapping_new_or_old
if res_return:
return_flows = mapping_new_or_old_all[mapping_new_or_old]
result = []
for mapping in mapping_flows:
if mapping_new_or_old == "new_flows":
if mapping["new_flows"]["leaf_flows"] == leaf_flows_name:
result.append(mapping[return_flows])
elif mapping_new_or_old == "old_flows":
if mapping["old_flows"]["leaf_flows"] == leaf_flows_name:
result.append(mapping[return_flows])
# 去重
seen = set()
unique_list = []
for d in result:
# frozenset将字典条目转换为不可变集合
tuple_d = frozenset(d.items())
if tuple_d not in seen:
seen.add(tuple_d)
unique_list.append(d)
return unique_list
def update_file_component(self, choice):
if choice == "多文件模式":
return self.__gr.File(file_count="multiple", file_types=['.xlsx', '.xls'])
elif choice == "文件夹模式":
return self.__gr.File(file_count="directory")
def select_level(self, x, level="level_one_flows"):
result_list = self.seach_mapping_flows(self.mapping_flows_data, res_return=False, leaf_flows_name=x,
mapping_new_or_old="new_flows")
return result_list[0][level]
def start_app(self, input_paths: list):
check_prefix = ["商品", "店铺", "直播", "内容"]
all_data = []
for file in input_paths:
filename = os.path.basename(file)
try:
x = filename.split("_")
y = x[1].split("-")
prefix_name = x[0]
if prefix_name not in check_prefix:
raise self.__gr.Error(f"文件名错误无法判断来源!请添加前缀(例如:商品_)")
prefix_date = "-".join(y[1::])
excel_file = pd.ExcelFile(file)
sheet_name_list = ['无线流量来源', '经营优势来源渠道']
one_level = '经营优势'
sheet_names = excel_file.sheet_names
# ---------------------------------------------------------------------------------------------------------------
# 读取所有工作表
dfs = {sheet_name: pd.read_excel(excel_file, sheet_name=sheet_name, skiprows=5) for sheet_name in
sheet_names}
flow_df = dfs[sheet_name_list[0]] # 获取 无线流量来源
# 剔除 经营优势 一级来源
# flow_df = flow_df[flow_df["一级来源"] != one_level]
flow_df.insert(flow_df.shape[1], '日期', prefix_date)
if len(sheet_names) > 1:
# 非全站推
management_df = dfs[sheet_name_list[1]] # 获取经营优势来源渠道
# 添加新列数据
management_df.loc[:, '日期'] = prefix_date
management_df.loc[:, '来源'] = prefix_name + "-非全站推"
management_df.loc[:, '一级来源'] = one_level
management_df.loc[:, '二级来源'] = one_level
management_df['三级来源'] = management_df['来源名称']
flow_df.insert(flow_df.shape[1], '来源', prefix_name + "-非全站推")
# 删除 来源名称
management_df.drop(columns=['来源名称'], inplace=True)
concat_result = pd.concat([flow_df, management_df], axis=0, ignore_index=True)
else:
# 全站推
flow_df.insert(flow_df.shape[1], '来源', prefix_name + "-全站推")
flow_df_new = flow_df[flow_df["二级来源"] != "汇总"]
# 需要处理数据
need_parser_df = flow_df[flow_df["二级来源"] == "汇总"]
need_parser_df['三级来源'] = need_parser_df['一级来源']
need_parser_df['一级来源'] = need_parser_df['三级来源'].apply(lambda x: self.select_level(x))
need_parser_df['二级来源'] = need_parser_df['三级来源'].apply(
lambda x: self.select_level(x, level="level_two_flows"))
concat_result = pd.concat([flow_df_new, need_parser_df], axis=0, ignore_index=True)
# ---------------------------------------------------------------------------------------------------------------
# 将新增加的 'date' 列移动到第一列
cols = ['日期', '来源'] + [col for col in concat_result.columns if col != '日期' and col != '来源']
all_data.append(concat_result[cols])
self.__gr.Info(f"解析成功文件名:{filename}")
except self.__gr.exceptions.Error:
raise self.__gr.Error(f"文件名错误无法判断来源!请添加前缀(例如:商品_)")
except Exception as e:
raise self.__gr.Error(f"解析失败,文件名:{filename}")
filename = self.__paser_tools.get_save_filename("生意参谋_流量", "sycm_flows")
data = pd.concat(all_data, axis=0, ignore_index=True)
data.to_excel(filename, index=False, sheet_name="流量来源")
return filename
class SYCMDownload:
def __init__(self, gradio_app, mapping_path):
self.__gr = gradio_app # Gradio 应用
self.__paser_tools = ParserTools(gradio_app)
self.__downloader = app_sycm_download.SYCMDownloadSpider()
self.__date_map = {"日": "day", "月": "month", "周": "week", "近7天": "recent7", "近30天": "recent30"}
self.mapping_flows_data = self.init_mapping_flows(mapping_path)
def start_app(self, cookies, output_type, flows_dim: list, date_dim, start_date, end_date):
all_data = []
all_path = []
logging.info(
f" output_type: {output_type} flows_dim: {flows_dim} date_dim: {date_dim} start_date: {start_date} end_date: {end_date}")
# 获取参数
dateType = self.__date_map[date_dim]
# 获取cookies
cookies = self.__paser_tools.parse_cookie(cookies)
# 获取时间维度
if date_dim in ["日", "周", "月"]:
dateRanges = self.paser_date(dateType, start_date, end_date)
else:
dateRanges = self.paser_date(dateType)
dateType = "day"
logging.info(f"dateRanges: {dateRanges}")
progress = self.__gr.Progress(track_tqdm=True)
for dateRange in progress.tqdm(dateRanges, desc="下载进度", unit="number"):
logging.info(f"dateType: {dateType} dateRange: {dateRange}")
status, data = self.__downloader.crawl(cookies=cookies, dateType=dateType, dateRange=dateRange)
if not status:
raise self.__gr.Error("采集失败!")
byte_stream = io.BytesIO(data)
excel_file = pd.ExcelFile(byte_stream)
data = self.paser_flows_dim(excel_file, flows_dim, date_dim, dateRange)
all_data.append(data)
if output_type == "多个文件":
name = "~".join(dateRange.split("|"))
filename_m = self.__paser_tools.get_save_filename(f"生意参谋_流量_{name}", "sycm_flows_download")
data.to_excel(filename_m, index=False, sheet_name="流量来源")
all_path.append(filename_m)
if output_type == "多个文件":
return all_path
filename = self.__paser_tools.get_save_filename("生意参谋_流量_下载", "sycm_flows_download")
result = pd.concat(all_data, axis=0, ignore_index=True)
result.to_excel(filename, index=False, sheet_name="流量来源")
self.__state = False
return [filename]
def paser_flows_dim(self, excel_file_data, flows_dim, date_dim, dateRange):
df = pd.read_excel(excel_file_data, skiprows=5)
sheet_name_list = ['店铺渠道', '经营优势来源渠道']
# 全站推
promotion = ["整体", "非全站推广期", "全站推广期"]
one_level = '经营优势'
sheet_names = excel_file_data.sheet_names
# 读取所有工作表
dfs = {sheet_name: pd.read_excel(excel_file_data, sheet_name=sheet_name, skiprows=5) for sheet_name in
sheet_names}
# 日期
prefix_date = dateRange.split("|")[0]
if date_dim == "周" or date_dim == "月":
x_1 = dateRange.split("|")
prefix_date = "~".join(x_1)
flow_df = dfs[sheet_name_list[0]]
flow_df.insert(flow_df.shape[1], '日期', prefix_date)
if "全站推广期" in list(flow_df.columns):
flow_df_new = flow_df[(flow_df["二级来源"] != "汇总") & (flow_df["全站推广期"] == "全站推广期")]
flow_df_not = flow_df[flow_df["全站推广期"] != "全站推广期"]
# 需要处理数据
need_parser_df = flow_df[(flow_df["二级来源"] == "汇总") & (flow_df["全站推广期"] == "全站推广期")]
need_parser_df['三级来源'] = need_parser_df['一级来源']
need_parser_df['一级来源'] = need_parser_df['三级来源'].apply(lambda x: self.select_level(x))
need_parser_df['二级来源'] = need_parser_df['三级来源'].apply(
lambda x: self.select_level(x, level="level_two_flows"))
result_data = pd.concat([flow_df_new, need_parser_df, flow_df_not], axis=0, ignore_index=True)
else:
flow_df.loc[:, '全站推广期'] = "整体"
result_data = flow_df
if len(sheet_names) > 1:
# 处理经营优势
management_df = dfs[sheet_name_list[1]]
management_df.loc[:, '全站推广期'] = "整体"
management_df.loc[:, '日期'] = prefix_date
management_df.loc[:, '一级来源'] = one_level
management_df.loc[:, '二级来源'] = one_level
management_df['三级来源'] = management_df['来源名称']
management_df.drop(columns=['来源名称'], inplace=True)
result = pd.concat([result_data, management_df], axis=0, ignore_index=True)
else:
result = result_data
# 重新排序列名
cols = ['日期', '流量载体'] + [col for col in result.columns if col != '日期' and col != '流量载体']
new_flow_dim = [dim + "流量" for dim in flows_dim]
filter_cols = result[cols]
return filter_cols[filter_cols['流量载体'].isin(new_flow_dim)]
def select_level(self, x, level="level_one_flows"):
result_list = self.seach_mapping_flows(self.mapping_flows_data, res_return=False, leaf_flows_name=x,
mapping_new_or_old="new_flows")
return result_list[0][level]
@staticmethod
def paser_date(time_type, start_date=None, end_date=None):
today = datetime.datetime.today()
end_day = today - datetime.timedelta(days=1)
date_format = "%Y-%m-%d"
def parse_date(date_str):
return datetime.datetime.strptime(date_str, date_format)
if time_type in ["day", "week", "month"]:
if not start_date or not end_date:
raise ValueError("Error: Start date and end date must be provided for day, week, and month types.")
start_dt = parse_date(start_date)
end_dt = parse_date(end_date)
if end_dt > end_day:
raise ValueError("Error: End date cannot be later than the day before the current date.")
if time_type == "week":
if start_dt.weekday() != 0 or end_dt.weekday() != 6:
raise ValueError("Error: For 'week' type, start date must be Monday and end date must be Sunday.")
elif time_type == "month":
if start_dt.day != 1 or (end_dt + datetime.timedelta(days=1)).day != 1:
raise ValueError(
"Error: For 'month' type, start date must be the first day of the month and end date must be the last day of the month.")
elif time_type in ["recent7", "recent30"]:
if start_date or end_date:
raise ValueError(
"Error: Start date and end date should not be provided for last_7_days and last_30_days types.")
start_dt = end_day - datetime.timedelta(days=6 if time_type == "recent7" else 29)
end_dt = end_day
else:
raise ValueError("Error: Invalid type. Allowed values are: day, week, month, recent7, recent30.")
result = []
if time_type == "day":
curr_date = start_dt
while curr_date <= end_dt:
date_str = curr_date.strftime(date_format)
result.append(f"{date_str}|{date_str}")
curr_date += datetime.timedelta(days=1)
elif time_type == "week":
curr_date = start_dt
while curr_date <= end_dt:
start_of_week = curr_date - datetime.timedelta(days=curr_date.weekday())
end_of_week = start_of_week + datetime.timedelta(days=6)
if end_of_week > end_dt:
end_of_week = end_dt
result.append(f"{start_of_week.strftime(date_format)}|{end_of_week.strftime(date_format)}")
curr_date = end_of_week + datetime.timedelta(days=1)
elif time_type == "month":
curr_date = start_dt
while curr_date <= end_dt:
start_of_month = curr_date.replace(day=1)
next_month_start = (start_of_month + datetime.timedelta(days=32)).replace(day=1)
end_of_month = next_month_start - datetime.timedelta(days=1)
if end_of_month > end_dt:
end_of_month = end_dt
result.append(f"{start_of_month.strftime(date_format)}|{end_of_month.strftime(date_format)}")
curr_date = next_month_start
elif time_type == "recent7" or time_type == "recent30":
curr_date = start_dt
while curr_date <= end_dt:
date_str = curr_date.strftime(date_format)
result.append(f"{date_str}|{date_str}")
curr_date += datetime.timedelta(days=1)
return result
@staticmethod
def init_mapping_flows(flows_path):
"""
初始化映射表
"""
df = pd.read_excel(flows_path, header=[0, 1, 2])
# 填充空值
df.fillna(method='ffill', inplace=True)
new_flows = df['新旧版流量对照']['新版']
old_flows = df['新旧版流量对照']['旧版']
mapping_flows = []
for x, y in zip(new_flows.values, old_flows.values):
# 映射数据
new_flows = {"level_one_flows": x[0], "level_two_flows": x[1], "level_three_flows": x[2],
"leaf_flows": x[3]}
old_flows = {"level_one_flows": y[0], "level_two_flows": y[1], "level_three_flows": y[2],
"leaf_flows": y[3]}
# 添加到映射表
mapping_flows.append({"new_flows": new_flows, "old_flows": old_flows})
return mapping_flows
@staticmethod
def seach_mapping_flows(mapping_flows, res_return=False, leaf_flows_name="", mapping_new_or_old="new_flows"):
"""
搜索映射表
:param flows_level: 流量等级
:param flows_name: 流量名称
:param mapping_new_or_old: 映射新旧版
"""
mapping_new_or_old_all = {"new_flows": "old_flows", "old_flows": "new_flows"}
if mapping_new_or_old not in mapping_new_or_old_all:
raise ValueError("mapping_new_or_old must be in mapping_new_or_old_all 'new_flows' or 'old_flows'")
return_flows = mapping_new_or_old
if res_return:
return_flows = mapping_new_or_old_all[mapping_new_or_old]
result = []
for mapping in mapping_flows:
if mapping_new_or_old == "new_flows":
if mapping["new_flows"]["leaf_flows"] == leaf_flows_name:
result.append(mapping[return_flows])
elif mapping_new_or_old == "old_flows":
if mapping["old_flows"]["leaf_flows"] == leaf_flows_name:
result.append(mapping[return_flows])
# 去重
seen = set()
unique_list = []
for d in result:
# frozenset将字典条目转换为不可变集合
tuple_d = frozenset(d.items())
if tuple_d not in seen:
seen.add(tuple_d)
unique_list.append(d)
return unique_list
@staticmethod
def day_yesterday():
# 获取当前日期和时间
now = datetime.datetime.now()
# 计算前天的日期
day_yesterday = now - datetime.timedelta(days=1)
yesterday = day_yesterday.strftime("%Y-%m-%d")
return yesterday
def max_day_check(self, input_date_str):
input_date = datetime.datetime.strptime(input_date_str, "%Y-%m-%d")
# 获取昨天的日期
yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
# 比较输入日期和昨天的日期
if input_date > yesterday:
return self.__gr.DateTime(include_time=False, min_width=10, type='string', value=self.day_yesterday())
else:
return self.__gr.DateTime(include_time=False, min_width=10, type='string', value=input_date_str)