"""
四级类目匹配代码
25年06月05日编写
将不同店铺中的数据与我司的四级类目进行匹配
"""
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.styles import Font
import re
import os
from tkinter import Tk, filedialog
def select_file(title):
"""打开文件选择对话框并返回选择的文件路径"""
print(f"请选择{title}文件...")
root = Tk()
root.withdraw() # 隐藏主窗口
file_path = filedialog.askopenfilename(
title=f"选择{title}文件",
filetypes=[("Excel files", "*.xlsx *.xls")]
)
root.destroy()
if not file_path:
print("未选择文件,程序退出。")
exit()
return file_path
def read_level1_file(file_path):
"""读取level-1文件并返回数据框"""
try:
df = pd.read_excel(file_path)
required_columns = ['first', 'second', 'three', 'four']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Level-1文件缺少必要的列: {', '.join(missing_columns)}")
return df
except Exception as e:
print(f"读取Level-1文件时出错: {e}")
exit()
def read_jx_file(file_path):
"""读取JX标准货盘文件并返回数据框"""
try:
df = pd.read_excel(file_path)
required_columns = ['美团渠道一级类目', '美团渠道二级类目', '美团渠道三级类目',
'商品名称', '成本(进参考)', 'SPU编码']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"JX标准货盘文件缺少必要的列: {', '.join(missing_columns)}")
return df
except Exception as e:
print(f"读取JX标准货盘文件时出错: {e}")
exit()
def clean_keyword(keyword):
"""清理关键词,去除特殊字符并转为小写"""
if not isinstance(keyword, str):
return ''
cleaned = re.sub(r'[^\w\s]', '', keyword)
return cleaned.lower().strip()
def match_products(level1_df, jx_df):
"""根据条件匹配商品并返回匹配和未匹配的数据"""
matched_data = []
unmatched_data = []
category_map = {}
# 构建类目到商品的映射
for _, row in jx_df.iterrows():
category_key = (row['美团渠道一级类目'], row['美团渠道二级类目'], row['美团渠道三级类目'])
if category_key not in category_map:
category_map[category_key] = []
category_map[category_key].append(row)
# 遍历Level-1数据进行匹配
for _, level_row in level1_df.iterrows():
category_key = (level_row['first'], level_row['second'], level_row['three'])
keyword = level_row['four'] if not pd.isna(level_row['four']) else level_row['three']
cleaned_keyword = clean_keyword(keyword)
if category_key in category_map:
products = category_map[category_key]
found = False
for product in products:
product_name = clean_keyword(product['商品名称'])
if cleaned_keyword in product_name:
# 关键处理:提前将SPU编码转为字符串(避免后续科学计数法源头)
spu_str = str(product['SPU编码'])
matched_data.append({
'first': level_row['first'],
'second': level_row['second'],
'three': level_row['three'],
'four': keyword,
'商品名称': product['商品名称'],
'价格': product['成本(进参考)'],
'SPU编码': spu_str # 直接存储字符串
})
found = True
if not found:
unmatched_data.append({k: level_row[k] for k in ['first', 'second', 'three', 'four']})
else:
unmatched_data.append({k: level_row[k] for k in ['first', 'second', 'three', 'four']})
return matched_data, unmatched_data
def calculate_pit_count(matched_df):
"""计算每个关键词的坑位数(SPU编码去重统计)"""
if matched_df.empty:
return {}
return matched_df.groupby('four')['SPU编码'].nunique().to_dict()
def write_to_excel(matched_data, unmatched_data, output_path):
"""将匹配和未匹配的数据写入Excel文件,并设置SPU编码为文本格式"""
matched_df = pd.DataFrame(matched_data) if matched_data else pd.DataFrame()
unmatched_df = pd.DataFrame(unmatched_data) if unmatched_data else pd.DataFrame()
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
# 写入匹配数据
if not matched_df.empty:
# 计算坑位数并调整列顺序
matched_df['坑位数(spu)'] = matched_df['four'].map(calculate_pit_count(matched_df))
column_order = ['first', 'second', 'three', 'four', '商品名称', '价格', 'SPU编码', '坑位数(spu)']
matched_df = matched_df[column_order]
# 写入Sheet1
matched_df.to_excel(writer, sheet_name='Sheet1', index=False)
ws = writer.sheets['Sheet1']
# ---------------------- 关键修改区域 ----------------------
# 设置SPU编码列为文本格式(解决科学计数法问题)
spu_col_name = 'SPU编码'
col_idx = matched_df.columns.get_loc(spu_col_name) + 1 # Excel列索引从1开始
for row in ws.iter_rows(min_row=2, min_col=col_idx, max_col=col_idx):
for cell in row:
cell.number_format = '@' # 强制设置为文本格式
# ---------------------- 修改结束 ----------------------
# 自动调整列宽
for i, col in enumerate(matched_df.columns, 1):
col_letter = get_column_letter(i)
max_len = max(len(str(x)) for x in matched_df[col]) if not matched_df[col].empty else len(col)
ws.column_dimensions[col_letter].width = min(max_len + 2, 50)
# 表头加粗
for cell in ws[1]:
cell.font = Font(bold=True)
# 写入未匹配数据
if not unmatched_df.empty:
unmatched_df.to_excel(writer, sheet_name='Sheet2', index=False)
ws = writer.sheets['Sheet2']
for i, col in enumerate(unmatched_df.columns, 1):
col_letter = get_column_letter(i)
max_len = max(len(str(x)) for x in unmatched_df[col]) if not unmatched_df[col].empty else len(col)
ws.column_dimensions[col_letter].width = min(max_len + 2, 50)
for cell in ws[1]:
cell.font = Font(bold=True)
print(f"数据已保存至: {output_path}")
def main():
"""主函数,协调整个程序的执行"""
print("开始执行Excel数据处理程序...")
level1_path = select_file("Level-1")
jx_path = select_file("JX标准货盘5.19")
default_output = os.path.join(os.getcwd(), "匹配结果.xlsx")
print(f"默认输出路径: {default_output}")
use_default = input("是否使用默认路径?(y/n): ").strip().lower()
output_path = default_output if use_default == 'y' else filedialog.asksaveasfilename(
title="选择保存位置", defaultextension=".xlsx", filetypes=[("Excel files", "*.xlsx")]
) or default_output
level1_df = read_level1_file(level1_path)
jx_df = read_jx_file(jx_path)
matched_data, unmatched_data = match_products(level1_df, jx_df)
write_to_excel(matched_data, unmatched_data, output_path)
print("程序执行完毕。")
if __name__ == "__main__":
main()
上述代码中存在以下问题
1、JX标准货盘5.19 文件 的商品名称列中某些数据含有 level-1文件中four列中的多项数据 导致在进行关键词匹配时将该条数据进行重复匹配
2、level-1中未匹配到的数据已经写入了Sheet2页面中 请将JX标准货盘5.19 文件 中未匹配到的数据写入Sheet3页面中
请根据上述描述对代码修改, 并对修改后的代码进行正确性验证