Python神经网络实际数据预处理中的数据转换（2023年2月）

最新推荐文章于 2024-03-27 16:30:59 发布
原创最新推荐文章于 2024-03-27 16:30:59 发布 · 344 阅读
0 ·
CC 4.0 BY-SA版权
文章标签：
#python #神经网络 #机器学习
python使用技巧专栏收录该内容
10 篇文章
订阅专栏
该代码主要展示了如何使用Python中的pandas库进行数据转换和清洗。函数data_cover是一个数据处理流程，涉及读取多个数据表，如日期表、店铺表、商品数据、销售数据等，进行空值处理、异常数据过滤、数据转换、数据计算、数据修正、数据保存等操作。此外，还涉及到数据的合并、列的映射转换和存储。整个过程旨在准备数据以供进一步分析或建模使用。
data_cover数据转换例子

data_cover 是一个启动函数，里面的过程就是处理一个一个先前读取过的数据表。
处理的数据表包括日期表、店铺表、商品数据、销售数据、销售目标数据、预测占比数据、计算销售占比数据、计算销售目标占比数据等
核心就是一个表对应一个函数。函数里面进行空值处理、异常数据过滤、数据转换、数据计算、数据修正、数据保存、标签保存等操作。
需要注意的是，函数里面需要处理的固定文件名要对应上。
代码调试入口是data_cover





import logging
from typing import Tuple, Dict
import pandas as pd
import numpy as np
import os
import shutil
import datetime

import sys
sys.path.append('')
import yaml




# 空值统一转换成Unknown
def null_convert(x):
    """
    isinstance() 函数来判断一个对象是否是一个已知的类型
    strip() 方法用于移除字符串头尾指定的字符（默认为空格或换行符）或字符序列
    返回移除字符串头尾指定的字符生成的新字符串
    """
    if isinstance(x, float) and np.isnan(x):
        return 'Unknown'
    return 'Unknown' if x.strip()=='' else x.strip()
pass
def unknown_first(values):
    '''将Unknown放到第一位'''
    if 'Unknown' in values:
        values.remove('Unknown')
        values.insert(0, 'Unknown')
    return values
pass
def to_yml(data, file_path):
    dictionary = data
    if not isinstance(data, dict):
        index2name_dictionary = {}
        name2index_dictionary = {}
        for i, x in enumerate(data):
            index2name_dictionary[i] = x
            name2index_dictionary[x] = i
        dictionary = {
            'count': len(index2name_dictionary), 
            'index2name': index2name_dictionary, 
            'name2index': name2index_dictionary
            }
    with open(file_path, 'w', encoding='utf-8') as file:
        yaml.dump(dictionary, file)
    return dictionary
pass
class ShareArgs():
    args = {
        "labels_dir":"./yongjian/shop_group/month_w_amt/data/labels", # 标签目录
        "labels_output_dir":"./yongjian/shop_group/month_w_amt/data/labels_output", # 聚类导出标签目录
        "common_datas_dir":"./yongjian/data", # 共用数据目录。ur_bi_dw的公共
        "only_predict": False, # 只识别，不训练
        "delete_model": True, # 先删除模型，仅在训练时使用
        "export_excel": False, # 导出excel
        "classes": 12, # 聚类数
        "batch_size": 16,
        "hidden_size": 32,
        "max_nrof_epochs": 100,
        "learning_rate": 0.0005,
        "loss_type": "categorical_crossentropy",
        "avg_model_num": 10,
        "steps_per_epoch": 4.0, # 4.0
        "lr_callback_patience": 4, 
        "lr_callback_cooldown": 1,
        "early_stopping_callback_patience": 6,
        "get_data": True,
    }
    def get_args():
        return ShareArgs.args

    def set_args(args):
        ShareArgs.args = args

    def set_args_value(key, value):
        ShareArgs.args[key] = value

    def get_args_value(key, default_value=None):
        return ShareArgs.args.get(key, default_value)

    def contain_key(key):
        return key in ShareArgs.args.keys()

    def update(args):
        ShareArgs.args.update(args)
pass
def convert_date_data(ur_bi_dw_dir, labels_dir):
    ''',date_key,date_name,date_name_en,day_of_year,
    day_of_month,day_of_week,year_id,year_name,
    quarter_id,quarter_name,quarter_name_en,quarter_of_year,
    quarter_of_year_name,quarter_of_year_name_en,month_id,
    month_name,month_name_full_en,month_name_short_en,
    month_of_year,month_of_year_name,week_id,week_id_name,
    week_seq,week_name,week_name_full_en,week_name_short_en,
    week_of_year,week_year_name,week_of_month,is_week_end,
    is_week_last,is_month_last,festival_key,festival,
    lunar_date,solar_term,season_year,season,big_season,
    goods_season_year,goods_season,big_goods_season,l_week_day,
    cr_week_day,y_week_day,cr_y_week_day,first_day_week,
    last_day_week,l_month_day,y_month_day,first_day_month,
    last_day_month,last_30_day,l_month_last_30_day,y_month_last_30_day,
    etl_date,holiday_status_code,holiday_status,year_id_pd'''
    dim_date = pd.read_csv(
        os.path.join(ur_bi_dw_dir,'ur_bi_dw.dim_date.csv'),
        dtype={'dim_date.festival_key': str, 'dim_date.festival': str,'dim_date.holiday_status_code': str,'dim_date.holiday_status': str}
        ) # 日期数据
    # 去掉前缀
    columns = list(dim_date.columns)
    columns = {c:c.replace('dim_date.','') for c in columns}
    dim_date = dim_date.rename(columns=columns)

    dim_date['season'] = dim_date['season'].map(null_convert) # 空值转换

def convert_shop_data(
    common_ur_bi_dir,
    labels_dir,
    logger:logging.Logger
    ) -> pd.DataFrame:
    '''店铺数据'''
    ''',dim_shop.shop_no,dim_shop.shop_name,dim_shop.shop_name_en,
    dim_shop.brand,dim_shop.currency_code,dim_shop.shop_type,
    dim_shop.shop_type_desc,dim_shop.shop_sort,dim_shop.shop_sort_desc,
    dim_shop.stock_id,dim_shop.stock_no,dim_shop.region_no,
    dim_shop.region_name,dim_shop.region_name_en,dim_shop.region_manager_code,
    dim_shop.region_manager_name,dim_shop.subregion_no,dim_shop.subregion_name,
    dim_shop.subregion_name_en,dim_shop.subregion_manager_code,
    dim_shop.subregion_manager_name,dim_shop.bussinesscircle_code,
    dim_shop.bussinesscircle_name,dim_shop.bussinesscircle_name_en,
    dim_shop.country_code,dim_shop.country_name,dim_shop.country_name_en,
    dim_shop.province,dim_shop.province_name,dim_shop.province_name_en,
    dim_shop.city_code,dim_shop.city_name,dim_shop.city_name_en,dim_shop.city_level_code,
    dim_shop.city_level_name,dim_shop.city_level_name_en,dim_shop.district,
    dim_shop.shop_manager_code,dim_shop.shop_manager_name,dim_shop.shop_manager_name_en,
    dim_shop.open_date,dim_shop.new_open_date,dim_shop.close_date,dim_shop.store_email,
    dim_shop.shop_tel,dim_shop.shop_manager_tel,dim_shop.shop_property,
    dim_shop.is_new_shop,dim_shop.is_kids,dim_shop.is_same_shop,dim_shop.is_promote,
    dim_shop.is_streetstore,dim_shop.is_outrant,dim_shop.contract_size,dim_shop.shop_size,
    dim_shop.sales_size,dim_shop.shop_dis_level,dim_shop.sales_level,dim_shop.image_level,
    dim_shop.shop_level,dim_shop.ppsregion_no,dim_shop.ppsregion_name,dim_shop.temperature_zone,
    dim_shop.store_status,dim_shop.lon,dim_shop.lat,dim_shop.shop_address,dim_shop.etl_date,
    dim_shop.cost_currency_code,dim_shop.store_centercode,dim_shop.store_centername,dim_shop.data_source,
    dim_shop.bussinesscircle_id,dim_shop.project_id,dim_shop.project_code,dim_shop.project_name'''
    shop_data = pd.read_csv(os.path.join(common_ur_bi_dir,'ur_bi_dw.dim_shop.csv'), index_col=0)
    
    # 去掉前缀
    columns = list(shop_data.columns)
    columns = {c:c.replace('dim_shop.','') for c in columns}
    shop_data = shop_data.rename(columns=columns)
    
    # 修正数据
    shop_data[shop_data['province_name']=='上海'] = '上海市'
    shop_data[shop_data['province_name']=='北京'] = '北京市'
    shop_data[shop_data['province_name']=='天津'] = '天津市'
    shop_data[shop_data['province_name']=='重庆'] = '重庆市'

    # 店铺数据, 空值转换
    null_convert_columns = [
        'shop_no', # 店铺编码
        'shop_name', # 店铺名称
        'brand', # 品牌
        'currency_code', # 币种
        'shop_type', # 店铺类型
        'shop_type_desc', # 店铺类型描述
        'shop_sort', # 店铺归类
        'shop_sort_desc', # 店铺归类描述
        'region_no', # 大区编码
        'region_name', # 大区名称
        'subregion_no', # 小区编码
        'subregion_name', # 小区名称
        'bussinesscircle_code', # 商圈编码
        'bussinesscircle_name', # 商圈名称
        'country_code', # 国家编码
        'country_name', # 国家名称
        'province', # 省份编码
        'province_name', # 省份名称
        'city_code', # 城市编码
        'city_name', # 城市名称
        'city_level_code', # 城市等级编码
        'city_level_name', # 城市等级名称
        'district', # 城市区域
        'open_date', # 开业日期
        'new_open_date', # 再次开业日期
        'shop_property', # 店铺属性
        'is_new_shop', # 是否新老店
        'is_kids', # 是否童装店
        'is_same_shop', # 是否同店
        'is_promote', # 是否特卖
        'is_streetstore', # 是否街铺
        'is_outrant', # 是否外租仓
        # 'contract_size', # 合同面积
        # 'shop_size', # 实用面积
        # 'sales_size', # 销售面积
        'shop_dis_level', # 配货等级
        'sales_level', # 销售等级
        'image_level', # 形象级别
        'shop_level', # 店铺级别
        'ppsregion_no', # 企划分区
        'ppsregion_name', # 企划分区名称
        'temperature_zone', # 气温带
        'store_status', # 店铺状态
        # 'lon', # 经度
        # 'lat', # 纬度
        'store_centercode', # 分部编码
        'store_centername', # 分部名称
    ]
    for c in null_convert_columns:
        shop_data[c] = shop_data[c].map(null_convert)
    shop_data['contract_size'] = shop_data['contract_size'].fillna(0) # 合同面积
    shop_data['shop_size'] = shop_data['shop_size'].fillna(0) # 实用面积
    shop_data['sales_size'] = shop_data['sales_size'].fillna(0) # 销售面积
    shop_data['lon'] = shop_data['lon'].fillna(0) # 经度
    shop_data['lat'] = shop_data['lat'].fillna(0) # 纬度
    shop_data = shop_data.sort_values(['country_code','province_name','city_name','district','bussinesscircle_code']) # 排序

    # 筛选店铺
    # 'shop_type' in ['UR_SHOP']
    # 'shop_sort' in ['UR_LOCAL_STORE']
    # 'city_name' in ['广州市']
    # 'is_new_shop' != '临时店'
    # shop_sort_filter = None
    # shop_sort_filter = ['E_COMMERCE_STORE','ACC_STORE']
    shop_sort_filter = ['E_COMMERCE_STORE']
    # if is_acc:
    #     shop_sort_filter = ['E_COMMERCE_STORE']
    # else:
    #     shop_sort_filter = ['E_COMMERCE_STORE','ACC_STORE']
    shop_data_filter = shop_data[
        # (shop_data['city_name']=='广州市')
        # (shop_data['region_name'].isin(['华南一区','华南二区']))
        (shop_data['is_new_shop']!='临时店')
        &(shop_data['country_code']=='CN')
        &(shop_data['brand'].isin(['URBAN REVIVO','ACC']))
        # &(shop_data['shop_type'].isin(['UR_SHOP','UR_ONLINE_SHOP']))
        # &(shop_data['shop_sort'].isin(['UR_LOCAL_STORE','OUTLET_STORE']))
        &(~shop_data['shop_sort'].isin(shop_sort_filter))
        &(~shop_data['shop_name'].str.contains('临时', na=False))
        &(~shop_data['shop_name'].str.contains('JGO', na=False))
        # &(~shop_data['shop_name'].str.contains('奥特莱斯', na=False))
        &(~shop_data['store_status'].isin(['项目暂停','关店']))
        &((shop_data['province_name']!='Unknown')|(shop_data['city_name']!='Unknown'))
    ]
    # 选中列
    columns_filter = [
        'shop_no', # 店铺编码
        'shop_name', # 店铺名称
        'brand', # 品牌
        'currency_code', # 币种
        'shop_type', # 店铺类型
        'shop_type_desc', # 店铺类型描述
        'shop_sort', # 店铺归类
        'shop_sort_desc', # 店铺归类描述
        'region_no', # 大区编码
        'region_name', # 大区名称
        'subregion_no', # 小区编码
        'subregion_name', # 小区名称
        'bussinesscircle_code', # 商圈编码
        'bussinesscircle_name', # 商圈名称
        'country_code', # 国家编码
        'country_name', # 国家名称
        'province', # 省份编码
        'province_name', # 省份名称
        'city_code', # 城市编码
        'city_name', # 城市名称
        'city_level_code', # 城市等级编码
        'city_level_name', # 城市等级名称
        'district', # 城市区域
        'open_date', # 开业日期
        'new_open_date', # 再次开业日期
        'shop_property', # 店铺属性
        'is_new_shop', # 是否新老店
        'is_kids', # 是否童装店
        'is_same_shop', # 是否同店
        'is_promote', # 是否特卖
        'is_streetstore', # 是否街铺
        'is_outrant', # 是否外租仓
        'contract_size', # 合同面积
        'shop_size', # 实用面积
        'sales_size', # 销售面积
        'shop_dis_level', # 配货等级
        'sales_level', # 销售等级
        'image_level', # 形象级别
        'shop_level', # 店铺级别
        'ppsregion_no', # 企划分区
        'ppsregion_name', # 企划分区名称
        'temperature_zone', # 气温带
        'store_status', # 店铺状态
        'lon', # 经度
        'lat', # 纬度
        'store_centercode', # 分部编码
        'store_centername', # 分部名称
    ]
    shop_data_filter = shop_data_filter[columns_filter]
    # 店铺代码与名称映射转换
    shop_code2name_dictionary = {}
    for i in range(len(shop_data_filter)):
        shop_no = str(shop_data_filter.iloc[i]['shop_no'])
        shop_name = shop_data_filter.iloc[i]['shop_name']
        shop_code2name_dictionary[shop_no] = shop_name
    shop_code2name_dictionary = to_yml(shop_code2name_dictionary, os.path.join(labels_dir,'shop_code2name.yml'))
    logger.info('已导出: {}'.format(os.path.join(labels_dir,'shop_code2name.yml')))
    return shop_data_filter
pass
def convert_shop_data(
    common_ur_bi_dir,
    labels_dir,
    logger:logging.Logger
    ) -> pd.DataFrame:
    '''店铺数据'''
    ''',dim_shop.shop_no,dim_shop.shop_name,dim_shop.shop_name_en,
    dim_shop.brand,dim_shop.currency_code,dim_shop.shop_type,
    dim_shop.shop_type_desc,dim_shop.shop_sort,dim_shop.shop_sort_desc,
    dim_shop.stock_id,dim_shop.stock_no,dim_shop.region_no,
    dim_shop.region_name,dim_shop.region_name_en,dim_shop.region_manager_code,
    dim_shop.region_manager_name,dim_shop.subregion_no,dim_shop.subregion_name,
    dim_shop.subregion_name_en,dim_shop.subregion_manager_code,
    dim_shop.subregion_manager_name,dim_shop.bussinesscircle_code,
    dim_shop.bussinesscircle_name,dim_shop.bussinesscircle_name_en,
    dim_shop.country_code,dim_shop.country_name,dim_shop.country_name_en,
    dim_shop.province,dim_shop.province_name,dim_shop.province_name_en,
    dim_shop.city_code,dim_shop.city_name,dim_shop.city_name_en,dim_shop.city_level_code,
    dim_shop.city_level_name,dim_shop.city_level_name_en,dim_shop.district,
    dim_shop.shop_manager_code,dim_shop.shop_manager_name,dim_shop.shop_manager_name_en,
    dim_shop.open_date,dim_shop.new_open_date,dim_shop.close_date,dim_shop.store_email,
    dim_shop.shop_tel,dim_shop.shop_manager_tel,dim_shop.shop_property,
    dim_shop.is_new_shop,dim_shop.is_kids,dim_shop.is_same_shop,dim_shop.is_promote,
    dim_shop.is_streetstore,dim_shop.is_outrant,dim_shop.contract_size,dim_shop.shop_size,
    dim_shop.sales_size,dim_shop.shop_dis_level,dim_shop.sales_level,dim_shop.image_level,
    dim_shop.shop_level,dim_shop.ppsregion_no,dim_shop.ppsregion_name,dim_shop.temperature_zone,
    dim_shop.store_status,dim_shop.lon,dim_shop.lat,dim_shop.shop_address,dim_shop.etl_date,
    dim_shop.cost_currency_code,dim_shop.store_centercode,dim_shop.store_centername,dim_shop.data_source,
    dim_shop.bussinesscircle_id,dim_shop.project_id,dim_shop.project_code,dim_shop.project_name'''
    shop_data = pd.read_csv(os.path.join(common_ur_bi_dir,'ur_bi_dw.dim_shop.csv'), index_col=0)
    
    # 去掉前缀
    columns = list(shop_data.columns)
    columns = {c:c.replace('dim_shop.','') for c in columns}
    shop_data = shop_data.rename(columns=columns)
    
    # 修正数据
    shop_data[shop_data['province_name']=='上海'] = '上海市'
    shop_data[shop_data['province_name']=='北京'] = '北京市'
    shop_data[shop_data['province_name']=='天津'] = '天津市'
    shop_data[shop_data['province_name']=='重庆'] = '重庆市'

    # 店铺数据, 空值转换
    null_convert_columns = [
        'shop_no', # 店铺编码
        'shop_name', # 店铺名称
        'brand', # 品牌
        'currency_code', # 币种
        'shop_type', # 店铺类型
        'shop_type_desc', # 店铺类型描述
        'shop_sort', # 店铺归类
        'shop_sort_desc', # 店铺归类描述
        'region_no', # 大区编码
        'region_name', # 大区名称
        'subregion_no', # 小区编码
        'subregion_name', # 小区名称
        'bussinesscircle_code', # 商圈编码
        'bussinesscircle_name', # 商圈名称
        'country_code', # 国家编码
        'country_name', # 国家名称
        'province', # 省份编码
        'province_name', # 省份名称
        'city_code', # 城市编码
        'city_name', # 城市名称
        'city_level_code', # 城市等级编码
        'city_level_name', # 城市等级名称
        'district', # 城市区域
        'open_date', # 开业日期
        'new_open_date', # 再次开业日期
        'shop_property', # 店铺属性
        'is_new_shop', # 是否新老店
        'is_kids', # 是否童装店
        'is_same_shop', # 是否同店
        'is_promote', # 是否特卖
        'is_streetstore', # 是否街铺
        'is_outrant', # 是否外租仓
        # 'contract_size', # 合同面积
        # 'shop_size', # 实用面积
        # 'sales_size', # 销售面积
        'shop_dis_level', # 配货等级
        'sales_level', # 销售等级
        'image_level', # 形象级别
        'shop_level', # 店铺级别
        'ppsregion_no', # 企划分区
        'ppsregion_name', # 企划分区名称
        'temperature_zone', # 气温带
        'store_status', # 店铺状态
        # 'lon', # 经度
        # 'lat', # 纬度
        'store_centercode', # 分部编码
        'store_centername', # 分部名称
    ]
    for c in null_convert_columns:
        shop_data[c] = shop_data[c].map(null_convert)
    shop_data['contract_size'] = shop_data['contract_size'].fillna(0) # 合同面积
    shop_data['shop_size'] = shop_data['shop_size'].fillna(0) # 实用面积
    shop_data['sales_size'] = shop_data['sales_size'].fillna(0) # 销售面积
    shop_data['lon'] = shop_data['lon'].fillna(0) # 经度
    shop_data['lat'] = shop_data['lat'].fillna(0) # 纬度
    shop_data = shop_data.sort_values(['country_code','province_name','city_name','district','bussinesscircle_code']) # 排序

    # 筛选店铺
    # 'shop_type' in ['UR_SHOP']
    # 'shop_sort' in ['UR_LOCAL_STORE']
    # 'city_name' in ['广州市']
    # 'is_new_shop' != '临时店'
    # shop_sort_filter = None
    # shop_sort_filter = ['E_COMMERCE_STORE','ACC_STORE']
    shop_sort_filter = ['E_COMMERCE_STORE']
    # if is_acc:
    #     shop_sort_filter = ['E_COMMERCE_STORE']
    # else:
    #     shop_sort_filter = ['E_COMMERCE_STORE','ACC_STORE']
    shop_data_filter = shop_data[
        # (shop_data['city_name']=='广州市')
        # (shop_data['region_name'].isin(['华南一区','华南二区']))
        (shop_data['is_new_shop']!='临时店')
        &(shop_data['country_code']=='CN')
        &(shop_data['brand'].isin(['URBAN REVIVO','ACC']))
        # &(shop_data['shop_type'].isin(['UR_SHOP','UR_ONLINE_SHOP']))
        # &(shop_data['shop_sort'].isin(['UR_LOCAL_STORE','OUTLET_STORE']))
        &(~shop_data['shop_sort'].isin(shop_sort_filter))
        &(~shop_data['shop_name'].str.contains('临时', na=False))
        &(~shop_data['shop_name'].str.contains('JGO', na=False))
        # &(~shop_data['shop_name'].str.contains('奥特莱斯', na=False))
        &(~shop_data['store_status'].isin(['项目暂停','关店']))
        &((shop_data['province_name']!='Unknown')|(shop_data['city_name']!='Unknown'))
    ]
    # 选中列
    columns_filter = [
        'shop_no', # 店铺编码
        'shop_name', # 店铺名称
        'brand', # 品牌
        'currency_code', # 币种
        'shop_type', # 店铺类型
        'shop_type_desc', # 店铺类型描述
        'shop_sort', # 店铺归类
        'shop_sort_desc', # 店铺归类描述
        'region_no', # 大区编码
        'region_name', # 大区名称
        'subregion_no', # 小区编码
        'subregion_name', # 小区名称
        'bussinesscircle_code', # 商圈编码
        'bussinesscircle_name', # 商圈名称
        'country_code', # 国家编码
        'country_name', # 国家名称
        'province', # 省份编码
        'province_name', # 省份名称
        'city_code', # 城市编码
        'city_name', # 城市名称
        'city_level_code', # 城市等级编码
        'city_level_name', # 城市等级名称
        'district', # 城市区域
        'open_date', # 开业日期
        'new_open_date', # 再次开业日期
        'shop_property', # 店铺属性
        'is_new_shop', # 是否新老店
        'is_kids', # 是否童装店
        'is_same_shop', # 是否同店
        'is_promote', # 是否特卖
        'is_streetstore', # 是否街铺
        'is_outrant', # 是否外租仓
        'contract_size', # 合同面积
        'shop_size', # 实用面积
        'sales_size', # 销售面积
        'shop_dis_level', # 配货等级
        'sales_level', # 销售等级
        'image_level', # 形象级别
        'shop_level', # 店铺级别
        'ppsregion_no', # 企划分区
        'ppsregion_name', # 企划分区名称
        'temperature_zone', # 气温带
        'store_status', # 店铺状态
        'lon', # 经度
        'lat', # 纬度
        'store_centercode', # 分部编码
        'store_centername', # 分部名称
    ]
    shop_data_filter = shop_data_filter[columns_filter]
    # 店铺代码与名称映射转换
    shop_code2name_dictionary = {}
    for i in range(len(shop_data_filter)):
        shop_no = str(shop_data_filter.iloc[i]['shop_no'])
        shop_name = shop_data_filter.iloc[i]['shop_name']
        shop_code2name_dictionary[shop_no] = shop_name
    shop_code2name_dictionary = to_yml(shop_code2name_dictionary, os.path.join(labels_dir,'shop_code2name.yml'))
    logger.info('已导出: {}'.format(os.path.join(labels_dir,'shop_code2name.yml')))
    return shop_data_filter
pass
def convert_goods_data(
    common_ur_bi_dir,
    labels_dir,
    logger:logging.Logger):
    '''SKU数据，汇总系列与品类'''
    # 商品数据
    dim_goods = pd.read_csv(
        os.path.join(common_ur_bi_dir,'ur_bi_dw.dim_goods.csv'),
        dtype={
            'dim_goods.year':str,
            'dim_goods.mat_code':str,
            'dim_goods.serial':str,
            'dim_goods.style':str,
            'dim_goods.category':str,
            'dim_goods.subcategory':str,
            'dim_goods.version_style':str,
            'dim_goods.big_season_name':str,
            'dim_goods.order_no':str,
            'dim_goods.order_type_name':str,
            'dim_goods.month_of_year':str,
        },
        index_col=0
    )
    # 去掉前缀
    columns = list(dim_goods.columns)
    columns = {c:c.replace('dim_goods.','') for c in columns}
    dim_goods = dim_goods.rename(columns=columns)
    # 空值转换
    dim_goods['serial'] = dim_goods['serial'].map(null_convert)
    dim_goods['serial_name'] = dim_goods['serial_name'].map(null_convert)
    # 系列
    serial_data = dim_goods[['serial','serial_name']].drop_duplicates()
    serial_data = serial_data.sort_values(['serial','serial_name'])
    # 系列代码与名称映射转换
    serial_code2name_dictionary = {}
    for i in range(len(serial_data)):
        serial = str(serial_data.iloc[i]['serial'])
        serial_name = serial_data.iloc[i]['serial_name']
        serial_code2name_dictionary[serial] = serial_name
    _ = to_yml(serial_code2name_dictionary, os.path.join(labels_dir,'serial_code2name.yml'))
    logger.info('已导出: {}'.format(os.path.join(labels_dir,'serial_code2name.yml')))

    return dim_goods, serial_code2name_dictionary
pass
class DataHelper(object):
    def __init__(
        self
        ):
        pass

    @staticmethod
    def load_split_data(
        data_dir,   # 处理的原数据放在这里
        cache_file_dir,  # 处理好的数据放在这里
        logger:logging.Logger,
        convert_fun,  # 转换的函数
        delete_last_cache=True,
        delete_cache_indexes=[-1],
        data_dtype=None,
        cache_dtype=None,
        filter_fun=None, # 返回True，则过滤掉当前数据
        **kwargs
        ) -> pd.DataFrame:
        '''分批处理文件'''
        delete_cache_list = []
        # 新建目录
        if not os.path.exists(cache_file_dir):
            os.makedirs(cache_file_dir)
        elif delete_last_cache:
            #删除最后一个文件
            file_list = os.listdir(cache_file_dir)
            if len(file_list)>0:
                file_list.sort()
                for i in delete_cache_indexes:
                    # 将需要删除的文件加入列表
                    delete_cache_list.append(os.path.join(cache_file_dir,file_list[i]))
                    # os.remove(os.path.join(cache_file_dir,file_list[i]))
                    logger.info('删除：%s', file_list[i])
        # 加载文件列表
        file_list = os.listdir(data_dir)
        all_data = None
        if len(file_list)>0:
            file_list.sort()
            for file in file_list:
                if filter_fun is not None and filter_fun(file, **kwargs):
                    continue
                file_path = os.path.join(data_dir,file)
                logger.info(file_path)
                cache_file_path = os.path.join(cache_file_dir,file)
                logger.info(cache_file_path)
                now_data = None
                if not os.path.exists(cache_file_path) or cache_file_path in delete_cache_list:
                    logger.info('读取数据')
                    now_data:pd.DataFrame = pd.read_csv(file_path, dtype=data_dtype) # 读取数据
                    now_data = convert_fun(now_data, **kwargs) # 数据转换
                    now_data.to_csv(cache_file_path, index=False)
                else:
                    logger.info('读取缓存')
                    now_data:pd.DataFrame = pd.read_csv(cache_file_path, dtype=cache_dtype)
                logger.info('now_data: %s', len(now_data))
                if all_data is None:
                    all_data = now_data
                else:
                    all_data = pd.concat([all_data, now_data], ignore_index=True)
        return all_data
pass
def convert_sales_data(
    labels_dir,
    common_ur_bi_dir,
    shop_data_filter:pd.DataFrame,
    dim_goods:pd.DataFrame,
    dim_date:pd.DataFrame,
    logger:logging.Logger):
    '''实际销售'''
    # sales_data = pd.read_csv(os.path.join(ur_bi_dw_dir,'month_of_year_sales_amt.csv'), dtype={'dates.month_of_year':str}, index_col=0) # 月销售数据
    
    data_dir = os.path.join(common_ur_bi_dir,'dwd_daily_sales_size_all')
    cache_file_dir = os.path.join(labels_dir,'dwd_daily_sales_size_all')
    if not os.path.exists(cache_file_dir):
        os.makedirs(cache_file_dir)
    def data_convert(
        data:pd.DataFrame,
        dim_goods:pd.DataFrame,
        dim_date:pd.DataFrame,
        **kwargs,
        ):
        # 关联时间数据
        data = data.merge(
            dim_date[[
                'date_key','month_of_year',
            ]],
            on=['date_key'],
            how='inner',
        )
        # 关联商品数据
        data = data.merge(
            dim_goods[[
                'sku_no',
                'serial',
            ]].rename(columns={
                'sku_no':'sku_no',
                'serial':'goods.serial',
            }),
            on=['sku_no'],
            how='inner',
        )
        # 系列Y转W
        data['goods.serial'] = data['goods.serial'].map(lambda x: 'W' if x=='Y' else x)
        # 按 店-周 汇总数据
        data = data.groupby([
            'month_of_year',
            'shop_no',
            'goods.serial',
        ]).agg({
            'sales_amt': 'sum',
        }).reset_index()
        return data
    sales_data = DataHelper.load_split_data(
        data_dir=data_dir,
        cache_file_dir=cache_file_dir,
        logger=logger,
        convert_fun=data_convert,
        delete_last_cache=True,
        data_dtype={
            'date_key':int,
            'shop_no':str,
            'sku_no':str,
            'tag_price':float,
            'sales_qty':float,
            'sales_tag_amt':float,
            'sales_amt':float,
            'sales_count':int,
        },
        cache_dtype={
            'month_of_year':str,
            'shop_no':str,
            'goods.serial':str,
            'sales_amt':float,
        },
        dim_date=dim_date,
        dim_goods=dim_goods,
    )
    # 销售数据值处理
    sales_data['goods.serial'] = sales_data['goods.serial'].map(null_convert) # 空值转换

    # 筛选销售数据
    sales_data = sales_data[
        (sales_data['goods.serial'].isin(['A','M','W']))
        &(sales_data['shop_no'].isin(shop_data_filter['shop_no']))
    ]

    # 按系列汇总金额
    sales_group = sales_data.groupby([
        'shop_no',
        'goods.serial',
        'month_of_year',
        ])['sales_amt'].sum()
    sales_data = sales_group.reset_index()


    # 去掉月负数的，不然会导致训练异常，不能包含0，不然占比全为0
    sales_data = sales_data[sales_data['sales_amt']>0]

    return sales_data
pass
def convert_sales_goal_data(
    ur_bi_dw_dir,
    shop_data_filter:pd.DataFrame,
    logger:logging.Logger):
    '''销售目标'''
    # 月销售目标数据
    sales_goal_data = pd.read_csv(
        os.path.join(ur_bi_dw_dir,'month_of_year_sales_goal_amt.csv'),
        index_col=0,
        dtype={
            'sales_goal.serial':str,
            'dates.month_of_year':str
        },
    )
    ''',sales_goal.shop_no,sales_goal.serial,dates.month_of_year,sales_goal_amt'''
    sales_goal_data = sales_goal_data.rename(columns={
        'sales_goal.serial':'goods.serial',
        'dates.month_of_year':'month_of_year',
    })
    # 销售数据值处理
    sales_goal_data['goods.serial'] = sales_goal_data['goods.serial'].map(null_convert) # 空值转换
    
    # 筛选销售数据
    sales_goal_data = sales_goal_data[
        (sales_goal_data['goods.serial'].isin(['A','M','W']))
        &(sales_goal_data['sales_goal.shop_no'].isin(shop_data_filter['shop_no']))
    ]
    # 按系列汇总金额
    sales_goal_group = sales_goal_data.groupby([
        'sales_goal.shop_no',
        'goods.serial',
        'month_of_year',
        ])['sales_goal_amt'].sum()
    sales_goal_data = sales_goal_group.reset_index()

    # 去掉月负数的，不然会导致训练异常，不能包含0，不然占比全为0
    sales_goal_data = sales_goal_data[sales_goal_data['sales_goal_amt']>0]

    return sales_goal_data
pass
def convert_predict_data(
    ims_dir,
    ):
    '''预测的占比数据'''
    predict_data = pd.read_csv(os.path.join(ims_dir,'ims_w_amt_pro.csv'), index_col=0)
    '''
    ,id,year,plan_season,month,serial,shop_no,forecast_proportion,create_time,create_by,modify_time,modify_by
    '''
    predict_data['month_of_year'] = predict_data['year'].map('{:04d}'.format) + predict_data['month'].map('{:02d}'.format)
    return predict_data
pass
def convert_sales_percentage_data(
    labels_dir,
    sales_data:pd.DataFrame,
    logger:logging.Logger):
    '''计算销售占比'''
    # 计算店铺每月系列总销售金额，计算占比
    sales_group_data = sales_data.groupby(['shop_no','month_of_year'])
    # 店铺每月总销售金额
    sales_data['sales_amt_sum'] = sales_group_data['sales_amt'].transform('sum')
    # 系列占比
    sales_data['sales_amt_percentage'] = sales_data['sales_amt'] / sales_data['sales_amt_sum']
    
    # 过滤掉异常数据
    sales_data = sales_data[
        (sales_data['sales_amt_percentage']>0)&
        (sales_data['sales_amt_percentage']<1)
    ]

    # 系列数据值处理
    serial_data = sales_data['goods.serial']
    serial_data = serial_data.drop_duplicates() # 去重
    serial_data = serial_data.sort_values() # 排序
    # 系列序号映射转换
    serial_dictionary = serial_data.values
    serial_dictionary = to_yml(serial_dictionary, os.path.join(labels_dir,'serial.yml'))
    # print('serial_dictionary:', serial_dictionary)

    return sales_data
pass
def convert_sales_goal_percentage_data(
    sales_goal_data:pd.DataFrame,
    logger:logging.Logger):
    '''计算销售占比'''
    # 计算店铺每月系列总销售金额，计算占比
    sales_goal_group_data = sales_goal_data.groupby(['sales_goal.shop_no','month_of_year'])
    # 店铺每月总销售金额
    sales_goal_data['sales_goal_amt_sum'] = sales_goal_group_data['sales_goal_amt'].transform('sum')
    # 系列占比
    sales_goal_data['sales_goal_amt_percentage'] = sales_goal_data['sales_goal_amt'] / sales_goal_data['sales_goal_amt_sum']
    
    # 过滤掉异常数据
    sales_goal_data = sales_goal_data[
        (sales_goal_data['sales_goal_amt_percentage']>0)&
        (sales_goal_data['sales_goal_amt_percentage']<1)
    ]
    # 占比行转列
    sales_goal_percentage_data = convert_column(
        df=sales_goal_data,
        columns=['sales_goal.shop_no','month_of_year','goods.serial'],
        unstack_column='sales_goal_amt_percentage',
        default_value=0.0
    )
    return sales_goal_data, sales_goal_percentage_data
pass
def convert_column(df:pd.DataFrame,columns:list,unstack_column:str,default_value=None):
    '''
    将行转列
    
    Args:
        columns: 固定列
        unstack_column: 拆分列
        default_value: 默认空值
    '''
    # 将行转置成列
    df_index_data = df.set_index(columns)[unstack_column]
    df_index_data = df_index_data.unstack()
    # df_index_data = df_index_data.rename_axis(columns=None)
    df_index_data = df_index_data.reset_index()
    # df_index_data = pd.pivot_table(df,index=columns[:-1],columns=columns[-1:],values=unstack_column,dropna=False)
    # # print('df_index_data.columns:', df_index_data.columns)
    # df_index_data = df_index_data.reset_index(drop=False)
    if default_value is not None:
        df_index_data = df_index_data.fillna(default_value)
    return df_index_data
pass

def shop_labels_convert(
    other_dir,
    labels_dir,
    logger:logging.Logger,
):

    customer_labels_data = pd.read_csv(os.path.join(other_dir, 'shop_customer_labels.csv'))
    # 处理字符串列
    customer_labels_data = customer_labels_data.astype({
        'shopping_center_type':str,
        'shopping_center_position':str,
        'luxury_shopping_center':str,
        'fashion_type':str,
        'price_sensitivity_women':str,
        'price_sensitivity_men':str,
        'price_sensitivity_acc':str})
    
    # 生成映射
    # 购物中心类型序号映射转换
    shopping_center_type_dictionary = {}
    shopping_center_type_index = customer_labels_data[(customer_labels_data['shopping_center_type']=='nan')|(customer_labels_data['shopping_center_type']=='0')].index
    customer_labels_data.loc[shopping_center_type_index,'shopping_center_type'] = 'Unknown'
    # print(customer_labels_data)
    shopping_center_type = customer_labels_data['shopping_center_type'].drop_duplicates() # 去重
    shopping_center_type = shopping_center_type.sort_values() # 排序
    shopping_center_type_dictionary = shopping_center_type.values
    shopping_center_type_dictionary = to_yml(shopping_center_type_dictionary, os.path.join(labels_dir,'shopping_center_type.yml'))
    # print(shopping_center_type_dictionary)
    # 特征转下标
    customer_labels_data['shopping_center_type_index'] = customer_labels_data['shopping_center_type'].map(lambda x: shopping_center_type_dictionary['name2index'][x])


    # 商场定位趋势序号映射转换
    shopping_center_position_dictionary = {}
    shopping_center_position_index = customer_labels_data[(customer_labels_data['shopping_center_position']=='nan')|(customer_labels_data['shopping_center_position']=='0')].index
    customer_labels_data.loc[shopping_center_position_index,'shopping_center_position'] = 'Unknown'
    shopping_center_position = customer_labels_data['shopping_center_position'].drop_duplicates() # 去重
    shopping_center_position = shopping_center_position.sort_values() # 排序
    shopping_center_position_dictionary = shopping_center_position.values
    shopping_center_position_dictionary = to_yml(shopping_center_position_dictionary, os.path.join(labels_dir,'shopping_center_position.yml'))
    # print(shopping_center_position_dictionary)
    # 特征转下标
    customer_labels_data['shopping_center_position_index'] = customer_labels_data['shopping_center_position'].map(lambda x: shopping_center_position_dictionary['name2index'][x])


    # 是否重奢序号映射转换
    luxury_shopping_center_dictionary = {}
    luxury_shopping_center_index = customer_labels_data[(customer_labels_data['luxury_shopping_center']=='nan')|(customer_labels_data['luxury_shopping_center']=='0')].index
    customer_labels_data.loc[luxury_shopping_center_index,'luxury_shopping_center'] = 'Unknown'
    luxury_shopping_center = customer_labels_data['luxury_shopping_center'].drop_duplicates() # 去重
    luxury_shopping_center = luxury_shopping_center.sort_values() # 排序
    luxury_shopping_center_dictionary = luxury_shopping_center.values
    luxury_shopping_center_dictionary = to_yml(luxury_shopping_center_dictionary, os.path.join(labels_dir,'luxury_shopping_center.yml'))
    # print(luxury_shopping_center_dictionary)
    # 特征转下标
    customer_labels_data['luxury_shopping_center_index'] = customer_labels_data['luxury_shopping_center'].map(lambda x: luxury_shopping_center_dictionary['name2index'][x])


    # 商场时尚度 序号映射转换
    fashion_type_dictionary = {}
    fashion_type_index = customer_labels_data[(customer_labels_data['fashion_type']=='nan')|(customer_labels_data['fashion_type']=='0')].index
    customer_labels_data.loc[fashion_type_index,'fashion_type'] = 'Unknown'
    fashion_type = customer_labels_data['fashion_type'].drop_duplicates() # 去重
    fashion_type = fashion_type.sort_values() # 排序
    fashion_type_dictionary = fashion_type.values
    fashion_type_dictionary = to_yml(fashion_type_dictionary, os.path.join(labels_dir,'fashion_type.yml'))
    # print(fashion_type_dictionary)
    # 特征转下标
    customer_labels_data['fashion_type_index'] = customer_labels_data['fashion_type'].map(lambda x: fashion_type_dictionary['name2index'][x])


    # 客群价格敏感度|WOMEN 序号映射转换
    price_sensitivity_women_dictionary = {}
    price_sensitivity_women_index = customer_labels_data[(customer_labels_data['price_sensitivity_women']=='nan')|(customer_labels_data['price_sensitivity_women']=='0')].index
    customer_labels_data.loc[price_sensitivity_women_index,'price_sensitivity_women'] = 'Unknown'
    price_sensitivity_women = customer_labels_data['price_sensitivity_women'].drop_duplicates() # 去重
    price_sensitivity_women = price_sensitivity_women.sort_values() # 排序
    price_sensitivity_women_dictionary = price_sensitivity_women.values
    price_sensitivity_women_dictionary = to_yml(price_sensitivity_women_dictionary, os.path.join(labels_dir,'price_sensitivity_women.yml'))
    # print(price_sensitivity_women_dictionary)
    # 特征转下标
    customer_labels_data['price_sensitivity_women_index'] = customer_labels_data['price_sensitivity_women'].map(lambda x: price_sensitivity_women_dictionary['name2index'][x])

    # 客群价格敏感度|MEN 序号映射转换
    price_sensitivity_men_dictionary = {}
    price_sensitivity_men_index = customer_labels_data[(customer_labels_data['price_sensitivity_men']=='nan')|(customer_labels_data['price_sensitivity_men']=='0')].index
    customer_labels_data.loc[price_sensitivity_men_index,'price_sensitivity_men'] = 'Unknown'
    price_sensitivity_men = customer_labels_data['price_sensitivity_men'].drop_duplicates() # 去重
    price_sensitivity_men = price_sensitivity_men.sort_values() # 排序
    price_sensitivity_men_dictionary = price_sensitivity_men.values
    price_sensitivity_men_dictionary = to_yml(price_sensitivity_men_dictionary, os.path.join(labels_dir,'price_sensitivity_men.yml'))
    # print(price_sensitivity_men_dictionary)
    # 特征转下标
    customer_labels_data['price_sensitivity_men_index'] = customer_labels_data['price_sensitivity_men'].map(lambda x: price_sensitivity_men_dictionary['name2index'][x])

    # 客群价格敏感度|ACC 映射转换
    price_sensitivity_acc_dictionary = {}
    price_sensitivity_acc_index = customer_labels_data[(customer_labels_data['price_sensitivity_acc']=='nan')|(customer_labels_data['price_sensitivity_acc']=='0')].index
    customer_labels_data.loc[price_sensitivity_acc_index,'price_sensitivity_acc'] = 'Unknown'
    price_sensitivity_acc = customer_labels_data['price_sensitivity_acc'].drop_duplicates() # 去重
    price_sensitivity_acc = price_sensitivity_acc.sort_values() # 排序
    price_sensitivity_acc_dictionary = price_sensitivity_acc.values
    price_sensitivity_acc_dictionary = to_yml(price_sensitivity_acc_dictionary, os.path.join(labels_dir,'price_sensitivity_acc.yml'))
    # print(price_sensitivity_acc_dictionary)
    # 特征转下标
    customer_labels_data['price_sensitivity_acc_index'] = customer_labels_data['price_sensitivity_acc'].map(lambda x: price_sensitivity_acc_dictionary['name2index'][x])

    # 去除无用列
    customer_labels_data = customer_labels_data.drop(columns=[
        'shopping_center_type','shopping_center_position','luxury_shopping_center',
        'fashion_type','price_sensitivity_women','price_sensitivity_men','price_sensitivity_acc'],axis=1)
    # print(customer_labels_data)

    # 处理客群总量列
    resident_index = customer_labels_data[customer_labels_data['resident_amount'] == '无'].index
    office_index = customer_labels_data[customer_labels_data['office_amount'] == '无'].index
    customer_labels_data.loc[resident_index,'resident_amount'] = 0
    customer_labels_data.loc[office_index,'office_amount'] = 0
    customer_labels_data = customer_labels_data.fillna(0).astype({'resident_amount':int, 'office_amount':int})
    # print(customer_labels_data)

    return customer_labels_data
pass
def convert_shop_features_data(
    shop_data_filter:pd.DataFrame,
    labels_dir,
    predict_date:datetime.datetime,
    logger:logging.Logger):
    shop_features_data = shop_data_filter.copy()
    # 计算开店时长
    shop_features_data.loc[shop_features_data['open_date']=='Unknown', 'open_date'] = predict_date.strftime('%Y-%m-%d')
    shop_features_data.loc[shop_features_data['new_open_date']=='Unknown', 'new_open_date'] = predict_date.strftime('%Y-%m-%d')
    shop_features_data['open_days'] = (predict_date-pd.to_datetime(shop_features_data['open_date'])).dt.days
    shop_features_data['new_open_days'] = (predict_date-pd.to_datetime(shop_features_data['new_open_date'])).dt.days
    # 列转下标，需要转下标的列
    columns_filter = [
        # 'shop_no', # 店铺编码
        # 'shop_name', # 店铺名称
        # 'brand', # 品牌
        # 'currency_code', # 币种
        # 'shop_type', # 店铺类型
        # 'shop_type_desc', # 店铺类型描述
        'shop_sort', # 店铺归类
        # 'shop_sort_desc', # 店铺归类描述
        'region_no', # 大区编码
        # 'region_name', # 大区名称
        'subregion_no', # 小区编码
        # 'subregion_name', # 小区名称
        'bussinesscircle_code', # 商圈编码
        # 'bussinesscircle_name', # 商圈名称
        # 'country_code', # 国家编码
        # 'country_name', # 国家名称
        # 'province', # 省份编码
        'province_name', # 省份名称
        # 'city_code', # 城市编码
        'city_name', # 城市名称
        # 'city_level_code', # 城市等级编码
        'city_level_name', # 城市等级名称
        'district', # 城市区域
        # 'open_date', # 开业日期
        # 'new_open_date', # 再次开业日期
        'shop_property', # 店铺属性
        'is_new_shop', # 是否新老店
        'is_kids', # 是否童装店
        # 'is_same_shop', # 是否同店
        # 'is_promote', # 是否特卖
        'is_streetstore', # 是否街铺
        'is_outrant', # 是否外租仓
        # 'contract_size', # 合同面积
        # 'shop_size', # 实用面积
        # 'sales_size', # 销售面积
        'shop_dis_level', # 配货等级
        'sales_level', # 销售等级
        'image_level', # 形象级别
        'shop_level', # 店铺级别
        'ppsregion_no', # 企划分区
        # 'ppsregion_name', # 企划分区名称
        'temperature_zone', # 气温带
        'store_status', # 店铺状态
        # 'lon', # 经度
        # 'lat', # 纬度
        'store_centercode', # 分部编码
        # 'store_centername', # 分部名称
    ]
    # 所有列
    all_columns_filter = [
        'shop_no', # 店铺编码*
        # 'shop_name', # 店铺名称
        # 'brand', # 品牌
        # 'currency_code', # 币种
        # 'shop_type', # 店铺类型
        # 'shop_type_desc', # 店铺类型描述
        'shop_sort', # 店铺归类
        # 'shop_sort_desc', # 店铺归类描述
        'region_no', # 大区编码
        # 'region_name', # 大区名称
        'subregion_no', # 小区编码
        # 'subregion_name', # 小区名称
        'bussinesscircle_code', # 商圈编码
        # 'bussinesscircle_name', # 商圈名称
        # 'country_code', # 国家编码
        # 'country_name', # 国家名称
        # 'province', # 省份编码
        'province_name', # 省份名称
        # 'city_code', # 城市编码
        'city_name', # 城市名称
        # 'city_level_code', # 城市等级编码
        'city_level_name', # 城市等级名称
        'district', # 城市区域
        # 'open_date', # 开业日期*
        # 'new_open_date', # 再次开业日期*
        'open_days', # 开业天数*
        'new_open_days', # 再次开业天数*
        'shop_property', # 店铺属性
        'is_new_shop', # 是否新老店
        'is_kids', # 是否童装店
        # 'is_same_shop', # 是否同店
        # 'is_promote', # 是否特卖
        'is_streetstore', # 是否街铺
        'is_outrant', # 是否外租仓
        'contract_size', # 合同面积*
        'shop_size', # 实用面积*
        'sales_size', # 销售面积*
        'shop_dis_level', # 配货等级
        'sales_level', # 销售等级
        'image_level', # 形象级别
        'shop_level', # 店铺级别
        'ppsregion_no', # 企划分区
        # 'ppsregion_name', # 企划分区名称
        'temperature_zone', # 气温带
        'store_status', # 店铺状态
        'lon', # 经度*
        'lat', # 纬度*
        'store_centercode', # 分部编码
        # 'store_centername', # 分部名称
    ]
    select_columns = [] # 选中列
    dictionary_list = {}
    for c in all_columns_filter:
        # 判断是否需要转下标
        if c in columns_filter:
            # 去重
            values = shop_features_data[c].drop_duplicates().values
            values_indexes_dictionary = unknown_first(list(values))
            # 下标字典保存到文件
            values_indexes_dictionary = to_yml(values_indexes_dictionary, os.path.join(labels_dir, c+'.yml'))
            dictionary_list[c] = values_indexes_dictionary
            # 转下标
            shop_features_data[c+'_index'] = shop_features_data[c].map(lambda x: values_indexes_dictionary['name2index'][x])
            select_columns.append(c+'_index')
        else:
            select_columns.append(c)
    shop_features_data = shop_features_data[select_columns]
    return shop_features_data, dictionary_list
pass
def data_convert(
    other_dir,
    labels_dir,
    serial_ur_bi_dw_dir,
    ims_dir,
    predict_date:datetime.datetime,
    logger:logging.Logger,
):
    logger.info('开始数据转换')
    # 创建子目录
    if not os.path.exists(labels_dir):
        os.makedirs(labels_dir)

    # 创建子目录
    if not os.path.exists(other_dir):
        os.makedirs(other_dir)

    common_datas_dir = ShareArgs.get_args_value('common_datas_dir')
    common_ur_bi_dir = os.path.join(common_datas_dir, 'ur_bi_data')
    common_clickhouse_path = os.path.join(common_datas_dir, 'clickhouse_data')

    # 加载日期数据.。去掉列的前缀，然后将季节进行转换
    dim_date = convert_date_data(common_ur_bi_dir, labels_dir)

    # 店铺数据。去掉前缀、修正数据、空值转换、数据过滤、挑选字段列、保存店铺编码和名称字典，将过滤数据进行返回
    shop_data_filter = convert_shop_data(
        common_ur_bi_dir=common_ur_bi_dir,
        labels_dir=labels_dir,
        logger=logger
    )
    # print('shop_data_filter:', shop_data_filter.columns)

    # 加载SKU数据
    dim_goods, serial_code2name_dictionary = convert_goods_data(
        common_ur_bi_dir=common_ur_bi_dir,
        labels_dir=labels_dir,
        logger=logger
    )

    # 加载销售数据。分批读取并转换数据，然后将数据返回和存起来，如果曾经转换过，就不再进行转换了。
    sales_data = convert_sales_data(
        labels_dir=labels_dir,
        common_ur_bi_dir=common_ur_bi_dir,
        shop_data_filter=shop_data_filter,
        dim_goods=dim_goods,
        dim_date=dim_date,
        logger=logger
    )

    # 加载销售目标数据。读取月销售目标数据、空值处理，筛选销售数据、按店月系列汇总金额
    sales_goal_data = convert_sales_goal_data(
        ur_bi_dw_dir=serial_ur_bi_dw_dir,
        shop_data_filter=shop_data_filter,
        logger=logger
    )

    # 加载预测占比数据。将年和月字段拼成一个年月字段
    predict_data = convert_predict_data(
        ims_dir=ims_dir,
    )
    # print('predict_data:', predict_data.columns)

    # 计算销售占比数据。计算店月所有系列总销售金额，即店铺每月总销售金额。然后计算每个系列占比
    sales_data = convert_sales_percentage_data(
        labels_dir=labels_dir,
        sales_data=sales_data,
        logger=logger
    )

    # 计算销售目标占比数据
    sales_goal_data, sales_goal_percentage_data = convert_sales_goal_percentage_data(
        sales_goal_data=sales_goal_data,
        logger=logger
    )

    # print('shop_data_filter:', shop_data_filter[shop_data_filter['shop_no']=='URCN0410'])
    # # 去除没销售的店
    # shop_data_filter = shop_data_filter[shop_data_filter['shop_no'].isin(sales_data['shop_no'])]

    # 当月销售占比数据
    now_sales_data = sales_data[sales_data['month_of_year'] == predict_date.strftime('%Y%m')]

    # 当月销售目标占比数据
    now_sales_goal_data = sales_goal_data[sales_goal_data['month_of_year'] == predict_date.strftime('%Y%m')]

    # 当月预测占比数据
    now_predict_data = predict_data[predict_data['month_of_year'] == predict_date.strftime('%Y%m')]
    
    # 店SKU数

    # 店已审核配货单

    # print('shop_data_filter:', shop_data_filter[shop_data_filter['shop_no']=='URCN0410'])

    # 转换店铺特征数据。将字符串特征变成数字编码，数字编码才可以进行embedding
    shop_features_data, dictionary_list = convert_shop_features_data(
        shop_data_filter=shop_data_filter,
        labels_dir=labels_dir,
        predict_date=predict_date,
        logger=logger
    )

    # 店铺和客群特征。
    customer_labels_data = shop_labels_convert(
        other_dir=other_dir,
        labels_dir=labels_dir,
        logger=logger
    )

    # 保存到文件。。黄俊雄：训练和预测都是从这些文件里面读数据
    shop_data_filter.to_csv(os.path.join(labels_dir,'shop_data_filter.csv'), index=False)
    logger.info('已导出: %s', os.path.join(labels_dir,'shop_data_filter.csv'))
    sales_data.to_csv(os.path.join(labels_dir,'sales_data.csv'), index=False)
    logger.info('已导出: %s', os.path.join(labels_dir,'sales_data.csv'))
    sales_goal_percentage_data.to_csv(os.path.join(labels_dir,'sales_goal_percentage_data.csv'), index=False)
    logger.info('已导出: %s', os.path.join(labels_dir,'sales_goal_percentage_data.csv'))
    now_sales_data.to_csv(os.path.join(labels_dir,'now_sales_data.csv'), index=False)
    logger.info('已导出: %s', os.path.join(labels_dir,'now_sales_data.csv'))
    now_sales_goal_data.to_csv(os.path.join(labels_dir,'now_sales_goal_data.csv'), index=False)
    logger.info('已导出: %s', os.path.join(labels_dir,'now_sales_goal_data.csv'))
    now_predict_data.to_csv(os.path.join(labels_dir,'now_predict_data.csv'), index=False)
    logger.info('已导出: %s', os.path.join(labels_dir,'now_predict_data.csv'))
    shop_features_data.to_csv(os.path.join(labels_dir,'shop_features_data.csv'), index=False)
    logger.info('已导出: %s', os.path.join(labels_dir,'shop_features_data.csv'))
    customer_labels_data.to_csv(os.path.join(labels_dir,'customer_labels_feature.csv'), index=False)
    logger.info('已导出: %s', os.path.join(labels_dir,'customer_labels_feature.csv'))
    logger.info('完成数据转换')
pass







# data_convert(
#     other_dir=other_dir,  # 用户业务给的额外数据，不是从数据库里面读取的
#     labels_dir=labels_dir,  # 数据预处理之后文件的存放位置
#     serial_ur_bi_dw_dir=serial_ur_bi_dw_dir,  # 占比模型需要使用到的一些数据存放路径
#     ims_dir=ims_save_dir,  # 从ims数据库里面读取的文件数据。其实就是另外一个模型预测出来的占比数据
#     predict_date=predict_date,  # 
#     logger=self.logger
# )