比对删除库中数据

最新推荐文章于 2024-04-30 14:45:52 发布
小小权
最新推荐文章于 2024-04-30 14:45:52 发布
阅读量185
点赞数
文章标签： python 数据挖掘 pandas
本文链接：https://blog.youkuaiyun.com/u012366798/article/details/127023986
版权
# coding=utf-8
import csv
import ntpath

import pandas as pd
import pyhdb


def parse_excel(df, file, program, ending_time):
    file_group = file.replace('__', '_').split('_')[1]
    if file_group not in prefix_dict_1:
        prefix_dict_1[file_group] = []
        prefix_dict_2[file_group] = []
    lots = file.replace('__', '_').split('_')[0]
    op = file.replace('__', '_').split('_')[2]
    part_no = file.replace('__', '_').split('_')[3]
    if len(op) < 2:
        return
    if 'T1' == op or 'P1' == op:
        op = 'P1'
        retest = 0
    elif 'R1' == op:
        retest = 1
    elif 'R2' == op:
        retest = 2
    elif 'R3' == op:
        retest = 3
    else:
        print('\t', '非T1R123')
        return
    if 'FA' in file.replace('__', '_').split('_')[4]:
        temperature = 25
    elif 'FH' in file.replace('__', '_').split('_')[4]:
        temperature = 150
    elif 'FM' in file.replace('__', '_').split('_')[4]:
        temperature = 80
    else:
        temperature = 0
    res_data = []
    # 对规则和不规则csv文件判断
    df_group = df.groupby('PART_ID')
    for key in df_group.groups:
        pro_data = df_group.get_group(key)
        now_row = None
        for idx, row in pro_data.iterrows():
            if key is None or len(key.strip()) == 0:
                continue
            if now_row is None:
                now_row = row
            else:
                now_row = row
        if now_row is None:
            continue
        for x in range(len(title)):
            # print(title)
            nt = title[x]
            if len(nt.strip()) <= 0:
                continue

            line = [file, ending_time, key, lots, part_no, program, file_group, temperature, retest, nt,
                    now_row[nt], now_row['T_TIME'], now_row['SOFT_BIN'], op]
            # print(line)
            res_data.append(line)
    sel_line(res_data)

    for idx, rows in df.iterrows():
        if idx < 1:
            global parameter_id, is_chart
            if parameter_id <= 50:
                is_chart = 1
            else:
                is_chart += 1

            for x in range(len(title)):
                nt = title[x]
                if len(nt.strip()) <= 0:
                    continue
                module_line = ['AUTOMOTIVE', part_no, file_group, nt, limit_l[x], limit_u[x],
                               unit[x], parameter_id, program, parameter_id, unit[x], 1]
                parameter_id += 1
                is_chart += 1
                # print(module_line)
                # PARAMETER
                module_excel.append(module_line)


# 数据格式化
def sel_line(res_data):
    res_header = ['file', 'ending_time', 'part_id', 'lot_id', 'part_no', 'program', 'file_group', 'temperature',
                  'retest', 'nt', 'value', 't_time', 'soft_bin', 'lots']
    df = pd.DataFrame(res_data, columns=res_header)
    df_group = df.groupby('part_id')
    for key in df_group.groups:
        # print(key)
        pro_data = df_group.get_group(key)
        for idx, row in pro_data.iterrows():
            out_line = row
            # Fixed
            line_3 = [out_line['file'], out_line['ending_time'], out_line['part_id'], out_line['lot_id'],
                      out_line['lot_id'], out_line['part_no'], 0, 0, '', '', out_line['program'],
                      out_line['file_group'],
                      out_line['temperature'], out_line['lots'], 0, 0, '', 'TF', 0, 0]
            # Other
            line_1 = [out_line['part_id'], 0, 0, out_line['retest'], out_line['nt'], out_line['value'],
                      out_line['program'], out_line['lot_id'], out_line['ending_time']]
            # Devices
            line_2 = [out_line['part_id'], 0, 0, out_line['retest'], out_line['t_time'], out_line['soft_bin'],
                      out_line['soft_bin'], out_line['lot_id'], '', 1, out_line['program'], out_line['ending_time']]

            if not pd.isna(out_line['value']) and len(str(out_line['value'])) > 0:
                prefix_dict_1[out_line['file_group']].append(line_1)
            if line_2 not in prefix_dict_2[out_line['file_group']]:
                prefix_dict_2[out_line['file_group']].append(line_2)
            if line_3 not in all_ft:
                all_ft.append(line_3)


# Fixed
header = ['FLIE_NAME', 'ENDING_TIME', 'WAFER_ID', 'LOT_ID', 'C_LOT', 'PART_NO', 'RECORDS', 'INSERT_NUM', 'UPDATE_NUM',
          'MACHINE_NAME', 'PROGRAM', 'STEP', 'TEMPERATURE', 'FLOW', 'PARA_RECORDS', 'PARA_INSERT', 'PARA_UPDATE',
          'VENDOR', 'RAW_WAFERID', 'RASS_DIE']
# Other
prefix_header_1 = ['WAFER_ID', 'LOCATE_X', 'LOCATE_Y', 'RETEST', 'TEST_NAME', 'VALUE', 'PROGRAM', 'LOT_ID',
                   'ENDING_TIME']
# Devices
prefix_header_2 = ['WAFER_ID', 'LOCATE_X', 'LOCATE_Y', 'RETEST', 'T_TIME', 'SOFT_BIN', 'HARD_BIN', 'LOT_ID', 'PART_ID',
                   'SITE_NUM', 'PROGRAM', 'ENDING_TIME']
# Color
new_header = ['SOFT_BIN', 'BIN_NAME', 'PROJECT_TYPE', 'PART_NO', 'PROCESS', 'TEST_PROGRAM', 'COLOR', 'CREATE_DATE']
# Parameter
module_header = ['PROJECT_TYPE', 'PART_NO', 'PROCESS', 'PARAMETER', 'LIMIT_L', 'LIMIT_H', 'UNIT', 'TEST_NUM',
                 'TEST_PROGRAM', 'PARAMETER_ID', 'DISPLAY_UNIT', 'IS_CHART']
new_excel = []
module_excel = []

# 输出文件夹
out_file_dir = 'Output'

# 读取excel目录
file_dir = r'ASAASDAASDC_FT2_P1_ASDASFS_FM111_A3_SR01_2022-02-02_01_11_11_.csv'

file = file_dir.split('\\')[-1]
prefix_dict_1 = {}
prefix_dict_2 = {}
all_ft = []
num = 0

xlsx_file = file_dir
read_data = []
head = None
new_head = None
program = file.replace('__', '_').split('_')[5] + '_' + file.replace('__', '_').split('_')[6]
file_group = file.replace('__', '_').split('_')[1]
ending_time = file.replace('__', '_').split('_')[7] + ' ' + file.replace('__', '_').split('_')[8] + ':' + \
              file.replace('__', '_').split('_')[9] + ':' + file.replace('__', '_').split('_')[10].split('.')[0]

# print(file_group)
parameter_id = 1
with open(xlsx_file, encoding='gbk') as f:
    row = csv.reader(f, delimiter=',')
    for idx, r in enumerate(row):
        if len(r) <= 0:
            continue
        if head is not None or new_head is not None:
            read_data.append(r)
        first = r[0]
        # 初始化数据开始行
        if 'SITE_NUM' in first:
            head = r
        elif 'SBin' in first:
            l1 = first.split(']')[0].replace('SBin[', '')
            if 'Fail__Default' in first:
                l2 = 'Fail__Default'
            elif 'Pass__Default' in first:
                l2 = 'Pass__Default'
            else:
                l2 = first.split()[1].replace('__AllFail', '')

            part_no = file.replace('__', '_').split('_')[0]
            # COLOR
            new_excel.append([l1, l2, 'AUTOMOTIVE', part_no, file_group, program, 'test color', ''])

# print('\t', '当前开始行：', head)
map = {}
df = pd.DataFrame(read_data, columns=head, dtype=object)
df['SOFT_BIN'].replace('3', '1', inplace=True)
title = list(df.columns.values[6:])

unit = df.iloc[0, 6:].values
# print(unit)
limit_l = df.iloc[1, 6:].values
# print(limit_l)
limit_u = df.iloc[2, 6:].values
# print(limit_u)
parse_excel(df, file, program, ending_time)

"""--------------------------------------------------Upsert To HANA--------------------------------------------------"""
"""--------------------------------------------------Upsert To HANA--------------------------------------------------"""
"""--------------------------------------------------Upsert To HANA--------------------------------------------------"""


def get_connection():
    conn = pyhdb.connect(
        host="",
        port=34215,  # 多租户的端口需要准确的如30053，
        user="",
        password=""
    )
    return conn


"""---------------------------------------------------Fixed To HANA--------------------------------------------------"""
df_fixed = pd.DataFrame(all_ft, columns=header)


# print(df_fixed['WAFER_ID'])

# ----------------------------------------------------------------------------------------------------------------------
def select_wafer(file_wafer_ids):
    cur = conn.cursor()
    sql = "SELECT WAFERID,DEVICES_TIME FROM AUTOMOTIVE_FT_DEVICES_FIXED_ITEM WHERE WAFERID IN ({})" \
        .format(','.join(["?" for _ in file_wafer_ids]))
    cur.execute(sql, file_wafer_ids)
    results = cur.fetchall()
    return results


conn = get_connection()
cursor = conn.cursor()
# 取文件中的wafer_id 去查数据库
file_wafer_ids = list(df_fixed['WAFER_ID'])
results = select_wafer(file_wafer_ids)
if len(results) > 0:
    # 库中有数据才处理, 转为id映射
    db_id_time_map = {}
    for res in results:
        db_id_time_map[res[0]] = res[1]
    file_del = []
    for idx, rows in df_fixed.iterrows():
        file_wafer_id = df_fixed['WAFER_ID']
        if file_wafer_id in db_id_time_map:
            db_time = db_id_time_map[file_wafer_id]
            if db_time < file_wafer_id:  # 如果数据库中的时间小于文件中的，删库
                cur = conn.cursor()
                sql = "DELETE FROM AUTOMOTIVE_FT_DEVICES_FIXED_ITEM WHERE WAFERID = ?"
                cur.execute(sql, (file_wafer_id,))
            else:  # 文件 < 数据库, 删文件里的
                file_del.append(idx)
    df_fixed = df.drop(file_del)
    df_fixed.to_csv(ntpath.join(out_file_dir, 'Automotive_FT_Devices_Fixed_Item.csv'),
                    header=header, encoding='gbk', index=False)
# -----------------------------------------------------------------------------------------------------------------------


def every_strand(list_temp, n):
    for i in range(0, len(list_temp), n):
        yield list_temp[i:i + n]


def write_fixed_item(fixed_item):
    mark = '?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?'  # sql values
    cols_str = 'FILE_NAME,' \
               'DEVICES_TIME,' \
               'C_LOT,' \
               'SUB_LOT,' \
               'PART_NO,' \
               'WAFERID,' \
               'RECORDS,' \
               'INSERT_NUM,' \
               'UPDATE_NUM,' \
               'MACHINE_NAME,' \
               'PARA_RECORDS,' \
               'PARA_INSERT,' \
               'STEP,' \
               'TEMPERATURE,' \
               'TESTER_ID,' \
               'PARA_UPDATE,' \
               'FLOW,' \
               'RAW_WAFERID,' \
               'PASS_DIE,' \
               'VENDOR'

    sql = '''UPSERT "{}" ({}) values ({}) WITH PRIMARY KEY'''.format(fixed_table_name, cols_str, mark)
    cursor.executemany(sql, fixed_item)
    conn.commit()
    print("====   FIXED Inserted Successfully   ====")


fixed_table_name = 'AUTOMOTIVE_FT_DEVICES_FIXED_ITEM'
num = 1000  # 每次插入的条数
df_fixed['ENDING_TIME'] = pd.to_datetime(df_fixed['ENDING_TIME'], format='%Y-%m-%d')

list_data_fixed = []
for idx, row in df_fixed.iterrows():
    val = [row['FLIE_NAME'],
           row['ENDING_TIME'],
           row['C_LOT'],
           row['LOT_ID'],
           row['PART_NO'],
           row['WAFER_ID'],
           row['RECORDS'],
           row['INSERT_NUM'],
           row['UPDATE_NUM'],
           row['MACHINE_NAME'],
           row['PARA_RECORDS'],
           row['PARA_INSERT'],
           row['STEP'],
           row['TEMPERATURE'],
           row['PROGRAM'],
           row['PARA_UPDATE'],
           row['FLOW'],
           row['RAW_WAFERID'],
           row['RASS_DIE'],
           row['VENDOR']]
    for x in range(len(val)):
        if pd.isna(val[x]):
            val[x] = None
        if val[x] == '':
            val[x] = None
    list_data_fixed.append(val)

fixed_result = [list_data_fixed[i * num:(i + 1) * num] for i in range(int(len(list_data_fixed) / num) + 1) if
                list_data_fixed[i * num:(i + 1) * num]]
for fixed_item in fixed_result:
    write_fixed_item(fixed_item)

"""--------------------------------------------------Color To HANA---------------------------------------------------"""
df_color = pd.DataFrame(new_excel, columns=new_header)
df_color.to_csv(ntpath.join(out_file_dir, 'Global_Map_Color_V2.csv'),
                header=new_header, encoding='gbk', index=False)


def write_color_item(color_item):
    mark = '?,?,?,?,?,?,?'  # sql values
    cols_str = 'PROJECT_TYPE,' \
               'PART_NO,' \
               'PROCESS,' \
               'COLOR,' \
               'SOFT_BIN,' \
               'BIN_NAME,' \
               'TEST_PROGRAM'

    sql = '''UPSERT "{}" ({}) values ({}) WITH PRIMARY KEY'''.format(color_table_name, cols_str, mark)
    cursor.executemany(sql, color_item)
    conn.commit()
    print("====   COLOR Inserted successfully   ====")


color_table_name = 'GLOBAL_MAP_COLOR_V2'
num = 1000  # 每次插入的条数

list_data_color = []
for idx, row in df_color.iterrows():
    val = [row['PROJECT_TYPE'],
           row['PART_NO'],
           row['PROCESS'],
           row['COLOR'],
           row['SOFT_BIN'],
           row['BIN_NAME'],
           row['TEST_PROGRAM']]
    for x in range(len(val)):
        if pd.isna(val[x]):
            val[x] = None
        if val[x] == '':
            val[x] = None
    list_data_color.append(val)

color_result = [list_data_color[i * num:(i + 1) * num] for i in range(int(len(list_data_color) / num) + 1) if
                list_data_color[i * num:(i + 1) * num]]
for color_item in color_result:
    write_color_item(color_item)

"""------------------------------------------------Parameter To HANA-------------------------------------------------"""
df_parameter = pd.DataFrame(module_excel, columns=module_header)
df_parameter.to_csv(ntpath.join(out_file_dir, 'Global_Test_Parameter_V2.csv'),
                    header=module_header, encoding='gbk', index=False)


def write_parameter_item(parameter_item):
    mark = '?,?,?,?,?,?,?,?,?,?,?,?'  # sql values
    cols_str = 'PROJECT_TYPE,' \
               'PART_NO,' \
               'PROCESS,' \
               'PARAMETER,' \
               'LIMIT_L,' \
               'LIMIT_H,' \
               'UNIT,' \
               'TEST_NUM,' \
               'TEST_PROGRAM,' \
               'PARAMETER_ID,' \
               'DISPLAY_UNIT,' \
               'IS_CHART'

    sql = '''UPSERT "{}" ({}) values ({}) WITH PRIMARY KEY'''.format(parameter_table_name, cols_str, mark)
    # print(item)
    cursor.executemany(sql, parameter_item)
    conn.commit()
    print("==== PARAMETER Inserted successfully ====")


parameter_table_name = 'GLOBAL_TEST_PARAMETER_V2'
num = 1000  # 每次插入的条数

list_data_parameter = []
for idx, row in df_parameter.iterrows():
    val = [row['PROJECT_TYPE'],
           row['PART_NO'],
           row['PROCESS'],
           row['PARAMETER'],
           row['LIMIT_L'],
           row['LIMIT_H'],
           row['UNIT'],
           row['TEST_NUM'],
           row['TEST_PROGRAM'],
           row['PARAMETER_ID'],
           row['DISPLAY_UNIT'],
           row['IS_CHART']]

    for x in range(len(val)):
        if pd.isna(val[x]):
            val[x] = None
        if val[x] == '':
            val[x] = None
    list_data_parameter.append(val)

parameter_result = [list_data_parameter[i * num:(i + 1) * num] for i in range(int(len(list_data_parameter) / num) + 1)
                    if list_data_parameter[i * num:(i + 1) * num]]
for parameter_item in parameter_result:
    write_parameter_item(parameter_item)

"""--------------------------------------------------Other To HANA---------------------------------------------------"""
for key in prefix_dict_1:
    df_other = pd.DataFrame(prefix_dict_1[key], columns=prefix_header_1)
    df_other.to_csv(ntpath.join(out_file_dir, 'Automotive_' + key + '_Devices_Other_Items.csv'),
                    header=prefix_header_1, encoding='gbk', index=False)
    other_table_name = 'AUTOMOTIVE_' + key + '_DEVICES_OTHER_ITEMS'


    def write_other_item(other_item):
        mark = '?,?,?,?,?,?,?,?,?'  # sql values
        cols_str = 'WAFERID,' \
                   'LOCATE_X,' \
                   'LOCATE_Y,' \
                   'RETEST,' \
                   'RESULT,' \
                   'PARAMETER,' \
                   'TEST_PROGRAM,' \
                   'DEVICES_TIME,' \
                   'SUB_LOT'

        sql = '''UPSERT "{}" ({}) values ({}) WITH PRIMARY KEY'''.format(other_table_name, cols_str, mark)
        # print(item)
        cursor.executemany(sql, other_item)
        conn.commit()
        print("====   OTHER Inserted successfully   ====")


    num = 10000  # 每次插入的条数
    df_other['ENDING_TIME'] = pd.to_datetime(df_other['ENDING_TIME'], format='%Y-%m-%d')

    list_data_other = []
    for idx, row in df_other.iterrows():
        val = [row['WAFER_ID'],
               row['LOCATE_X'],
               row['LOCATE_Y'],
               row['RETEST'],
               row['VALUE'],
               row['TEST_NAME'],
               row['PROGRAM'],
               row['ENDING_TIME'],
               row['LOT_ID']]

        for x in range(len(val)):
            if pd.isna(val[x]):
                val[x] = None
            if val[x] == '':
                val[x] = None
        list_data_other.append(val)

    other_result = [list_data_other[i * num:(i + 1) * num] for i in range(int(len(list_data_other) / num) + 1) if
                    list_data_other[i * num:(i + 1) * num]]
    for other_item in other_result:
        write_other_item(other_item)

"""-------------------------------------------------Devices To HANA--------------------------------------------------"""
for key in prefix_dict_2:
    df_devices = pd.DataFrame(prefix_dict_2[key], columns=prefix_header_2)
    df_devices.to_csv(ntpath.join(out_file_dir, 'Automotive_' + key + '_Devices.csv'),
                      header=prefix_header_2, encoding='gbk', index=False)
    devices_table_name = 'AUTOMOTIVE_' + key + '_DEVICES'


    def write_devices_item(devices_item):
        mark = '?,?,?,?,?,?,?,?,?,?,?,?'
        cols_str = 'WAFERID,' \
                   'LOCATE_X,' \
                   'LOCATE_Y,' \
                   'SOFT_BIN,' \
                   'DEVICES_TIME,' \
                   'RETEST,' \
                   'HARD_BIN,' \
                   'PART_ID,' \
                   'TEST_PROGRAM,' \
                   'SITE_NUM,' \
                   'TEST_TIME,' \
                   'SUB_LOT'

        sql = '''UPSERT "{}" ({}) values ({}) WITH PRIMARY KEY'''.format(devices_table_name, cols_str, mark)
        # print(item)
        cursor.executemany(sql, devices_item)
        conn.commit()
        print("====  DEVICES Inserted successfully  ====")


    num = 1000  # 每次插入的条数
    df_devices['ENDING_TIME'] = pd.to_datetime(df_devices['ENDING_TIME'], format='%Y-%m-%d')

    list_data_devices = []
    for idx, row in df_devices.iterrows():
        val = [row['WAFER_ID'],
               row['LOCATE_X'],
               row['LOCATE_Y'],
               row['SOFT_BIN'],
               row['ENDING_TIME'],
               row['RETEST'],
               row['HARD_BIN'],
               row['PART_ID'],
               row['PROGRAM'],
               row['SITE_NUM'],
               row['T_TIME'],
               row['LOT_ID']]
        for x in range(len(val)):
            if pd.isna(val[x]):
                val[x] = None
            if val[x] == '':
                val[x] = None
        list_data_devices.append(val)

    devices_result = [list_data_devices[i * num:(i + 1) * num] for i in range(int(len(list_data_devices) / num) + 1) if
                      list_data_devices[i * num:(i + 1) * num]]
    for devices_item in devices_result:
        write_devices_item(devices_item)

print("TableName  :" + other_table_name)
print("TableName  :" + devices_table_name)
"""----------------------------------------------------HANA Close----------------------------------------------------"""
try:
    cursor.close()
    conn.close()
except Exception as e:
    print("Close HANA :" + str(e))
    pass