1数据提取和剔除打卡无效数据
# coding: utf-8
import numpy as np
import pandas as pd
import datetime
import collections
# 计算时间差函数
def difftime(click_in, click_out):
# 输入的日期和时间是字符串形式,需要先将字符串格式化为datetime形式。
time1 = datetime.datetime.strptime(click_in, "%H:%M")
time2 = datetime.datetime.strptime(click_out, "%H:%M")
num = (time2-time1).seconds/60
return num
# 得到每一个姓名下所有数据的行所引
def checkId(target, a):
b = []
for index, nums in enumerate(a):
if nums == target:
b.append(index)
return (b)
in_path = "D:\"
df = pd.read_excel(in_path, sheet_name="原始记录", skiprows=2, header=0)
print(df)
# 提取工时计算所需数据
Name = df["姓名"].astype("str")
Company = df["部门"].astype("str")
Date_attendance = df["考勤日期"].astype("str")
Clock_time = df["打卡时间"].astype("str")
Group_attendance = df["考勤组"].astype("str")
Clock_result = df["打卡结果"].astype("str")
# 原始数据
df_1 = {"姓名": Name, "部门": Company, "考勤组": Group_attendance, "考勤日期": Date_attendance, "打卡时间": Clock_time,
"打卡结果": Clock_result}
df_1 = pd.DataFrame(df_1)
df_1.to_excel("D:\\", index=False)
in_path = "D:\\"
df = pd.read_excel(in_path, skiprows=0, header=0)
print(df)
# 提取工时计算所需数据
Name = df["姓名"].astype("str")
Company = df["部门"].astype("str")
Group_attendance = df["考勤组"].astype("str")
Date_attendance = df["考勤日期"].astype("str")
Clock_time = df["打卡时间"].astype("str")
Clock_result = df["打卡结果"].astype("str")
Click_no_effect_list_all = []
# 1.找到打卡无效字样所在行的索引
for i in Clock_result:
if "打卡无效" in str(i):
Click_no_effect_list = Clock_result[(Clock_result == str(i))].index.tolist()
Click_no_effect_list_all = Click_no_effect_list_all + Click_no_effect_list
Click_no_effect_list_all = list(set(Click_no_effect_list_all))
Click_no_effect_delete_index_list = sorted(Click_no_effect_list_all)
# 2.按列表中数据剔除
df_1 = df.drop(df.index[Click_no_effect_delete_index_list])
df_1.reset_index(inplace=True, drop=True)
print(df_1)
df_1.to_excel("D:\\", index=False)
2.剔除重复打卡(留下最早最晚打卡数据)
# coding=utf-8
import numpy as np
import pandas as pd
import datetime
import collections
# 计算时间差函数
def difftime(click_in, click_out):
# 输入的日期和时间是字符串形式,需要先将字符串格式化为datetime形式。
time1 = datetime.datetime.strptime(click_in, "%H:%M")
time2 = datetime.datetime.strptime(click_out, "%H:%M")
num = (time2-time1).seconds/60
return num
# 得到每一个姓名下所有数据的行所引
def checkId(target, a):
b = []
for index, nums in enumerate(a):
if nums == target:
b.append(index)
return (b)
in_path = "D:\\"
df = pd.read_excel(in_path, skiprows=0, header=0)
print(df)
# 提取工时计算所需数据
Name = df["姓名"].astype("str")
Company = df["部门"].astype("str")
Date_attendance = df["考勤日期"].astype("str")
Clock_time = df["打卡时间"].astype("str")
Group_attendance = df["考勤组"].astype("str")
Clock_result = df["打卡结果"].astype("str")
# 数据预处理,提出打卡异常数据,打卡大于两次,保留最早和最晚打卡时间
# 每个人遍历,分别找到每个人的一天内多次打卡的异常数据
Every_Name = [item for item, count in collections.Counter(Name).items() if count > 1]
Many_clock_data_list_all = []
Delete_index_list = []
for t in Every_Name:
# 每一个人的所有打卡时间数据
Date_index_Every_person = checkId(t, Name)
Clock_time_Every_person = Clock_time[Date_index_Every_person]
Date_attendance_Every_person = Date_attendance[Date_index_Every_person]
# 考勤日期连续重复大于两次
pre_element = None
cur_time = 1
Many_clock_data_list = []
for s in Date_attendance_Every_person:
# 如果当前数据与前一个数据相同
if s == pre_element:
cur_time = cur_time + 1
if cur_time > 2:
# 找到连续重复的考勤日期的索引
Date_attendance_Every_person_repet_index = Date_attendance_Every_person[(Date_attendance_Every_person ==
str(s))].index
# 一天内某个人多次打卡数据
Clock_time_Every_person_repet = Clock_time_Every_person[Date_attendance_Every_person_repet_index]
print(Clock_time_Every_person_repet)
# Series类型 转化为 datetime类型
Clock_time_Every_person_repet_datatime = pd.to_datetime(Clock_time_Every_person_repet)
Clock_time_earliest = str(Clock_time_Every_person_repet_datatime.min())
Clock_time_latest = str(Clock_time_Every_person_repet_datatime.max())
# .min().max()后输出的时间格式与原数据时间格式不同,需切片处理成相同
Clock_time_earliest = Clock_time_earliest[0:16]
Clock_time_latest = Clock_time_latest[0:16]
# 最早打卡时间与最晚打卡时间数据可能对应多个索引
Clock_time_earliest_index = Clock_time_Every_person_repet[(Clock_time_Every_person_repet ==
Clock_time_earliest)].index.tolist()
Clock_time_latest_index = Clock_time_Every_person_repet[(Clock_time_Every_person_repet ==
Clock_time_latest)].index.tolist()
# 删除每个(最早、最晚)列表里的第一个时间数据,索引列表中其他索引用于df.drop,这样就唯一保存了一个人一天最早和最晚打卡数据
# 早不同,晚不同
if len(Clock_time_earliest_index) < 2 and len(Clock_time_latest_index) < 2:
Clock_time_earliest_latest = Clock_time_earliest_index + Clock_time_latest_index
Clock_time_other_index = set(Date_attendance_Every_person_repet_index).difference(
Clock_time_earliest_latest)
Clock_time_other_index = sorted(Clock_time_other_index)
print("删掉不在同一时间打卡的其他数据", Clock_time_other_index)
# 要删除数据的列表
Delete_index_list = Delete_index_list + Clock_time_other_index
if len(Clock_time_earliest_index) >= 2 and len(Clock_time_latest_index) < 2:
# 有多个相同,删掉列表里的第一个,实际是保留重复数据的第一个
Clock_time_earliest_index_one = Clock_time_earliest_index.pop(0)
Clock_time_earliest_latest = [Clock_time_earliest_index_one] + Clock_time_latest_index
Clock_time_other_index = set(Date_attendance_Every_person_repet_index).difference(
Clock_time_earliest_latest)
Clock_time_other_index = sorted(Clock_time_other_index)
print("2.删除元素", Clock_time_other_index)
Delete_index_list = Delete_index_list + Clock_time_other_index
# 同理删除最晚重复打卡除了第一个元素以外元素的索引
if len(Clock_time_latest_index) < 2 and len(Clock_time_latest_index) >= 2:
Clock_time_latest_index_one = Clock_time_latest_index.pop(0)
Clock_time_earliest_latest = [Clock_time_latest_index_one] + Clock_time_latest_index
Clock_time_other_index = set(Date_attendance_Every_person_repet_index).difference(
Clock_time_earliest_latest)
Clock_time_other_index = sorted(Clock_time_other_index)
df = df.drop(df.index[Clock_time_other_index])
print("3.删除元素", Clock_time_other_index)
Delete_index_list = Delete_index_list + Clock_time_other_index
if len(Clock_time_earliest_index) >= 2 and len(Clock_time_latest_index) >= 2:
Clock_time_earliest_index_one = Clock_time_earliest_index.pop(0)
Clock_time_latest_index_one = Clock_time_latest_index.pop(0)
Clock_time_earliest_latest = [Clock_time_earliest_index_one] + [Clock_time_latest_index_one]
Clock_time_other_index = set(Date_attendance_Every_person_repet_index).difference(
Clock_time_earliest_latest)
Clock_time_other_index = sorted(Clock_time_other_index)
print("4.删除元素", Clock_time_other_index)
Delete_index_list = Delete_index_list + Clock_time_other_index
else:
pre_element = s
cur_time = 1
df = df.drop(df.index[Delete_index_list])
print(df)
df.to_excel("D:\\", index=False)
3. 剔除单次打卡数据(获得数据运算数据源)
# coding=utf-8
import numpy as np
import pandas as pd
import datetime
import collections
# 得到每一个姓名下所有数据的行所引
def checkId(target, a):
b = []
for index, nums in enumerate(a):
if nums == target:
b.append(index)
return (b)
in_path = "D:\\LGD"
df = pd.read_excel(in_path, skiprows=0, header=0)
print(df)
# 提取工时计算所需数据
Name_1 = df["姓名"].astype("str")
Company_1 = df["部门"].astype("str")
Group_attendance_1 = df["考勤组"].astype("str")
Date_attendance_1 = df["考勤日期"]
Clock_time_1 = df["打卡时间"].astype("str")
Clock_result_1 = df["打卡结果"].astype("str")
Click_no_effect_list_all_1 = []
'找到单次打卡的索引'
# 1.找到大于等于两次打卡的索引
Every_Name_1 = [item for item, count in collections.Counter(Name_1).items() if count > 1]
Many_clock_data_list_all_1 = []
for t in Every_Name_1:
# 每一个人的所有打卡时间数据
Date_index_Every_person_1 = checkId(t, Name_1)
Clock_time_Every_person_1 = Clock_time_1[Date_index_Every_person_1]
Date_attendance_Every_person_1 = Date_attendance_1[Date_index_Every_person_1]
pre_element = None
cur_time = 1
Many_clock_data_list_1 = []
for s in Date_attendance_Every_person_1:
# 如果当前数据与前一个数据相同
if s == pre_element:
cur_time = cur_time + 1
if cur_time >= 2:
a = Date_attendance_Every_person_1[(Date_attendance_Every_person_1 == str(s))].index.tolist()
Many_clock_data_list_1 = Many_clock_data_list_1 + a
else:
pre_element = s
cur_time = 1
Many_clock_data_list_all_1 = Many_clock_data_list_all_1 + Many_clock_data_list_1
# 剔除重复索引
print(Many_clock_data_list_all_1)
Many_clock_data_list_all_1 = list(set(Many_clock_data_list_all_1))
print(Many_clock_data_list_all_1)
# 用总表的索引与连续重复大于等于两次表的索引取差集
df_index_list = df.index.tolist()
single_click_index = set(df_index_list).difference(Many_clock_data_list_all_1)
single_click_index_list = sorted(single_click_index)
print(single_click_index_list)
df = df.drop(df.index[single_click_index_list])
df = df.reset_index(drop=True)
print(df)
out_path = "D:\\LGD"
df.to_excel(out_path, index=False)