简记一段数据清洗项目经历

Jo乔戈里

已于 2024-10-18 10:21:17 修改

阅读量886

点赞数 24

文章标签： sql database mysql python

于 2024-10-17 21:40:39 首次发布

本文链接：https://blog.youkuaiyun.com/a494665/article/details/143026648

版权

声明：该项目是从github上下载的，原项目地址在:
https://github.com/TurboWay/bigdata_analyse

这个数据分析项目选自其中的"Amoy_job", 即"10万条厦门招聘数据分析", 但是没有严格沿用里面的代码，绝大部分都是自己思考所做的数据清洗。注意，此处仅是数据清洗，尚未进入到真正的数据分析和可视化，得出结论这些阶段！

代码镇楼:

from collections import Counter
import swifter
import modin.pandas as pd
import json
import ray
import re
import os


class DataManager(object):
    PATH = "../dataset/job.csv"
    SAVE_PATH = "../dataset/job_clean.csv"

    @classmethod
    def load_external_data(cls):
        with open("../dataset/additional_data.json", "r", encoding="utf-8") as file:
            return json.load(file)

    @classmethod
    def load_raw_data(cls):
        return pd.read_csv(cls.PATH)

    @classmethod
    def save(cls, save_df: pd.DataFrame):
        from sqlalchemy import create_engine
        engine = create_engine("mysql+pymysql://root:your_password@localhost/data_analysis")
        save_df.to_sql(name="amoy_job", con=engine, index=False, if_exists="replace")
        save_df.to_csv(SAVE_PATH, index=False, encoding="utf_8_sig")


class FillMissingValue(object):
    def __init__(self, data_frame, _external_data):
        # 缺失值的补充，个人认为是最富有挑战性的
        # 异常值的数量远没有缺失值多，因此往往直接drop掉也无伤大雅，但缺失值就相当复杂了
        self.data_frame: pd.DataFrame = data_frame
        self.external_data = _external_data

    def fill_missing_value(self):
        self.data_frame["num"].fillna(3, inplace=True)
        self.data_frame["num"].replace({"若干": 6}, inplace=True)
        self.data_frame["lang"].fillna("不限", inplace=True)
        self.data_frame["age"].fillna("不限", inplace=True)
        self.data_frame["salary"].fillna("面议", inplace=True)
        self.data_frame["welfare"].fillna("面议", inplace=True)
        self.data_frame["industry"].replace({"其他行业": None}, inplace=True)
        self.data_frame["industry"] = self.data_frame.swifter.apply(self.fill_missing_industry, axis=1)
        self.data_frame["company_type"] = self.data_frame.apply(self.fill_missing_company_type, axis=1)

    def fill_missing_company_type(self, row):
        if pd.isnull(row["company_type"]):
            return self.external_data["query_company_type"][row["company"]]
        return row["company_type"]

    def fill_missing_industry(self, row: pd.Series):
        if pd.notnull(row["industry"]):
            return row["industry"]

        position_prefix = row["position"][:2]
        query_cond = f"position.str.startswith('{position_prefix}')"
        words: Counter = Counter()

        try:
            query_result: pd.DataFrame = self.data_frame.query(query_cond)["industry"].dropna()
        except SyntaxError:
            return "其他"

        for string in query_result:
            for word in re.split(r'\W', string):
                words[word] += 1

        words.pop("", None)
        words.pop("其他行业", None)
        most_common_keyword = words.most_common(3)
        industry: list = [keyword[0] for keyword in most_common_keyword]
        return "其他" if len(industry) == 0 else ",".join(industry)


class FormatField(object):
    def __init__(self, data_frame, _external_data):
        self.data_frame = data_frame
        self.external_data = _external_data

    def format_field(self):
        self.data_frame["sex"].replace({"无": "不限"}, inplace=True)
        self.data_frame["lang"].replace({"其他": "不限"}, inplace=True)
        self.data_frame["lang"] = self.data_frame["lang"].apply(self.__language_format)
        self.data_frame["age"] = self.data_frame["age"].apply(lambda x: x.replace("岁至", "-").replace("岁", ""))
        self.data_frame["education"] = self.data_frame["education"].apply(lambda x: x[:2])
        self.data_frame["work_experience"] = self.data_frame["work_experience"].apply(self.__work_experience_format)
        self.data_frame["industry"] = self.data_frame["industry"].apply(self.__industry_format)
        self.data_frame["company_type"] = self.data_frame["company_type"].apply(self.__company_type_format)

    @staticmethod
    def __work_experience_format(row):
        experience_match = r"(不限)|(.*?)工作经验以上|(应届生)"
        match_result = re.match(experience_match, row)
        if match_result is None:
            return "其他"
        for res in match_result.groups():
            if res is not None:
                return res

    def __industry_format(self, row):
        industry_map: dict = self.external_data["industry_map"]
        for industry, pattern in industry_map.items():
            if re.search(pattern, row):
                return industry

        return "其他"

    @staticmethod
    def __language_format(row):
        res = re.match(r"(.*?)语", row)
        return "不限" if res is None else res.group(0)

    @staticmethod
    def __company_type_format(row):
        if re.search(r"民营|私企", row):
            return "民营/私企"
        elif re.search(r"台资|港资", row):
            return "台资/港资"

        company_type_pattern = r"私营股份制|合资|外资|上市公司|国营企业|事业单位|外资代表处"
        search_result = re.search(company_type_pattern, row)
        if search_result:
            return search_result.group(0)
        else:
            return "其他"


class DataCleaning(object):
    def __init__(self, data_frame, _external_data):
        self.data_frame: pd.DataFrame = data_frame
        self.external_data = _external_data

    def drop_redundant_data(self):
        self.data_frame.drop_duplicates(inplace=True)
        self.data_frame.drop("phone", axis=1, inplace=True)
        self.data_frame.drop("HR", axis=1, inplace=True)
        self.data_frame.drop("workplace", axis=1, inplace=True)
        self.data_frame.drop("address", axis=1, inplace=True)

    def fill_missing_value(self):
        filler = FillMissingValue(self.data_frame, external_data)
        filler.fill_missing_value()

    def divestiture_field(self):
        def fill_lose_shift_type(row):
            if pd.isnull(row["shift_type"]):
                if pd.isnull(row["work_hours"]) and pd.isnull(row["work_days"]):
                    return None
                return "正常白班"
            return row["shift_type"]

        work_hour_regex_exp = r"([0-9.]+)小时/天"
        work_day_regex_exp = r"([0-9.]+天/周)|大小周"
        shift_type_regex_exp = r"(不定时工作制|正常白班|正常晚班|2班倒|3班倒)"
        worktime_field = self.data_frame["worktime"].str
        self.data_frame["work_hours"] = worktime_field.extract(work_hour_regex_exp, expand=False)
        self.data_frame["work_days"] = worktime_field.extract(work_day_regex_exp, expand=False)
        self.data_frame["shift_type"] = worktime_field.extract(shift_type_regex_exp, expand=False)
        self.data_frame["shift_type"] = self.data_frame.apply(fill_lose_shift_type, axis=1)
        self.data_frame.drop("worktime", axis=1, inplace=True)

    def deal_abnormal_value(self):
        self.data_frame["work_hours"].replace({40.0: 8.0})
        abnormal_salary_cond = self.data_frame["salary"].str.contains(r'^.-', regex=True, na=False)
        self.data_frame.drop(self.data_frame[abnormal_salary_cond].index, inplace=True)
        abnormal_work_hours = self.data_frame["work_hours"] > 16
        self.data_frame.drop(self.data_frame[abnormal_work_hours].index, inplace=True)
        # 这里的copy()是极为关键的一环，他确保了self.data_frame[abnormal_salary_cond].copy()得到的是副本而不是视图
        # SettingWithCopyWarning往往不是由对应行的代码引发而来的，要逐个函数检查!!
        # self.data_frame = self.data_frame[abnormal_salary_cond].copy()

    def format_field(self):
        formatter = FormatField(self.data_frame, self.external_data)
        formatter.format_field()

    def column_type_transform(self):
        self.data_frame["num"] = self.data_frame["num"].astype("int16")
        self.data_frame["work_hours"] = self.data_frame["work_hours"].astype("float16")


if __name__ == "__main__":
    # 配置好环境，启动Modin
    ray.init()
    swifter.register_modin()
    os.environ["MODIN_CPUS"] = "3"  # 限制 Modin 使用的 CPU 数量

    # 数据清洗
    data = DataManager.load_raw_data()
    external_data = DataManager.load_external_data()
    clean_data = DataCleaning(data, external_data)
    clean_data.drop_redundant_data()
    clean_data.fill_missing_value()
    clean_data.divestiture_field()
    clean_data.format_field()
    clean_data.column_type_transform()
    clean_data.deal_abnormal_value()
    DataManager.save(data)

文件目录:

__personal_process

__etl.py

__dataset

__addition_data.json

__job.csv

特别的，addition_data.json是自己搜集来的外部补充数据，文件数据如下:

{
    "query_company_type": {
        "厦门腾德鑫工贸有限公司": "民营/私营公司",
        "厦门博信强实业有限公司": "民营/私营公司",
        "厦门市黄家明仁堂商贸有限公司": "民营公司",
        "厦门市金佳财务代理有限公司": "民营/私营公司",
        "厦门昱优工贸有限公司": "民营/私营公司",
        "厦门兴宏山金属制品有限公司": "民营/私营公司",
        "厦门芸妆美容有限公司": "民营/私营公司",
        "厦门承翔伟业科技有限公司": "民营/私营公司",
        "厦门十里亭食品有限公司": "民营/私营公司",
        "厦门兆伦纸业有限公司": "民营/私营公司",
        "厦门市湖里区博林民办学校": "事业单位",
        "厦门正朝新物联科技有限公司": "民营/私营公司",
        "厦门思明修志夫整形外科门诊部有限公司": "民营/私营公司"
    },

    "industry_map": {
        "金融与投资行业": "金融|投资|证券|银行|会计|审计|保险",
        "互联网与信息技术行业": "互联网|电子|商务|网络|游戏|计算机|硬件|系统|数据|软件|通信|电信",
        "消费品与零售行业": "快速消费品|食品|饮料|化妆品|工艺品|服装|纺织|皮革|贸易|进出口|批发|零售",
        "建筑与房地产行业": "建筑|工程|原材料|加工|房地产开发|室内设计|装潢|物业管理|家具",
        "交通与物流行业": "交通|运输|物流|航天|航空",
        "服务业": "商业中心|服务|专业|咨询|人力资源|中介|物业|管理|商业|室内|设计|餐饮业|酒店|旅游",
        "媒体与娱乐行业": "影视|媒体|艺术|娱乐|休闲|体育",
        "制造与工业行业": "机械|设备|重工|技术|半导体|集成电路|印刷|包装|工业|自动化|零配件|仪器仪表",
        "教育与培训行业": "教育|培训|科研|学术",
        "医疗与健康行业": "医疗|护理|保健|卫生|美容",
        "法律与公关行业": "法律|广告|公关|市场|会展",
        "办公与用品行业": "办公|用品",
        "环保与农业行业": "环保|农业|渔业|林业|电力|水利",
        "化工与矿产行业": "化工|矿产|石油|采掘|冶炼",
        "产品开发与认证行业": "产品|检测|认证",
        "文字与出版行业": "文字|出版",
        "非盈利性机构": "非盈利",
        "多元化业务": "多元化业务"
    }
}

以及重命名了一下数据的字段，如下:

position,num,company,job_type,work_experience,lang,age,sex,education,workplace,worktime,salary,welfare,HR,phone,address,company_type,industry,require

其中简要说明一下各个字段的含义:

position-职位
num-公司招聘人数
company-公司名
job_type-工作类型，如全职/兼职
work_experience-工作经验要求
lang-语言要求
age-年龄要求
sex-性别要求
education-学历要求
workplace-工作地点
worktime-工作时间
salary-薪资
HR-面试官
phone-联系电话
address-公司地址
company_type-公司类型，如民企/私企等
industry-行业类型, 如服务业/制造业等
require-就职要求

先来总结整体思路, 见图:

先说难点，个人认为最富有挑战性的是③和⑧，数据清洗后效果也有诸多瑕疵，但是我们务必时刻记得数据清洗的目的是服务于数据分析的，数据分析不是一定要基于完全准确的数据，数据之中甚至是3%左右的异常值可能都不会对整个分析结果有太大的影响，因此这里的③虽然耗时很久但其实是可以偷懒直接drop掉缺失的industry的。OK，现在让我们开始!

阶段一、drop

①drop掉重复数据，这个pandas自带的drop_duplicate方法就能实现

self.data_frame.drop_duplicates(inplace=True)

②drop掉冗余字段。在表中，我们很容易知道一些数据对我们是没有必要分析的，比如HR和phone。很明显，这些是我们真的要投简历时才需要的字段，我们分析时根本用不到它们，没必要浪费时间在这些多余的字段上。

self.data_frame.drop("phone", axis=1, inplace=True)
self.data_frame.drop("HR", axis=1, inplace=True)

此外，我们要时刻清晰我们究竟关心什么。想研究地理位置对薪资的影响？因此保留work_place?

想研究公司地理位置对公司职位，招聘人数，薪资待遇的影响？我只能说，一口吃不成大象，由于整个数据段实在太庞大了，我们必须舍去一些研究价值不太大的字段。我们难道要通过研究这些地理位置的影响，决定今后我们去哪些市区找工作？仅对我而言，我只关心自己所居住的地方附加的公司与职位，因此也就没有探究地理关系对工作的影响，因此也是:

self.data_frame.drop("workplace", axis=1, inplace=True)
self.data_frame.drop("address", axis=1, inplace=True)

阶段二、fillna

在fillna时，我们首先要确定在整个数据结构中哪些数据是存在NaN值的，这个通过DataFrame.describe()方法就能知道

print(self.data_frame.describe())

缺失值的字段有: num, lang, age, salary, welfare, industry, company_type

文本型字段: lang, salary, welfare, industry, company_type

其中, 可推断的字段有，industry和company_type

不可推断的字段有: lang, salary, welfare

③fill可推断的字段:

industry

industry指的是这一职位所处的行业是什么。初步观察，这部分的缺失值在900个左右，占整体数据的0.9%左右，因此，事实上，直接drop掉缺失industry的行是没有任何关系的，不会影响整体的数据分析结果。但是要是以后遇到类似的情况，并且像industry这种字段缺失值比例相当大，这又该怎么办。因此我想到了一种比较不太精确的推断法，大致能推断对50%左右的industry。

事实上， industry和position字段以及require字段是有着直接关系的，我们完全可以根据缺失的industry所在行对应的position和require共同推测出industry。当然，本人为了简化情况，只选择使用position字段来推断industry，这当然是一种很质朴的减元思想。

要通过postion -> industry, 我们就必须明确position和industry之间到底存在什么关系。要知道，我们本身就拥有庞大的数据！我们可以基于已有数据建立position-industry之间的关系。

有高人肯定就要想到机器学习或其他更复杂的数学方法了。当然在此处我仍然是采用了一种非常质朴的方法，即词频统计法。

具体的思路是，通过先搜索缺失industry行对应的position行业，通过统计这个position行业绝大部分的industry是什么，得出缺失的industry值。

例如，我们发现一行数据:

IOS, 1, 厦门嘉乐道管理咨询有限公司, 全职, 五年, 不限, 不限, 不限, 本科, 20000-30000元/月, 面议, 民营/私企, , ...

这行数据缺失了industry, 他对应的position是IOS, 我们可以尝试使用SQL搜索:

SELECT industry FROM amoy_job WHERE position LIKE 'IOS%';

这样，我们可以得到一些列IOS这个职位对应的行业，通过词频统计，我们知道绝大部分的IOS都是互联网与信息技术行业，因此我们就可以粗略的推测这个缺失的industry很大可能也是互联网与信息技术行业。具体代码如下:

self.data_frame["industry"].replace({"其他行业": None}, inplace=True)
self.data_frame["industry"] = self.data_frame.swifter.apply(self.fill_missing_industry, axis=1)

    def fill_missing_industry(self, row: pd.Series):
        if pd.notnull(row["industry"]):
            return row["industry"]

        position_prefix = row["position"][:2]
        query_cond = f"position.str.startswith('{position_prefix}')"
        words: Counter = Counter()

        try:
            query_result: pd.DataFrame = self.data_frame.query(query_cond)["industry"].dropna()
        except SyntaxError:
            return "其他"

        for string in query_result:
            for word in re.split(r'\W', string):
                words[word] += 1

        words.pop("", None)
        words.pop("其他行业", None)
        most_common_keyword = words.most_common(3)
        industry: list = [keyword[0] for keyword in most_common_keyword]
        return "其他" if len(industry) == 0 else ",".join(industry)

代码的基本逻辑如上所说，只是有些细节不同，详见代码。

company_type

通过简单的检查，我们发现缺失company_type的行非常少，通过给ai直接喂对应的公司名company, 我们很轻松的就能补上缺失的company_type。这部分缺失数据放入了外部文件：

addition_data.json里面的query_company_type部分

不可推断的字段：

不可推断，意味着我们只能填充一些"保底"词汇，诸如: "面议", "不限", "其他"之类的。特别的，这部分不可推断的字段的缺失值比例往往远比哪些可推测的缺失值要大得多。

self.data_frame["lang"].fillna("不限", inplace=True)
self.data_frame["salary"].fillna("面议", inplace=True)
self.data_frame["welfare"].fillna("面议", inplace=True)

数字型字段: num, age

num的填充就相当简单了，通过下面的查询语句便可发现:

SELECT num, COUNT(1) FROM amoy_job GROUP BY num;

num的大小和它的出现频次基本呈现线性关系，因此我们完全可以根据缺失值以及"若干"出现的频次，合理推断其对应的num

至于age, 虽然age是数字，但是null值对应的语义不能直接粗暴的翻译成完全无限制。我们知道很多公司表面上没有入职年龄限制，实际上年龄稍大的可能根本过不了简历这一关。为了保证null值的语义，我们使用"不限"进行标注，而不是使用具体的岁数区间如18-55之类的完全宽松的岁数限制来代替。

self.data_frame["num"].fillna(3, inplace=True)
self.data_frame["num"].replace({"若干": 6}, inplace=True)
self.data_frame["age"].fillna("不限", inplace=True)

阶段三、abnormal_value

异常值通常存在于数字型字段中，数字型字段有哪些呢？

salary, worktime实际上都可以视为数字型字段。而其他真正的数字型字段如num, age在前一阶段已经检查后并不存在异常值，因此此处没有列举出来。

⑥对于salary, 通过语句:

SELECT DISTINCT salary FROM amoy_job;

我们发现一些奇葩的数据，诸如1-1元/天之类的，这种数据大概率是导入出错，我们简单查看一下这类数据的比例:

SELECT COUNT(1) / (SELECT COUNT(1) FROM amoy_job)
 FROM amoy_job WHERE salary LIKE '_-%'

发现这类数据比例很低，因此可以直接drop掉。否则，我们可能要使用工资的中位数进行填充。

对于worktime，为了方便异常值的识别以及研究，这里对这一字段进行了拆分, 将worktime变成了

三个字段: ①work_hours; ②work_days; ③shift_type;

    def divestiture_field(self):
        def fill_lose_shift_type(row):
            if pd.isnull(row["shift_type"]):
                if pd.isnull(row["work_hours"]) and pd.isnull(row["work_days"]):
                    return None
                return "正常白班"
            return row["shift_type"]

        work_hour_regex_exp = r"([0-9.]+)小时/天"
        work_day_regex_exp = r"([0-9.]+天/周)|大小周"
        shift_type_regex_exp = r"(不定时工作制|正常白班|正常晚班|2班倒|3班倒)"
        worktime_field = self.data_frame["worktime"].str
        self.data_frame["work_hours"] = worktime_field.extract(work_hour_regex_exp, expand=False)
        self.data_frame["work_days"] = worktime_field.extract(work_day_regex_exp, expand=False)
        self.data_frame["shift_type"] = worktime_field.extract(shift_type_regex_exp, expand=False)
        self.data_frame["shift_type"] = self.data_frame.apply(fill_lose_shift_type, axis=1)
        self.data_frame.drop("worktime", axis=1, inplace=True)

通过检查work_hours一天的工作时长极限理论∈[0, 24], 此处取 < 16, 得出异常的work_hours;

同理，对于work_days, 一周的天数∈[0, 7], 此处取 < 7, 得出异常的work_days;

这部分的异常值比例同样很低，因此也是直接drop, 因此异常值部分的⑦，事实上在这里没有进行实践，对异常值通通采用了drop的策略。

    def deal_abnormal_value(self):
        self.data_frame["work_hours"].replace({40.0: 8.0})
        abnormal_salary_cond = self.data_frame["salary"].str.contains(r'^.-', regex=True, na=False)
        self.data_frame.drop(self.data_frame[abnormal_salary_cond].index, inplace=True)
        abnormal_work_hours = self.data_frame["work_hours"] > 16
        self.data_frame.drop(self.data_frame[abnormal_work_hours].index, inplace=True)

事实上，在很多数据中，异常值的比例往往都不是很高。如果异常值能够体现什么额外有价值的信息，我们可以选择保留。否则，绝大多数情况下，我们都可以简单粗暴的直接drop掉，甚至不需要使用什么平均数/众数来填充。当然如果真的希望数据完整，填充也无伤大雅。

阶段四、format_field

这一部分的难度仅次于③的fillna有逻辑的字段值。格式化是一个令人略感纠结的工作，因为你很多时候很像保留字段更多的细节。所有的这些都取决于你最终的研究目的。比如company_type字段，在github下载的源码中，代码将company_type划分成了四类，见github代码:

    @staticmethod
    def __company_type_clean(x):
        if len(x) > 100 or '其他' in x:
            return '其他'
        elif re.findall('私营|民营', x):
            return '民营/私营'
        elif re.findall('外资|外企代表处', x):
            return '外资'
        elif re.findall('合资', x):
            return '合资'
        return x

这个划分可谓是相当粗略，如果我还想知道更多细节呢？因此我采用了保留更多细节的划分:

    @staticmethod
    def __company_type_format(row):
        if re.search(r"民营|私企", row):
            return "民营/私企"
        elif re.search(r"台资|港资", row):
            return "台资/港资"

        company_type_pattern = r"私营股份制|合资|外资|上市公司|国营企业|事业单位|外资代表处"
        search_result = re.search(company_type_pattern, row)
        if search_result:
            return search_result.group(0)
        else:
            return "其他"

其他各个字段的format_field就详见最上面的代码了。format_field本质上就是难在聚类的选择而已，我们到底要保留原字段多少信息？这个度的掌握是要严格服务我们最终研究目的的。

补充

在源代码中，特别的，使用了:

import swifter
import modin.pandas as pd
import ray

事实上，这些都是给pandas加速用的第三方库。由于在③中大量的query推测industry缺失值占用了大量时间，因此我在后续修改代码时寻求了这些加速手段。如果我们自己想通过诸如concurrent下的futures的ProcessPoolExecutor来进行多进程程序的编写，难度是相当大的，因为像Series对象和DataFrame对象的apply方法的执行是在内部操作的，我们似乎没办法直接使用多进程来加速器内部的执行操作。而且，就Python的多进程，它甚至需要拷贝解释器到另外一块独立内存中对进程代码进行执行，其代价是相当高的。我能想到的最质朴的方法就是预先用多个变量拷贝多个DataFrame对象的部分数据，然后分发给不同进程进行处理，最后callback回调进行数据汇集。当然这都是预想中的，本人并没有用代码进行实践，感兴趣的，请留下你的代码!