pandas使用笔记

码道功成

已于 2024-11-29 17:48:29 修改

阅读量296

点赞数

分类专栏： Python 文章标签： pandas 笔记 python

于 2023-09-27 14:15:21 首次发布

本文链接：https://blog.youkuaiyun.com/NDF923/article/details/133321930

版权

Python 专栏收录该内容

14 篇文章

订阅专栏

1、根据已有列的值生成新列

1）传入当前行生成新的一列

def get_ques_stem(question_id, question_desc_text):
    if "<com_q" in question_desc_text:
        patt = re.compile(r'<.{3}_q.{0,3}>(.*)</.{3}_q.{0,3}>', re.S)
        backgroud = re.sub(patt, '', question_desc_text)
        return backgroud
    else:
        return "###_" + str(question_id) 

df = pd.read_excel(path, index_col=None, sheet_name="税法二")
df['ques_stem'] = df.apply(lambda row: get_ques_stem(row['question_id'], row['question_desc_text']), axis=1)

2）传入当前行生成新的多列

def remove_invalid_item(s):
    soup = BeautifulSoup(s, 'html.parser')
    handled_html_text = soup.encode_contents().decode()
    handled_search_text = soup.get_text()
    return handled_html_text, handled_search_text, part_invalid

df = pd.read_json(file_path, orient='records', lines=True)
df[['handled_html_text', 'handled_search_text', 'part_invalid']] = df.apply(
        lambda row: remove_invalid_item(row['html_text']), axis=1, result_type='expand')

3）传入指定列生成新的一列

def get_priority(line):
    """根据data_source获取数据的优先级"""
    if line is None or len(line) <= 0:
        return 5
    if line in country_tax:
        priority = 1
    elif "省" in line or "自治区" in line or cu.is_str_contains_any_element_of_array(line, provinces):
        priority = 2
    elif "市" in line or "12366" in line or cu.is_str_contains_any_element_of_array(line, areas):
        priority = 3
    else:
        priority = 4
    return priority

df['priority'] = df['data_source'].apply(lambda x: get_priority(x))
#也可省略入参，如下所示：
df['priority'] = df['data_source'].apply(get_priority)

4）传入指定列生成新的多列

def extract_start_end_date_from_ans(ans):
    s, e = parse_start_end_date(ans)
    if len((s, e)) != 2:
        print(s, e)
    return pd.Series({'start_date': s, 'end_date': e})


df[['start_date', 'end_date']] = df['answer'].apply(extract_start_end_date_from_ans)

建议使用3）、4）这种处理办法，即对列上使用apply函数而不是对行上，这样在表比较大的时候，会有性能的提升。

2、聚合分组

根据数据的某一个字段进行聚合

def get_attachment_dic():
    q_sql = "SELECT parent_id, download_url FROM shuiben_attachment_relevance WHERE deleted = 0 limit 10"
    df = fu.mysql_pd(q_sql, "prod")
    group_df = df.groupby(['parent_id'])
    group_df_keys = group_df.groups.keys()
    print("#########打印集合后的key##########")
    print(group_df_keys)
    # 根据聚合后的key遍历
    for k in group_df_keys:
        # 获取同一parent_id下的所有附件信息
        grouped_rows = group_df.get_group(k).to_dict('records')
        print("#########打印法规下的所有附件信息##########")
        print(grouped_rows)
    

    ### 也可以这样写
    for k, group in group_df:
        print(k) # 打印k
        print(group) # 打印该k下的数据，格式为df

打印信息如下：

#########打印集合后的key##########
dict_keys([4, 6, 8, 12, 13])
#########打印法规下的所有附件信息##########
[{'parent_id': 4, 'download_url': 'upload/2023/01-18/00-03-130082-1841407387.docx'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 6, 'download_url': 'upload/2022/11-14/16-23-470077701932422.pdf'}, {'parent_id': 6, 'download_url': 'upload/2022/11-14/16-23-570084-90822133.pdf'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-1109521442465714.doc'}, {'parent_id': 8, 'download_url': None}, {'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-120183-2102821684.doc'}, {'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-120515-1103417705.doc'}, {'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-120730301869269.doc'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 12, 'download_url': 'upload/2023/01-18/00-03-150019365343013.docx'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 13, 'download_url': 'upload/2023/01-18/00-03-150460-1843529730.pdf'}]

3、调整列顺序

 df = fu.mysql_pd(sql, 'prod')
 log.info("%s表数据条数：%d" % (tbl[0], len(df)))
 #新增一列
 df['data_type'] = tbl[0]
 #将列按照预想的重新排序
 df = df[['id', 'data_type', 'title', 'create_time', 'update_time']]

4、提取个别列

df有若干列，我只想id和name两列

def y12366_fagui():
    df = pd.read_json(r"../data/12366_fagui.json", orient="records", lines=True)
    df['nid']= df.apply(lambda row: cu.get_bigint_id(), axis=1)
    # 指定需要的列，列序并按照此顺序排列
    results = df[['nid','id','title','state','source']]
    results.to_json(r"../data/12366_for_faiss.jsonl", orient="records", lines=True, force_ascii=False)

5、提取数组列并展开

文件的数据结构如下：

{"dayi_id": 414738, "dayi_title": "的解读", "matched_fagui": [{"id": 437858345536151, "title": "中国国际贸易促进委员会关于邀请参加“欧盟《一般数据保护条例》的解读与启示”合规讲坛的函",  "poster": "中国国际贸易促进委员会", "law_type": "经济技术纠纷调解", "status": "现行有效"}]}

使用df读取该文件，并提取matched_fagui字段里的数据到新的df中

    df = pd.read_json(r"../data/faguis_regrab.jsonl", orient="records", lines=True)

    res = df['matched_fagui'].explode().to_list()
    df = pd.DataFrame(res)

6、join关联查询及重命名列名

    unfound_dayi_df = pd.read_json(r"../data/faiss/fuzzy_unfounded_dayis.jsonl", orient="records", lines=True)
    dayi_df = pd.read_json(r"../data/etled_dayi_for_faiss.jsonl", orient="records", lines=True)
    # 执行 join 操作
    result = pd.merge(dayi_df, unfound_dayi_df, left_on='id', right_on='dayi_id', how='inner')
    # 选择需要的字段
    result = result[['id', 'title', 'cleared_title_x']]
    # 重命名字段（merge方法有个suffixes参数，可指定同名列的后缀，默认是：("_x", "_y")）
    result = result.rename(columns={'cleared_title_x': 'cleared_title'})

7、自定义函数筛选数据

def filter_row(title):
    return not cu.is_str_contains_any_element_of_array(title, get_area_infos())

df = df[df.apply(lambda row: filter_row(row['title']), axis=1)]

自定义的函数需要返回True或False

8、按照列值自然排序

sub_df = sub_df[
            ['nid', 'oid', 'answer', 'answer_html', 'publish_date', 'data_source', 'channel', 'original_source',
             'branch', 'priority']]
        sub_df = sub_df.sort_values(by=['priority', 'publish_date'], ascending=[True, False])

以上代码，先按照priority升序，再按照publish_date降序。