1、根据已有列的值生成新列
1)传入当前行生成新的一列
def get_ques_stem(question_id, question_desc_text):
if "<com_q" in question_desc_text:
patt = re.compile(r'<.{3}_q.{0,3}>(.*)</.{3}_q.{0,3}>', re.S)
backgroud = re.sub(patt, '', question_desc_text)
return backgroud
else:
return "###_" + str(question_id)
df = pd.read_excel(path, index_col=None, sheet_name="税法二")
df['ques_stem'] = df.apply(lambda row: get_ques_stem(row['question_id'], row['question_desc_text']), axis=1)
2)传入当前行生成新的多列
def remove_invalid_item(s):
soup = BeautifulSoup(s, 'html.parser')
handled_html_text = soup.encode_contents().decode()
handled_search_text = soup.get_text()
return handled_html_text, handled_search_text, part_invalid
df = pd.read_json(file_path, orient='records', lines=True)
df[['handled_html_text', 'handled_search_text', 'part_invalid']] = df.apply(
lambda row: remove_invalid_item(row['html_text']), axis=1, result_type='expand')
3)传入指定列生成新的一列
def get_priority(line):
"""根据data_source获取数据的优先级"""
if line is None or len(line) <= 0:
return 5
if line in country_tax:
priority = 1
elif "省" in line or "自治区" in line or cu.is_str_contains_any_element_of_array(line, provinces):
priority = 2
elif "市" in line or "12366" in line or cu.is_str_contains_any_element_of_array(line, areas):
priority = 3
else:
priority = 4
return priority
df['priority'] = df['data_source'].apply(lambda x: get_priority(x))
#也可省略入参,如下所示:
df['priority'] = df['data_source'].apply(get_priority)
4)传入指定列生成新的多列
def extract_start_end_date_from_ans(ans):
s, e = parse_start_end_date(ans)
if len((s, e)) != 2:
print(s, e)
return pd.Series({'start_date': s, 'end_date': e})
df[['start_date', 'end_date']] = df['answer'].apply(extract_start_end_date_from_ans)
建议使用3)、4)这种处理办法,即对列上使用apply函数而不是对行上,这样在表比较大的时候,会有性能的提升。
2、聚合分组
根据数据的某一个字段进行聚合
def get_attachment_dic():
q_sql = "SELECT parent_id, download_url FROM shuiben_attachment_relevance WHERE deleted = 0 limit 10"
df = fu.mysql_pd(q_sql, "prod")
group_df = df.groupby(['parent_id'])
group_df_keys = group_df.groups.keys()
print("#########打印集合后的key##########")
print(group_df_keys)
# 根据聚合后的key遍历
for k in group_df_keys:
# 获取同一parent_id下的所有附件信息
grouped_rows = group_df.get_group(k).to_dict('records')
print("#########打印法规下的所有附件信息##########")
print(grouped_rows)
### 也可以这样写
for k, group in group_df:
print(k) # 打印k
print(group) # 打印该k下的数据,格式为df
打印信息如下:
#########打印集合后的key##########
dict_keys([4, 6, 8, 12, 13])
#########打印法规下的所有附件信息##########
[{'parent_id': 4, 'download_url': 'upload/2023/01-18/00-03-130082-1841407387.docx'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 6, 'download_url': 'upload/2022/11-14/16-23-470077701932422.pdf'}, {'parent_id': 6, 'download_url': 'upload/2022/11-14/16-23-570084-90822133.pdf'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-1109521442465714.doc'}, {'parent_id': 8, 'download_url': None}, {'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-120183-2102821684.doc'}, {'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-120515-1103417705.doc'}, {'parent_id': 8, 'download_url': 'upload/2023/01-18/00-03-120730301869269.doc'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 12, 'download_url': 'upload/2023/01-18/00-03-150019365343013.docx'}]
#########打印法规下的所有附件信息##########
[{'parent_id': 13, 'download_url': 'upload/2023/01-18/00-03-150460-1843529730.pdf'}]
3、调整列顺序
df = fu.mysql_pd(sql, 'prod')
log.info("%s表数据条数:%d" % (tbl[0], len(df)))
#新增一列
df['data_type'] = tbl[0]
#将列按照预想的重新排序
df = df[['id', 'data_type', 'title', 'create_time', 'update_time']]
4、提取个别列
df有若干列,我只想id和name两列
def y12366_fagui():
df = pd.read_json(r"../data/12366_fagui.json", orient="records", lines=True)
df['nid']= df.apply(lambda row: cu.get_bigint_id(), axis=1)
# 指定需要的列,列序并按照此顺序排列
results = df[['nid','id','title','state','source']]
results.to_json(r"../data/12366_for_faiss.jsonl", orient="records", lines=True, force_ascii=False)
5、提取数组列并展开
文件的数据结构如下:
{"dayi_id": 414738, "dayi_title": "的解读", "matched_fagui": [{"id": 437858345536151, "title": "中国国际贸易促进委员会关于邀请参加“欧盟《一般数据保护条例》的解读与启示”合规讲坛的函", "poster": "中国国际贸易促进委员会", "law_type": "经济技术纠纷调解", "status": "现行有效"}]}
使用df读取该文件,并提取matched_fagui字段里的数据到新的df中
df = pd.read_json(r"../data/faguis_regrab.jsonl", orient="records", lines=True)
res = df['matched_fagui'].explode().to_list()
df = pd.DataFrame(res)
6、join关联查询及重命名列名
unfound_dayi_df = pd.read_json(r"../data/faiss/fuzzy_unfounded_dayis.jsonl", orient="records", lines=True)
dayi_df = pd.read_json(r"../data/etled_dayi_for_faiss.jsonl", orient="records", lines=True)
# 执行 join 操作
result = pd.merge(dayi_df, unfound_dayi_df, left_on='id', right_on='dayi_id', how='inner')
# 选择需要的字段
result = result[['id', 'title', 'cleared_title_x']]
# 重命名字段(merge方法有个suffixes参数,可指定同名列的后缀,默认是:("_x", "_y"))
result = result.rename(columns={'cleared_title_x': 'cleared_title'})
7、自定义函数筛选数据
def filter_row(title):
return not cu.is_str_contains_any_element_of_array(title, get_area_infos())
df = df[df.apply(lambda row: filter_row(row['title']), axis=1)]
自定义的函数需要返回True或False
8、按照列值自然排序
sub_df = sub_df[
['nid', 'oid', 'answer', 'answer_html', 'publish_date', 'data_source', 'channel', 'original_source',
'branch', 'priority']]
sub_df = sub_df.sort_values(by=['priority', 'publish_date'], ascending=[True, False])
以上代码,先按照priority升序,再按照publish_date降序。