Python字典排序/序列数据切分/日期变换/正则清洗数据

原创已于 2022-05-12 09:59:05 修改 · 411 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python

于 2020-03-25 19:50:00 首次发布

python经验总结专栏收录该内容

15 篇文章

订阅专栏

本文分享了使用Python进行数据处理的实用技巧，包括列表排序、Pandas序列切分、字符串清理及时间格式转换等，适用于数据预处理和分析场景。

自己偶尔会用到一些python的语法进行数据处理，用的时候又忘记了，去百度搜索又要搜索半天，干脆自己记录下来，下次再用到直接来copy。

以列表中的某个元素为索引进行排序

// An highlighted block
#test是一个二维数组（列表），以每一个一级元素中的第一个元素进行排序
test = [[1,2,3],[9,8,7],[6,5,4]]
sort_test = sorted(test,key=lambda x:x[0],reverse = True)#reverse = True时降序排序

#以字典的键/值为索引进行排序
test_dic = {'a':1,'b':2,'c':6,'e':4,'f':9}
sort_test_dic = sorted(test_dic.items(), key=lambda x:x[0])#以健为索引进行排序
sort_test_dic = sorted(test_dic.items(), key=lambda x:x[1])#以值为索引进行排序

使用Pandas对序列进行切分

// An highlighted block
#1、test是一个一维序列，需要按照一定的区间对其进行切分
import pandas as pd
import numpy as np
#按照list_cut对test进行区间切分
list_cut = [3,6,9]

test = [1,2,3,4,5,6,7,8,9,10]
test_np = np.array(test)
test_cut = pd.cut(test_np,list_cut)
test_cut_num = test_cut.value_counts()
print(test_cut_num)
#输出结果如下：
#(3,6] 3
#(6,9] 3
#最终的test_cut_num是一个可迭代对象，可用for循环依次将每一个区间内值的数量取出来

#2、test是一个一维序列，对test内的数据进行等量切分，而不是按照预先设定的区间进行切分
import pandas as pd
import numpy as np

test = [1,2,3,4,5,6,7,8,9,10]
test_np = np.array(test)
test_cut = pd.qcut(test_np,5)#区别在这！！，5代表均分成5份
test_cut_num = test_cut.value_counts()
print(test_cut_num)
#输出结果如下：
#(0.999,2.8] 2
#(2.8,4.6] 2
#(4.6,6.4] 2
#(6.4,8.2] 2
#(8.2,10.0] 2
#临界值会前后扩张一点
#最终的test_cut_num是一个可迭代对象，可用for循环依次将每一个区间内值的数量取出来

# 3 删除字符串中的数字和英文
import re
def clear(content):
    pattern = re.compile('[0-9a-zA-Z]')
    content = pattern.sub('',content)
    return content

# 4 全角转半角
def strQ2B(ustring):
    """全角转半角"""
    rstring = ""
    for uchar in ustring:
        inside_code=ord(uchar)
        if inside_code == 12288:                              #全角空格直接转换            
            inside_code = 32 
        elif (inside_code >= 65281 and inside_code <= 65374): #全角字符（除空格）根据关系转化
            inside_code -= 65248

        rstring += chr(inside_code)
    return rstring

# 5 将句子以标点符号分隔
import re
def split(con):
    pattern = r',|\.|/|;|\'|`|\[|\]|<|>|\?|:|"|\{|\}|\~|!|@|#|\$|%|\^|&|\(|\)|-|=|\_|\+|，|。|、|；|‘|’|【|】|·|！| |…|（|）|:|：'
    con_split = re.split(pattern, con)
    return con_split

# 6.时间字符串互相转换

#6.1日期转字符串
import datetime
t = datetime.datetime.now()
t = t.strftime('%Y-%m-%d %H:%M:%S')

#6.2字符串转日期
t = '2022-04-22 10:10:10'
t = datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S')

#6.3 日期加减
t = datetime.datetime.today()
# 对小时进行加减
t = t - timedelta(hours=10)
# 对天进行加减
t = t - timedelta(days=10)

#7.使用正则表达式删除数据中不想要的内容
import re
con = '怎么huishi??....'
con = re.sub(r'[a-zA-Z,，.。?？！!]','',con) # 将想要删除的内容写在表达式中的[]内即可
print(con)

参考：
https://www.cnblogs.com/houzichiguodong/p/9097790.html
https://www.cnblogs.com/kaituorensheng/p/3554571.html