数据处理与Python技巧-优快云博客

本文介绍了如何使用Python进行数据处理的多种方法，包括时间差计算、数据格式转换、数据读取与截取、数据去重及字符串操作等。并详细展示了如何利用pandas和numpy库进行高效的数据操作。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
时间差：
from dateutil.parser import parse
a = parse('2017-10-01/12:12:12')
b = parse('2013-3-4/10:10:10')
(a-b).days
(a-b).seconds
(a-b).total_seconds()

tim = pd.to_datetime(b) - pd.to_datetime(a)
tim/np.timedelta64(1, 's')

out:start_date_time    2400.0
dtype: float64

其中a,b (start_date_time   2017-09-24 10:10:00
 Name: 6045571, dtype: datetime64[ns], start_date_time   2017-09-24 10:50:00
 Name: 0, dtype: datetime64[ns])

 * 1/17/07 has the format "%m/%d/%y"#y表示两位数的年份07
 * 17-1-2007 has the format "%d-%m-%Y"#Y表示四位数的年份2007
#将date按照format，转为datetime64格式数据
landslides['date_parsed'] = pd.to_datetime(landslides['date'], format = "%m/%d/%y")
#如果数据格式不一样，通过设置infer_datetime_format = True去猜想
landslides['date_parsed'] = pd.to_datetime(landslides['Date'],infer_datetime_format=True)

day_of_month = landslides['date_parsed'].dt.day#显示天

数据读取：
info_keliu = pd.read_excel(r'D:\project资料\公交.xlsx', sheet_name = 'Sheet1')


数据截取：
info_kl = info_keliu[info_keliu['居住地'] != '未知']

gz_dt['cgi'] = gz_dt['ne_cgi'].str.extract('460-00-(.*)')

info = info_kl.astype(str)
info_cc = info[info['工作地'].str.contains('禅城')]

数据去除null:
df.dropna(axis = , how = 'any')有一个为null
df.dropna(axis = , how = 'all')整行/列都是null

indexs = list(df[np.isnan(df['aaa'])].index)/df[np.isnan(df['aaa'])].index.tolist()
df = df.drop(indexs)删除nan行

获取不为nan的行
df = df[np.isnan(df['aaa']) == False]

################################### axis ##########################################
axis解释：
aaa = np.array([2,3],[3,4])
aaa = 2, 1
      3, 4
np.mean(aaa, axis = 0)
out: (2,5, 2.5) (2+3)/2 = 2.5, (1+4)/2= 2.5
axis = 0, 即沿着列的方向,对行进行操作

np.mean(aaa, axis = 1)
out: (1.5, 3.5) (2+1)/2 = 1.5, (3+4)/2= 3.5
axis = 1, 即沿着行的方向,对列进行操作

################################# 去重 ###################################

数据去重：
list_gzd_new = []
for one in list_gzd:
    if one not in list_gzd_new:
        list_gzd_new.append(one)

列表去重：
#收件人去重，并保持原来的收件人顺序
mailto = ['cc', 'bbbb', 'afa', 'sss', 'bbbb', 'cc', 'shafa']
addr_to = list(set(mailto))
addr_to.sort(key = mailto.index)

dataframe转为列表：先转为series,在tolist
destination = np.array(from_jzd['destination']).tolist()

dict:
#构建json文件，输入到高德API的多路径程序中，为网上寻找的现有程序

all_route = []
###################################这个地方，注意route = {}写在循环内，和循环外的区别
for i in range(len(flat)):
    route = {}
    #print(flat[i])
    route['flng'] = flat[i]
    route['flat'] = flng[i]
    route['tlng'] = glat[i]
    route['tlat'] = glng[i]
    all_route.append(route)
    print(route)
#print(all_route)

route_fs = json.dumps(all_route)
################################################# 字符串  ##########################################   

#字符串
a = 'abc'
b = a[::-1]
b = 'cba'

###################################################  json  ###############################################
 json读写：


"""
dumps：序列化一个对象
sort_keys：根据key排序
indent：以4个空格缩进，输出阅读友好型
ensure_ascii: 可以序列化非ascii码（中文等）

"""
s_dumps = json.dumps(data_obj, sort_keys=True, indent=4, ensure_ascii=False)
print(s_dumps)

# ---------------------------------------------------分割线------------------------------------------------------------


"""
dump：将一个对象序列化存入文件
dump()的第一个参数是要序列化的对象，第二个参数是打开的文件句柄
注意打开文件时加上以UTF-8编码打开

* 运行此文件之后在统计目录下会有一个data.json文件，打开之后就可以看到json类型的文件应该是怎样定义的

"""
with open("data.json", "w", encoding="UTF-8") as f_dump:
    s_dump = json.dump(data_obj, f_dump, ensure_ascii=False, indent = 4)
print(s_dump)

"""
load：从一个打开的文件句柄加载数据
注意打开文件的编码

"""
with open("data.json", "r", encoding="UTF-8") as f_load:
    r_load = json.load(f_load)
print(r_load)

# ---------------------------------------------------json------------------------------------------------------------


"""
loads： 从一个对象加载数据

"""
r_loads = json.loads(s_dumps)
print(r_loads)

arg = '{"bakend": "www.oldboy.org", "record": {"server": "100.1.7.9", "weight": 20, "maxconn": 30}}'

a = json.loads(input('请输入添加的数据：'),encoding='utf-8')
print(a)

###############################################  数据库mysql  #################################################
#连接数据库
import pymysql
from sqlalchemy import create_engine

db = create_engine('mysql+pymysql://root:hantele@192.168.6.10:5029/Data_0104?charset=utf8')
sql = 'select * from  gf_subway_5mm_1216 where station LIKE"珠江新城" and start_date_time BETWEEN "2017-12-16 08:00:00" AND "2017-12-16 08:30:00"'
df = db.execute(sql)

sql:
union去重，
union all 不去重


########################################### 正则表达式 #####################################
a = 'ItemScore(1805100000001637,7.688509220116103), ItemScore(1805080000001600,7.684840663118415), ItemScore(1805110000001662,7.683625207896754)'
pa = r'\d+,\d.\d+'#提取括号中数字
re.findall(a, pa)


###########################################    编码内容   ###############################################
  #unicode码
one[1].encode('utf-8').decode('unicode_escape')

with open("../input/kickstarter-projects/ks-projects-201801.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))
#chardet.detect()可以查看文本的编码格式，用于正确读取（有时候查看不一定完全对）
# check what the character encoding might be
print(result)
{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}

kicksta_2016 = pd.read_csv("../input/ks-612.csv", encoding = 'Windows-1252')