时间差:
from dateutil.parser import parse
a = parse('2017-10-01/12:12:12')
b = parse('2013-3-4/10:10:10')
(a-b).days
(a-b).seconds
(a-b).total_seconds()
tim = pd.to_datetime(b) - pd.to_datetime(a)
tim/np.timedelta64(1, 's')
out:start_date_time 2400.0
dtype: float64
其中a,b (start_date_time 2017-09-24 10:10:00
Name: 6045571, dtype: datetime64[ns], start_date_time 2017-09-24 10:50:00
Name: 0, dtype: datetime64[ns])
* 1/17/07 has the format "%m/%d/%y"#y表示两位数的年份07
* 17-1-2007 has the format "%d-%m-%Y"#Y表示四位数的年份2007
#将date按照format,转为datetime64格式数据
landslides['date_parsed'] = pd.to_datetime(landslides['date'], format = "%m/%d/%y")
#如果数据格式不一样,通过设置infer_datetime_format = True去猜想
landslides['date_parsed'] = pd.to_datetime(landslides['Date'],infer_datetime_format=True)
day_of_month = landslides['date_parsed'].dt.day#显示天
数据读取:
info_keliu = pd.read_excel(r'D:\project资料\公交.xlsx', sheet_name = 'Sheet1')
数据截取:
info_kl = info_keliu[info_keliu['居住地'] != '未知']
gz_dt['cgi'] = gz_dt['ne_cgi'].str.extract('460-00-(.*)')
info = info_kl.astype(str)
info_cc = info[info['工作地'].str.contains('禅城')]
数据去除null:
df.dropna(axis = , how = 'any')有一个为null
df.dropna(axis = , how = 'all')整行/列都是null
indexs = list(df[np.isnan(df['aaa'])].index)/df[np.isnan(df['aaa'])].index.tolist()
df = df.drop(indexs)删除nan行
获取不为nan的行
df = df[np.isnan(df['aaa']) == False]
################################### axis ##########################################
axis解释:
aaa = np.array([2,3],[3,4])
aaa = 2, 1
3, 4
np.mean(aaa, axis = 0)
out: (2,5, 2.5) (2+3)/2 = 2.5, (1+4)/2= 2.5
axis = 0, 即沿着列的方向,对行进行操作
np.mean(aaa, axis = 1)
out: (1.5, 3.5) (2+1)/2 = 1.5, (3+4)/2= 3.5
axis = 1, 即沿着行的方向,对列进行操作
################################# 去重 ###################################
数据去重:
list_gzd_new = []
for one in list_gzd:
if one not in list_gzd_new:
list_gzd_new.append(one)
列表去重:
#收件人去重,并保持原来的收件人顺序
mailto = ['cc', 'bbbb', 'afa', 'sss', 'bbbb', 'cc', 'shafa']
addr_to = list(set(mailto))
addr_to.sort(key = mailto.index)
dataframe转为列表:先转为series,在tolist
destination = np.array(from_jzd['destination']).tolist()
dict:
#构建json文件,输入到高德API的多路径程序中,为网上寻找的现有程序
all_route = []
###################################这个地方,注意route = {}写在循环内,和循环外的区别
for i in range(len(flat)):
route = {}
#print(flat[i])
route['flng'] = flat[i]
route['flat'] = flng[i]
route['tlng'] = glat[i]
route['tlat'] = glng[i]
all_route.append(route)
print(route)
#print(all_route)
route_fs = json.dumps(all_route)
################################################# 字符串 ##########################################
#字符串
a = 'abc'
b = a[::-1]
b = 'cba'
################################################### json ###############################################
json读写:
"""
dumps:序列化一个对象
sort_keys:根据key排序
indent:以4个空格缩进,输出阅读友好型
ensure_ascii: 可以序列化非ascii码(中文等)
"""
s_dumps = json.dumps(data_obj, sort_keys=True, indent=4, ensure_ascii=False)
print(s_dumps)
# ---------------------------------------------------分割线------------------------------------------------------------
"""
dump:将一个对象序列化存入文件
dump()的第一个参数是要序列化的对象,第二个参数是打开的文件句柄
注意打开文件时加上以UTF-8编码打开
* 运行此文件之后在统计目录下会有一个data.json文件,打开之后就可以看到json类型的文件应该是怎样定义的
"""
with open("data.json", "w", encoding="UTF-8") as f_dump:
s_dump = json.dump(data_obj, f_dump, ensure_ascii=False, indent = 4)
print(s_dump)
"""
load:从一个打开的文件句柄加载数据
注意打开文件的编码
"""
with open("data.json", "r", encoding="UTF-8") as f_load:
r_load = json.load(f_load)
print(r_load)
# ---------------------------------------------------json------------------------------------------------------------
"""
loads: 从一个对象加载数据
"""
r_loads = json.loads(s_dumps)
print(r_loads)
arg = '{"bakend": "www.oldboy.org", "record": {"server": "100.1.7.9", "weight": 20, "maxconn": 30}}'
a = json.loads(input('请输入添加的数据:'),encoding='utf-8')
print(a)
############################################### 数据库mysql #################################################
#连接数据库
import pymysql
from sqlalchemy import create_engine
db = create_engine('mysql+pymysql://root:hantele@192.168.6.10:5029/Data_0104?charset=utf8')
sql = 'select * from gf_subway_5mm_1216 where station LIKE"珠江新城" and start_date_time BETWEEN "2017-12-16 08:00:00" AND "2017-12-16 08:30:00"'
df = db.execute(sql)
sql:
union去重,
union all 不去重
########################################### 正则表达式 #####################################
a = 'ItemScore(1805100000001637,7.688509220116103), ItemScore(1805080000001600,7.684840663118415), ItemScore(1805110000001662,7.683625207896754)'
pa = r'\d+,\d.\d+'#提取括号中数字
re.findall(a, pa)
########################################### 编码内容 ###############################################
#unicode码
one[1].encode('utf-8').decode('unicode_escape')
with open("../input/kickstarter-projects/ks-projects-201801.csv", 'rb') as rawdata:
result = chardet.detect(rawdata.read(10000))
#chardet.detect()可以查看文本的编码格式,用于正确读取(有时候查看不一定完全对)
# check what the character encoding might be
print(result)
{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}
kicksta_2016 = pd.read_csv("../input/ks-612.csv", encoding = 'Windows-1252')