04_强大的数据结构和python扩展库
01_3-1强大的数据结构
01_1字典
#为什么要使用字典
names = ['a','b','c','d']
salaries = ['100','200','300','400']
print ('a salary = ',salaries[names.index('a')])
#键:数字、字符串、元组等不可变的类型
# 字典中的值是无序的,只和键有关
a salary = 100
#创建字典:直接‘利用dict函数
ainfo = {'a':100,'b':200,'c':300,'d':400}
print (ainfo)
info = [('a',100),('b',200),('c',300),('d',400)]
binfo = dict(info)
print (binfo)
cinfo = dict([['a',100],['b',200],['c',300],['d',400]])
print(cinfo)
dinfo = dict(a =100,b=200,c=300,d =400)
print(dinfo)
einfo = dict((('a',100),('b',200),('c',300),('d',400)))
print (einfo)
#创建字典,所有员工的默认值设置位3000
a = {}.fromkeys(('a','b','c','d'),3000)
print (a)
sorted(a)
{'a': 100, 'b': 200, 'c': 300, 'd': 400}
{'a': 100, 'b': 200, 'c': 300, 'd': 400}
{'a': 100, 'b': 200, 'c': 300, 'd': 400}
{'a': 100, 'b': 200, 'c': 300, 'd': 400}
{'a': 100, 'b': 200, 'c': 300, 'd': 400}
{'a': 3000, 'b': 3000, 'c': 3000, 'd': 3000}
['a', 'b', 'c', 'd']
#生成字典
plist = [('a','apple','100'),
('b','banana','200'),
('o','orange','300')]
alist = []
blist = []
for i in range(3):
astr = plist[i][0]
bstr = plist[i][2]
alist.append(astr)
blist.append(bstr)
aDict = {}.fromkeys(alist,0)
# print (list(aDict.keys()))
j = 0
for key in aDict:
print (key)
aDict[key] = blist[j]
j += 1
# aDict = dict(zip(alist,blist))
print ('aDict = ',aDict)
a
b
o
aDict = {'a': '100', 'b': '200', 'o': '300'}
#生成字典
plist = [('a','apple','100'),
('b','banana','200'),
('o','orange','300')]
alist = []
blist = []
for i in range(3):
astr = plist[i][0]
bstr = plist[i][2]
alist.append(astr)
blist.append(bstr)
aDict = {}.fromkeys(alist,0)
# print (list(aDict.keys()))
# for key in aDict:
# print (key)
# aDict[key] = blist[i]
aDict = dict(zip(alist,blist))
print ('aDict = ',aDict)
aDict = {'a': '100', 'b': '200', 'o': '300'}
02_2字典的使用
#字典的基本操作
ainfo = {'a':100,'b':200,'c':300}
print (ainfo['a'])#键值查找
ainfo['a']=888#更新
print (ainfo)
ainfo ['d']=400#添加
print (ainfo )
print ('a' in ainfo)#成员判断
del ainfo#删除字典
print (ainfo)
100
{'a': 888, 'b': 200, 'c': 300}
{'a': 888, 'b': 200, 'c': 300, 'd': 400}
True
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-44-61aaa9102053> in <module>()
8 print ('a' in ainfo)#成员判断
9 del ainfo#删除字典
---> 10 print (ainfo)
NameError: name 'ainfo' is not defined
#字典的格式化字符串
ainfo = {'a':100,'b':200,'c':300}
for key in ainfo.keys():
print ('name = %s,salary = %s'%(key,ainfo[key]))
print ('a salary is %(a)s'%ainfo)
# %(key)格式说明符 %字典对象名
binfo = {'a':100,'b':200,'c':300}
template ='''
welcome to the pay wall
a's salary is %(a)s
b's salary is %(b)s
'''
print (template%binfo)
name = a,salary = 100
name = b,salary = 200
name = c,salary = 300
a salary is 100
welcome to the pay wall
a's salary is 100
b's salary is 200
#字典的内建函数
#单独输出员工和员工的工资
ainfo = {'a':100,'b':200,'c':300}
print (ainfo.keys())
print (ainfo.values())
for a in ainfo:
print (a)
print (ainfo[a])
dict_keys(['a', 'b', 'c'])
dict_values([100, 200, 300])
a
100
b
200
c
300
#通过键查找值
ainfo = {'a':100,'b':200,'c':300}
print (ainfo.get('d'))#法一
print(ainfo[d])#法二
None
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-20-60c1016b3b86> in <module>()
2 ainfo = {'a':100,'b':200,'c':300}
3 print (ainfo.get('d'))#法一
----> 4 print(ainfo[d])#法二
NameError: name 'd' is not defined
#更新字典
ainfo = {'a':100,'b':200,'c':300}
binfo = {'a':888,'d':555,'e':222}
ainfo.update(binfo)
print (ainfo)
{'a': 888, 'b': 200, 'c': 300, 'd': 555, 'e': 222}
#删除字典
ainfo = {'a':100,'b':200,'c':300}
binfo = ainfo
ainfo = {}
print (ainfo)
print (binfo)
{}
{'a': 100, 'b': 200, 'c': 300}
#删除字典
ainfo = {'a':100,'b':200,'c':300}
binfo = ainfo
ainfo.clear()
print (ainfo)
print (binfo)
#字典的内建函数 clear(),get(),items(),iter(),setdefault(),values(),fromkeys(),has_key(),keys(),pop(),update()
{}
{}
#字典作为函数的形式参数
#python 中函数的参数形式:位置或关键字参数,仅位置的参数,可变长位置参数,可变长关键字参数(参数可设定默认值)
def func (args1,*argst,**argsd):
print (args1)
print (argst)
print (argsd)
print (func('hello','a','b','c',d1 = 1,e = 2,f =3))
# *argst 可变长位置参数 用元组作为其形式参数 **argsd 可变长关键字参数 用字典作为其形式参数
hello
('a', 'b', 'c')
{'d1': 1, 'e': 2, 'f': 3}
None
03_3 集合
#信息重复登记
name = ['a','b','c','a','b']
nameset = set(name)
print (nameset)
#集合:无序不重复的元素的组合,可分为 可变集合 和 不可变集合
a = set('hello')
print (a)
b = frozenset('hello')
print (b)
print (type (a))
print (type (b))
{'c', 'b', 'a'}
{'e', 'h', 'o', 'l'}
frozenset({'e', 'h', 'o', 'l'})
<class 'set'>
<class 'frozenset'>
#集合比较
aset = set('sunrise')
bset = set('sunset')
print ('u' in aset)
print (aset == bset)
print (aset<bset)
print (set('sun')<aset)
#集合运算关系:& | - ^
print (aset&bset)
print (aset|bset)
print (aset-bset)
print (aset^bset)
#集合内建函数:面向所有集合,s.issubset(t) issuperset(t),union(t),intersection(t),defference(t),symmetric_difference(t),copy()
print ('所有集合')
print (aset.issubset(bset))
print (aset.intersection(bset))
print (aset.difference(bset))
cset = aset.copy()
print (cset)
#集合内建函数:面向可变集合,update(t),intersection_update(t),difference_update(t),symmetric_difference_update(t),add(obj),remove(obj),discard(obj),pop(),clear()
print ('可变集合')
aset = set('sunrise')
aset.add('!')
print (aset)
aset.remove('!')
print (aset)
aset.update('yeah')
print (aset)
aset.clear()
print (aset)
True
False
False
True
{'e', 'u', 's', 'n'}
{'e', 'r', 't', 'n', 'i', 'u', 's'}
{'i', 'r'}
{'r', 't', 'i'}
所有集合
False
{'e', 'u', 's', 'n'}
{'i', 'r'}
{'e', 'r', 'n', 'i', 'u', 's'}
可变集合
{'e', 'r', 'n', '!', 'i', 'u', 's'}
{'e', 'r', 'n', 'i', 'u', 's'}
{'e', 'r', 'n', 'h', 'i', 'a', 'u', 's', 'y'}
set()
02_3-2 python 扩展库
01_1扩展库 scipy
# scipy 中的数据结构:ndarray(N维数组),series(变长字典),dataframe(数据框)
# numpy:强大的ndarray对象和ufunc函数;精巧的函数;比较适合线性代数和随机数处理等科学计算;有效的通用多维数据,可定义任意数据类型;无缝对接数据库
# scipy:python中科学计算程序的核心包;有效计算numpy矩阵,让numpy和scipy协调工作;致力于科学计算中常见问题的各个工具箱,其不同子模块由不同的应用,如插值、积分、优化和图像处理等
# linalg.det(arr)求行列式的值
from scipy import linalg
import numpy as np
arr = np.array([[1,2],[3,4]])
print (arr)
print (linalg.det(arr))
#matplotlib:基于numpy,二维绘图库,简单快速地生成曲线图、直方图和散点图等形式的图;常用的pyplot试一个简单提供类似matlab接口的模块
#pandas :基于scipy和numpy;高效的series和dataframe数据结构;强大的可扩展数据操作与分析的python库;高效处理大数据集的切片等功能;提供优化库功能读写多种文件格式,如CSV、HDF5
[[1 2]
[3 4]]
-2.0
02_2-ndarray
# 数组用于表示有序的数据集
# 列表可用于表示数组,列表中保存的是对象的指针,如list = [1,2,3,4],保存的是4个整数对象和4个指针,会浪费内存和计算时间。
# 列表、元组可以当数组用,但它们之间有些区别。
# Array模块通过array函数创建数组,不支持多维数组,函数功能不够丰富
# python中的数组
# 形式
# 用list和tuple等数据结构表示数组
# 一维数组
list1 = [1,2,3,4]
print (list1)
# 二维数组
list2 = [[1,2,3],[4,5,6],[7,8,9]]
print (list2)
# array模块: 通过array函数创建数组;提供append、list和read等函数
import array
a = array.array("b",range(5))
b = np.array (range(5))
c = range(5)
print (a)
print (b)
print (c)
print (d)
[1, 2, 3, 4]
[[1, 2, 3], [4, 5, 6], [7, 8, 9]]
array('b', [0, 1, 2, 3, 4])
[0 1 2 3 4]
range(0, 5)
None
# ndarray 是n维数组,numpy中基本的数据结构,别名aarray,利于节省内存和提高CPU计算时间,由丰富的函数
# ndarray 的创建和输出 arange copy empty_like fromfile identity logspace ogrid ones_like zeros array enpty eye fromfunction linspace mgrid ones r zeros_like
from numpy import *
a = array([1,2,3])
print (a)
b = array([[1,2],[3,4]])
print (b)
print (zeros((2,2)))
print (arange(1,5,0.5))
[1 2 3]
[[1 2]
[3 4]]
[[ 0. 0.]
[ 0. 0.]]
[ 1. 1.5 2. 2.5 3. 3.5 4. 4.5]
# ndarray的基本运算符
import numpy as np
a = array([[1,2,6],[3,4,8]])
b = array([[5,6,3],[7,8,5]])
c = a*b
d = a+b
print (c)
print (d)
print (a>5)
print (sin(a))
print (a.shape)
print (a.reshape(3,2))
print (a.sum( axis = 0))
print (a.sum( axis = 1))
c = array([1,3,5])
print (c[:1])
d = array([1,2,3])
e = array([4,5,6])
where(c>2,d,e)#where 如果c>2,就取d中的数,否则取e中的数
[[ 5 12 18]
[21 32 40]]
[[ 6 8 9]
[10 12 13]]
[[False False True]
[False False True]]
[[ 0.84147098 0.90929743 -0.2794155 ]
[ 0.14112001 -0.7568025 0.98935825]]
(2, 3)
[[1 2]
[6 3]
[4 8]]
[ 4 6 14]
[ 9 15]
[1]
array([4, 2, 3])
# ndarray的内建函数
#打印9*9乘法表口诀
def fun(i,j):
return ((i+1)*(j+1))
a = fromfunction(fun,(9,9))
print (a)
[[ 1. 2. 3. 4. 5. 6. 7. 8. 9.]
[ 2. 4. 6. 8. 10. 12. 14. 16. 18.]
[ 3. 6. 9. 12. 15. 18. 21. 24. 27.]
[ 4. 8. 12. 16. 20. 24. 28. 32. 36.]
[ 5. 10. 15. 20. 25. 30. 35. 40. 45.]
[ 6. 12. 18. 24. 30. 36. 42. 48. 54.]
[ 7. 14. 21. 28. 35. 42. 49. 56. 63.]
[ 8. 16. 24. 32. 40. 48. 56. 64. 72.]
[ 9. 18. 27. 36. 45. 54. 63. 72. 81.]]
# ufunc(universal function)是一种能对数组的每个元素进行操作的数。numpy内置的许多ufunc函数都是在c语言级别实现的,计算速度非常快。
# help(ufunc)
import numpy as np
a = np.arange(1,5)
b = np.arange(2,6)
print (np.add(a,b))
print (np.add.accumulate([2,3,5]))
print (np.multiply.accumulate([2,3,5]))
[3 5 7 9]
[ 2 5 10]
[ 2 6 30]
03_3 变长字典 series
# 字典是一种无序的数字结构,key和value之间存在映射关系,但是字典的key和value之间是不独立的,pandas的series是定长有序的字典,index和value之间是独立的,在某些应用中有比字典更强大的功能。
# series 基本特征:类似一维数组的对象,由数据和索引组成
import pandas as pd
# from pandas import Series
a = pd.Series([1,2.0,'a'])
print (a)
# 自定义series的index
b = pd.Series(['apple','banana',4],index=[1,2,3])
print (b)
print (b.index)
print (b.values)
0 1
1 2
2 a
dtype: object
1 apple
2 banana
3 4
dtype: object
Int64Index([1, 2, 3], dtype='int64')
['apple' 'banana' 4]
# series的基本运算
import numpy as np
a = Series([3,5,7],index = ['a','b','c'])
print (a['b'])
print (a*2)
print (np.exp(a))
5
a 6
b 10
c 14
dtype: int64
a 20.085537
b 148.413159
c 1096.633158
dtype: float64
# series 的数据对齐
# NaN:Not a number 未定义或不可表示的值
data = {'a':1,'b':2,'c':3}
sindex ={'a','b','c','d'}
b = Series(data,index=sindex)
print (b)
print (pd.isnull(b))
c 3.0
b 2.0
a 1.0
d NaN
dtype: float64
c False
b False
a False
d True
dtype: bool
data1 = {'a':'1','b':'2','c':'3'}
print (data1)
sindex ={'a','b','c','d'}
b = Series(data1,index=sindex)
print (b)
{'a': '1', 'b': '2', 'c': '3'}
c 3
b 2
a 1
d NaN
dtype: object
# series 的数据对齐
# 在算术运算中自动对齐不同索引的数据
data1 = {'a':'1','b':'2','c':'3'}
sindex ={'a','b','c','d'}
b = Series(data1,index=sindex)
print (b)
c ={'a':'1','b':'2','d':'4'}
d =pd.Series(c)
print (b + d)
c 3
b 2
a 1
d NaN
dtype: object
a 11
b 22
c NaN
d NaN
dtype: object
# series的name属性 重要功能:series对象本身及其索引均有一个name属性;series的name属性与其他重要功能关系密切
data1 = {'a':'1','b':'2','c':'3'}
sindex ={'a','b','c','d'}
b = Series(data1,index=sindex)
b.name = 'cnames'
b.index.name = 'volume'
print (b)
volume
c 3
b 2
a 1
d NaN
Name: cnames, dtype: object
04_4-dataframe
# series对应的是一维序列;dataframe 对应的是二维表结构,是一种表格型的数据结构,可看作是共享同一个index的series的集合
# dataframe 基本特征:一个表格型的数据结构,含有一组有序的列(类似于index),大致可看成共享同一个index的Series集合
# 与series一样,指定index,数据对齐
# data = {'name':[a,b,c],'pay':[1,2,3]}#这样不行
data = {'name':['a','b','c'],'pay':['1','2','3']}
frame = pd.DataFrame(data)
print(frame)
# 取Dataframe对象的行和列可获得series
print (frame['name'])
print (frame.pay)
print (frame.ix[2])
name pay
0 a 1
1 b 2
2 c 3
0 a
1 b
2 c
Name: name, dtype: object
0 1
1 2
2 3
Name: pay, dtype: object
name c
pay 3
Name: 2, dtype: object
D:\software\Anaconda\anaconda\lib\site-packages\ipykernel_launcher.py:12: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
if sys.path[0] == '':
# Dataframe 的 name 属性
data = {'name':['a','b','c'],'pay':['1','2','3']}
frame = pd.DataFrame(data)
print(frame)
frame.index.name = 'NO'
print (frame)
# Dateframe的基本操作
frame['name'] = 'admin'
print (frame)
del frame['pay']
print (frame)
name pay
0 a 1
1 b 2
2 c 3
name pay
NO
0 a 1
1 b 2
2 c 3
name pay
NO
0 admin 1
1 admin 2
2 admin 3
name
NO
0 admin
1 admin
2 admin