学习了numpy之后,开始继续学习pandas。以下是pandas基本的一些语法及其使用方法,这一节主要是pandas中的Series结构。运行环境python2.7
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
# Pandas模块的数据结构主要有两:1、Series ;2、DataFrame
# Series结构是基于NumPy的ndarray结构,是一个一维的标签矩阵
# pd.Series([list],index=[list])
# 以list为参数,参数为一list; index为可选参数,若不填则默认index从0开始;若添则index长度与value长度相等
# 例子一
s = pd.Series([1, 2, 6, np.nan, 44, 1])
print "s:\n", s
# 例子二
s1 = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
print "s1:\n", s1
# pd.Series({dict})以一字典结构为参数
s2 = pd.Series({'a': 3, 'b': 4, 'c': 5, 'd': 2, 'e': 6})
print "s2:\n", s2
# 取值 s[index] or s[[index的list]] 取值操作类似数组,当取不连续的多个值时可以以一list为参数
v = np.random.random_sample(20)
print "v:", v
l = pd.Series(v)
print "l:\n", l
l1 = l[[1, 5, 7, 9]]
print "l1:\n", l1
l2 = l[2:10]
print "l2:\n", l2
l3 = l[17]
print "l3:\n", l3
# head(n);tail(n)//取出头n行或尾n行,n为可选参数,若不填默认5
print "l的前五个值:\n", l.head()
print "l的后三个值:\n", l.tail(3)
# index(); values()//取出index 与values ,返回list
print type(l.index) # class 'pandas.core.indexes.range.RangeIndex'
print "l的标签:\n", list(l.index)
print "l的值:\n", l.values
# Size、shape、uniqueness、counts of values
print "len():", len(l) # Series长度,包括NaN len(): 20
print "shape():", np.shape(l) # 矩阵形状,(,) shape(): (20L,)
print "count():", l.count() # Series长度,不包括NaN count(): 20
print "unique():", l.unique() # 出现不重复values值
print "value_counts():\n", l.value_counts() # 统计value值出现次数
# 加运算:相同index的value相加,若index并非共有的则该index对应value变为NaN
# s4: NaN 1 2 3 4
# s5: 1 1 1 1 NaN
# s6: NaN 2 3 4 NaN
s4 = pd.Series([1, 2, 3, 4], index=[1, 2, 3, 4])
s5 = pd.Series([1, 1, 1, 1])
s6 = s4 + s5
print "s4 + s5:\n", s6
以下是上述代码的运行结果
D:\software\Anaconda2\python.exe D:/PycharmProjects/Learn/learn_panda/learn_pd1.py
s:
0 1.0
1 2.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
s1:
a 1
b 2
c 3
d 4
e 5
dtype: int64
s2:
a 3
b 4
c 5
d 2
e 6
dtype: int64
v: [0.37635755 0.4886929 0.07210501 0.79949712 0.45134537 0.87680035
0.30748949 0.99859402 0.61613633 0.83520501 0.07343604 0.81827534
0.25139748 0.65326514 0.45755433 0.31827714 0.3407211 0.07814553
0.64210118 0.69003531]
l:
0 0.376358
1 0.488693
2 0.072105
3 0.799497
4 0.451345
5 0.876800
6 0.307489
7 0.998594
8 0.616136
9 0.835205
10 0.073436
11 0.818275
12 0.251397
13 0.653265
14 0.457554
15 0.318277
16 0.340721
17 0.078146
18 0.642101
19 0.690035
dtype: float64
l1:
1 0.488693
5 0.876800
7 0.998594
9 0.835205
dtype: float64
l2:
2 0.072105
3 0.799497
4 0.451345
5 0.876800
6 0.307489
7 0.998594
8 0.616136
9 0.835205
dtype: float64
l3:
0.0781455282331196
l的前五个值:
0 0.376358
1 0.488693
2 0.072105
3 0.799497
4 0.451345
dtype: float64
l的后三个值:
17 0.078146
18 0.642101
19 0.690035
dtype: float64
<class 'pandas.core.indexes.range.RangeIndex'>
l的标签:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
l的值:
[0.37635755 0.4886929 0.07210501 0.79949712 0.45134537 0.87680035
0.30748949 0.99859402 0.61613633 0.83520501 0.07343604 0.81827534
0.25139748 0.65326514 0.45755433 0.31827714 0.3407211 0.07814553
0.64210118 0.69003531]
len(): 20
shape(): (20L,)
count(): 20
unique(): [0.37635755 0.4886929 0.07210501 0.79949712 0.45134537 0.87680035
0.30748949 0.99859402 0.61613633 0.83520501 0.07343604 0.81827534
0.25139748 0.65326514 0.45755433 0.31827714 0.3407211 0.07814553
0.64210118 0.69003531]
value_counts():
0.451345 1
0.835205 1
0.307489 1
0.799497 1
0.818275 1
0.457554 1
0.616136 1
0.998594 1
0.078146 1
0.376358 1
0.318277 1
0.251397 1
0.073436 1
0.072105 1
0.488693 1
0.642101 1
0.690035 1
0.876800 1
0.340721 1
0.653265 1
dtype: int64
s4 + s5:
0 NaN
1 2.0
2 3.0
3 4.0
4 NaN
dtype: float64
Process finished with exit code 0