pandas网站链接
Object Creation
import pandas as pd
import numpy as np
s = pd.Series([1,3,5, np.NAN,6,8 ])
Series is just like the list in python
s
0 1.0
1 3.0
2 5.0
3 NaN
4 6.0
5 8.0
dtype: float64
s[1]
3.0
Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:
dates = pd.date_range('20130101', periods=6)
dates
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df
| A | B | C | D |
---|
2013-01-01 | -1.815560 | -2.066970 | 0.083446 | 0.457541 |
---|
2013-01-02 | 0.878679 | -1.279713 | -2.170213 | -0.317302 |
---|
2013-01-03 | -2.504659 | 0.727344 | -0.062560 | 1.473940 |
---|
2013-01-04 | 0.245661 | 1.158579 | 0.431938 | -0.511382 |
---|
2013-01-05 | 0.159338 | 1.616634 | -0.567605 | -0.221172 |
---|
2013-01-06 | -0.758477 | -1.364958 | -0.398516 | -0.241077 |
---|
df2 = pd.DataFrame({'A': 1.,
'B': pd.Timestamp('20130102'),
'C': pd.Series(1, index=list(range(4)), dtype='float32'),
'D': np.array([3] * 4, dtype='int32'),
'E': pd.Categorical(["test", "train", "test", "train"]),
'F': 'foo'})
df2
| A | B | C | D | E | F |
---|
0 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
---|
1 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
---|
2 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
---|
3 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
---|
df2.dtypes
A float64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
View Data
df.head()
| A | B | C | D |
---|
2013-01-01 | -1.815560 | -2.066970 | 0.083446 | 0.457541 |
---|
2013-01-02 | 0.878679 | -1.279713 | -2.170213 | -0.317302 |
---|
2013-01-03 | -2.504659 | 0.727344 | -0.062560 | 1.473940 |
---|
2013-01-04 | 0.245661 | 1.158579 | 0.431938 | -0.511382 |
---|
2013-01-05 | 0.159338 | 1.616634 | -0.567605 | -0.221172 |
---|
df.tail(2)
| A | B | C | D |
---|
2013-01-05 | 0.159338 | 1.616634 | -0.567605 | -0.221172 |
---|
2013-01-06 | -0.758477 | -1.364958 | -0.398516 | -0.241077 |
---|
df.index
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
'2013-01-05', '2013-01-06'],
dtype='datetime64[ns]', freq='D')
df.columns
Index(['A', 'B', 'C', 'D'], dtype='object')
df.to_xarray
<bound method NDFrame.to_xarray of A B C D
2013-01-01 -1.815560 -2.066970 0.083446 0.457541
2013-01-02 0.878679 -1.279713 -2.170213 -0.317302
2013-01-03 -2.504659 0.727344 -0.062560 1.473940
2013-01-04 0.245661 1.158579 0.431938 -0.511382
2013-01-05 0.159338 1.616634 -0.567605 -0.221172
2013-01-06 -0.758477 -1.364958 -0.398516 -0.241077>
df.describe()
| A | B | C | D |
---|
count | 6.000000 | 6.000000 | 6.000000 | 6.000000 |
---|
mean | -0.632503 | -0.201514 | -0.447252 | 0.106758 |
---|
std | 1.311665 | 1.550118 | 0.915211 | 0.746014 |
---|
min | -2.504659 | -2.066970 | -2.170213 | -0.511382 |
---|
25% | -1.551289 | -1.343647 | -0.525333 | -0.298246 |
---|
50% | -0.299569 | -0.276185 | -0.230538 | -0.231124 |
---|
75% | 0.224080 | 1.050770 | 0.046944 | 0.287863 |
---|
max | 0.878679 | 1.616634 | 0.431938 | 1.473940 |
---|
df.T
| 2013-01-01 00:00:00 | 2013-01-02 00:00:00 | 2013-01-03 00:00:00 | 2013-01-04 00:00:00 | 2013-01-05 00:00:00 | 2013-01-06 00:00:00 |
---|
A | -1.815560 | 0.878679 | -2.504659 | 0.245661 | 0.159338 | -0.758477 |
---|
B | -2.066970 | -1.279713 | 0.727344 | 1.158579 | 1.616634 | -1.364958 |
---|
C | 0.083446 | -2.170213 | -0.062560 | 0.431938 | -0.567605 | -0.398516 |
---|
D | 0.457541 | -0.317302 | 1.473940 | -0.511382 | -0.221172 | -0.241077 |
---|
df.sort_values(by = 'B')
| A | B | C | D |
---|
2013-01-01 | -1.815560 | -2.066970 | 0.083446 | 0.457541 |
---|
2013-01-06 | -0.758477 | -1.364958 | -0.398516 | -0.241077 |
---|
2013-01-02 | 0.878679 | -1.279713 | -2.170213 | -0.317302 |
---|
2013-01-03 | -2.504659 | 0.727344 | -0.062560 | 1.473940 |
---|
2013-01-04 | 0.245661 | 1.158579 | 0.431938 | -0.511382 |
---|
2013-01-05 | 0.159338 | 1.616634 | -0.567605 | -0.221172 |
---|
df.sort_index(axis=1, ascending=False)
| D | C | B | A |
---|
2013-01-01 | 0.457541 | 0.083446 | -2.066970 | -1.815560 |
---|
2013-01-02 | -0.317302 | -2.170213 | -1.279713 | 0.878679 |
---|
2013-01-03 | 1.473940 | -0.062560 | 0.727344 | -2.504659 |
---|
2013-01-04 | -0.511382 | 0.431938 | 1.158579 | 0.245661 |
---|
2013-01-05 | -0.221172 | -0.567605 | 1.616634 | 0.159338 |
---|
2013-01-06 | -0.241077 | -0.398516 | -1.364958 | -0.758477 |
---|
Selection
df['A']
2013-01-01 -1.815560
2013-01-02 0.878679
2013-01-03 -2.504659
2013-01-04 0.245661
2013-01-05 0.159338
2013-01-06 -0.758477
Freq: D, Name: A, dtype: float64
df[0:3]
| A | B | C | D |
---|
2013-01-01 | -1.815560 | -2.066970 | 0.083446 | 0.457541 |
---|
2013-01-02 | 0.878679 | -1.279713 | -2.170213 | -0.317302 |
---|
2013-01-03 | -2.504659 | 0.727344 | -0.062560 | 1.473940 |
---|
df['20130102':'20130104']
| A | B | C | D |
---|
2013-01-02 | 0.878679 | -1.279713 | -2.170213 | -0.317302 |
---|
2013-01-03 | -2.504659 | 0.727344 | -0.062560 | 1.473940 |
---|
2013-01-04 | 0.245661 | 1.158579 | 0.431938 | -0.511382 |
---|
df.loc[dates[0]]
A -1.815560
B -2.066970
C 0.083446
D 0.457541
Name: 2013-01-01 00:00:00, dtype: float64
df.loc['20130102':'20130104', ['A', 'B']]
| A | B |
---|
2013-01-02 | 0.878679 | -1.279713 |
---|
2013-01-03 | -2.504659 | 0.727344 |
---|
2013-01-04 | 0.245661 | 1.158579 |
---|
df.loc[dates[0], 'A']
-1.8155597588741252
df.iloc[3]
A 0.245661
B 1.158579
C 0.431938
D -0.511382
Name: 2013-01-04 00:00:00, dtype: float64
df[df.A > 0]
| A | B | C | D |
---|
2013-01-02 | 0.878679 | -1.279713 | -2.170213 | -0.317302 |
---|
2013-01-04 | 0.245661 | 1.158579 | 0.431938 | -0.511382 |
---|
2013-01-05 | 0.159338 | 1.616634 | -0.567605 | -0.221172 |
---|
Missing data
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])
df1
| A | B | C | D | E |
---|
2013-01-01 | -1.815560 | -2.066970 | 0.083446 | 0.457541 | NaN |
---|
2013-01-02 | 0.878679 | -1.279713 | -2.170213 | -0.317302 | NaN |
---|
2013-01-03 | -2.504659 | 0.727344 | -0.062560 | 1.473940 | NaN |
---|
2013-01-04 | 0.245661 | 1.158579 | 0.431938 | -0.511382 | NaN |
---|
df1.loc[dates[0]:dates[1], 'E'] = 1
df1
| A | B | C | D | E |
---|
2013-01-01 | -1.815560 | -2.066970 | 0.083446 | 0.457541 | 1.0 |
---|
2013-01-02 | 0.878679 | -1.279713 | -2.170213 | -0.317302 | 1.0 |
---|
2013-01-03 | -2.504659 | 0.727344 | -0.062560 | 1.473940 | NaN |
---|
2013-01-04 | 0.245661 | 1.158579 | 0.431938 | -0.511382 | NaN |
---|
df1.fillna(value=5)
| A | B | C | D | E |
---|
2013-01-01 | -1.815560 | -2.066970 | 0.083446 | 0.457541 | 1.0 |
---|
2013-01-02 | 0.878679 | -1.279713 | -2.170213 | -0.317302 | 1.0 |
---|
2013-01-03 | -2.504659 | 0.727344 | -0.062560 | 1.473940 | 5.0 |
---|
2013-01-04 | 0.245661 | 1.158579 | 0.431938 | -0.511382 | 5.0 |
---|
Operation
df.mean()
A -0.632503
B -0.201514
C -0.447252
D 0.106758
dtype: float64
df.mean(1)
2013-01-01 -0.835386
2013-01-02 -0.722137
2013-01-03 -0.091484
2013-01-04 0.331199
2013-01-05 0.246799
2013-01-06 -0.690757
Freq: D, dtype: float64
df.apply(lambda x: x.max() - x.min())
A 3.383339
B 3.683604
C 2.602150
D 1.985322
dtype: float64
Grouping
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
df
| A | B | C | D |
---|
0 | foo | one | 0.445369 | -0.374983 |
---|
1 | bar | one | 1.147522 | 0.078017 |
---|
2 | foo | two | 0.411960 | -2.394559 |
---|
3 | bar | three | -1.254990 | -0.817442 |
---|
4 | foo | two | 0.421721 | -0.667637 |
---|
5 | bar | two | -1.153841 | 0.159370 |
---|
6 | foo | one | 0.170444 | -0.078069 |
---|
7 | foo | three | -0.302964 | 0.584654 |
---|
df.groupby('A').sum()
| C | D |
---|
A | | |
---|
bar | -1.261309 | -0.580056 |
---|
foo | 1.146529 | -2.930594 |
---|
df.groupby(['A', 'B']).sum()
| | C | D |
---|
A | B | | |
---|
bar | one | 1.147522 | 0.078017 |
---|
three | -1.254990 | -0.817442 |
---|
two | -1.153841 | 0.159370 |
---|
foo | one | 0.615813 | -0.453052 |
---|
three | -0.302964 | 0.584654 |
---|
two | 0.833681 | -3.062196 |
---|
另外,pandas还可以读入csv和excel文件,感觉非常方便,就是python版本的Excel