金融和经济数据应用
数据规整化方面的话题
时间序列以及截面对齐
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
stock_px = pd.read_csv("E:\python_study_files\python\pydata-book-2nd-edition\examples\stock_px.csv",parse_dates=True,index_col=0)
prices = stock_px[['AAPL','JNJ','SPX','XOM']]['2011-09-06':'2011-09-14']
prices
|
AAPL |
JNJ |
SPX |
XOM |
| 2011-09-06 |
379.74 |
64.64 |
1165.24 |
71.15 |
| 2011-09-07 |
383.93 |
65.43 |
1198.62 |
73.65 |
| 2011-09-08 |
384.14 |
64.95 |
1185.90 |
72.82 |
| 2011-09-09 |
377.48 |
63.64 |
1154.23 |
71.01 |
| 2011-09-12 |
379.94 |
63.59 |
1162.27 |
71.84 |
| 2011-09-13 |
384.62 |
63.61 |
1172.87 |
71.65 |
| 2011-09-14 |
389.30 |
63.73 |
1188.68 |
72.64 |
stock_pxx = pd.read_csv("E:\\python_study_files\\python\\pydata-book-2nd-edition\\examples\\volume.csv",parse_dates=True,index_col=0)
volume = stock_pxx[['AAPL','JNJ','XOM']]['2011-09-06':'2011-09-12']
volume
|
AAPL |
JNJ |
XOM |
| 2011-09-06 |
18173500.0 |
15848300.0 |
25416300.0 |
| 2011-09-07 |
12492000.0 |
10759700.0 |
23108400.0 |
| 2011-09-08 |
14839800.0 |
15551500.0 |
22434800.0 |
| 2011-09-09 |
20171900.0 |
17008200.0 |
27969100.0 |
| 2011-09-12 |
16697300.0 |
13448200.0 |
26205800.0 |
prices*volume
|
AAPL |
JNJ |
SPX |
XOM |
| 2011-09-06 |
6.901205e+09 |
1.024434e+09 |
NaN |
1.808370e+09 |
| 2011-09-07 |
4.796054e+09 |
7.040072e+08 |
NaN |
1.701934e+09 |
| 2011-09-08 |
5.700561e+09 |
1.010070e+09 |
NaN |
1.633702e+09 |
| 2011-09-09 |
7.614489e+09 |
1.082402e+09 |
NaN |
1.986086e+09 |
| 2011-09-12 |
6.343972e+09 |
8.551710e+08 |
NaN |
1.882625e+09 |
| 2011-09-13 |
NaN |
NaN |
NaN |
NaN |
| 2011-09-14 |
NaN |
NaN |
NaN |
NaN |
vwap = (prices*volume).sum()/volume.sum()
vwap
AAPL 380.655181
JNJ 64.394769
SPX NaN
XOM 72.024288
dtype: float64
vwap.dropna()
AAPL 380.655181
JNJ 64.394769
XOM 72.024288
dtype: float64
prices.align(volume,join='inner')
( AAPL JNJ XOM
2011-09-06 379.74 64.64 71.15
2011-09-07 383.93 65.43 73.65
2011-09-08 384.14 64.95 72.82
2011-09-09 377.48 63.64 71.01
2011-09-12 379.94 63.59 71.84,
AAPL JNJ XOM
2011-09-06 18173500.0 15848300.0 25416300.0
2011-09-07 12492000.0 10759700.0 23108400.0
2011-09-08 14839800.0 15551500.0 22434800.0
2011-09-09 20171900.0 17008200.0 27969100.0
2011-09-12 16697300.0 13448200.0 26205800.0)
s1 = Series(range(3),index=['a','b','c'])
s2 = Series(range(4),index=['d','b','c','e'])
s3 = Series(range(3),index=['f','a','c'])
DataFrame({
'one':s1,'two':s2,'three':s3})
|
one |
two |
three |
| a |
0.0 |
NaN |
1.0 |
| b |
1.0 |
1.0 |
NaN |
| c |
2.0 |
2.0 |
2.0 |
| d |
NaN |
0.0 |
NaN |
| e |
NaN |
3.0 |
NaN |
| f |
NaN |
NaN |
0.0 |
DataFrame({
'one':s1,'two':s2,'three':s3},index=list('face'))
|
one |
two |
three |
| f |
NaN |
NaN |
0.0 |
| a |
0.0 |
NaN |
1.0 |
| c |
2.0 |
2.0 |
2.0 |
| e |
NaN |
3.0 |
NaN |
频率不同的时间序列的运算
ts1 = Series(np.random.randn(3),index=pd.date_range('2012-6-13',periods=3,freq='W-WED'))
ts1
2012-06-13 1.581292
2012-06-20 0.891047
2012-06-27 -0.209271
Freq: W-WED, dtype: float64
ts1.resample('B')
<pandas.core.resample.DatetimeIndexResampler object at 0x000001F9A5F93610>
ts1.resample('B').ffill()
2012-06-13 1.581292
2012-06-14 1.581292
2012-06-15 1.581292
2012-06-18 1.581292
2012-06-19 1.581292
2012-06-20 0.891047
2012-06-21 0.891047
2012-06-22 0.891047
2012-06-25 0.891047
2012-06-26 0.891047
2012-06-27 -0.209271
Freq: B, dtype: float64
dates = pd.DatetimeIndex(['2012-6-12','2012-6-17','2012-6-18','2012-6-21','2012-6-22','2012-6-29']