1.读取文件
逻辑上可被解释为二维数组的文本文件:
数据项1<分隔符>数据项2<分隔符>…<分隔符>数据项n
…
numpy.loadtxt(
文件路径,
delimiter=分隔符(缺省一个空格),
usecols=列序列(缺省所有列),
unpack=是否展开列(缺省False),
dtype=元素类型(缺省float),
converters=转换器字典(缺省不做转换))
->一个二维数组(unpack=False)或
多个一维数组(unpack=True)
代码:k.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
import matplotlib.pyplot as mp
import matplotlib.dates as md
def dmy2ymd(dmy):
dmy = str(dmy, encoding='utf-8')#用utf-8来解释dmy
date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()#strptime是一个静态方法,把str变成一个时间对象
ymd = date.strftime('%Y-%m-%d')
return ymd
dates, opening_prices, highest_prices, \
lowest_prices, closing_prices = np.loadtxt(
'aapl.csv',
delimiter=',', usecols=(1, 3, 4, 5, 6),
unpack=True, dtype='M8[D], f8, f8, f8, f8',
converters={1: dmy2ymd})
mp.figure('Candlestick', facecolor='lightgray')
mp.title('Candlestick', fontsize=20)
mp.xlabel('Date', fontsize=14)
mp.ylabel('Price', fontsize=14)
ax = mp.gca()
# 设置水平坐标每个星期一为主刻度
ax.xaxis.set_major_locator(md.WeekdayLocator(
byweekday=md.MO))
# 设置水平坐标每一天为次刻度
ax.xaxis.set_minor_locator(md.DayLocator())
# 设置水平坐标主刻度标签格式
ax.xaxis.set_major_formatter(md.DateFormatter(
'%d %b %Y'))
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
dates = dates.astype(md.datetime.datetime)
# 阳线掩码
rise = closing_prices - opening_prices >= 0.01
# 阴线掩码
fall = opening_prices - closing_prices >= 0.01
# 填充色
fc = np.zeros(dates.size, dtype='3f4')
fc[rise], fc[fall] = (1, 1, 1), (0, 0.5, 0)
# 边缘色
ec = np.zeros(dates.size, dtype='3f4')
ec[rise], ec[fall] = (1, 0, 0), (0, 0.5, 0)
mp.bar(dates, highest_prices - lowest_prices, 0,
lowest_prices, color=fc, edgecolor=ec)
mp.bar(dates, closing_prices - opening_prices, 0.8,
opening_prices, color=fc, edgecolor=ec)
# 自动调整水平坐标轴的日期标签
mp.gcf().autofmt_xdate()
mp.show()
这里我将csv文件和ipynb放在一个文件夹里面,所以路径才写成这样的。可是奇怪了,也没报错,但是竟然出不来效果。只出来这么一个东西:
2.算数平均值
样本:S = [s1, s2, …, sn]
算数平均值:m = (s1+s2+…+sn) / n
s1 = s+d1
s2 = s+d2
…
sn = s+dn
m = (s1+s2+…+sn) / n
=(s+s+…+s) / n + (d1+d2+…+dn)/n
=s + (d1+d2+…+dn)/n
n->oo
算数平均值表示对真值的无偏估计。
numpy.mean(S)->m
代码:mean.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
closing_prices = np.loadtxt(
'aapl.csv', delimiter=',',
usecols=(6), unpack=True)
mean = 0
for closing_price in closing_prices:
mean += closing_price
mean /= closing_prices.size
print(mean)
mean = np.mean(closing_prices)
print(mean)
结果:
351.03766666666667
351.0376666666667
3.加权平均值
样本:S = [s1, s2, …, sn]
权重:W = [w1, w2, …, wn]
加权平均值:
a = (s1w1+s2w2+…+snwn)/(w1+w2+…+wn)
numpy.average(S, weights=W)->a
VWAP - 成交量加权平均价格
代码:vwap.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
closing_prices, volumes = np.loadtxt(
'aapl.csv', delimiter=',',
usecols=(6, 7), unpack=True)
vwap, wsum = 0, 0
for closing_price, volume in zip(
closing_prices, volumes):
vwap += closing_price * volume
wsum += volume
vwap /= wsum
print(vwap)
vwap = np.average(closing_prices, weights=volumes)
print(vwap)
结果:
350.5895493532009
350.5895493532009
TWAP - 时间加权平均价格
代码:twap.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
def dmy2days(dmy):
dmy = str(dmy, encoding='utf-8')
date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
days = (date - dt.date.min).days
return days
days, closing_prices = np.loadtxt(
'aapl.csv', delimiter=',',
usecols=(1, 6), unpack=True,
converters={1: dmy2days})
twap = np.average(closing_prices, weights=days)
print(twap)
结果:
351.0377051146597
4.最值
1)max/min: 返回一个数组中最大/最小元素
2)argmax/argmin: 返回一个数组中最大/最小元素的下标
3)maximum/minimum: 将两个同维数组中对应元素中最大/最小元素构成一个新的数组
4)ptp: 返回一个数组中最大值和最小值之差
代码:max.py、
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
# 产生9个介于[10, 100)区间的随机数
a = np.random.randint(10, 100, 9).reshape(3, 3)
print(a)
print(np.max(a), np.min(a), np.ptp(a))
print(np.argmax(a), np.argmin(a))#最大值、最小值的下标
b = np.random.randint(10, 100, 9).reshape(3, 3)
print(b)
print(np.maximum(a, b), np.minimum(a, b), sep='\n')
结果:
range.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
dates, highest_prices, lowest_prices = np.loadtxt(
'aapl.csv', delimiter=',',
usecols=(1, 4, 5), dtype='U10, f8, f8',
unpack=True)
max_price = np.max(highest_prices)
min_price = np.min(lowest_prices)
print(min_price, '~', max_price)
max_index = np.argmax(highest_prices)
min_index = np.argmin(lowest_prices)
print(dates[min_index], dates[max_index])
highest_ptp = np.ptp(highest_prices)
lowest_ptp = np.ptp(lowest_prices)
print(lowest_ptp, highest_ptp)
结果:
333.53 ~ 364.9
28-01-2011 16-02-2011
26.970000000000027 24.859999999999957
5.中位数
将多个样本按照大小排序,取中间位置的元素。
10 20 30 40 50
^ (a[(5-1)/2]+a[5/2])/2
10 20 30 40 50 60
^ ^
\ /
平均 (a[(6-1)/2]+a[6/2])/2
(a[(s-1)/2]+a[s/2])/2
np.median(无序样本)->中位数
代码:med.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
closing_prices = np.loadtxt(
'../../data/aapl.csv', delimiter=',',
usecols=(6), unpack=True)
size = closing_prices.size
sorted_prices = np.msort(closing_prices)
median = (sorted_prices[int((size - 1) / 2)] +
sorted_prices[int(size / 2)]) / 2
print(median)
median = np.median(closing_prices)
print(median)
结果:
352.055
352.055
6.标准差
样本:S = [s1, s2, …, sn]
平均值:m = (s1+s2+…+sn)/n
离差:D = [d1, d2, …, dn], di = si-m
离差方:Q = [q1, q2, …, qn], qi = di^2
总体方差:v = (q1+q2+…+qn)/n
总体标准差:s = sqrt(v),方均根
样本方差:v’ = (q1+q2+…+qn)/(n-1)
样本标准差:s’ = sqrt(v’),方均根
10
2 + 2 + … + 2 = 20
10000
10-8
numpy.std(S)->s
numpy.std(S, ddof=1)->s’#ddof非自由度
代码:var.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
closing_prices = np.loadtxt(
'aapl.csv', delimiter=',',
usecols=(6), unpack=True)
mean = np.mean(closing_prices) # 算数平均值
devs = closing_prices - mean # 离差
dsqs = devs ** 2 # 离差方
pvar = np.sum(dsqs) / dsqs.size # 总体方差
pstd = np.sqrt(pvar) # 总体标准差
svar = np.sum(dsqs) / (dsqs.size - 1) # 样本方差
sstd = np.sqrt(svar) # 样本标准差
print(pstd, sstd)
pstd = np.std(closing_prices) # 总体标准差
sstd = np.std(closing_prices, ddof=1) # 样本标准差
print(pstd, sstd)
结果:
7.080008325481608 7.201042876260849
7.080008325481608 7.201042876260849
7.时间数据处理
1)按星期取平均值
代码:week.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
# 转换器函数:将日-月-年格式的日期字符串转换为星期
def dmy2wday(dmy):
dmy = str(dmy, encoding='utf-8')
date = dt.datetime.strptime(
dmy, '%d-%m-%Y').date()
wday = date.weekday() # 用0~6表示周一到周日
return wday
wdays, closing_prices = np.loadtxt(
'aapl.csv', delimiter=',',
usecols=(1, 6), unpack=True,
converters={1: dmy2wday})
print(wdays)
ave_closing_prices = np.zeros(5)
for wday in range(ave_closing_prices.size):
'''
ave_closing_prices[wday] = np.take(
closing_prices, np.where(wdays == wday)).mean()
'''
'''
ave_closing_prices[wday] = \
closing_prices[np.where(wdays == wday)].mean()
'''
ave_closing_prices[wday] = \
closing_prices[wdays == wday].mean()
for wday, ave_closing_price in zip(
['MON', 'TUE', 'WED', 'THU', 'FRI'],
ave_closing_prices):
print(wday, np.round(ave_closing_price, 2))
结果:
2)按星期汇总数据
数组的轴向汇总
np.apply_along_axis(处理函数, 轴向, 数组)
沿着数组中所指定的轴向,调用处理函数,并将每次调用的返回值重新组织成数组返回。
代码:axis.py、
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
def foo(arg):
print('foo:', arg)
return arg.sum()
a = np.arange(1, 10).reshape(3, 3)
print(a)
b = np.apply_along_axis(foo, 0, a)
print(b)
c = np.apply_along_axis(foo, 1, a)
print(c)
sum.py
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import datetime as dt
import numpy as np
def dmy2wday(dmy):
dmy = str(dmy, encoding='utf-8')
date = dt.datetime.strptime(dmy, '%d-%m-%Y').date()
wday = date.weekday()
return wday
wdays, opening_prices, highest_prices, \
lowest_prices, closing_prices = np.loadtxt(
'aapl.csv',
delimiter=',', usecols=(1, 3, 4, 5, 6),
unpack=True, converters={1: dmy2wday})
wdays = wdays[:16]
opening_prices = opening_prices[:16]
highest_prices = highest_prices[:16]
lowest_prices = lowest_prices[:16]
closing_prices = closing_prices[:16]
# 第一个星期一的索引
first_monday = np.where(wdays == 0)[0][0]
last_friday = np.where(wdays == 4)[0][-1]
indices = np.arange(first_monday, last_friday + 1)
indices = np.split(indices, 3)
def week_summary(indices):
opening_price = opening_prices[indices[0]]
highest_price = highest_prices[indices].max()
lowest_price = lowest_prices[indices].min()
closing_price = closing_prices[indices[-1]]
return opening_price, highest_price, \
lowest_price, closing_price
summaries = np.apply_along_axis(
week_summary, 1, indices)
print(summaries)
np.savetxt('summary.csv', summaries,
delimiter=',', fmt='%g')
结果:
[[335.8 346.7 334.3 346.5 ]
[347.89 360. 347.64 356.85]
[356.79 364.9 349.52 350.56]]