目录
高阶回归
分类变量
高阶回归
# Y=5 + 2X + 3X^2
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
nsample = 50 # 样本数量
#linspace,返回间隔均匀的样本
x = np.linspace(0,10,nsample)
#column_stack,将一维数组转换为二维数组。
X = np.column_stack((x,x**2))
X = sm.add_constant(X)
X
array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[1.00000000e+00, 2.04081633e-01, 4.16493128e-02],
[1.00000000e+00, 4.08163265e-01, 1.66597251e-01],
[1.00000000e+00, 6.12244898e-01, 3.74843815e-01],
[1.00000000e+00, 8.16326531e-01, 6.66389005e-01],
[1.00000000e+00, 1.02040816e+00, 1.04123282e+00],
[1.00000000e+00, 1.22448980e+00, 1.49937526e+00],
[1.00000000e+00, 1.42857143e+00, 2.04081633e+00],
[1.00000000e+00, 1.63265306e+00, 2.66555602e+00],
[1.00000000e+00, 1.83673469e+00, 3.37359434e+00],
[1.00000000e+00, 2.04081633e+00, 4.16493128e+00],
[1.00000000e+00, 2.24489796e+00, 5.03956685e+00],
[1.00000000e+00, 2.44897959e+00, 5.99750104e+00],
[1.00000000e+00, 2.65306122e+00, 7.03873386e+00],
[1.00000000e+00, 2.85714286e+00, 8.16326531e+00],
[1.00000000e+00, 3.06122449e+00, 9.37109538e+00],
[1.00000000e+00, 3.26530612e+00, 1.06622241e+01],
[1.00000000e+00, 3.46938776e+00, 1.20366514e+01],
[1.00000000e+00, 3.67346939e+00, 1.34943773e+01],
[1.00000000e+00, 3.87755102e+00, 1.50354019e+01],
[1.00000000e+00, 4.08163265e+00, 1.66597251e+01],
[1.00000000e+00, 4.28571429e+00, 1.83673469e+01],
[1.00000000e+00, 4.48979592e+00, 2.01582674e+01],
[1.00000000e+00, 4.69387755e+00, 2.20324865e+01],
[1.00000000e+00, 4.89795918e+00, 2.39900042e+01],
[1.00000000e+00, 5.10204082e+00, 2.60308205e+01],
[1.00000000e+00, 5.30612245e+00, 2.81549354e+01],
[1.00000000e+00, 5.51020408e+00, 3.03623490e+01],
[1.00000000e+00, 5.71428571e+00, 3.26530612e+01],
[1.00000000e+00, 5.91836735e+00, 3.50270721e+01],
[1.00000000e+00, 6.12244898e+00, 3.74843815e+01],
[1.00000000e+00, 6.32653061e+00, 4.00249896e+01],
[1.00000000e+00, 6.53061224e+00, 4.26488963e+01],
[1.00000000e+00, 6.73469388e+00, 4.53561016e+01],
[1.00000000e+00, 6.93877551e+00, 4.81466056e+01],
[1.00000000e+00, 7.14285714e+00, 5.10204082e+01],
[1.00000000e+00, 7.34693878e+00, 5.39775094e+01],
[1.00000000e+00, 7.55102041e+00, 5.70179092e+01],
[1.00000000e+00, 7.75510204e+00, 6.01416077e+01],
[1.00000000e+00, 7.95918367e+00, 6.33486047e+01],
[1.00000000e+00, 8.16326531e+00, 6.66389005e+01],
[1.00000000e+00, 8.36734694e+00, 7.00124948e+01],
[1.00000000e+00, 8.57142857e+00, 7.34693878e+01],
[1.00000000e+00, 8.77551020e+00, 7.70095793e+01],
[1.00000000e+00, 8.97959184e+00, 8.06330696e+01],
[1.00000000e+00, 9.18367347e+00, 8.43398584e+01],
[1.00000000e+00, 9.38775510e+00, 8.81299459e+01],
[1.00000000e+00, 9.59183673e+00, 9.20033319e+01],
[1.00000000e+00, 9.79591837e+00, 9.59600167e+01],
[1.00000000e+00, 1.00000000e+01, 1.00000000e+02]])
bate = np.array([5,2,3])
e = np.random.normal(size=nsample)
y = np.dot(X,bate)+e
#最小二乘法,计算回归系数
model = sm.OLS(y,X)
result = model.fit()
result.params
array([4.91388904, 1.89789012, 3.01647615])
result.summary()
OLS Regression Results
Dep. Variable: | y | R-squared: | 1.000 |
---|
Model: | OLS | Adj. R-squared: | 1.000 |
---|
Method: | Least Squares | F-statistic: | 2.449e+05 |
---|
Date: | Wed, 19 Sep 2018 | Prob (F-statistic): | 3.80e-95 |
---|
Time: | 16:48:00 | Log-Likelihood: | -68.540 |
---|
No. Observations: | 50 | AIC: | 143.1 |
---|
Df Residuals: | 47 | BIC: | 148.8 |
---|
Df Model: | 2 | | |
---|
Covariance Type: | nonrobust | | |
---|
| coef | std err | t | P>|t| | [0.025 | 0.975] |
---|
const | 4.9139 | 0.401 | 12.257 | 0.000 | 4.107 | 5.720 |
---|
x1 | 1.8979 | 0.185 | 10.236 | 0.000 | 1.525 | 2.271 |
---|
x2 | 3.0165 | 0.018 | 168.240 | 0.000 | 2.980 | 3.053 |
---|
Omnibus: | 0.197 | Durbin-Watson: | 2.100 |
---|
Prob(Omnibus): | 0.906 | Jarque-Bera (JB): | 0.003 |
---|
Skew: | 0.019 | Prob(JB): | 0.998 |
---|
Kurtosis: | 3.011 | Cond. No. | 142. |
---|
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
#拟合估计值
y_fitted = result.fittedvalues
#绘图
fig,ax = plt.subplots(figsize=(8,6))
ax.plot(x,y,'o',label='data')
ax.plot(x,y_fitted,'r--',label='OLS')
ax.legend(loc='best')
plt.show()

分类变量
#假设分类变量有3个取值(a,b,c),创建哑变量,比如考试成绩有3个等级a(1,0,0)b(0,1,0)c(0,0,1),这个时候需要3个系数β0,β1,β2,也就是要β0 * 0 + β1 * 1 + β2*2
#此方法为创建哑变量,有多少个取值,做多少个长度,对应值的位置为1,其余位置为0。
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
nsamples = 50
groups = np.zeros(nsamples,int)
groups[20:40] = 1 # 第二组值
groups[40:] = 2 #第三组值
print('groups:',groups)
dummy = sm.categorical(groups,drop=True)
print('dummy',dummy)
groups: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 2 2 2 2 2 2 2 2 2 2]
dummy [[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[1. 0. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 1. 0.]
[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]
[0. 0. 1.]]
# Y = 5 + 2X + 3Z1 + 6Z2 + 9Z3 + e
x = np.linspace(0,10,nsamples)
X = np.column_stack((x,dummy))
print(X)
X = sm.add_constant(X)
bata = [5,2,3,6,9]
e = np.random.normal(size=nsamples)
y = np.dot(X,bata) + e
[[ 0. 1. 0. 0. ]
[ 0.20408163 1. 0. 0. ]
[ 0.40816327 1. 0. 0. ]
[ 0.6122449 1. 0. 0. ]
[ 0.81632653 1. 0. 0. ]
[ 1.02040816 1. 0. 0. ]
[ 1.2244898 1. 0. 0. ]
[ 1.42857143 1. 0. 0. ]
[ 1.63265306 1. 0. 0. ]
[ 1.83673469 1. 0. 0. ]
[ 2.04081633 1. 0. 0. ]
[ 2.24489796 1. 0. 0. ]
[ 2.44897959 1. 0. 0. ]
[ 2.65306122 1. 0. 0. ]
[ 2.85714286 1. 0. 0. ]
[ 3.06122449 1. 0. 0. ]
[ 3.26530612 1. 0. 0. ]
[ 3.46938776 1. 0. 0. ]
[ 3.67346939 1. 0. 0. ]
[ 3.87755102 1. 0. 0. ]
[ 4.08163265 0. 1. 0. ]
[ 4.28571429 0. 1. 0. ]
[ 4.48979592 0. 1. 0. ]
[ 4.69387755 0. 1. 0. ]
[ 4.89795918 0. 1. 0. ]
[ 5.10204082 0. 1. 0. ]
[ 5.30612245 0. 1. 0. ]
[ 5.51020408 0. 1. 0. ]
[ 5.71428571 0. 1. 0. ]
[ 5.91836735 0. 1. 0. ]
[ 6.12244898 0. 1. 0. ]
[ 6.32653061 0. 1. 0. ]
[ 6.53061224 0. 1. 0. ]
[ 6.73469388 0. 1. 0. ]
[ 6.93877551 0. 1. 0. ]
[ 7.14285714 0. 1. 0. ]
[ 7.34693878 0. 1. 0. ]
[ 7.55102041 0. 1. 0. ]
[ 7.75510204 0. 1. 0. ]
[ 7.95918367 0. 1. 0. ]
[ 8.16326531 0. 0. 1. ]
[ 8.36734694 0. 0. 1. ]
[ 8.57142857 0. 0. 1. ]
[ 8.7755102 0. 0. 1. ]
[ 8.97959184 0. 0. 1. ]
[ 9.18367347 0. 0. 1. ]
[ 9.3877551 0. 0. 1. ]
[ 9.59183673 0. 0. 1. ]
[ 9.79591837 0. 0. 1. ]
[10. 0. 0. 1. ]]
#回归分析
res = sm.OLS(y,X).fit()
res.summary()
OLS Regression Results
Dep. Variable: | y | R-squared: | 0.981 |
---|
Model: | OLS | Adj. R-squared: | 0.980 |
---|
Method: | Least Squares | F-statistic: | 789.5 |
---|
Date: | Thu, 20 Sep 2018 | Prob (F-statistic): | 1.50e-39 |
---|
Time: | 10:23:05 | Log-Likelihood: | -76.762 |
---|
No. Observations: | 50 | AIC: | 161.5 |
---|
Df Residuals: | 46 | BIC: | 169.2 |
---|
Df Model: | 3 | | |
---|
Covariance Type: | nonrobust | | |
---|
| coef | std err | t | P>|t| | [0.025 | 0.975] |
---|
const | 7.6313 | 0.664 | 11.501 | 0.000 | 6.296 | 8.967 |
---|
x1 | 2.1754 | 0.153 | 14.247 | 0.000 | 1.868 | 2.483 |
---|
x2 | 0.2556 | 0.421 | 0.607 | 0.547 | -0.591 | 1.103 |
---|
x3 | 2.2886 | 0.352 | 6.508 | 0.000 | 1.581 | 2.997 |
---|
x4 | 5.0870 | 0.792 | 6.421 | 0.000 | 3.492 | 6.682 |
---|
Omnibus: | 3.522 | Durbin-Watson: | 2.244 |
---|
Prob(Omnibus): | 0.172 | Jarque-Bera (JB): | 2.958 |
---|
Skew: | -0.596 | Prob(JB): | 0.228 |
---|
Kurtosis: | 3.026 | Cond. No. | 1.32e+17 |
---|
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.
#拟合估计值
y_fitted = res.fittedvalues
#绘图
fig,ax = plt.subplots(figsize=(8,6))
ax.plot(x,y,'o',label='data')
ax.plot(x,y_fitted,'r--',label='OLS')
ax.legend(loc='best')
plt.show()
