高阶回归与分类变量分析-优快云博客

本文链接：https://blog.youkuaiyun.com/xzy53719/article/details/82782516

高阶回归

# Y=5 + 2X + 3X^2
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

nsample = 50 # 样本数量
#linspace,返回间隔均匀的样本
x = np.linspace(0,10,nsample)
#column_stack，将一维数组转换为二维数组。
X = np.column_stack((x,x**2))
X = sm.add_constant(X)
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 2.04081633e-01, 4.16493128e-02],
       [1.00000000e+00, 4.08163265e-01, 1.66597251e-01],
       [1.00000000e+00, 6.12244898e-01, 3.74843815e-01],
       [1.00000000e+00, 8.16326531e-01, 6.66389005e-01],
       [1.00000000e+00, 1.02040816e+00, 1.04123282e+00],
       [1.00000000e+00, 1.22448980e+00, 1.49937526e+00],
       [1.00000000e+00, 1.42857143e+00, 2.04081633e+00],
       [1.00000000e+00, 1.63265306e+00, 2.66555602e+00],
       [1.00000000e+00, 1.83673469e+00, 3.37359434e+00],
       [1.00000000e+00, 2.04081633e+00, 4.16493128e+00],
       [1.00000000e+00, 2.24489796e+00, 5.03956685e+00],
       [1.00000000e+00, 2.44897959e+00, 5.99750104e+00],
       [1.00000000e+00, 2.65306122e+00, 7.03873386e+00],
       [1.00000000e+00, 2.85714286e+00, 8.16326531e+00],
       [1.00000000e+00, 3.06122449e+00, 9.37109538e+00],
       [1.00000000e+00, 3.26530612e+00, 1.06622241e+01],
       [1.00000000e+00, 3.46938776e+00, 1.20366514e+01],
       [1.00000000e+00, 3.67346939e+00, 1.34943773e+01],
       [1.00000000e+00, 3.87755102e+00, 1.50354019e+01],
       [1.00000000e+00, 4.08163265e+00, 1.66597251e+01],
       [1.00000000e+00, 4.28571429e+00, 1.83673469e+01],
       [1.00000000e+00, 4.48979592e+00, 2.01582674e+01],
       [1.00000000e+00, 4.69387755e+00, 2.20324865e+01],
       [1.00000000e+00, 4.89795918e+00, 2.39900042e+01],
       [1.00000000e+00, 5.10204082e+00, 2.60308205e+01],
       [1.00000000e+00, 5.30612245e+00, 2.81549354e+01],
       [1.00000000e+00, 5.51020408e+00, 3.03623490e+01],
       [1.00000000e+00, 5.71428571e+00, 3.26530612e+01],
       [1.00000000e+00, 5.91836735e+00, 3.50270721e+01],
       [1.00000000e+00, 6.12244898e+00, 3.74843815e+01],
       [1.00000000e+00, 6.32653061e+00, 4.00249896e+01],
       [1.00000000e+00, 6.53061224e+00, 4.26488963e+01],
       [1.00000000e+00, 6.73469388e+00, 4.53561016e+01],
       [1.00000000e+00, 6.93877551e+00, 4.81466056e+01],
       [1.00000000e+00, 7.14285714e+00, 5.10204082e+01],
       [1.00000000e+00, 7.34693878e+00, 5.39775094e+01],
       [1.00000000e+00, 7.55102041e+00, 5.70179092e+01],
       [1.00000000e+00, 7.75510204e+00, 6.01416077e+01],
       [1.00000000e+00, 7.95918367e+00, 6.33486047e+01],
       [1.00000000e+00, 8.16326531e+00, 6.66389005e+01],
       [1.00000000e+00, 8.36734694e+00, 7.00124948e+01],
       [1.00000000e+00, 8.57142857e+00, 7.34693878e+01],
       [1.00000000e+00, 8.77551020e+00, 7.70095793e+01],
       [1.00000000e+00, 8.97959184e+00, 8.06330696e+01],
       [1.00000000e+00, 9.18367347e+00, 8.43398584e+01],
       [1.00000000e+00, 9.38775510e+00, 8.81299459e+01],
       [1.00000000e+00, 9.59183673e+00, 9.20033319e+01],
       [1.00000000e+00, 9.79591837e+00, 9.59600167e+01],
       [1.00000000e+00, 1.00000000e+01, 1.00000000e+02]])

bate = np.array([5,2,3])
e = np.random.normal(size=nsample)
y = np.dot(X,bate)+e

#最小二乘法，计算回归系数
model = sm.OLS(y,X)
result = model.fit()
result.params

array([4.91388904, 1.89789012, 3.01647615])

result.summary()

OLS Regression Results
Dep. Variable:	y	R-squared:	1.000
Model:	OLS	Adj. R-squared:	1.000
Method:	Least Squares	F-statistic:	2.449e+05
Date:	Wed, 19 Sep 2018	Prob (F-statistic):	3.80e-95
Time:	16:48:00	Log-Likelihood:	-68.540
No. Observations:	50	AIC:	143.1
Df Residuals:	47	BIC:	148.8
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	4.9139	0.401	12.257	0.000	4.107	5.720
x1	1.8979	0.185	10.236	0.000	1.525	2.271
x2	3.0165	0.018	168.240	0.000	2.980	3.053

Omnibus:	0.197	Durbin-Watson:	2.100
Prob(Omnibus):	0.906	Jarque-Bera (JB):	0.003
Skew:	0.019	Prob(JB):	0.998
Kurtosis:	3.011	Cond. No.	142.

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

#拟合估计值
y_fitted = result.fittedvalues
#绘图
fig,ax = plt.subplots(figsize=(8,6))
ax.plot(x,y,'o',label='data')
ax.plot(x,y_fitted,'r--',label='OLS')
ax.legend(loc='best')
plt.show()

分类变量

#假设分类变量有3个取值（a,b,c）,创建哑变量，比如考试成绩有3个等级a（1,0,0）b(0,1,0)c(0,0,1)，这个时候需要3个系数β0，β1，β2，也就是要β0 * 0 + β1 * 1 + β2*2
#此方法为创建哑变量，有多少个取值，做多少个长度，对应值的位置为1，其余位置为0。
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt

nsamples = 50
groups = np.zeros(nsamples,int)
groups[20:40] = 1 # 第二组值
groups[40:] = 2 #第三组值
print('groups:',groups)
dummy = sm.categorical(groups,drop=True)
print('dummy',dummy)

groups: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 2 2 2 2 2 2 2 2 2]
dummy [[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]

# Y = 5 + 2X + 3Z1 + 6Z2 + 9Z3 + e
x = np.linspace(0,10,nsamples)
X = np.column_stack((x,dummy))
print(X)
X = sm.add_constant(X)
bata = [5,2,3,6,9]
e = np.random.normal(size=nsamples)
y = np.dot(X,bata) + e

[[ 0.          1.          0.          0.        ]
 [ 0.20408163  1.          0.          0.        ]
 [ 0.40816327  1.          0.          0.        ]
 [ 0.6122449   1.          0.          0.        ]
 [ 0.81632653  1.          0.          0.        ]
 [ 1.02040816  1.          0.          0.        ]
 [ 1.2244898   1.          0.          0.        ]
 [ 1.42857143  1.          0.          0.        ]
 [ 1.63265306  1.          0.          0.        ]
 [ 1.83673469  1.          0.          0.        ]
 [ 2.04081633  1.          0.          0.        ]
 [ 2.24489796  1.          0.          0.        ]
 [ 2.44897959  1.          0.          0.        ]
 [ 2.65306122  1.          0.          0.        ]
 [ 2.85714286  1.          0.          0.        ]
 [ 3.06122449  1.          0.          0.        ]
 [ 3.26530612  1.          0.          0.        ]
 [ 3.46938776  1.          0.          0.        ]
 [ 3.67346939  1.          0.          0.        ]
 [ 3.87755102  1.          0.          0.        ]
 [ 4.08163265  0.          1.          0.        ]
 [ 4.28571429  0.          1.          0.        ]
 [ 4.48979592  0.          1.          0.        ]
 [ 4.69387755  0.          1.          0.        ]
 [ 4.89795918  0.          1.          0.        ]
 [ 5.10204082  0.          1.          0.        ]
 [ 5.30612245  0.          1.          0.        ]
 [ 5.51020408  0.          1.          0.        ]
 [ 5.71428571  0.          1.          0.        ]
 [ 5.91836735  0.          1.          0.        ]
 [ 6.12244898  0.          1.          0.        ]
 [ 6.32653061  0.          1.          0.        ]
 [ 6.53061224  0.          1.          0.        ]
 [ 6.73469388  0.          1.          0.        ]
 [ 6.93877551  0.          1.          0.        ]
 [ 7.14285714  0.          1.          0.        ]
 [ 7.34693878  0.          1.          0.        ]
 [ 7.55102041  0.          1.          0.        ]
 [ 7.75510204  0.          1.          0.        ]
 [ 7.95918367  0.          1.          0.        ]
 [ 8.16326531  0.          0.          1.        ]
 [ 8.36734694  0.          0.          1.        ]
 [ 8.57142857  0.          0.          1.        ]
 [ 8.7755102   0.          0.          1.        ]
 [ 8.97959184  0.          0.          1.        ]
 [ 9.18367347  0.          0.          1.        ]
 [ 9.3877551   0.          0.          1.        ]
 [ 9.59183673  0.          0.          1.        ]
 [ 9.79591837  0.          0.          1.        ]
 [10.          0.          0.          1.        ]]

#回归分析
res = sm.OLS(y,X).fit()
res.summary()

OLS Regression Results
Dep. Variable:	y	R-squared:	0.981
Model:	OLS	Adj. R-squared:	0.980
Method:	Least Squares	F-statistic:	789.5
Date:	Thu, 20 Sep 2018	Prob (F-statistic):	1.50e-39
Time:	10:23:05	Log-Likelihood:	-76.762
No. Observations:	50	AIC:	161.5
Df Residuals:	46	BIC:	169.2
Df Model:	3
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	7.6313	0.664	11.501	0.000	6.296	8.967
x1	2.1754	0.153	14.247	0.000	1.868	2.483
x2	0.2556	0.421	0.607	0.547	-0.591	1.103
x3	2.2886	0.352	6.508	0.000	1.581	2.997
x4	5.0870	0.792	6.421	0.000	3.492	6.682

Omnibus:	3.522	Durbin-Watson:	2.244
Prob(Omnibus):	0.172	Jarque-Bera (JB):	2.958
Skew:	-0.596	Prob(JB):	0.228
Kurtosis:	3.026	Cond. No.	1.32e+17

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

#拟合估计值
y_fitted = res.fittedvalues
#绘图
fig,ax = plt.subplots(figsize=(8,6))
ax.plot(x,y,'o',label='data')
ax.plot(x,y_fitted,'r--',label='OLS')
ax.legend(loc='best')
plt.show()