.csv(file="C:/Users/Public/Desktop/EduRecivedFiles/lm.blood.csv",header=T)
> blood
y x1 x2 x3 x4
1 11.2 5.68 1.90 4.53 8.2
2 8.8 3.79 1.64 7.32 6.9
3 12.3 6.02 3.56 6.95 10.8
4 11.6 4.85 1.07 5.88 8.3
5 13.4 4.60 2.32 4.05 7.5
6 18.3 6.05 0.64 1.42 13.6
7 11.1 4.90 8.50 12.60 8.5
8 12.1 7.08 3.00 6.75 11.5
9 9.6 3.85 2.11 16.28 7.9
10 8.4 4.65 0.63 6.59 7.1
11 9.3 4.59 1.97 3.61 8.7
12 10.6 4.29 1.97 6.61 7.8
13 8.4 7.97 1.93 7.57 9.9
14 9.6 6.19 1.18 1.42 6.9
15 10.9 6.13 2.06 10.35 10.5
16 10.1 5.71 1.78 8.53 8.0
17 14.8 6.40 2.40 4.53 10.3
18 9.1 6.06 3.67 12.79 7.1
19 10.8 5.09 1.03 2.53 8.9
20 10.2 6.13 1.71 5.28 9.9
21 13.6 5.78 3.36 2.96 8.0
22 14.9 5.43 1.13 4.31 11.3
23 16.0 6.50 6.21 3.47 12.3
24 13.2 7.98 7.92 3.37 9.8
25 20.0 11.54 10.89 1.20 10.5
26 13.3 5.84 0.92 8.61 6.4
27 10.4 3.84 1.20 6.45 9.6
> x1 <- blood[,2]
> x2 <- blood[,3]
> x4 <- blood[,4]
> x3 <- blood[,4]
> x4 <- blood[,5]
> y <- blood[,1]
>
> lm.reg <- lm(y~x1+x2+x3+x4,data=blood)
> summary(lm.reg)
Call:
lm(formula = y ~ x1 + x2 + x3 + x4, data = blood)
Residuals:
Min 1Q Median 3Q Max
-3.6268 -1.2004 -0.2276 1.5389 4.4467
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.9433 2.8286 2.101 0.0473 *
x1 0.1424 0.3657 0.390 0.7006
x2 0.3515 0.2042 1.721 0.0993 .
x3 -0.2706 0.1214 -2.229 0.0363 *
x4 0.6382 0.2433 2.623 0.0155 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.01 on 22 degrees of freedom
Multiple R-squared: 0.6008, Adjusted R-squared: 0.5282
F-statistic: 8.278 on 4 and 22 DF, p-value: 0.0003121
> cor(blood)
y x1 x2 x3 x4
y 1.0000000 0.5585251 0.45850963 -0.51012130 0.6096420
x1 0.5585251 1.0000000 0.63150583 -0.35479471 0.4152708
x2 0.4585096 0.6315058 1.00000000 -0.03863221 0.2189743
x3 -0.5101213 -0.3547947 -0.03863221 1.00000000 -0.3297787
x4 0.6096420 0.4152708 0.21897432 -0.32977870 1.0000000
> lm.step <- step(lm.reg)
Start: AIC=42.16
y ~ x1 + x2 + x3 + x4
Df Sum of Sq RSS AIC
- x1 1 0.6129 89.454 40.343
<none> 88.841 42.157
- x2 1 11.9627 100.804 43.568
- x3 1 20.0635 108.905 45.655
- x4 1 27.7939 116.635 47.507
Step: AIC=40.34
y ~ x2 + x3 + x4
Df Sum of Sq RSS AIC
<none> 89.454 40.343
- x3 1 25.690 115.144 45.159
- x2 1 26.530 115.984 45.356
- x4 1 32.269 121.723 46.660
> # 因此x1的AIC最小 ,即对y的影响最小。因此删去x1
> # 最后当只有x2,x3,x4时 AIC最小。
> lm.step <- step(lm.reg)
Start: AIC=42.16
y ~ x1 + x2 + x3 + x4
Df Sum of Sq RSS AIC
- x1 1 0.6129 89.454 40.343
<none> 88.841 42.157
- x2 1 11.9627 100.804 43.568
- x3 1 20.0635 108.905 45.655
- x4 1 27.7939 116.635 47.507
Step: AIC=40.34
y ~ x2 + x3 + x4
Df Sum of Sq RSS AIC
<none> 89.454 40.343
- x3 1 25.690 115.144 45.159
- x2 1 26.530 115.984 45.356
- x4 1 32.269 121.723 46.660
> summary(lm.step)
Call:
lm(formula = y ~ x2 + x3 + x4, data = blood)
Residuals:
Min 1Q Median 3Q Max
-3.2692 -1.2305 -0.2023 1.4886 4.6570
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.4996 2.3962 2.713 0.01242 *
x2 0.4023 0.1541 2.612 0.01559 *
x3 -0.2870 0.1117 -2.570 0.01712 *
x4 0.6632 0.2303 2.880 0.00845 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.972 on 23 degrees of freedom
Multiple R-squared: 0.5981, Adjusted R-squared: 0.5456
F-statistic: 11.41 on 3 and 23 DF, p-value: 8.793e-05
> # 最后得到的方程为:y=6.4996 +0.4023*x1-0.2870*x3+0.6632*x4
ile="C:/Users/Public/Desktop/EduRecivedFiles/lm.toothpaste.csv",header = T)
> data
p1 p2 p12 ad sale
1 3.85 3.80 -0.05 5.50 7.38
2 3.75 4.00 0.25 6.75 8.51
3 3.70 4.30 0.60 7.25 9.52
4 3.70 3.70 0.00 5.50 7.50
5 3.60 3.85 0.25 7.00 9.33
6 3.60 3.80 0.20 6.50 8.28
7 3.60 3.75 0.15 6.75 8.75
8 3.80 3.85 0.05 5.25 7.87
9 3.80 3.65 -0.15 5.25 7.10
10 3.85 4.00 0.15 6.00 8.00
11 3.90 4.10 0.20 6.50 7.89
12 3.90 4.00 0.10 6.25 8.15
13 3.70 4.10 0.40 7.00 9.10
14 3.75 4.20 0.45 6.90 8.86
15 3.75 4.10 0.35 6.80 8.90
16 3.80 4.10 0.30 6.80 8.87
17 3.70 4.20 0.50 7.10 9.26
18 3.80 4.30 0.50 7.00 9.00
19 3.70 4.10 0.40 6.80 8.75
20 3.80 3.75 -0.05 6.50 7.95
21 3.80 3.75 -0.05 6.25 7.65
22 3.75 3.65 -0.10 6.00 7.27
23 3.70 3.90 0.20 6.50 8.00
24 3.55 3.65 0.10 7.00 8.50
25 3.60 4.10 0.50 6.80 8.75
26 3.65 4.25 0.60 6.80 9.21
27 3.70 3.65 -0.05 6.50 8.27
28 3.75 3.75 0.00 5.75 7.67
29 3.80 3.85 0.05 5.80 7.93
30 3.70 4.25 0.55 6.80 9.26
> p1 <- data[,1]
> p2 <- data[,2]
> p12 <- data[,3]
> ad <- data[,4]
> sale <- data[,5]
> lm <- lm(sale~p1+p2+p12+ad)
> summaty(lm)
Error in summaty(lm) : 没有"summaty"这个函数
> summary(lm)
Call:
lm(formula = sale ~ p1 + p2 + p12 + ad)
Residuals:
Min 1Q Median 3Q Max
-0.41065 -0.11562 -0.00984 0.13466 0.51361
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.5891 2.4450 3.104 0.004567 **
p1 -2.3577 0.6379 -3.696 0.001028 **
p2 1.6122 0.2954 5.459 1.01e-05 ***
p12 NA NA NA NA
ad 0.5012 0.1259 3.981 0.000491 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2347 on 26 degrees of freedom
Multiple R-squared: 0.8936, Adjusted R-squared: 0.8813
F-statistic: 72.8 on 3 and 26 DF, p-value: 8.883e-13
> # 由上表得,影响sale的是p12
> lm <- lm(sale~p12+ad)
> summary(lm)
Call:
lm(formula = sale ~ p12 + ad)
Residuals:
Min 1Q Median 3Q Max
-0.49779 -0.12031 -0.00867 0.11084 0.58106
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.4075 0.7223 6.102 1.62e-06 ***
p12 1.5883 0.2994 5.304 1.35e-05 ***
ad 0.5635 0.1191 4.733 6.25e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2383 on 27 degrees of freedom
Multiple R-squared: 0.886, Adjusted R-squared: 0.8776
F-statistic: 105 on 2 and 27 DF, p-value: 1.845e-13
> # 回归诊断
> sale.res <- residuals(lm)
> sale.fit <- predict(lm)
> plot(sale.res~sale.fit)
> # 删除残差在[-2,2]的样本点
> data.1 <- data[-c(5,8,11),]
> lm.1 <- lm(data.1$sale~data.1$p12+data.1$ad)
> summary(lm.1)
Call:
lm(formula = data.1$sale ~ data.1$p12 + data.1$ad)
Residuals:
Min 1Q Median 3Q Max
-0.37130 -0.10114 0.03066 0.10016 0.30162
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.0759 0.6267 6.504 1.00e-06 ***
data.1$p12 1.5276 0.2354 6.489 1.04e-06 ***
data.1$ad 0.6138 0.1027 5.974 3.63e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1767 on 24 degrees of freedom
Multiple R-squared: 0.9378, Adjusted R-squared: 0.9327
F-statistic: 181 on 2 and 24 DF, p-value: 3.33e-15
> # r方接近1,sale=4.0759+1.5276*p12+0.6138*ad
>
> plot(data$sale~data$p12)
> abline(lm(data$sale~data$p12)
+ )
# 导入数据:
> data <- read.csv(file="C:/Users/Public/Desktop/EduRecivedFiles/lm.toothpaste.csv",header = T)
# 切片提取
> p1 <- data[,1]
> p2 <- data[,2]
> p12 <- data[,3]
> ad <- data[,4]
> sale <- data[,5]
# 建立回归方程
> lm <- lm(sale~p1+p2+p12+ad,data=data)
> summary(lm)
# lm的回归检验
Call:
lm(formula = sale ~ p1 + p2 + p12 + ad, data = data)
Residuals:
Min 1Q Median 3Q Max
-0.41065 -0.11562 -0.00984 0.13466 0.51361
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.5891 2.4450 3.104 0.004567 **
p1 -2.3577 0.6379 -3.696 0.001028 **
p2 1.6122 0.2954 5.459 1.01e-05 ***
p12 NA NA NA NA
ad 0.5012 0.1259 3.981 0.000491 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2347 on 26 degrees of freedom
Multiple R-squared: 0.8936, Adjusted R-squared: 0.8813
F-statistic: 72.8 on 3 and 26 DF, p-value: 8.883e-13
# 逐步回归法筛选变量
> lm.step <- step(lm)
Start: AIC=-83.27
sale ~ p1 + p2 + p12 + ad
Step: AIC=-83.27
sale ~ p1 + p2 + ad
Df Sum of Sq RSS AIC
<none> 1.4318 -83.268
- p1 1 0.75219 2.1840 -72.601
- ad 1 0.87293 2.3047 -70.987
- p2 1 1.64088 3.0727 -62.359
> # 筛选得到的变量有p1,p2,ad
# 回归检验新模型
> summary(lm.step)
Call:
lm(formula = sale ~ p1 + p2 + ad, data = data)
Residuals:
Min 1Q Median 3Q Max
-0.41065 -0.11562 -0.00984 0.13466 0.51361
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.5891 2.4450 3.104 0.004567 **
p1 -2.3577 0.6379 -3.696 0.001028 **
p2 1.6122 0.2954 5.459 1.01e-05 ***
ad 0.5012 0.1259 3.981 0.000491 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2347 on 26 degrees of freedom
Multiple R-squared: 0.8936, Adjusted R-squared: 0.8813
F-statistic: 72.8 on 3 and 26 DF, p-value: 8.883e-13
> # 此上是对逐步回归法得到的线性回归方程的回归诊断,可以发现变量均显著,模型的R方=0.8936,接近于1
> # 残差图
>
> sale.res <- residuals(lm.step)
> sale.fit <- predict(lm.step)
> plot(sale.res~sale.fit)
>
> data.1 <- data[-c{5,8,11),]
错误: 意外的'{'在"data.1 <- data[-c{"里
> data.1 <- data[-c(5,8,11),]
> lm.1 <- lm(data.1$sale~data.1$p1+data.1$p2+data.1$ad)
> summary(lm.1)
Call:
lm(formula = data.1$sale ~ data.1$p1 + data.1$p2 + data.1$ad)
Residuals:
Min 1Q Median 3Q Max
-0.36892 -0.10228 0.03239 0.09380 0.31041
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.7261 2.1127 1.764 0.0911 .
data.1$p1 -1.4430 0.5433 -2.656 0.0141 *
data.1$p2 1.5252 0.2408 6.335 1.83e-06 ***
data.1$ad 0.6206 0.1118 5.549 1.21e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1804 on 23 degrees of freedom
Multiple R-squared: 0.9379, Adjusted R-squared: 0.9298
F-statistic: 115.8 on 3 and 23 DF, p-value: 5.046e-14
> data
p1 p2 p12 ad sale
1 3.85 3.80 -0.05 5.50 7.38
2 3.75 4.00 0.25 6.75 8.51
3 3.70 4.30 0.60 7.25 9.52
4 3.70 3.70 0.00 5.50 7.50
5 3.60 3.85 0.25 7.00 9.33
6 3.60 3.80 0.20 6.50 8.28
7 3.60 3.75 0.15 6.75 8.75
8 3.80 3.85 0.05 5.25 7.87
9 3.80 3.65 -0.15 5.25 7.10
10 3.85 4.00 0.15 6.00 8.00
11 3.90 4.10 0.20 6.50 7.89
12 3.90 4.00 0.10 6.25 8.15
13 3.70 4.10 0.40 7.00 9.10
14 3.75 4.20 0.45 6.90 8.86
15 3.75 4.10 0.35 6.80 8.90
16 3.80 4.10 0.30 6.80 8.87
17 3.70 4.20 0.50 7.10 9.26
18 3.80 4.30 0.50 7.00 9.00
19 3.70 4.10 0.40 6.80 8.75
20 3.80 3.75 -0.05 6.50 7.95
21 3.80 3.75 -0.05 6.25 7.65
22 3.75 3.65 -0.10 6.00 7.27
23 3.70 3.90 0.20 6.50 8.00
24 3.55 3.65 0.10 7.00 8.50
25 3.60 4.10 0.50 6.80 8.75
26 3.65 4.25 0.60 6.80 9.21
27 3.70 3.65 -0.05 6.50 8.27
28 3.75 3.75 0.00 5.75 7.67
29 3.80 3.85 0.05 5.80 7.93
30 3.70 4.25 0.55 6.80 9.26
# 删除样本点在【-0.4,0.4】内的残差点,以下找出
> # 计算出逐步回归模型的残差
> residuals <- resid(lm.step)
> # 找到样本点残差小于等于0.4的
> yangben <- which(abs(residuals)<=0.4)
> yangben
1 2 3 4 6 7 9 10 11 12 13 14 15 16 17 18 19 20 21 22 24 25 26 27 28 29 30
1 2 3 4 6 7 9 10 11 12 13 14 15 16 17 18 19 20 21 22 24 25 26 27 28 29 30
> # 删除样本点重新拟合
> data.1 <- data[-c(5,8),]
> lm.step.1 <- lm(data.1$sale~data.1$p1+data.1$p2+data.1$ad)
> summary(lm.step.1)
Call:
lm(formula = data.1$sale ~ data.1$p1 + data.1$p2 + data.1$ad)
Residuals:
Min 1Q Median 3Q Max
-0.39962 -0.10260 0.03131 0.13708 0.29236
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.7582 2.1416 2.689 0.01283 *
data.1$p1 -1.9461 0.5543 -3.511 0.00179 **
data.1$p2 1.5582 0.2646 5.889 4.48e-06 ***
data.1$ad 0.5742 0.1214 4.731 8.22e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1986 on 24 degrees of freedom
Multiple R-squared: 0.923, Adjusted R-squared: 0.9134
F-statistic: 95.9 on 3 and 24 DF, p-value: 1.686e-13