import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
import joblib
import numpy as np
from sklearn.metrics import mean_squared_error # 使用sklearn的mean_squared_error
from sklearn import metrics
# 读取数据
filename = "D:\\aaa东华大学\\专业课\\数据科学\\homework\\advertising.csv"
data = pd.read_csv(filename, index_col=0)
print(data.iloc[0:100, :])
# 绘制散点图
data.plot(kind='scatter', x='TV', y='Sales', title='Sales by TV')
plt.xlabel('TV')
plt.ylabel('Sales')
plt.show()
data.plot(kind='scatter', x='Weibo', y='Sales', title='Sales by Weibo')
plt.xlabel('Weibo')
plt.ylabel('Sales')
plt.show()
data.plot(kind='scatter', x='WeChat', y='Sales', title='Sales by WeChat')
plt.xlabel('WeChat')
plt.ylabel('Sales')
plt.show()
# 准备数据
x = data.iloc[:, 0:3]
y = data.iloc[:, 3]
linreg = LinearRegression()
linreg.fit(x, y)
print("线性回归模型的截距和系数是:",linreg.intercept_, linreg.coef_)
# 保存模型
joblib.dump(linreg, filename='linreg.pkl')
load_linreg = joblib.load('linreg.pkl')
new_x = pd.DataFrame([[130.1, 87.8, 69.2]])
print("六月广告投入:", new_x)
print("预期销售:", load_linreg.predict(new_x))
# 划分训练集和测试集
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.35, random_state=1)
linregTr = LinearRegression()
linregTr.fit(x_train, y_train)
print("训练集的linrgeTr的截距和系数是:",linregTr.intercept_, linregTr.coef_)
# 使用全部数据训练的模型linreg在测试集上的预测
y_test_pred_linreg = linreg.predict(x_test)
# 计算linreg在测试集上的均方误差和均方根误差
test_err_linreg = mean_squared_error(y_test, y_test_pred_linreg)
test_err_rmse_linreg = test_err_linreg
# 打印linreg在测试集上的RMSE
print("使用全部数据训练的模型linreg在测试集上的均方误差是:{:.2f}".format(test_err_rmse_linreg))
y_train_pred = linregTr.predict(x_train)
y_test_pred = linregTr.predict(x_test)
train_err = metrics.mean_squared_error(y_train, y_train_pred)
test_err = metrics.mean_squared_error(y_test, y_test_pred)
print("\n使用训练集训练的linrgeTr在训练集和测试集的均方误差是:{:.2f},{:.2f}".format(train_err, test_err))
predict_score = linregTr.score(x_test, y_test)
print('决定系数为{:.2f}'.format(predict_score))
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
import joblib
import numpy as np
from sklearn.metrics import mean_squared_error # 使用sklearn的mean_squared_error
from sklearn import metrics
# 读取数据
filename = "D:\\aaa东华大学\\专业课\\数据科学\\homework\\advertising.csv"
data = pd.read_csv(filename, index_col=0)
# 绘制散点图
data.plot(kind='scatter', x='TV', y='Sales', title='Sales by TV')
plt.xlabel('TV')
plt.ylabel('Sales')
plt.show()
data.plot(kind='scatter', x='Weibo', y='Sales', title='Sales by Weibo')
plt.xlabel('Weibo')
plt.ylabel('Sales')
plt.show()
data.plot(kind='scatter', x='WeChat', y='Sales', title='Sales by WeChat')
plt.xlabel('WeChat')
plt.ylabel('Sales')
plt.show()
#准备数据
x = data.iloc[:, 0:3]
y = data.iloc[:, 3]
linreg = LinearRegression()
linreg.fit(x, y)
print("linreg的截距和系数是:",linreg.intercept_, linreg.coef_)
#准备linregHalf
x_half = data.iloc[:100, 0:3]
y_half= data.iloc[:100, 3]
linregHalf = LinearRegression()
linregHalf.fit(x_half, y_half)
print("linregHalf的截距和系数是:",linregHalf.intercept_, linregHalf.coef_)
# 保存模型
joblib.dump(linreg, filename='linreg.pkl')
load_linreg = joblib.load('linreg.pkl')
new_x = pd.DataFrame([[130.1, 87.8, 69.2]])
# 划分训练集和测试集
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.35, random_state=1)
linregTr = LinearRegression()
linregTr.fit(x_train, y_train)
print("训练集的linrgeTr的截距和系数是:",linregTr.intercept_, linregTr.coef_)
# 使用全部数据训练的模型linreg在测试集上的预测
y_test_pred_linreg = linreg.predict(x_test)
# 计算linreg在测试集上的均方误差和均方根误差
test_err_linreg = mean_squared_error(y_test, y_test_pred_linreg)
test_err_rmse_linreg = test_err_linreg
# 打印linreg在测试集上的RMSE
print("使用全部数据训练的模型linreg在测试集上的均方误差是:{:.2f}".format(test_err_rmse_linreg))
y_test_pred_linregHalf = linregHalf.predict(x_test)
# 计算linregHalf在测试集上的均方误差
test_err_linregHalf = mean_squared_error(y_test, y_test_pred_linregHalf)
test_err_rmse_linregHalf = test_err_linregHalf
# 打印linregHalf在测试集上的RMSE
print("使用前100条数据训练的模型linregHalf在测试集上的均方误差是:{:.2f}".format(test_err_rmse_linregHalf))
y_train_pred = linregTr.predict(x_train)
y_test_pred = linregTr.predict(x_test)
train_err = metrics.mean_squared_error(y_train, y_train_pred)
test_err = metrics.mean_squared_error(y_test, y_test_pred)
print("使用训练集训练的linrgeTr在训练集和测试集的均方误差是:{:.2f},{:.2f}".format(train_err, test_err))
predict_score = linregTr.score(x_test, y_test)
print('决定系数为{:.2f}'.format(predict_score))