线性回归
单变量线性回归-梯度下降算法
from xml.etree.ElementTree import tostring
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 设置随机种子,以确保结果可重复
np.random.seed(0)
# 生成自变量 x,假设范围是 0 到 100,共有 100 个数据点
x = np.random.uniform(0, 100, 100)
# 假设因变量 y 与 x 之间存在线性关系,加上一些噪声
# y = 3 * x + 5 + 噪声,噪声服从均值为 0,标准差为 10 的正态分布
noise = np.random.normal(0, 10, 100)
y = 3 * x + 5 + noise
# 创建 DataFrame 存储数据
data = pd.DataFrame({
'X': x, 'Y': y})
# 保存数据到 CSV 文件
data.to_csv('linear_regression_data.csv', index=False)
data.insert(0,'ones',1)
# 绘制散点图
plt.scatter(x, y)
plt.title('Scatter Plot of Linear Regression Data')
plt.xlabel('X')
plt.ylabel('Y')
plt.grid(True)
plt.show()
#数据处理
X = data.iloc[:,0:-1]
X.head()
X = X.values
X.shape
y = data.iloc[:,-1]
y.head()
y = y.values
y.shape
y = y.reshape(100,1)
y.shape
#计算J(Θ)的值
def cost_func(X,y,theta):
inner=np.power(X@theta - y,2)
return np.sum(inner)/(2*len(X))
#随机初始值
theta=np.zeros((2,1))
print(theta)
cost0=cost_func(X,y,theta)
print("cost0:")
print(cost0)
#学习率和学习论述
alpha=0.000001
count=1000000
#梯度下降算法
def gradient_Abscent(X,y,alpha,count):
global theta
costs=[]
for i in range(count):
#theta = theta-(X.T @(X @ theta - y))*alpha/len(X)
theta = theta - (X.T @ (X @ theta - y)) * alpha / len(X)
nowcost=cost_func(X,y,theta)
costs.append(nowcost)
if i%100==0:
print(nowcost)
return theta,costs
theta_ans,cost_ans=gradient_Abscent(X,y,alpha,count)
#代价函数可视化
fig,ax = plt.subplots()
ax.plot(np.arange(count),cost_ans)
ax.set(xlabel = 'count',ylabel = 'cost')
plt.show()
# 拟合函数可视化
x = np.linspace(y.min(), y.max(), 100) # 网格数据
y_ = theta_ans[0, 0] + theta_ans[1, 0] * x # 取theta第一行第一个和第二行第一个
print("b:")
print(theta_ans[0, 0])
print("k:")
print(theta_ans[1, 0])
fig, ax = plt.subplots()
ax.scatter(X[:, 1], y, label='training') # 绘制数据集散点图取x所有行,第2列population
ax.plot(x, y_, 'r', label='predict') # 绘制预测后的直线
ax.legend()
ax.set(xlabel='population', ylabel='profit')
plt.show()
多变量线性回归-梯度下降算法
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
np.random.seed(0)
# 设置数据集大小
num_samples = 1000
# 生成特征数据
area = np.random.normal(loc=1500, scale=300, size=num_samples) # 房屋面积,均值为1500,标准差为300
year = np.random.randint(1950, 2023, size=num_samples) # 房屋年份,范围在1950年至2022年之间
num_rooms = np.random.randint(2, 6, size=num_samples) # 房间数量,范围在2至5之间
# 生成目标变量数据(房价),假设线性关系为 price = 100 * area + 500 * year - 300 * num_rooms + noise
noise = np.random.normal(loc=0, scale=10000, size=num_samples) # 添加噪声
price = 100 * area + 500 * (2022 - year) - 300 * num_rooms + noise
# 定义Z-score标准化函数
def z_score_normalization(feature):
mean = np.mean(feature)
std = np.std(feature)
normalized_feature = (feature - mean) / std
return normalized_feature
# 对每个特征进行Z-score标准化
area_normalized = z_score_normalization(area)
year_normalized = z_score_normalization(year)
num_rooms_normalized = z_score_normalization(num_rooms)
# 输出标准化后的特征数据
print("Normalized Area:", area_normalized)
print("Normalized Year:", year_normalized)
print("Normalized Number of Rooms:", num_rooms_normalized)
# 创建 DataFrame 对象
data = pd.DataFrame({
'Area': area_normalized,
'Year': year_normalized,
'NumRooms': num_rooms_normalized,
'Price': price
})
# 保存数据集到文件
data.to_csv('linear_regression_data1.csv', index=False)
data.insert(0,'ones',1) #x0=1
# 数据处理
X = data.iloc[:, :-1].values # 特征矩阵
y = data.iloc[:, -1].values.reshape(-1, 1) # 目标变量列
# 数据集分割为训练集和测试集(70%训练,30%测试)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#计算J(Θ)的值
def cost_func(X,y,theta):
#inner=np.power(X @ theta - y,2)
# print("X shape:", X.shape)
# print("y shape:", y.shape)
# print("theta shape:", theta.shape)
inner = (X @ theta - y) ** 2
return np.sum(inner)/(2*