import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
import math
from IPython import display
from matplotlib import cm, gridspec
from sklearn import metrics
from tensorflow.python.data import Dataset
tf.logging.set_verbosity(tf.logging.ERROR)
# TensorFlow使用五个不同级别的日志消息。 按照上升的顺序,
# 它们是DEBUG,INFO,WARN,ERROR和FATAL。
# 当您在任何这些级别配置日志记录时,
# TensorFlow将输出与该级别相对应的所有日志消息以及所有级别的严重级别。
# 即,设置为ERROR时,将输出ERROR和FATAL两个级别的日志
pd.options.display.max_rows = 10
# 设置notebook一个cell最多显示数据行数为10行
pd.options.display.float_format = '{:.1f}'.format
# 设置pandas数据的显示格式为1位小数
print('section1 finished.')
section1 finished.
california_housing_dataframe = pd.read_csv("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=",")
# 从网站上下载csv文件,并使用pandas的read_csv函数读取,获得Dataframe对象
print('section2 finished.')
section2 finished.
california_housing_dataframe[["median_house_value",'population']]
| median_house_value | population | |
|---|---|---|
| 0 | 66900.0 | 1015.0 |
| 1 | 80100.0 | 1129.0 |
| 2 | 85700.0 | 333.0 |
| 3 | 73400.0 | 515.0 |
| 4 | 65500.0 | 624.0 |
| ... | ... | ... |
| 16995 | 111400.0 | 907.0 |
| 16996 | 79000.0 | 1194.0 |
| 16997 | 103600.0 | 1244.0 |
| 16998 | 85800.0 | 1298.0 |
| 16999 | 94600.0 | 806.0 |
17000 rows × 2 columns
california_housing_dataframe = california_housing_dataframe.reindex(
np.random.permutation(california_housing_dataframe.index))
# Dataframe的索引随机排列化,涉及到两个函数:
# 1是np.random.permutation,输入一个数组,返回数组的一个随机排列
# 2是Dataframe.reindex,输入一个数组,将索引设置为输入的数组
california_housing_dataframe["median_house_value"] /= 1000.0
# 查看Dataframe对象的列数据必须使用类似字典的形式
california_housing_dataframe
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
|---|---|---|---|---|---|---|---|---|---|
| 3958 | -114.3 | 34.2 | 15.0 | 5612.0 | 1283.0 | 1015.0 | 472.0 | 1.5 | 66.9 |
| 6515 | -114.5 | 34.4 | 19.0 | 7650.0 | 1901.0 | 1129.0 | 463.0 | 1.8 | 80.1 |
| 11198 | -114.6 | 33.7 | 17.0 | 720.0 | 174.0 | 333.0 | 117.0 | 1.7 | 85.7 |
| 3501 | -114.6 | 33.6 | 14.0 | 1501.0 | 337.0 | 515.0 | 226.0 | 3.2 | 73.4 |
| 8838 | -114.6 | 33.6 | 20.0 | 1454.0 | 326.0 | 624.0 | 262.0 | 1.9 | 65.5 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2805 | -124.3 | 40.6 | 52.0 | 2217.0 | 394.0 | 907.0 | 369.0 | 2.4 | 111.4 |
| 11369 | -124.3 | 40.7 | 36.0 | 2349.0 | 528.0 | 1194.0 | 465.0 | 2.5 | 79.0 |
| 11180 | -124.3 | 41.8 | 17.0 | 2677.0 | 531.0 | 1244.0 | 456.0 | 3.0 | 103.6 |
| 1002 | -124.3 | 41.8 | 19.0 | 2672.0 | 552.0 | 1298.0 | 478.0 | 2.0 | 85.8 |
| 15287 | -124.3 | 40.5 | 52.0 | 1820.0 | 300.0 | 806.0 | 270.0 | 3.0 | 94.6 |
17000 rows × 9 columns
california_housing_dataframe.describe()
# Dataframe对象的describe函数返回各列数据的计数、均值、标准差、最小值、分位数
| longitude | latitude | housing_median_age | total_rooms | total_bedrooms | population | households | median_income | median_house_value | |
|---|---|---|---|---|---|---|---|---|---|
| count | 17000.0 | 17000.0 | 17000.0 | 17000.0 | 17000.0 | 17000.0 | 17000.0 | 17000.0 | 17000.0 |
| mean | -119.6 | 35.6 | 28.6 | 2643.7 | 539.4 | 1429.6 | 501.2 | 3.9 | 207.3 |
| std | 2.0 | 2.1 | 12.6 | 2179.9 | 421.5 | 1147.9 | 384.5 | 1.9 | 116.0 |
| min | -124.3 | 32.5 | 1.0 | 2.0 | 1.0 | 3.0 | 1.0 | 0.5 | 15.0 |
| 25% | -121.8 | 33.9 | 18.0 | 1462.0 | 297.0 | 790.0 | 282.0 | 2.6 | 119.4 |
| 50% | -118.5 | 34.2 | 29.0 | 2127.0 | 434.0 | 1167.0 | 409.0 | 3.5 | 180.4 |
| 75% | -118.0 | 37.7 | 37.0 | 3151.2 | 648.2 | 1721.0 | 605.2 | 4.8 | 265.0 |
| max | -114.3 | 42.0 | 52.0 | 37937.0 | 6445.0 | 35682.0 | 6082.0 | 15.0 | 500.0 |
# 我们尝试预测的目标是median_house_value,作为我们的label/target
# 使用total_rooms作为输入特征
# 使用tensorflow Estimator提供的LinearRegressor接口
# 常用的数据有两种,一是分类数据(文字数据)
# 二是数值数据(如本例中的total_rooms)
# tensorflow中我们使用“特征列”的结构来标识特征的数据类型
# 使用numeric_column将特征指定为数值
my_feature = california_housing_dataframe[["total_rooms"]]
# 如果要使用多个变量,将"total_rooms"改为["feature1","feature2"...] 形式的列表
feature_column = [tf.feature_column.numeric_column('total_rooms')]
# 此时feature_column只是一个numeric_column对象,还未赋值
# my_feature
# feature_column
[_NumericColumn(key='total_rooms', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]
# 定义目标,即median_house_value
targets = california_housing_dataframe['median_house_value']
targets.describe()
count 17000.0
mean 207.3
std 116.0
min 15.0
25% 119.4
50% 180.4
75% 265.0
max 500.0
Name: median_house_value, dtype: float64
# 配置LinearRegressor
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
# 设置优化器为SGD(随机梯度下降法),learning_rate控制步长大小
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
# 为安全起见,我们通过clip_gradients_by_norm将梯度裁剪应用到优化器
# 梯度裁剪可以确保梯度大小在训练期间不会变得过大,过大将导致GD方法失败。
# 梯度裁剪相关:https://blog.youkuaiyun.com/y12345678904/article/details/79581550
linear_regressor = tf.estimator.LinearRegressor(
feature_columns=feature_column,
optimizer=my_optimizer
)
dict(my_feature)
{'total_rooms': 3958 5612.0
6515 7650.0
11198 720.0
3501 1501.0
8838 1454.0
...
2805 2217.0
11369 2349.0
11180 2677.0
1002 2672.0
15287 1820.0
Name: total_rooms, Length: 17000, dtype: float64}
# 定义输入函数,告诉tensorflow如何预处理数据,以及如何在训练期间批处理、随机处理和重复数据
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
"""
Args:
features: pandas Dataframe, 输入的特征
targets: pandas Dataframe, 需要预测的目标
batch_size: int,将数据拆分成大小为batch_size的多批数据
shuffle: bool value,是否随机抽取数据
num_epochs:int,指定重复周期数,若为None,则无限循环
Returns:
Tuple:特征、标签用于下次数据批处理
"""
# 首先将Dataframe对象转换为numpy字典
features = {key:np.array(value) for key,value in dict(features).items()}
# key均为total_rooms,values则为带有索引的Series对象(操作形式与列表类似)
dataset = Dataset.from_tensor_slices((features,targets))
# 创建dataset对象,并设置批处理大小和重复次数
dataset = dataset.batch(batch_size).repeat(num_epochs)
if shuffle:
dataset = dataset.shuffle(buffer_size=1000)
# 维持一个buffer_size大小的shuffle buffer,每次样本都从buffer中抽取
# 每取出一个样本,就向其中补充一个
# 返回下一批数据
features, labels = dataset.make_one_shot_iterator().get_next()
return features, labels
# 使用linear_regressor训练模型
# 将my_input_fn封装在lambda中,以便可以传入参数(features,targets)
_ = linear_regressor.train(
input_fn = lambda:my_input_fn(my_feature, targets),
steps = 100
)
# 训练次数为10000
# 使用训练数据预测与数据的拟合情况
prediction_input_fn =lambda: my_input_fn(my_feature, targets, num_epochs=1,shuffle=False)
# 创建一个预测函数,基于输入函数
predictions = linear_regressor.predict(input_fn=prediction_input_fn)
# 将预测函数作为参数输入regressor预测器
predictions = np.array([item['predictions'][0] for item in predictions])
# 将预测结果转为array对象,方便计算误差
mean_squared_error = metrics.mean_squared_error(predictions, targets)
# 计算均方差
root_mean_squared_error = math.sqrt(mean_squared_error)
# 均方根误差差
print("Mean Squared Error (on training data): %0.3f"%mean_squared_error)
print("Root Mean Squared Error (on training data): %0.3f"%root_mean_squared_error)
Mean Squared Error (on training data): 27664.247
Root Mean Squared Error (on training data): 166.326
min_house_value = california_housing_dataframe["median_house_value"].min()
max_house_value = california_housing_dataframe["median_house_value"].max()
min_max_difference = max_house_value - min_house_value
print("Min median house value is %0.3f"%min_house_value)
print("Max median house value is %0.3f"%max_house_value)
print("Difference between min and max house value is %0.3f"%min_max_difference)
print("Root Mean Squared Error (on training data): %0.3f"%root_mean_squared_error)
Min median house value is 14.999
Max median house value is 500.001
Difference between min and max house value is 485.002
Root Mean Squared Error (on training data): 166.326
calibration_data = pd.DataFrame()
calibration_data['predictions'] = pd.Series(predictions)
calibration_data['targets'] = pd.Series(targets)
calibration_data.describe()
| predictions | targets | |
|---|---|---|
| count | 17000.0 | 17000.0 |
| mean | 129.1 | 207.3 |
| std | 106.5 | 116.0 |
| min | 0.1 | 15.0 |
| 25% | 71.4 | 119.4 |
| 50% | 103.9 | 180.4 |
| 75% | 153.9 | 265.0 |
| max | 1852.7 | 500.0 |
sample = california_housing_dataframe.sample(n=300)
x_0 = sample['total_rooms'].min()
x_1 = sample['total_rooms'].max()
weight = linear_regressor.get_variable_value('linear/linear_model/total_rooms/weights')[0]
bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')
y_0 = weight*x_0+bias
y_1 = weight*x_1+bias
plt.plot([x_0,x_1],[y_0,y_1],c='r')
plt.ylabel('median_house_value')
plt.xlabel('total_rooms')
plt.scatter(sample['total_rooms'],sample['median_house_value'])
plt.show()

from mpl_toolkits.mplot3d import Axes3D
def train_model(learning_rate, steps, batch_size, input_feature=["population","total_rooms"]):
"""
Args:
learning_rate: float 学习率
steps: 训练次数
batch_size: 批处理规模
input_feature: string,从california_housing_dataframe提取的特征名
"""
# 在10个等分时间段使用训练函数
periods = 5
steps_per_period =steps/periods
my_feature = input_feature
my_feature_data = california_housing_dataframe[my_feature]
my_label ='median_house_value'
targets = california_housing_dataframe[[my_label]]
# 创建特征块
feature_columns = [tf.feature_column.numeric_column(k) for k in my_feature]
# 创建输入函数
train_input_fn = lambda:my_input_fn(my_feature_data, targets, batch_size)
prediction_input_fn = lambda:my_input_fn(my_feature_data, targets, num_epochs=1,shuffle=False)
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
linear_regressor = tf.estimator.LinearRegressor(
feature_columns=feature_columns,
optimizer=my_optimizer)
plt.figure(figsize=(15,6)) # 得到画面
ax = plt.subplot(1,2,1,projection='3d')
plt.title('Learned Line by Period')
ax.set_zlabel(my_label)
plt.ylabel(my_feature[1]) # total_rooms
plt.xlabel(my_feature[0]) # population
n = 300 # 取样点数
sample = california_housing_dataframe.sample(n)
ax.scatter(sample[my_feature[0]],sample[my_feature[1]],sample[my_label])
colors = [cm.coolwarm(x) for x in np.linspace(-1,1,periods)]
print("Train model...")
print('RMSE (on training data):')
root_mean_squared_errors = []
for period in range(periods):
linear_regressor.train(
input_fn=train_input_fn,
steps=steps_per_period)
predictions = linear_regressor.predict(input_fn=prediction_input_fn)
predictions = np.array([item['predictions'][0] for item in predictions])
root_mean_squared_error = math.sqrt(
metrics.mean_squared_error(predictions, targets))
print("period %02d : %0.2f"%(period, root_mean_squared_error))
root_mean_squared_errors.append(root_mean_squared_error)
# y_extents = np.array([0, sample[my_label].max()])
weight0 = linear_regressor.get_variable_value('linear/linear_model/%s/weights'%input_feature[0])[0]
weight1 = linear_regressor.get_variable_value('linear/linear_model/%s/weights'%input_feature[1])[0]
bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')
# y = w1*x1+w2*x2+b
# x_extents = (y_extents - bias)/weight
# x_extents = np.maximum(np.minimum(x_extents,
# sample[my_feature].max()),
# sample[my_feature].min())
# y_extents =weight*x_extents+bias
for i in range(n):
y_extents = weight0*sample[my_feature[0]].values[i]+weight1*sample[my_feature[1]].values[i]+bias
ax.scatter(sample[my_feature[0]].values[i], sample[my_feature[1]].values[i], y_extents, color=colors[period])
print('Model training finished.')
plt.subplot(1,2,2)
plt.ylabel('RMSE')
plt.xlabel('Periods')
plt.title('Root Mean Squared Error vs. Periods')
plt.tight_layout()
plt.plot(root_mean_squared_errors)
calibration_data = pd.DataFrame()
calibration_data['predictions'] = pd.Series(predictions)
calibration_data['targets'] = pd.Series(targets)
calibration_data.describe()
print('Final RMSE (on training data): %0.2f'%root_mean_squared_error)
train_model(learning_rate=0.0001,
steps=100,
batch_size=1
)
Train model...
RMSE (on training data):
period 00 : 214.40
period 01 : 198.69
period 02 : 183.96
period 03 : 176.80
period 04 : 173.32
Model training finished.
Final RMSE (on training data): 173.32

sample = california_housing_dataframe.sample(300)
my_feature = ['total_rooms','population']
my_label = 'median_house_value'
fig = plt.figure(figsize=(15,6))
ax = Axes3D(fig)
# sample[my_label].values
ax.scatter(sample[my_feature[0]],sample[my_feature[1]],sample[my_label])
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1c263d734e0>

"""
总结:
使用tensorflow进行线性回归的步骤:
1.加载数据集,分析数据主要特征,最好能够分析特征之间的相关性
(violin图查看各个变量的分布,表格构建相关性)
相关效果见:https://blog.youkuaiyun.com/u010099080/article/details/72824899?ref=myread
计算方法见:https://blog.youkuaiyun.com/weixin_37272286/article/details/80079673
2.数据随机排序,防止病态排序结果
3.定义特征和标签
4.配置LinearRegressor (包括optimizer,梯度裁剪上界,学习率)
位于tf.estimator.LinearRegressor
5.定义输入函数 (构建一个迭代器,将数据拆分成多批数据,按指定周期向LinearRegressor输入训练数据)
输入参数包括特征、标签、批尺寸、随机性、循环周期
返回特征、标签
6.训练LR模型并查看训练效果
训练效果主要通过均方根误差(RMSE)和max_min_difference的差距体现
获取weight和bias的位置:linear/linear_model/%s/weights %my_features
linear/linear_model/bias_weights
7.调整模型超参(在此时是学习率learning_rate),寻求更好的拟合和泛化效果
"""
本文详细介绍使用TensorFlow进行线性回归的过程,包括数据加载、预处理、模型配置、训练及评估,展示了如何通过调整超参数提升模型表现。
60

被折叠的 条评论
为什么被折叠?



