数据集链接
链接:https://pan.baidu.com/s/1hK-eTBgkLL7ZQuWCKqyvNA
提取码:fz74
线性回归
纯Python构建的经典算法实战Kaggle真实项目:红酒质量分析预测

# 1.load csv
# 2.convert string to float
# 3.normalization
# 4.cross validation
# 5.evaluate our algo(RMSE)
# 1 . Import standard Lib
from csv import reader
from math import sqrt
from random import randrange
from random import seed
# 2. Load our csv file
def csv_loader(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# dataset_list = csv_loader('winequality-white.csv')
#
# print(dataset_list)
# 3.Convert our datatype
def string_to_float_converter(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())
# 4.find the min and max of our dataset
def find_the_min_and_max_of_our_dataset(dateset):
min_max_list = list()
for i in range(len(dateset[0])):
col_value = [row[i] for row in dateset]
max_value = max(col_value)
min_value = min(col_value)
min_max_list.append([min_value, max_value])
return min_max_list
# 5.normalization our data
def normalization(dataset, min_max_list):
for row in dataset:
for i in range(len(row)):
row[i] = (row[i] - min_max_list[i][0]) / (min_max_list[i][1] - min_max_list[i][0])
# 6.splitting our data
def k_fold_cross_validation_split(dataset, n_folds):
split_dataset = list()
copy_dataset = list(dataset)
every_fold_size = int(len(dataset) / n_folds)
for i in range(n_folds):
fold = list()
while len(fold) < every_fold_size:
index = randrange(len(copy_dataset))
fold.append(copy_dataset.pop(index))
split_dataset.append(fold)
return split_dataset
# 7.using root mean squared error method to calculate our model
def rmse_method(actual_data, predicted_data):
sum_of_error = 0.0
for i in range(len(actual_data)):
predicted_error = predicted_data[i] - actual_data[i]
sum_of_error += (predicted_error ** 2)
mean_error = sum_of_error / float(len(actual_data))
rmse = sqrt(mean_error)
return rmse
# 8. how good is our algo by using cross validation
def how_good_is_our_algo(dataset, algo, n_folds, *args):
folds = k_fold_cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set.remove(fold)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
test_set.append(row_copy)
row_copy[-1] = None
predicted = algo(train_set, test_set, *args)
actual = [row[-1] for row in fold]
rmse = rmse_method(actual, predicted)
scores.append(rmse)
return scores
# 9.make prediction
def predict(row, coefficients):
yhat = coefficients[0]
for i in range(len(row) - 1):
yhat += coefficients[i + 1] * row[i]
return yhat
# 10. using stochastic gradient descent method to calculate the coefficient
def sgd_method_to_calculate_coefficient(training_data, learning_rate, n_epoch):
coefficients_list = [0.0 for i in range(len(training_data[0]))]
for epoch in range(n_epoch):
for row in training_data:
yhat = predict(row, coefficients_list)
# typo
error = yhat - row[-1]
coefficients_list[0] = coefficients_list[0] - learning_rate * error
for i in range(len(row) - 1):
coefficients_list[i + 1] = coefficients_list[i + 1] - learning_rate * error * row[i]
print(learning_rate, n_epoch, error)
return coefficients_list
# 11. using linear regression algo
def using_sgd_method_to_calculate_linear_regression(training_data, testing_data, learning_rate, n_epoch):
predictions = list()
coefficients_list = sgd_method_to_calculate_coefficient(training_data, learning_rate, n_epoch)
for row in testing_data:
yhat = predict(row, coefficients_list)
predictions.append(yhat)
return (predictions)
# 12. Using our real wine quality data
seed(1)
wine_quality_data_name = 'winequality-white.csv'
dataset = csv_loader(wine_quality_data_name)
for i in range(len(dataset[0])):
string_to_float_converter(dataset, i)
# 13.Normalization
min_and_max = find_the_min_and_max_of_our_dataset(dataset)
normalization(dataset, min_and_max)
# 14.How good is our algo
n_folds = 5
learning_rate = 0.1
n_epoch = 50
algo_score = how_good_is_our_algo(dataset, using_sgd_method_to_calculate_linear_regression, n_folds, learning_rate,
n_epoch)
print("Our algo's score is %s" % algo_score)
print("The mean of our algo's RMSE is %.3f" % (sum(algo_score) / float(len(algo_score))))

纯Python构建的经典算法实战Kaggle真实项目:车险赔付建模预测

from csv import reader
from math import sqrt
from random import randrange, seed
# 1.Load our csv data
# 1.读取csv文件
def load_csv(data_file):
data_set = list()
with open(data_file, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
data_set.append(row)
return data_set
# print(load_csv('insurance.csv'))
# 发现数据均为字符串,我们接下来对字符串进行转化
# 最终要转换成float
# 2.数据类型转换
def string_converter(data_set, column):
for row in data_set:
"""
Return a copy of the string with leading and trailing whitespace removed.
因为很可能出现这样的数据" 99.9 "
If chars is given and not None, remove characters in chars instead.
"""
row[column] = float(row[column].strip())
# 模型预测的准确性基本判断方法RMSE(衡量模型的标尺)
# 3.root mean squared error
def calculate_RMSE(actual_data, predicted_data):
sum_error = 0.0
for i in range(len(actual_data)):
predicted_error = predicted_data[i] - actual_data[i]
sum_error += (predicted_error ** 2)
mean_error = sum_error / float(len((actual_data)))
return sqrt(mean_error)
# 4.测试集、训练集切分train/test split
def train_test_split(data_set, split):
train = list()
train_size = split * len(data_set)
data_set_copy = list(data_set)
while len(train) < train_size:
index = randrange(len(data_set_copy))
train.append(data_set_copy.pop(index))
return train, data_set_copy
# 5.模型到底如何(train/test split)通过在训练集,测试集切分后,用RMSE进行衡量模型好坏
def how_good_is_our_algo(data_set, algo, split, *args):
train, test = train_test_split(data_set, split)
test_set = list()
for row in test:
row_copy = list(row)
row_copy[-1] = None
test_set.append(row_copy)
# 伪代码思想,先用algo统一代替具体算法
predicted = algo(train, test_set, *args)
actual = [row[-1] for row in test]
rmse = calculate_RMSE(actual, predicted)
return rmse
# 6.为了实现简单线形回归的小玩意儿
def mean(values):
return sum(values) / float(len(values))
def covariance(x, the_mean_of_x, y, the_mean_of_y):
covar = 0.0
for i in range(len(x)):
covar += (x[i] - the_mean_of_x) * (y[i] - the_mean_of_y)
return covar
def variance(values, mean):
return sum([(x - mean) ** 2 for x in values])
def coefficients(data_set):
x = [row[0] for row in data_set]
y = [row[1] for row in data_set]
the_mean_of_x = mean(x)
the_mean_of_y = mean(y)
# y=b1*x+b0
b1 = covariance(x, the_mean_of_x, y, the_mean_of_y) / variance(x, the_mean_of_x)
b0 = the_mean_of_y - b1 * the_mean_of_x
return [b0, b1]
# 7.这里写简单线性回归的具体预测
def using_simple_linear_regression(train, test):
# 套路:先弄一个空的容器出来,然后逐一处理放入
predictions = list()
b0, b1 = coefficients(train)
for row in test:
y_hat = b1 * row[0] + b0
predictions.append(y_hat)
return predictions
# 8.带入真实数据
# 可调项,避免"魔法数字"
seed(4)
split = 0.6
# 读取数据
data_set = load_csv('F:/py/算法学习/X.Kaggle数据用PurePython实战/insurance.csv')
# 数据准备
for i in range(len(data_set[0])):
string_converter(data_set, i)
rmse = how_good_is_our_algo(data_set, using_simple_linear_regression, split)
print("RMSE of our algo is : %.3f" % (rmse))

逻辑回归
纯Python构建的经典算法实战Kaggle真实项目:糖尿病建模预测

from random import seed
from random import randrange
from csv import reader
from math import exp
# 1.Load our data using csv reader
def load_data_from_csv_file(file_name):
# 通过一个list容器去装数据
dataset = list()
with open(file_name, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
continue
dataset.append(row)
return dataset
# print(load_data_from_csv_file('diabetes.csv'))
# 2. convert string in list of lists to float(data type change)
def change_string_to_float(dataset, column):
for row in dataset:
# 把前后对空格去掉
row[column] = float(row[column].strip())
# ret_data = load_data_from_csv_file('diabetes.csv')
#
# change_string_to_float(ret_data, 1)
#
# print(ret_data)
# 3.Find the min and max value of our data
def find_the_min_and_max_of_our_data(dataset):
min_max_list = list()
for i in range(len(dataset[0])):
values_in_every_column = [row[i] for row in dataset]
the_min_value = min(values_in_every_column)
the_max_value = max(values_in_every_column)
min_max_list.append([the_min_value, the_max_value])
return min_max_list
# 4.rescale our data so it fits to range 0 ~ 1
def rescale_our_data(dataset, min_max_list):
for row in dataset:
for i in range(len(row)):
row[i] = (row[i] - min_max_list[i][0]) / (min_max_list[i][1] - min_max_list[i][0])
# 5.k fold train and test split
def k_fold_cross_validation(dataset, how_many_fold_do_you_want):
splited_dataset = list()
# 对原数据进行处理的时候,尽量不要改动源数据(可以通过创建copy的方式对copy数据进行处理)
copy_dataset = list(dataset)
how_big_is_every_fold = int(len(dataset) / how_many_fold_do_you_want)
# 创建一个空的盒子,然后逐一随机选取数据放入盒子中
for i in range(how_many_fold_do_you_want):
box_for_my_fold = list()
while len(box_for_my_fold) < how_big_is_every_fold:
some_random_index_in_the_fold = randrange(len(copy_dataset))
box_for_my_fold.append(copy_dataset.pop(some_random_index_in_the_fold))
splited_dataset.append(box_for_my_fold)
return splited_dataset
# 6.Calculate the accuracy of our model
def calculate_the_accuracy_of_our_model(actual_data, predicted_data):
counter_of_correct_prediction = 0
for i in range(len(actual_data)):
if actual_data[i] == predicted_data[i]:
counter_of_correct_prediction += 1
return counter_of_correct_prediction / float(len(actual_data)) * 100.0
# 7. how good is our algo ?
def how_good_is_our_algo(dataset, algo, how_many_fold_do_you_want, *args):
folds = k_fold_cross_validation(dataset, how_many_fold_do_you_want)
scores = list()
for fold in folds:
training_data_set = list(folds)
training_data_set.remove(fold)
training_data_set = sum(training_data_set, [])
testing_data_set = list()
# 保险操作,去除真实数据,避免影响模型的学习结果
for row in fold:
row_copy = list(row)
testing_data_set.append(row_copy)
row_copy[-1] = None
predicted

本文档详细介绍了使用纯Python实现的各种机器学习算法,并通过Kaggle数据集进行实战应用,包括线性回归、逻辑回归、感知器、决策树、朴素贝叶斯、KNN、LVQ学习向量和神经网络等,覆盖了红酒质量分析、糖尿病预测、自动验钞等多个项目。
最低0.47元/天 解锁文章
2531





