import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns
# 设置中文字体
sns.set_style('whitegrid', {'font.sans-serif':['simhei', 'Arial']})
# 创建数据集
data = {
'序号': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
'x1': [59, 36, 61, 58, 55, 61, 38, 42, 50, 58, 68, 25, 52, 31, 36, 42],
'x2': [2, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 2, 1, 1, 3, 1],
'x3': [43.4, 57.2, 190, 128, 80, 94.4, 76, 240, 74, 68.6, 132.8, 94.6, 56, 47.8, 31.6, 66.2],
'x4': [2, 1, 2, 4, 3, 2, 1, 3, 1, 2, 4, 4, 1, 2, 3, 2],
'x5': [1, 1, 1, 3, 4, 1, 1, 2, 1, 2, 2, 3, 1, 1, 1, 1],
'y': [0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0]
}
df = pd.DataFrame(data)
# 准备特征和目标变量
X = df[['x1', 'x2', 'x3', 'x4', 'x5']]
y = df['y']
# 由于数据量较小,采用留一法交叉验证
def leave_one_out_cv(X, y, normalized=False):
accuracies = []
for i in range(len(X)):
# 划分训练集和测试集
X_train = X.drop(i).values
y_train = y.drop(i).values
X_test = X.iloc[i].values.reshape(1, -1)
y_test = y.iloc[i]
if normalized:
# 数据归一化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# 训练Logistic回归模型
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)
# 预测并计算准确率
y_pred = model.predict(X_test)
accuracy = accuracy_score([y_test], y_pred)
accuracies.append(accuracy)
return np.mean(accuracies)
# 计算归一化和未归一化情况下的准确率
accuracy_unnormalized = leave_one_out_cv(X, y, normalized=False)
accuracy_normalized = leave_one_out_cv(X, y, normalized=True)
print(f"未归一化模型准确率: {accuracy_unnormalized:.4f}")
print(f"归一化模型准确率: {accuracy_normalized:.4f}")
# 绘制柱状图比较结果
categories = ['未归一化', '归一化']
accuracies = [accuracy_unnormalized, accuracy_normalized]
plt.figure(figsize=(10, 6))
bars = plt.bar(categories, accuracies, color=['lightblue', 'lightcoral'], alpha=0.8)
# 在柱状图上显示数值
for bar, accuracy in zip(bars, accuracies):
plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
f'{accuracy:.4f}', ha='center', va='bottom', fontsize=12)
plt.ylabel('预测准确率', fontsize=12)
plt.title('Logistic回归模型在归一化与未归一化情况下的预测精度比较', fontsize=14)
plt.ylim(0, 1.0)
plt.grid(axis='y', alpha=0.3)
# 添加分析结论
plt.text(0.5, 0.1,
f'分析结论:\n• 未归一化准确率: {accuracy_unnormalized:.1%}\n• 归一化准确率: {accuracy_normalized:.1%}',
transform=plt.gca().transAxes, fontsize=11, bbox=dict(boxstyle="round,pad=0.3", facecolor="lightyellow"))
plt.tight_layout()
plt.show()
# 显示数据集基本信息
print("\n数据集基本信息:")
print(f"样本数量: {len(df)}")
print(f"特征数量: {X.shape[1]}")
print(f"正样本数量 (Y=1): {sum(y)}")
print(f"负样本数量 (Y=0): {len(y) - sum(y)}")
print(f"正样本比例: {sum(y) / len(y):.2%}")
# 显示特征统计信息
print("\n特征统计信息:")
print(X.describe())
运行结果:
未归一化模型准确率: 0.8125
归一化模型准确率: 0.8125
D:\python\pythonProject1\Test\机器学习原理与应用.py:347: UserWarning: Glyph 8226 (\N{BULLET}) missing from font(s) SimHei.
plt.tight_layout()
C:\Python311\Lib\tkinter\__init__.py:861: UserWarning: Glyph 8226 (\N{BULLET}) missing from font(s) SimHei.
func(*args)
数据集基本信息:
样本数量: 16
特征数量: 5
正样本数量 (Y=1): 5
负样本数量 (Y=0): 11
正样本比例: 31.25%
特征统计信息:
x1 x2 x3 x4 x5
count 16.00000 16.000000 16.000000 16.000000 16.000000
mean 48.25000 1.812500 92.537500 2.312500 1.625000
std 12.66228 0.910586 55.907386 1.078193 0.957427
min 25.00000 1.000000 31.600000 1.000000 1.000000
25% 37.50000 1.000000 56.900000 1.750000 1.000000
50% 51.00000 1.500000 75.000000 2.000000 1.000000
75% 58.25000 3.000000 102.950000 3.000000 2.000000
max 68.00000 3.000000 240.000000 4.000000 4.000000
这里归一化后准确率没有提高,请给出我不使用留一法并且让归一化后的模型准确率提高的合适的完整代码
最新发布