import os
from nltk import accuracy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from data.RandimForestFunctions import train_model, save_model
if name == ‘main’:
# 获取当前脚本文件所在目录的绝对路径
current_dir = os.path.dirname(os.path.abspath(file))
# 构建数据集文件路径
dataset_path = os.path.join(current_dir, "data", "TRAISN.csv")
# 构建保存模型的文件路径
saved_model_path = os.path.join(current_dir, "train_model.m")
def main(dataset_path, saved_model_path):
# 训练模型
# model, score_r, mean_score, Xtest, Ytest = train_model(dataset_path)
model, score_r, train_accuracy, class_accuracy, mean_score, Xtest, Ytest = train_model(dataset_path)
# 模型保存
save_model(model, saved_model_path)
# 模型评估
Ypred = model.predict(Xtest)
accuracy = accuracy_score(Ytest, Ypred)
#precision = precision_score(Ytest, Ypred)
#recall = recall_score(Ytest, Ypred)
#f1 = f1_score(Ytest, Ypred)
# 打印训练和保存路径
print("Random Forest Score: {}".format(score_r))
print("Ridge Mean Score: {}".format(mean_score))
print("Model saved at: {}".format(saved_model_path))
print("Accuracy: {}".format(accuracy))
print("Training Accuracy: {}".format(train_accuracy))
print("Class Accuracy: {}".format(class_accuracy))
print("Done!")
# print("Precision: {}".format(precision))
# print("Recall: {}".format(recall))
# print("F1 Score: {}".format(f1))
# 调用主函数,传入数据集路径和保存模型路径
# 打印训练集和测试集精度
# 调用主函数,传入数据集路径和保存模型路径
main(dataset_path, saved_model_path)
封装.py
import numpy as np
from google.protobuf.internal import encoder
from pyts.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler, PowerTransformer, QuantileTransformer, scaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
import joblib
from sklearn.model_selection import train_test_split
#def get_datasets(dataset, way=None):
scaler = MinMaxScaler()
# 处理空字符串(“”),将其替换为NaN
dataset[dataset == “”] = np.nan
# 将NaN替换为0
#dataset = np.nan_to_num(dataset.astype(float))
normalized_data = scaler.fit_transform(dataset)
#return normalized_data
#def train_model(dataset_path):
root = dataset_path
#raw_datasets = np.genfromtxt(root, dtype=np.str, delimiter=",")[1:, 1:][:1000]
#datasets = get_datasets(raw_datasets[:, :-1])
#labels = [int(one_data[-1]) for one_data in raw_datasets]
#labels = [int(one_data[-1]) if one_data[-1] != '' else -1 for one_data in raw_datasets]
#Xtrain, Xtest, Ytrain, Ytest = train_test_split(datasets, labels, test_size=0.3)
#rfc = RandomForestClassifier(random_state=10)
#rfc = rfc.fit(Xtrain, Ytrain)
#score_r = rfc.score(Xtest, Ytest)
#return rfc, score_r
def get_datasets(dataset, way=None):
#scaler = StandardScaler()
#预处理方式
scaler = MinMaxScaler() #精度0.45
#scaler = RobustScaler() 精度0.31
#对数据进行缩放,使用中位数和四分位数而不是均值和方差。对于存在离群值的数据集,这种方法可以更好地保持数据的分布。
#scaler = MaxAbsScaler() 精度0.41
#对数据进行缩放,使其最大绝对值为1。它通过除以每个特征的最大绝对值来缩放数据。
#scaler = PowerTransformer(method='box-cox') # 可选择使用Box-Cox变换或Yeo-Johnson变换;通过对数据进行幂变换,将数据映射到正态分布或近似正态分布。常见的幂变换方法包括Box-Cox变换和Yeo-Johnson变换。
#正数据
#scaler = QuantileTransformer(output_distribution='uniform') # 可选择输出均匀分布或高斯分布;将数据映射到指定分位数的均匀分布上。可以使用这种方法来降低离群值的影响并使数据更加符合某种分布。
#精度0.31
#encoder = OneHotEncoder()
#encoded_data = encoder.fit_transform(dataset)#对于具有分类特征的数据集,可以使用One-Hot编码将其转换为二进制向量表示。每个类别都将转换为一个独立的二进制特征。
#encoder = LabelEncoder()
#encoded_labels = encoder.fit_transform(labels)#对于具有分类标签的数据集,可以使用LabelEncoder将标签编码为整数。
# 处理空字符串(""),将其替换为NaN
dataset[dataset == ""] = np.nan
# 将NaN替换为0
dataset = np.nan_to_num(dataset.astype(float))
normalized_data = scaler.fit_transform(dataset)
return normalized_data
def train_model(dataset_path):
root = dataset_path
#raw_datasets = np.genfromtxt(root, dtype=np.str, delimiter=“,”)[1:, 1:][:1000]
raw_datasets = np.genfromtxt(root, dtype=np.str, delimiter=“,”)[1:].reshape(-1, 1)
#raw_datasets = np.loadtxt(root,dtype=np.str,delimiter=“,”)[1:1:1].reshape
raw_datasets = np.loadtxt(root, dtype=np.str, delimiter=“,”)[1:, 1:][:1000]
datasets = get_datasets(raw_datasets[:, :-1])
#labels = [int(one_data[-1]) for one_data in raw_datasets]
labels = [int(one_data[-1]) if one_data[-1] != ‘’ else -1 for one_data in raw_datasets]
#如果标签是一个非空字符串,将其转换为整数类型,并作为标签值。
#如果标签是一个空字符串,将其转换为 - 1,并作为标签值。
# 特征选择添加之前精度0.45添加后精度0.41
selector = SelectKBest(score_func=f_classif, k=50)
datasets_selected = selector.fit_transform(datasets, labels)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(datasets_selected, labels, test_size=0.3)
rfc = RandomForestClassifier(random_state=60)
rfc = rfc.fit(Xtrain, Ytrain)
score_r = rfc.score(Xtest, Ytest)
# 计算训练集精度
train_accuracy = rfc.score(Xtrain, Ytrain)
# 计算每个类别的精度
Ypred = rfc.predict(Xtest)
class_accuracy = accuracy_score(Ytest, Ypred)
# 使用岭回归进行正则化和交叉验证
ridge = Ridge(alpha=0.5)
cv_scores = cross_val_score(ridge, datasets, labels, cv=5)
mean_score = cv_scores.mean()
return rfc, score_r, train_accuracy, class_accuracy, mean_score, Xtest, Ytest
def save_model(model, save_path):
joblib.dump(model, save_path)
虽然很简单,但是经过检验和数据处理,对于标准数据集来说,精度80%上