在我的后端中还有两个文件,一个是new_algorithm.py:import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.base import clone
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import os
import re
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import seaborn as sns
# 添加数据插补器
from sklearn.impute import SimpleImputer
def check_chinese_font_support():
"""检查系统是否支持中文字体"""
chinese_fonts = ['SimHei', 'WenQuanYi Micro Hei', 'Heiti TC', 'Microsoft YaHei', 'SimSun']
for font in chinese_fonts:
try:
plt.rcParams["font.family"] = font
# 测试字体是否可用
fig, ax = plt.subplots(figsize=(1, 1))
ax.text(0.5, 0.5, '测试', fontsize=12)
plt.close(fig)
print(f"系统支持中文字体: {font}")
return True
except:
continue
print("系统不支持中文字体,将使用英文标签")
plt.rcParams["font.family"] = ['Arial', 'sans-serif']
return False
class GasSensorDataAnalyzer:
"""有害气体分类数据加载与预处理类"""
def __init__(self):
# 基础气体标签
self.gas_labels = {
'acetone': 0,
'toluene': 1,
'methanol': 2,
'formaldehyde': 3,
'ethanol': 4
}
# 混合气体标签生成(每个混合气体用唯一数字标识)
self.mixture_labels = self._generate_mixture_labels()
# 合并所有气体标签
self.all_gas_labels = {**self.gas_labels, **self.mixture_labels}
# 中英文气体名称映射
self.gas_names = {
'acetone': {'cn': '丙酮', 'en': 'Acetone'},
'toluene': {'cn': '甲苯', 'en': 'Toluene'},
'methanol': {'cn': '甲醇', 'en': 'Methanol'},
'formaldehyde': {'cn': '甲醛', 'en': 'Formaldehyde'},
'ethanol': {'cn': '乙醇', 'en': 'Ethanol'},
'toluene+formaldehyde': {'cn': '甲苯+甲醛', 'en': 'Toluene+Formaldehyde'},
'methanol+toluene+formaldehyde': {'cn': '甲醇+甲苯+甲醛', 'en': 'Methanol+Toluene+Formaldehyde'}
# 可以根据需要添加更多混合气体的名称映射
}
# 传感器类型映射
self.sensor_types = {
'MP2': 0,
'MP3B': 1,
'MP503': 2,
'MP801': 3,
'MQ2': 4,
'MQ7B': 5
}
# 初始化多维度类别映射
self.multi_dimension_labels = {}
self.next_label_id = 0
# 传感器中英文名称映射
self.sensor_names = {
'MP2': {'cn': 'MP2', 'en': 'MP2'},
'MP3B': {'cn': 'MP3B', 'en': 'MP3B'},
'MP503': {'cn': 'MP503', 'en': 'MP503'},
'MP801': {'cn': 'MP801', 'en': 'MP801'},
'MQ2': {'cn': 'MQ2', 'en': 'MQ2'},
'MQ7B': {'cn': 'MQ7B', 'en': 'MQ7B'}
}
def _generate_mixture_labels(self):
"""生成混合气体的标签映射"""
# 定义可能的混合气体组合
mixtures = [
'toluene+formaldehyde',
'methanol+toluene+formaldehyde'
# 可以根据需要添加更多混合气体组合
]
# 为每个混合气体分配唯一标签(从基础气体标签之后开始)
next_label = max(self.gas_labels.values()) + 1
return {mixture: next_label + i for i, mixture in enumerate(mixtures)}
def get_or_create_multi_dimension_label(self, sensor_type, gas_type, concentration):
"""
获取或创建多维度类别标签
参数:
- sensor_type: 传感器类型
- gas_type: 气体类型
- concentration: 浓度值
返回:
- 标签ID和标签名称
"""
# 创建唯一键
key = f"{sensor_type}_{gas_type}_{concentration}ppm"
# 如果键不存在,创建新标签
if key not in self.multi_dimension_labels:
self.multi_dimension_labels[key] = self.next_label_id
self.next_label_id += 1
# 返回标签ID和标签名称
label_id = self.multi_dimension_labels[key]
# 创建中英文标签名称
sensor_name_cn = self.sensor_names.get(sensor_type, {}).get('cn', sensor_type)
sensor_name_en = self.sensor_names.get(sensor_type, {}).get('en', sensor_type)
gas_name_cn = self.gas_names.get(gas_type, {}).get('cn', gas_type)
gas_name_en = self.gas_names.get(gas_type, {}).get('en', gas_type)
label_name_cn = f"{sensor_name_cn}_{gas_name_cn}_{concentration}ppm"
label_name_en = f"{sensor_name_en}_{gas_name_en}_{concentration}ppm"
return label_id, {
'cn': label_name_cn,
'en': label_name_en
}
def load_single_gas_data(self, file_path, gas_type, concentration, sensor_type):
"""
加载单一气体数据
参数:
- file_path: 文件路径
- gas_type: 气体类型 (如 'acetone', 'toluene' 等)
- concentration: 浓度值 (如 20, 30, 50 等)
- sensor_type: 传感器类型 (如 'MP2', 'MP801' 等)
"""
try:
if not os.path.exists(file_path):
raise FileNotFoundError(f"文件不存在: {file_path}")
df = pd.read_excel(file_path, sheet_name='Sheet1', index_col=0)
X = df.values
# 尝试将数据转换为 float 类型
try:
X = X.astype(float)
except ValueError:
print("警告: 数据中包含非数值类型,将过滤掉非数值类型的数据")
numeric_mask = np.vectorize(np.isreal)(X)
X = X[numeric_mask].reshape(-1, df.shape[1])
# 检查并报告NaN值
nan_count = np.isnan(X).sum()
if nan_count > 0:
print(f"警告: 数据中包含 {nan_count} 个NaN值")
# 可选:替换NaN值为0
# X = np.nan_to_num(X, nan=0.0)
# 创建多维度标签
label_id, label_name = self.get_or_create_multi_dimension_label(
sensor_type, gas_type, concentration
)
# 为所有样本分配相同的标签
y = np.full(len(X), label_id, dtype=int)
print(f"已加载 {label_name['cn']} 数据: {len(X)} 样本, 特征维度: {X.shape[1]}")
return X, y
except Exception as e:
print(f"加载数据时出错: {e}")
return None, None
def load_multiple_gas_data(self, file_paths, gas_types, concentrations, sensor_types):
"""
加载多个气体数据并合并
参数:
- file_paths: 文件路径列表
- gas_types: 气体类型列表 (如 ['acetone', 'toluene'] 等)
- concentrations: 浓度值列表 (如 [20, 30] 等)
- sensor_types: 传感器类型列表 (如 ['MP2', 'MP801'] 等)
"""
X_all = []
y_all = []
feature_dimensions = [] # 用于记录每个数据集的特征维度
for file_path, gas_type, concentration, sensor_type in zip(
file_paths, gas_types, concentrations, sensor_types
):
X, y = self.load_single_gas_data(file_path, gas_type, concentration, sensor_type)
if X is not None and len(X) > 0:
X_all.append(X)
y_all.append(y)
feature_dimensions.append(X.shape[1])
if not X_all:
print("没有加载到有效数据")
return None, None
# 检查所有数据集的特征维度是否一致
unique_dimensions = np.unique(feature_dimensions)
if len(unique_dimensions) > 1:
print(f"警告: 检测到不同的特征维度: {unique_dimensions}")
print("这可能导致合并数据时出错。请检查您的Excel文件是否具有相同的列数。")
# 找出最常见的维度
from collections import Counter
dimension_counts = Counter(feature_dimensions)
most_common_dimension = dimension_counts.most_common(1)[0][0]
print(f"最常见的特征维度是: {most_common_dimension}")
# 过滤掉特征维度不匹配的数据
filtered_X_all = []
filtered_y_all = []
for i, X in enumerate(X_all):
if X.shape[1] == most_common_dimension:
filtered_X_all.append(X)
filtered_y_all.append(y_all[i])
else:
print(f"忽略特征维度不匹配的数据集: {file_paths[i]} (维度: {X.shape[1]})")
if not filtered_X_all:
print("没有找到特征维度匹配的数据集")
return None, None
X_all = filtered_X_all
y_all = filtered_y_all
# 合并所有数据
X_combined = np.vstack(X_all)
y_combined = np.concatenate(y_all)
# 检查合并后的数据中是否存在NaN值
total_nan = np.isnan(X_combined).sum()
if total_nan > 0:
print(f"警告: 合并后的数据中包含 {total_nan} 个NaN值,占比: {total_nan/(X_combined.size):.4f}")
print(f"NaN值在样本中的分布: {np.isnan(X_combined).any(axis=1).sum()} 个样本包含NaN值")
print(f"NaN值在特征中的分布: {np.isnan(X_combined).any(axis=0).sum()} 个特征包含NaN值")
print(f"合并后的数据: {len(X_combined)} 样本,{len(np.unique(y_combined))} 个类别,特征维度: {X_combined.shape[1]}")
return X_combined, y_combined
def load_dataset(self, file_path, gas_type, concentration, sensor_type):
"""加载单一数据集并返回"""
return self.load_single_gas_data(file_path, gas_type, concentration, sensor_type)
class AlgorithmSelector:
"""多算法选择与训练类"""
def __init__(self, use_chinese=True):
# 算法名称映射
self.algorithm_names = {
'knn': {'cn': 'K-近邻算法', 'en': 'K-Nearest Neighbors'},
'svm': {'cn': '支持向量机', 'en': 'Support Vector Machine'},
'random_forest': {'cn': '随机森林', 'en': 'Random Forest'},
'decision_tree': {'cn': '决策树', 'en': 'Decision Tree'},
'neural_network': {'cn': '神经网络', 'en': 'Neural Network'}
}
# 算法配置
self.algorithms = {
'knn': {
'model': KNeighborsClassifier(),
'params': {'n_neighbors': 5, 'metric': 'euclidean'}
},
'svm': {
'model': SVC(),
'params': {'kernel': 'rbf', 'C': 1.0, 'probability': True}
},
'random_forest': {
'model': RandomForestClassifier(),
'params': {'n_estimators': 100, 'random_state': 42}
},
'decision_tree': {
'model': DecisionTreeClassifier(),
'params': {'max_depth': None, 'random_state': 42}
},
'neural_network': {
'model': MLPClassifier(),
'params': {
'neural_network__hidden_layer_sizes': (100, 50), # 注意前缀
'neural_network__max_iter': 500,
'neural_network__random_state': 42}
}
}
# 算法是否需要标准化
self.needs_scaling = {
'knn': True,
'svm': True,
'random_forest': False,
'decision_tree': False,
'neural_network': True
}
# 是否使用中文
self.use_chinese = use_chinese
def set_algorithm_params(self, algorithm_name, params):
"""设置算法参数"""
if algorithm_name in self.algorithms:
# 为Pipeline正确格式化参数名称
formatted_params = {f"{algorithm_name}__{k}": v for k, v in params.items()}
self.algorithms[algorithm_name]['params'] = formatted_params
else:
raise ValueError(f"不支持的算法: {algorithm_name}")
def train_models(self, X, y, test_size=0.2, random_state=42):
"""
训练所有算法并返回结果
返回:
- 包含训练好的模型及其性能的字典
"""
# 检查类别数量
unique_classes = np.unique(y)
num_classes = len(unique_classes)
if num_classes < 2:
print(f"警告: 数据集中只有 {num_classes} 个类别,某些算法可能无法训练")
print(f"单一类别值: {unique_classes[0]}")
# 跳过SVM算法,因为它需要至少两个类别
algorithms_to_train = [name for name in self.algorithms if name != 'svm']
print(f"由于单类别数据,将跳过 SVM 算法,仅训练: {', '.join([self.algorithm_names[name]['cn'] for name in algorithms_to_train])}")
# 在单一数据集上划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state
)
# 标记这是单类别数据
is_single_class_data = True
else:
# 在多类别数据集上划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=test_size, random_state=random_state, stratify=y
)
algorithms_to_train = list(self.algorithms.keys())
is_single_class_data = False
# 检查数据类型并确保可以安全转换为数值类型
try:
# 尝试将数据转换为float类型
X_train_numeric = X_train.astype(float)
X_test_numeric = X_test.astype(float)
# 检查训练数据中是否存在NaN值
train_nan = np.isnan(X_train_numeric).sum()
if train_nan > 0:
print(f"警告: 训练数据中包含 {train_nan} 个NaN值,占比: {train_nan/(X_train_numeric.size):.4f}")
print(f"NaN值在训练样本中的分布: {np.isnan(X_train_numeric).any(axis=1).sum()} 个样本包含NaN值")
print(f"NaN值在训练特征中的分布: {np.isnan(X_train_numeric).any(axis=0).sum()} 个特征包含NaN值")
# 检查测试数据中是否存在NaN值
test_nan = np.isnan(X_test_numeric).sum()
if test_nan > 0:
print(f"警告: 测试数据中包含 {test_nan} 个NaN值,占比: {test_nan/(X_test_numeric.size):.4f}")
print(f"NaN值在测试样本中的分布: {np.isnan(X_test_numeric).any(axis=1).sum()} 个样本包含NaN值")
print(f"NaN值在测试特征中的分布: {np.isnan(X_test_numeric).any(axis=0).sum()} 个特征包含NaN值")
except ValueError as e:
print(f"警告: 无法将数据转换为数值类型,跳过NaN值检查: {e}")
results = {}
for name in algorithms_to_train:
algo = self.algorithms[name]
# 获取算法名称(根据是否支持中文选择)
algo_name = self.algorithm_names[name]['cn'] if self.use_chinese else self.algorithm_names[name]['en']
try:
print(f"\n训练 {algo_name}...")
# 创建模型管道
if self.needs_scaling[name]:
# 为需要标准化的算法创建包含三个步骤的Pipeline
model = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # 使用均值填充缺失值
('scaler', StandardScaler()),
(name, clone(algo['model']))
])
else:
# 为不需要标准化的算法创建包含两个步骤的Pipeline
model = Pipeline([
('imputer', SimpleImputer(strategy='mean')), # 使用均值填充缺失值
(name, clone(algo['model']))
])
# 为决策树和随机森林直接设置参数,不使用Pipeline参数设置方式
if name in ['decision_tree', 'random_forest']:
# 获取算法实例
algo_instance = model.named_steps[name]
# 直接设置参数
for param, value in algo['params'].items():
setattr(algo_instance, param, value)
else:
# 为其他算法使用Pipeline参数设置方式
model.set_params(**algo['params'])
# 训练模型
model.fit(X_train, y_train)
# 评估模型
train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)
y_pred = model.predict(X_test)
print(f"训练集准确率: {train_accuracy:.4f}")
print(f"测试集准确率: {test_accuracy:.4f}")
print("分类报告:")
print(classification_report(y_test, y_pred))
results[name] = {
'name': algo_name,
'model': model,
'train_accuracy': train_accuracy,
'test_accuracy': test_accuracy,
'y_pred': y_pred,
'X_test': X_test,
'y_test': y_test,
'unique_labels': np.unique(y_test),
'is_single_class': is_single_class_data
}
except Exception as e:
print(f"训练 {algo_name} 时发生错误: {e}")
results[name] = {
'name': algo_name,
'error': str(e),
'is_single_class': is_single_class_data
}
# 为跳过的SVM算法添加结果记录
if 'svm' not in algorithms_to_train:
svm_name = self.algorithm_names['svm']['cn'] if self.use_chinese else self.algorithm_names['svm']['en']
results['svm'] = {
'name': svm_name,
'error': "由于单类别数据,跳过SVM算法",
'is_single_class': is_single_class_data
}
return results
def compare_algorithms(self, results):
"""比较不同算法的性能"""
# 过滤掉训练失败的算法
valid_results = {name: result for name, result in results.items() if 'test_accuracy' in result}
if not valid_results:
print("没有算法成功训练,无法生成比较图。")
return None
names = [valid_results[name]['name'] for name in valid_results]
accuracies = [valid_results[name]['test_accuracy'] for name in valid_results]
plt.figure(figsize=(12, 6))
bars = plt.bar(names, accuracies, color='skyblue')
# 根据是否支持中文选择标题
title = "不同算法的测试集准确率比较" if self.use_chinese else "Comparison of Test Set Accuracies for Different Algorithms"
x_label = "算法" if self.use_chinese else "Algorithm"
y_label = "准确率" if self.use_chinese else "Accuracy"
plt.ylim(0, 1.05)
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label)
# 添加数值标签
for bar in bars:
height = bar.get_height()
plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
f'{height:.4f}', ha='center', va='bottom')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
return plt
def plot_confusion_matrix(self, results, gas_data_loader, use_chinese=True, rotate_labels=45, fig_width=12, fig_height=10, font_size=10):
"""
绘制混淆矩阵
参数:
- results: 包含算法结果的字典
- gas_data_loader: 气体数据加载器实例
- use_chinese: 是否使用中文
- rotate_labels: 标签旋转角度,默认为45度
- fig_width: 图形的宽度,默认为12
- fig_height: 图形的高度,默认为10
- font_size: 字体大小,默认为10
"""
# 过滤掉训练失败的算法
valid_results = {name: result for name, result in results.items() if 'test_accuracy' in result}
if not valid_results:
print("没有算法成功训练,无法生成混淆矩阵。")
return None
# 获取所有算法中出现的唯一标签
all_unique_labels = set()
for name, result in valid_results.items():
all_unique_labels.update(result['unique_labels'])
all_unique_labels = sorted(list(all_unique_labels))
# 创建标签名称映射
label_names = []
for label in all_unique_labels:
# 尝试查找对应的多维度标签名称
label_name = None
for key, label_id in gas_data_loader.multi_dimension_labels.items():
if label_id == label:
# 获取标签名称而不是标签ID
label_name = gas_data_loader.get_or_create_multi_dimension_label(
key.split('_')[0], # 传感器类型
key.split('_')[1], # 气体类型
int(key.split('_')[2].replace('ppm', '')) # 浓度值
)[1] # 获取第二个返回值,即标签名称字典
break
# 如果找到,使用对应的标签名称
if label_name and isinstance(label_name, dict):
if use_chinese:
label_names.append(label_name.get('cn', f"类别 {label}"))
else:
label_names.append(label_name.get('en', f"Class {label}"))
else:
# 如果没有找到,使用默认标签名称
label_names.append(f"类别 {label}" if use_chinese else f"Class {label}")
for name, result in valid_results.items():
plt.figure(figsize=(fig_width, fig_height))
cm = confusion_matrix(result['y_test'], result['y_pred'], labels=all_unique_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
disp.plot(cmap=plt.cm.Blues)
# 根据是否支持中文选择标题
title = f"{result['name']} 混淆矩阵" if use_chinese else f"{result['name']} Confusion Matrix"
# 如果是单类别数据,添加说明
if result.get('is_single_class', False):
title += " (单类别数据)"
plt.title(title)
# 旋转x轴标签
plt.xticks(rotation=rotate_labels, ha='right', rotation_mode='anchor', fontsize=font_size)
plt.yticks(fontsize=font_size)
plt.tight_layout()
return plt
def detect_dataset_type(dataset_path):
"""
自动检测数据集类型:单一数据集或多数据集文件夹
参数:
- dataset_path: 数据集路径
返回:
- dataset_type: 'single' 或 'multiple'
- file_paths: 文件路径列表
- gas_types: 气体类型列表
- concentrations: 浓度值列表
- sensor_types: 传感器类型列表
"""
# 检查路径是否存在
if not os.path.exists(dataset_path):
raise FileNotFoundError(f"路径不存在: {dataset_path}")
# 检查是文件还是文件夹
if os.path.isfile(dataset_path):
# 单一文件处理
file_paths = [dataset_path]
# 从文件名提取传感器类型、气体类型和浓度
file_name = os.path.basename(dataset_path)
sensor_type = extract_sensor_type(file_name)
gas_type = extract_gas_type(file_name)
concentration = extract_concentration(file_name)
gas_types = [gas_type]
concentrations = [concentration]
sensor_types = [sensor_type]
print(f"检测到单一数据集: {file_name}")
print(f"传感器类型: {sensor_type}, 气体类型: {gas_type}, 浓度: {concentration}ppm")
return 'single', file_paths, gas_types, concentrations, sensor_types
elif os.path.isdir(dataset_path):
# 文件夹处理 - 查找所有Excel文件
excel_files = [f for f in os.listdir(dataset_path) if f.endswith(('.xlsx', '.xls'))]
if not excel_files:
raise ValueError(f"文件夹中没有找到Excel文件: {dataset_path}")
file_paths = []
gas_types = []
concentrations = []
sensor_types = []
for file in excel_files:
file_path = os.path.join(dataset_path, file)
file_paths.append(file_path)
# 从文件名提取传感器类型、气体类型和浓度
sensor_type = extract_sensor_type(file)
gas_type = extract_gas_type(file)
concentration = extract_concentration(file)
gas_types.append(gas_type)
concentrations.append(concentration)
sensor_types.append(sensor_type)
print(f"找到数据集文件: {file}")
print(f"传感器类型: {sensor_type}, 气体类型: {gas_type}, 浓度: {concentration}ppm")
print(f"总共找到 {len(file_paths)} 个数据集文件")
return 'multiple', file_paths, gas_types, concentrations, sensor_types
else:
raise ValueError(f"无法识别的路径: {dataset_path}")
def extract_sensor_type(file_name):
"""从文件名提取传感器类型"""
# 定义传感器类型的正则表达式模式
sensor_patterns = {
'MP2': r'(^MP2[^a-zA-Z0-9]|MP2$)',
'MP3B': r'(^MP3B[^a-zA-Z0-9]|MP3B$)',
'MP503': r'(^MP503[^a-zA-Z0-9]|MP503$)',
'MP801': r'(^MP801[^a-zA-Z0-9]|MP801$)',
'MQ2': r'(^MQ2[^a-zA-Z0-9]|MQ2$)',
'MQ7B': r'(^MQ7B[^a-zA-Z0-9]|MQ7B$)'
}
# 转换为大写以提高匹配率
file_name_upper = file_name.upper()
# 尝试匹配传感器类型
for sensor_type, pattern in sensor_patterns.items():
if re.search(pattern, file_name_upper):
return sensor_type
# 如果没有匹配到,返回默认值
print(f"警告: 无法从文件名 '{file_name}' 中提取传感器类型,使用默认值 'MP2'")
return 'MP2'
def extract_gas_type(file_name):
"""从文件名提取气体类型"""
# 定义基础气体类型的中英文名称映射
gas_name_mapping = {
'bingtong': 'acetone',
'丙酮': 'acetone',
'jiaben': 'toluene',
'甲苯': 'toluene',
'jiachun': 'methanol',
'甲醇': 'methanol',
'jiaquan': 'formaldehyde',
'甲醛': 'formaldehyde',
'yichun': 'ethanol',
'乙醇': 'ethanol'
}
# 去除文件扩展名
file_name_without_ext = os.path.splitext(file_name)[0]
# 按照固定格式"传感器_气体名称_浓度"分割文件名
parts = file_name_without_ext.split('_')
# 确保有足够的部分
if len(parts) < 3:
print(f"警告: 文件名格式不符合预期: {file_name}")
return 'acetone'
# 获取气体名称部分
gas_name_part = parts[1]
# 检查是否为混合气体
if '+' in gas_name_part or '+' in gas_name_part:
# 处理混合气体
# 统一分隔符
gas_name_part = gas_name_part.replace('+', '+')
gas_components = gas_name_part.split('+')
# 转换为标准气体名称
standard_gas_names = []
for component in gas_components:
# 先尝试中文名称映射
standard_name = gas_name_mapping.get(component, None)
if standard_name:
standard_gas_names.append(standard_name)
else:
# 如果是英文名称,直接添加
if component.lower() in ['acetone', 'toluene', 'methanol', 'formaldehyde', 'ethanol']:
standard_gas_names.append(component.lower())
else:
print(f"警告: 无法识别的气体成分: {component}")
# 按字母顺序排序以确保一致性
standard_gas_names.sort()
# 组合成混合气体名称
if len(standard_gas_names) > 1:
return '+'.join(standard_gas_names)
elif len(standard_gas_names) == 1:
return standard_gas_names[0]
# 处理单一气体
# 先尝试中文名称映射
standard_name = gas_name_mapping.get(gas_name_part, None)
if standard_name:
return standard_name
# 如果是英文名称,直接返回小写形式
if gas_name_part.lower() in ['acetone', 'toluene', 'methanol', 'formaldehyde', 'ethanol']:
return gas_name_part.lower()
# 如果没有匹配到,返回默认值
print(f"警告: 无法从文件名 '{file_name}' 中提取气体类型,使用默认值 'acetone'")
return 'acetone'
def extract_concentration(file_name):
"""从文件名提取浓度值"""
# 去除文件扩展名
file_name_without_ext = os.path.splitext(file_name)[0]
# 按照固定格式"传感器_气体名称_浓度"分割文件名
parts = file_name_without_ext.split('_')
# 确保有足够的部分
if len(parts) < 3:
print(f"警告: 文件名格式不符合预期: {file_name}")
return 20
# 获取浓度部分
concentration_part = parts[2]
# 提取数字部分
match = re.search(r'(\d+)', concentration_part)
if match:
return int(match.group(1))
# 如果没有匹配到,返回默认值
print(f"警告: 无法从文件名 '{file_name}' 中提取浓度值,使用默认值 20ppm")
return 20
def main():
"""主函数"""
# 检查中文字体支持
chinese_supported = check_chinese_font_support()
# 创建数据加载器
data_loader = GasSensorDataAnalyzer()
# 定义数据集路径
dataset_path = r"C:\Users\Cong\Desktop\作业\项目\六通道2混合\2_MP2"
try:
# 自动检测数据集类型
dataset_type, file_paths, gas_types, concentrations, sensor_types = detect_dataset_type(dataset_path)
# 根据检测结果加载数据
if dataset_type == 'single':
# 加载单一数据集
X, y = data_loader.load_dataset(file_paths[0], gas_types[0], concentrations[0], sensor_types[0])
else:
# 加载多个数据集并合并
X, y = data_loader.load_multiple_gas_data(file_paths, gas_types, concentrations, sensor_types)
if X is None or len(X) == 0:
print("No valid data available for training. Please check file paths and formats.")
return
print(f"加载的数据集总样本数: {len(X)}")
print(f"数据集中的类别数量: {len(np.unique(y))}")
# 创建算法选择器,根据中文字体支持情况决定是否使用中文
selector = AlgorithmSelector(use_chinese=chinese_supported)
# 自定义参数配置示例
selector.set_algorithm_params('knn', {'n_neighbors': 3, 'metric': 'manhattan'})
selector.set_algorithm_params('svm', {'C': 0.8, 'kernel': 'linear'})
selector.set_algorithm_params('neural_network', {'hidden_layer_sizes': (150, 75)})
# 训练所有算法
results = selector.train_models(X, y)
# 比较算法性能
plt1 = selector.compare_algorithms(results)
if plt1:
plt1.savefig('algorithm_comparison.png')
plt1.close()
# 绘制混淆矩阵
plt2 = selector.plot_confusion_matrix(results, data_loader, use_chinese=chinese_supported, rotate_labels=45,fig_width=20, fig_height=20, font_size=8)
if plt2:
plt2.savefig('confusion_matrix.png')
plt2.close()
print("\n算法比较结果已保存为 'algorithm_comparison.png'")
print("混淆矩阵已保存为 'confusion_matrix.png'")
except Exception as e:
print(f"程序执行过程中发生错误: {e}")
if __name__ == "__main__":
main()还有一个是tempcoderunnerfile.py文件:@app.route('/upload', methods=['POST'])
def upload_file():
"""处理文件上传"""
if 'files' not in request.files:
return jsonify({'error': 'No file part'}), 400
files = request.files.getlist('files')
gas_type = request.form.get('gas_type', 'acetone')
concentration = int(request.form.get('concentration', 20))
if not files or files[0].filename == '':
return jsonify({'error': 'No selected file'}), 400
datasets = []
for file in files:
if file and allowed_file(file.filename):
# 保存临时文件
file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
file.save(file_path)
# 加载数据
data = data_loader.load_single_gas_data(file_path, gas_type, concentration)
datasets.append(data)
# 删除临时文件
os.remove(file_path)
# 合并数据集
X, y = data_loader.combine_datasets(datasets)
if X is None or len(X) == 0:
return jsonify({'error': 'No valid data loaded'}), 400
# 保存合并后的数据
df = pd.DataFrame(X)
df['label'] = y
file_path = os.path.join(app.config['UPLOAD_FOLDER'], 'temp_data.xlsx')
df.to_excel(file_path, index=False)
return jsonify({
'status': 'success',
'sample_count': len(X),
'feature_count': X.shape[1],
'gas_type': gas_type,
'concentration': concentration
})请根据这两个文件重修修改app.py文件,确保他在algorithmselection呈现的页面中当我选择两个文件进入的时候能够分析数据