import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
class SpectralDataPreprocessor:
"""
光谱数据预处理类
专门用于处理B题中碳化硅和硅晶圆片的红外干涉光谱数据
"""
def __init__(self):
self.processed_data = {}
self.anomaly_reports = {}
print("光谱数据预处理器已初始化")
print("=" * 60)
def load_data(self, filepath):
"""
加载Excel数据文件
Parameters:
filepath: str, 文件路径
Returns:
pandas.DataFrame: 原始数据
"""
try:
# 尝试读取Excel文件
data = pd.read_excel(filepath, header=None)
# 检查数据维度
if data.shape[1] < 2:
raise ValueError(f"数据列数不足,期望至少2列,实际{data.shape[1]}列")
# 设置列名:第1列为波数,第2列为反射率
data.columns = ['wavenumber', 'reflectance'] + [f'col_{i}' for i in range(2, data.shape[1])]
print(f"✓ 成功加载文件: {filepath}")
print(f" 数据维度: {data.shape[0]} 行 × {data.shape[1]} 列")
print(f" 前5行预览:")
print(data.head())
print()
return data
except Exception as e:
print(f"✗ 加载文件失败: {e}")
return None
def detect_and_remove_header(self, data):
"""
检测并移除可能的标题行
Parameters:
data: pandas.DataFrame, 原始数据
Returns:
pandas.DataFrame: 移除标题行后的数据
"""
# 检查第一行是否为数值
first_row_numeric = True
try:
pd.to_numeric(data.iloc[0, 0])
pd.to_numeric(data.iloc[0, 1])
except:
first_row_numeric = False
if not first_row_numeric:
print(" 检测到标题行,已移除")
data = data.iloc[1:].copy()
return data
def preprocess_data(self, data, filename):
"""
预处理光谱数据的主函数
Parameters:
data: pandas.DataFrame, 原始数据
filename: str, 文件名
Returns:
dict: 预处理结果
"""
print(f"开始预处理文件: {filename}")
print("-" * 40)
# 初始化结果字典
result = {
'filename': filename,
'original_shape': data.shape,
'clean_data': None,
'anomalies': {
'header_rows': 0,
'missing_values': [],
'negative_wavenumber': [],
'invalid_reflectance': [],
'duplicates': [],
'outliers': []
},
'statistics': {}
}
# 创建数据副本进行处理
work_data = data.copy()
# 1. 检测并移除标题行
original_length = len(work_data)
work_data = self.detect_and_remove_header(work_data)
result['anomalies']['header_rows'] = original_length - len(work_data)
# 2. 数据类型转换
print(" 步骤1: 数据类型转换")
work_data['wavenumber'] = pd.to_numeric(work_data['wavenumber'], errors='coerce')
work_data['reflectance'] = pd.to_numeric(work_data['reflectance'], errors='coerce')
# 3. 检查并处理缺失值
print(" 步骤2: 检查缺失值")
missing_mask = work_data['wavenumber'].isna() | work_data['reflectance'].isna()
missing_count = missing_mask.sum()
if missing_count > 0:
missing_indices = work_data[missing_mask].index.tolist()
result['anomalies']['missing_values'] = missing_indices
print(f" 发现 {missing_count} 个缺失值行,已移除")
work_data = work_data.dropna().copy()
else:
print(" 未发现缺失值")
# 4. 检查负波数或零波数
print(" 步骤3: 检查波数有效性")
negative_wavenumber_mask = work_data['wavenumber'] <= 0
negative_count = negative_wavenumber_mask.sum()
if negative_count > 0:
negative_indices = work_data[negative_wavenumber_mask].index.tolist()
result['anomalies']['negative_wavenumber'] = negative_indices
print(f" 发现 {negative_count} 个非正波数,已移除")
work_data = work_data[work_data['wavenumber'] > 0].copy()
else:
print(" 所有波数均为正值")
# 5. 检查反射率范围
print(" 步骤4: 检查反射率有效性")
invalid_reflectance_mask = (work_data['reflectance'] < 0) | (work_data['reflectance'] > 100)
invalid_count = invalid_reflectance_mask.sum()
if invalid_count > 0:
invalid_indices = work_data[invalid_reflectance_mask].index.tolist()
result['anomalies']['invalid_reflectance'] = invalid_indices
print(f" 发现 {invalid_count} 个无效反射率值,已移除")
work_data = work_data[(work_data['reflectance'] >= 0) & (work_data['reflectance'] <= 100)].copy()
else:
print(" 所有反射率值均在有效范围内(0-100%)")
# 6. 检查重复波数
print(" 步骤5: 检查重复波数")
duplicate_mask = work_data['wavenumber'].duplicated()
duplicate_count = duplicate_mask.sum()
if duplicate_count > 0:
duplicate_indices = work_data[duplicate_mask].index.tolist()
result['anomalies']['duplicates'] = duplicate_indices
print(f" 发现 {duplicate_count} 个重复波数,保留第一个值")
work_data = work_data.drop_duplicates(subset=['wavenumber']).copy()
else:
print(" 未发现重复波数")
# 7. 按波数排序
print(" 步骤6: 按波数排序")
work_data = work_data.sort_values('wavenumber').reset_index(drop=True)
# 8. 异常值检测(使用IQR方法)
print(" 步骤7: 异常值检测(IQR方法)")
Q1 = work_data['reflectance'].quantile(0.25)
Q3 = work_data['reflectance'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outlier_mask = (work_data['reflectance'] < lower_bound) | (work_data['reflectance'] > upper_bound)
outlier_count = outlier_mask.sum()
if outlier_count > 0:
outlier_indices = work_data[outlier_mask].index.tolist()
result['anomalies']['outliers'] = outlier_indices
print(f" 发现 {outlier_count} 个统计异常值")
print(f" 异常值范围: < {lower_bound:.2f} 或 > {upper_bound:.2f}")
# 对于光谱数据,建议保留异常值,但标记出来
# work_data = work_data[~outlier_mask].copy()
print(f" 建议: 光谱数据的异常值可能包含重要信息,暂时保留")
else:
print(" 未发现统计异常值")
# 9. 计算最终统计信息
print(" 步骤8: 计算统计信息")
result['statistics'] = {
'final_count': len(work_data),
'wavenumber_range': [work_data['wavenumber'].min(), work_data['wavenumber'].max()],
'reflectance_range': [work_data['reflectance'].min(), work_data['reflectance'].max()],
'mean_reflectance': work_data['reflectance'].mean(),
'std_reflectance': work_data['reflectance'].std(),
'median_reflectance': work_data['reflectance'].median(),
'total_removed_count': result['original_shape'][0] - len(work_data),
'data_density': len(work_data) / (work_data['wavenumber'].max() - work_data['wavenumber'].min())
}
result['clean_data'] = work_data
# 打印统计摘要
stats = result['statistics']
print(f"\n 预处理完成:")
print(f" 原始数据: {result['original_shape'][0]} 行")
print(f" 清洗后数据: {stats['final_count']} 行")
print(
f" 移除数据: {stats['total_removed_count']} 行 ({stats['total_removed_count'] / result['original_shape'][0] * 100:.1f}%)")
print(f" 波数范围: {stats['wavenumber_range'][0]:.1f} - {stats['wavenumber_range'][1]:.1f} cm⁻¹")
print(f" 反射率范围: {stats['reflectance_range'][0]:.2f} - {stats['reflectance_range'][1]:.2f} %")
print(f" 平均反射率: {stats['mean_reflectance']:.2f} ± {stats['std_reflectance']:.2f} %")
print()
return result
def save_cleaned_data(self, result, output_dir="cleaned_data"):
"""
保存清洗后的数据
Parameters:
result: dict, 预处理结果
output_dir: str, 输出目录
"""
if result['clean_data'] is None or len(result['clean_data']) == 0:
print(f" ✗ {result['filename']}: 没有有效数据可保存")
return
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
# 生成输出文件名
base_name = result['filename'].replace('.xlsx', '').replace('.xls', '')
output_path = os.path.join(output_dir, f"cleaned_{base_name}.xlsx")
# 保存数据
clean_df = result['clean_data'][['wavenumber', 'reflectance']].copy()
clean_df.columns = ['波数(cm-1)', '反射率(%)']
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
# 保存清洗后的数据
clean_df.to_excel(writer, sheet_name='清洗后数据', index=False)
# 保存统计信息
stats_df = pd.DataFrame([
['原始数据行数', result['original_shape'][0]],
['清洗后数据行数', result['statistics']['final_count']],
['移除行数', result['statistics']['total_removed_count']],
['移除比例(%)',
f"{result['statistics']['total_removed_count'] / result['original_shape'][0] * 100:.1f}"],
['波数最小值(cm-1)', f"{result['statistics']['wavenumber_range'][0]:.2f}"],
['波数最大值(cm-1)', f"{result['statistics']['wavenumber_range'][1]:.2f}"],
['反射率最小值(%)', f"{result['statistics']['reflectance_range'][0]:.2f}"],
['反射率最大值(%)', f"{result['statistics']['reflectance_range'][1]:.2f}"],
['平均反射率(%)', f"{result['statistics']['mean_reflectance']:.2f}"],
['反射率标准差(%)', f"{result['statistics']['std_reflectance']:.2f}"],
['反射率中位数(%)', f"{result['statistics']['median_reflectance']:.2f}"]
], columns=['统计项目', '数值'])
stats_df.to_excel(writer, sheet_name='统计信息', index=False)
# 保存异常值报告
anomaly_data = []
for anomaly_type, indices in result['anomalies'].items():
if isinstance(indices, list) and indices:
anomaly_data.append(
[anomaly_type, len(indices), str(indices[:10]) + ('...' if len(indices) > 10 else '')])
elif isinstance(indices, (int, float)) and indices > 0:
anomaly_data.append([anomaly_type, indices, ''])
if anomaly_data:
anomaly_df = pd.DataFrame(anomaly_data, columns=['异常类型', '数量', '样例索引'])
anomaly_df.to_excel(writer, sheet_name='异常值报告', index=False)
print(f" ✓ 清洗后数据已保存到: {output_path}")
def plot_data_comparison(self, results, output_dir="plots"):
"""
绘制数据清洗前后的对比图
Parameters:
results: list, 预处理结果列表
output_dir: str, 图片输出目录
"""
os.makedirs(output_dir, exist_ok=True)
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
for result in results:
if result['clean_data'] is None:
continue
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle(f'{result["filename"]} - 数据预处理对比分析', fontsize=16, fontweight='bold')
clean_data = result['clean_data']
# 1. 清洗后的光谱曲线
axes[0, 0].plot(clean_data['wavenumber'], clean_data['reflectance'], 'b-', linewidth=1.5, alpha=0.8)
axes[0, 0].set_title('清洗后的光谱数据', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('波数 (cm$^{-1}$)')
axes[0, 0].set_ylabel('反射率 (%)')
axes[0, 0].grid(True, alpha=0.3)
# 2. 反射率分布直方图
axes[0, 1].hist(clean_data['reflectance'], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 1].axvline(clean_data['reflectance'].mean(), color='red', linestyle='--', linewidth=2,
label=f'均值: {clean_data["reflectance"].mean():.2f}%')
axes[0, 1].axvline(clean_data['reflectance'].median(), color='green', linestyle='--', linewidth=2,
label=f'中位数: {clean_data["reflectance"].median():.2f}%')
axes[0, 1].set_title('反射率分布', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('反射率 (%)')
axes[0, 1].set_ylabel('频次')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)
# 3. 数据质量统计
quality_data = [
result['statistics']['final_count'],
result['statistics']['total_removed_count']
]
quality_labels = ['保留数据', '移除数据']
colors = ['lightgreen', 'lightcoral']
wedges, texts, autotexts = axes[0, 2].pie(quality_data, labels=quality_labels, colors=colors,
autopct='%1.1f%%', startangle=90)
axes[0, 2].set_title('数据质量分布', fontsize=12, fontweight='bold')
# 4. 波数密度分析
axes[1, 0].scatter(clean_data['wavenumber'], clean_data['reflectance'], alpha=0.6, s=1)
axes[1, 0].set_title('波数-反射率散点图', fontsize=12, fontweight='bold')
axes[1, 0].set_xlabel('波数 (cm$^{-1}$)')
axes[1, 0].set_ylabel('反射率 (%)')
axes[1, 0].grid(True, alpha=0.3)
# 5. 异常值统计
anomaly_types = []
anomaly_counts = []
for anomaly_type, indices in result['anomalies'].items():
if isinstance(indices, list) and indices:
anomaly_types.append(anomaly_type.replace('_', ' ').title())
anomaly_counts.append(len(indices))
elif isinstance(indices, (int, float)) and indices > 0:
anomaly_types.append(anomaly_type.replace('_', ' ').title())
anomaly_counts.append(indices)
if anomaly_types:
bars = axes[1, 1].bar(range(len(anomaly_types)), anomaly_counts, color='orange', alpha=0.7)
axes[1, 1].set_title('异常值检测结果', fontsize=12, fontweight='bold')
axes[1, 1].set_xlabel('异常类型')
axes[1, 1].set_ylabel('数量')
axes[1, 1].set_xticks(range(len(anomaly_types)))
# axes[1, 1].set_xticklabels(anomaly_types, rotation=45, ha='right')
# 在柱子上添加数值标签
for bar, count in zip(bars, anomaly_counts):
axes[1, 1].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + max(anomaly_counts) * 0.01,
str(count), ha='center', va='bottom')
else:
axes[1, 1].text(0.5, 0.5, '未发现异常值', ha='center', va='center', transform=axes[1, 1].transAxes,
fontsize=14)
axes[1, 1].set_title('异常值检测结果', fontsize=12, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)
# 删除第六张子图的坐标框(隐藏该子图)
axes[1, 2].axis('off') # 隐藏坐标框和内容
plt.tight_layout()
# 保存图片
output_path = os.path.join(output_dir, f"{result['filename']}_analysis.png")
plt.savefig(output_path, dpi=300, bbox_inches='tight')
print(f" ✓ 分析图表已保存到: {output_path}")
plt.close()
def generate_comprehensive_report(self, results, output_dir="reports"):
"""
生成综合预处理报告
Parameters:
results: list, 所有文件的预处理结果
output_dir: str, 报告输出目录
"""
os.makedirs(output_dir, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_path = os.path.join(output_dir, f"preprocessing_report_{timestamp}.txt")
with open(report_path, 'w', encoding='utf-8') as f:
f.write("=" * 80 + "\n")
f.write("光谱数据预处理综合报告\n")
f.write("=" * 80 + "\n")
f.write(f"生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"处理文件数量: {len(results)}\n\n")
# 总体统计
total_original = sum(r['original_shape'][0] for r in results)
total_final = sum(r['statistics']['final_count'] for r in results if r['statistics'])
total_removed = total_original - total_final
f.write("总体统计:\n")
f.write("-" * 40 + "\n")
f.write(f"总原始数据行数: {total_original}\n")
f.write(f"总清洗后行数: {total_final}\n")
f.write(f"总移除行数: {total_removed}\n")
f.write(f"总体数据保留率: {total_final / total_original * 100:.1f}%\n\n")
# 各文件详细信息
for i, result in enumerate(results, 1):
f.write(f"{i}. 文件: {result['filename']}\n")
f.write("-" * 50 + "\n")
if result['statistics']:
stats = result['statistics']
f.write(f"原始数据行数: {result['original_shape'][0]}\n")
f.write(f"清洗后数据行数: {stats['final_count']}\n")
f.write(f"移除行数: {stats['total_removed_count']}\n")
f.write(f"数据保留率: {stats['final_count'] / result['original_shape'][0] * 100:.1f}%\n")
f.write(f"波数范围: {stats['wavenumber_range'][0]:.2f} - {stats['wavenumber_range'][1]:.2f} cm⁻¹\n")
f.write(
f"反射率范围: {stats['reflectance_range'][0]:.2f} - {stats['reflectance_range'][1]:.2f} %\n")
f.write(f"平均反射率: {stats['mean_reflectance']:.2f} ± {stats['std_reflectance']:.2f} %\n")
f.write(f"数据密度: {stats['data_density']:.2f} 点/cm⁻¹\n")
# 异常值详情
f.write(f"\n异常值检测结果:\n")
for anomaly_type, data in result['anomalies'].items():
if isinstance(data, list) and data:
f.write(f" - {anomaly_type}: {len(data)} 个\n")
elif isinstance(data, (int, float)) and data > 0:
f.write(f" - {anomaly_type}: {data} 个\n")
else:
f.write("处理失败,无统计信息\n")
f.write("\n")
# 数据质量评估
f.write("数据质量评估:\n")
f.write("-" * 40 + "\n")
quality_issues = []
for result in results:
if result['statistics']:
retention_rate = result['statistics']['final_count'] / result['original_shape'][0]
if retention_rate < 0.9:
quality_issues.append(f"{result['filename']}: 数据保留率较低 ({retention_rate * 100:.1f}%)")
# 检查数据密度
density = result['statistics']['data_density']
if density < 1:
quality_issues.append(f"{result['filename']}: 数据密度较低 ({density:.2f} 点/cm⁻¹)")
if quality_issues:
f.write("发现的质量问题:\n")
for issue in quality_issues:
f.write(f" {issue}\n")
else:
f.write("✓ 所有文件的数据质量良好\n")
f.write("\n")
def main():
"""
主函数:执行完整的数据预处理流程
"""
print(" 光谱数据预处理程序启动")
print("=" * 60)
# 创建预处理器实例
preprocessor = SpectralDataPreprocessor()
# 定义要处理的文件
files = [
('附件1.xlsx', '碳化硅_10度入射'),
('附件2.xlsx', '碳化硅_15度入射'),
('附件3.xlsx', '硅_10度入射'),
('附件4.xlsx', '硅_15度入射')
]
results = []
# 处理每个文件
for filepath, description in files:
print(f"\n 处理文件: {filepath} ({description})")
print("=" * 60)
# 检查文件是否存在
if not os.path.exists(filepath):
print(f" ✗ 文件不存在: {filepath}")
print(f" 请确保文件位于当前目录中")
continue
# 加载数据
data = preprocessor.load_data(filepath)
if data is None:
continue
# 预处理数据
result = preprocessor.preprocess_data(data, description)
results.append(result)
# 保存清洗后的数据
preprocessor.save_cleaned_data(result)
# 生成可视化图表
if results:
print("\n 生成可视化图表...")
preprocessor.plot_data_comparison(results)
# 生成综合报告
print("\n 生成综合报告...")
preprocessor.generate_comprehensive_report(results)
print("\n 所有数据预处理完成!")
print("输出文件:")
print(" - cleaned_data/: 清洗后的Excel文件")
print(" - plots/: 数据分析图表")
print(" - reports/: 综合处理报告")
else:
print("\n 没有成功处理任何文件")
if __name__ == "__main__":
main()(对代码进行检查和优化,确保代码结果的正确性和严谨性,可视化图的高级性)