pandas:sort_values排序后concat"失效“问题解决

本文探讨了在使用Pandas进行数据排序后,如何正确地利用concat函数连接已排序的数据框,避免因索引不匹配导致的排序失效问题。通过实例演示了reset_index方法的应用,确保数据框在连接时保持正确的排序状态。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

pandas:sort_values排序后concat"失效“问题解决

问题描述:

import pandas as pd
import numpy as np
data1=np.array([[1,1,1],[3,2,2],[2,3,3]])
index=np.arange(1,4,1)#起点,终点+1,步长
columns=list('ABC')
df1=pd.DataFrame(data1,index=index,columns=columns)
print('df1:\n',df1)

数据1

df1.sort_values(inplace=True,by=['A'])#排序
print('排序后的df1(注意index也变化了):\n',df1)

数据1.1

#创建一个df2
df2=pd.DataFrame([4,4,4],index=[1,2,3],columns=['D'])
connect=pd.concat([df1,df2],axis=1,sort=True)
print('希望连接排序后的df1和df2:\n',connect)

connect_data
此时我们发现,刚刚明明排好的数据df1,连接df2时,为什么又复原了?因为concat连接是根据那个索引值进行连接(第一行连接第一行……),索引值在排序的时候也跟着排好了。

我们这么解决:

#直接重置2个索引,让它们一致
df1.reset_index(inplace=True,drop=True)
df2.reset_index(inplace=True,drop=True)
connect=pd.concat([df1,df2],axis=1,sort=True)#0:纵向合并;1;横向合并
print('成功连接排序后的df1和df2:\n',connect)

在这里插入图片描述

# -*- coding: utf-8 -*- """整合气象数据获取与处理系统 功能: 1. 在5分钟整点(0分、5分、10分等)获取实况气象数据 2. 将5分钟数据按月保存到Excel(存储在"5分钟级数据"文件夹) 3. 将5分钟数据插值为分钟级数据 4. 计算每分钟的太阳位置(高度角、方位角)和地外辐照度(保留两位小数) 5. 将分钟级数据按月保存到独立Excel(存储在"分钟级数据"文件夹) """ import urllib, urllib3, sys, uuid import ssl import json from openpyxl import Workbook, load_workbook from datetime import datetime, timedelta import warnings from urllib3.exceptions import InsecureRequestWarning import time import os import pandas as pd import numpy as np from scipy.interpolate import CubicSpline import math import openpyxl from openpyxl.styles import PatternFill, Font, Alignment import traceback # 强制刷新标准输出 - 确保实时输出 sys.stdout = sys.__stdout__ sys.stderr = sys.__stderr__ if hasattr(sys.stdout, 'reconfigure'): sys.stdout.reconfigure(line_buffering=True) # 确保实时输出 else: # 对于不支持reconfigure的环境 sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 1) # 禁用SSL警告 warnings.filterwarnings("ignore", category=InsecureRequestWarning) # 创建绕过SSL验证的连接池 http = urllib3.PoolManager(cert_reqs='CERT_NONE') # API配置 host = 'http://aliv18.data.moji.com' condition_path = '/whapi/json/alicityweather/condition' method = 'POST' appcode = '2c642e12da774c918fa78ac56b5b4c50' # 保定地理位置 LOCATION_NAME = "保定" LONGITUDE = 115.480 # 东经 LATITUDE = 38.855 # 北纬 # 定义API请求头 API_HEADERS = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Authorization': 'APPCODE ' + appcode } # 实况数据API参数 CONDITION_TOKEN = '50b53ff8dd7d9fa320d3d3ca32cf8ed1' CITY_ID = '1819' # 保定城市ID # 创建数据存储目录 FIVEMIN_DIR = "5分钟级数据" MINUTELY_DIR = "分钟级数据" os.makedirs(FIVEMIN_DIR, exist_ok=True) os.makedirs(MINUTELY_DIR, exist_ok=True) def log_message(message): """记录日志到屏幕 - 确保实时输出""" timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S') log_line = f"[{timestamp}] {message}" # 直接输出到控制台,强制刷新 print(log_line, flush=True) # 同时写入日志文件以便调试 with open("weather_system.log", "a", encoding="utf-8") as log_file: log_file.write(log_line + "\n") log_file.flush() # 立即刷新文件写入 def safe_save(wb, filename): """安全保存Excel文件,避免占用问题""" log_message(f"正在保存文件: {filename}") # 确保目标目录存在 os.makedirs(os.path.dirname(filename), exist_ok=True) for attempt in range(5): # 增加尝试次数 try: # 使用绝对路径避免相对路径问题 abs_filename = os.path.abspath(filename) temp_file = abs_filename.replace(".xlsx", f"_temp_{attempt}.xlsx") log_message(f"尝试保存临时文件: {temp_file}") wb.save(temp_file) # 替换原文件 if os.path.exists(abs_filename): os.remove(abs_filename) os.rename(temp_file, abs_filename) log_message(f"文件保存成功: {abs_filename}") return True except (PermissionError, OSError) as e: log_message(f"文件保存失败 ({attempt+1}/5): {str(e)}") time.sleep(3) except Exception as e: log_message(f"文件保存时发生意外错误: {str(e)}") traceback.print_exc() time.sleep(3) return False def get_condition_data(): """获取实况天气数据""" try: log_message(f"正在获取{LOCATION_NAME}实况天气数据...") condition_bodys = {'token': CONDITION_TOKEN, 'cityId': CITY_ID} post_data_condition = urllib.parse.urlencode(condition_bodys).encode('utf-8') condition_url = host + condition_path response_condition = http.request('POST', condition_url, body=post_data_condition, headers=API_HEADERS) content_condition = response_condition.data.decode('utf-8') condition_data = json.loads(content_condition) log_message(f"API响应内容: {json.dumps(condition_data, ensure_ascii=False)[:200]}...") if condition_data.get('code') != 0: log_message(f"实况API请求失败: 代码 {condition_data.get('code')}, 消息: {condition_data.get('msg')}") return None condition_info = condition_data['data']['condition'] current_time = datetime.now() # 对齐到最近的5分钟 aligned_minute = (current_time.minute // 5) * 5 aligned_time = current_time.replace(minute=aligned_minute, second=0, microsecond=0) time_str = aligned_time.strftime("%Y-%m-%d %H:%M") # 计算风速(m/s)并保留一位小数 raw_wind_speed = condition_info.get('windSpeed', '') wind_speed_mps = '' if raw_wind_speed and raw_wind_speed.replace('.', '', 1).isdigit(): try: wind_speed_mps = round(float(raw_wind_speed) / 3.6, 1) # 保留一位小数 except: wind_speed_mps = '' # 处理温度值,确保保留一位小数 temp_value = condition_info.get('temp', '') if temp_value and temp_value.replace('.', '', 1).isdigit(): try: temp_value = round(float(temp_value), 1) # 温度保留一位小数 except: pass # 构建实况数据行 condition_row = [ time_str, temp_value, # 温度保留一位小数 condition_info.get('condition', ''), condition_info.get('humidity', ''), condition_info.get('windDir', ''), condition_info.get('windLevel', ''), raw_wind_speed, wind_speed_mps, # 风速保留一位小数 condition_info.get('precip', '0.0'), condition_info.get('pressure', '') ] log_message(f"成功获取实况数据: {time_str}") log_message(f" 温度: {temp_value}℃, 天气状况: {condition_info.get('condition', '')}, 湿度: {condition_info.get('humidity', '')}%") return condition_row except Exception as e: log_message(f"获取实况数据时出错: {str(e)}") traceback.print_exc() return None def normalize_time(time_str): """统一时间格式为 YYYY-MM-DD HH:MM""" try: # 如果已经是字符串,尝试解析 if isinstance(time_str, str): # 尝试解析为日期时间 if len(time_str) == 16: # 格式为 "YYYY-MM-DD HH:MM" return time_str elif len(time_str) == 19: # 包含秒数 "YYYY-MM-DD HH:MM:00" return time_str[:16] elif ' ' not in time_str: # 只有日期部分 return time_str + " 00:00" else: # 其他格式,尝试转换 dt = datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S") return dt.strftime("%Y-%m-%d %H:%M") # 如果是datetime对象 elif isinstance(time_str, datetime): return time_str.strftime("%Y-%m-%d %H:%M") except Exception: pass # 无法识别的格式,原样返回 return str(time_str) def update_fivemin_excel(condition_row): """更新5分钟级Excel文件""" try: current_month = datetime.now().strftime("%Y%m") # 确保目录存在 os.makedirs(FIVEMIN_DIR, exist_ok=True) filename = os.path.join(os.path.abspath(FIVEMIN_DIR), f"{LOCATION_NAME}_5分钟级数据_{current_month}.xlsx") wb = None ws = None existing_data = {} # 存储时间点 -> 行号的映射 headers = [] log_message(f"正在更新5分钟数据文件: {filename}") if os.path.exists(filename): for attempt in range(3): try: log_message(f"尝试加载现有5分钟数据文件... (尝试 {attempt+1}/3)") wb = load_workbook(filename) ws = wb.active if ws.max_row > 0: headers = [cell.value for cell in ws[1]] # 读取现有数据到字典 (时间 -> 行号) for row_idx, row in enumerate(ws.iter_rows(min_row=2, values_only=True), start=2): if not row or not row[0]: continue time_key = normalize_time(row[0]) if time_key in existing_data: log_message(f"警告: 发现重复时间点: {time_key} (行 {row_idx})") existing_data[time_key] = row_idx log_message(f"成功加载现有5分钟数据文件: {filename} (共 {len(existing_data)} 条记录)") break except PermissionError: if attempt < 2: log_message(f"文件被占用,等待5秒后重试 ({attempt+1}/3)") time.sleep(5) else: log_message("无法打开文件,创建新工作簿") wb = Workbook() ws = wb.active ws.title = f"{LOCATION_NAME}5分钟级数据" headers = ["时间", "温度(℃)", "天气状况", "湿度(%)", "风向", "风力等级", "原始风速(km/h)", "风速(m/s)", "降水量(mm)", "气压(hPa)"] ws.append(headers) else: wb = Workbook() ws = wb.active ws.title = f"{LOCATION_NAME}5分钟极数据" headers = ["时间", "温度(℃)", "天气状况", "湿度(%)", "风向", "风力等级", "原始风速(km/h)", "风速(m/s)", "降水量(mm)", "气压(hPa)"] ws.append(headers) log_message(f"创建新的5分钟数据文件: {filename}") if not headers: headers = ["时间", "温度(℃)", "天气状况", "湿度(%)", "风向", "风力等级", "原始风速(km/h)", "风速(m/s)", "降水量(mm)", "气压(hPa)"] # 处理实况数据 if condition_row: current_time = condition_row[0] normalized_current = normalize_time(current_time) if normalized_current in existing_data: row_idx = existing_data[normalized_current] # 更新所有字段(覆盖旧数据) for col_idx in range(1, len(condition_row) + 1): new_value = condition_row[col_idx - 1] ws.cell(row=row_idx, column=col_idx, value=new_value) log_message(f"更新5分钟时间点: {normalized_current} (行 {row_idx})") else: ws.append(condition_row) # 更新字典,记录新行号 existing_data[normalized_current] = ws.max_row log_message(f"新增5分钟时间点: {normalized_current} (行 {ws.max_row})") # 保存文件 if safe_save(wb, filename): log_message(f"5分钟数据保存成功: {filename}") return filename else: log_message(f"5分钟数据保存失败") return None except Exception as e: log_message(f"更新5分钟数据文件时出错: {str(e)}") traceback.print_exc() return None def calculate_solar_parameters(dt, longitude, latitude): """计算太阳高度角、方位角和地外辐照度(高度角和方位角保留两位小数)""" try: # 将时间转换为UTC+0 utc_time = dt - timedelta(hours=8) n = utc_time.timetuple().tm_yday # 年中的第几天 # 计算太阳赤纬 (δ) delta = 23.45 * math.sin(math.radians(360 * (284 + n) / 365)) delta_rad = math.radians(delta) # 计算时角 (ω) utc_hour = utc_time.hour + utc_time.minute/60.0 omega = (utc_hour - 12) * 15 + longitude omega_rad = math.radians(omega) # 纬度转弧度 phi_rad = math.radians(latitude) # 计算太阳高度角 (α) sin_alpha = math.sin(phi_rad) * math.sin(delta_rad) + math.cos(phi_rad) * math.cos(delta_rad) * math.cos(omega_rad) alpha = math.degrees(math.asin(sin_alpha)) # 计算太阳方位角 (γ) sin_gamma = -math.cos(delta_rad) * math.sin(omega_rad) / math.cos(math.radians(alpha)) cos_gamma = (math.sin(delta_rad) * math.cos(phi_rad) - math.cos(delta_rad) * math.sin(phi_rad) * math.cos(omega_rad)) / math.cos(math.radians(alpha)) gamma = math.degrees(math.atan2(sin_gamma, cos_gamma)) gamma = (gamma + 360) % 360 # 转换为0-360度范围 # 计算地外辐照度 (W/m²) g0 = 1367 * (1 + 0.033 * math.cos(math.radians(360 * n / 365))) * sin_alpha g0 = max(0, g0) # 确保非负 # 保留两位小数 return round(alpha, 2), round(gamma, 2), round(g0, 1) except Exception as e: log_message(f"计算太阳参数错误: {str(e)}") return None, None, None def process_minutely_data(fivemin_file): """处理分钟级数据并添加太阳参数""" try: log_message("="*50) log_message(f"开始生成分钟级数据...") # 检查文件是否存在 if not os.path.exists(fivemin_file): log_message(f"错误: 5分钟数据文件不存在: {fivemin_file}") return None # 读取5分钟数据 log_message(f"读取5分钟数据文件: {fivemin_file}") try: fivemin_df = pd.read_excel(fivemin_file) log_message(f"读取5分钟数据成功,共 {len(fivemin_df)} 行") except Exception as e: log_message(f"读取5分钟数据文件失败: {str(e)}") return None # 查找时间列 time_col = next((col for col in fivemin_df.columns if '时间' in col), None) if not time_col: log_message("未找到时间列") return None # 转换时间格式并处理重复值 fivemin_df[time_col] = pd.to_datetime(fivemin_df[time_col], errors='coerce') # 删除无效行 original_count = len(fivemin_df) fivemin_df = fivemin_df.dropna(subset=[time_col]) if original_count > len(fivemin_df): log_message(f"删除了 {original_count - len(fivemin_df)} 行无效时间数据") # 处理重复时间点 - 保留最后一个 duplicate_mask = fivemin_df.duplicated(subset=[time_col], keep='last') if duplicate_mask.any(): log_message(f"发现 {duplicate_mask.sum()} 个重复时间点,保留最后一个") fivemin_df = fivemin_df[~duplicate_mask] # 按时间排序 fivemin_df = fivemin_df.sort_values(by=time_col) fivemin_df.set_index(time_col, inplace=True) # 创建分钟级时间索引 start_time = fivemin_df.index.min() end_time = fivemin_df.index.max() log_message(f"时间范围: {start_time} 至 {end_time}") minutely_index = pd.date_range(start=start_time, end=end_time, freq='1min') minutely_df = pd.DataFrame(index=minutely_index) log_message(f"将生成 {len(minutely_index)} 条分钟级数据") # 插值处理 - 确保时间戳严格递增 base_time = start_time fivemin_timestamps = (fivemin_df.index - base_time).total_seconds() # 检查时间戳是否严格递增 if not (np.diff(fivemin_timestamps) > 0).all(): log_message("时间戳不是严格递增,尝试修复...") # 重新排序 fivemin_df = fivemin_df.sort_index() fivemin_timestamps = (fivemin_df.index - base_time).total_seconds() minutely_timestamps = (minutely_df.index - base_time).total_seconds() # 对连续参数进行插值 log_message("\n开始数据插值处理...") for param in ['风速(m/s)', '温度(℃)', '湿度(%)', '气压(hPa)']: if param in fivemin_df.columns: param_data = fivemin_df[param].dropna() if len(param_data) >= 3: log_message(f"对 {param} 使用三次样条插值 (数据点: {len(param_data)})") cs = CubicSpline(fivemin_timestamps, param_data.values) interpolated_values = cs(minutely_timestamps) # 对温度和风速保留一位小数 if param in ['温度(℃)', '风速(m/s)']: interpolated_values = np.round(interpolated_values, 1) minutely_df[param] = interpolated_values else: log_message(f"对 {param} 使用前向填充 (有效点不足: {len(param_data)})") minutely_df[param] = fivemin_df[param].resample('1min').ffill() # 处理降水量 if '降水量(mm)' in fivemin_df.columns: log_message("处理降水量数据") # 将5分钟降水量转换为每分钟降水量 precip_min = fivemin_df['降水量(mm)'].resample('1min').asfreq().ffill() minutely_df['降水量(mm)'] = precip_min / 5.0 # 平均分配到每分钟 # 处理文本数据 for param in ['天气状况', '风向']: if param in fivemin_df.columns: log_message(f"处理 {param} 数据") minutely_df[param] = fivemin_df[param].resample('1min').ffill() # 数值列保留适当的小数位数 # 温度和风速保留一位小数 if '温度(℃)' in minutely_df.columns: minutely_df['温度(℃)'] = minutely_df['温度(℃)'].round(1) if '风速(m/s)' in minutely_df.columns: minutely_df['风速(m/s)'] = minutely_df['风速(m/s)'].round(1) # 其他数值列保留一位小数 other_numeric_cols = ['湿度(%)', '气压(hPa)', '降水量(mm)'] for col in other_numeric_cols: if col in minutely_df.columns: minutely_df[col] = minutely_df[col].round(1) # 计算太阳参数 log_message(f"\n开始计算太阳位置和辐照度...") solar_data = [] total_points = len(minutely_df) last_progress = -1 start_time = time.time() for i, dt in enumerate(minutely_df.index): alpha, gamma, g0 = calculate_solar_parameters(dt, LONGITUDE, LATITUDE) solar_data.append({ '太阳高度角(度)': alpha, '太阳方位角(度)': gamma, '地外辐照度(W/m2)': g0 }) # 显示进度 progress = int((i + 1) / total_points * 100) if progress != last_progress and progress % 10 == 0: elapsed = time.time() - start_time remaining = (total_points - i) * (elapsed / (i+1)) log_message(f"计算进度: {progress}% | 预计剩余时间: {remaining:.1f}秒") last_progress = progress solar_df = pd.DataFrame(solar_data, index=minutely_df.index) minutely_df = pd.concat([minutely_df, solar_df], axis=1) # 生成输出文件名 current_month = datetime.now().strftime("%Y%m") filename = f"{LOCATION_NAME}_分钟级数据_{current_month}.xlsx" filepath = os.path.join(os.path.abspath(MINUTELY_DIR), filename) # 保存分钟级数据 minutely_df.reset_index(inplace=True) minutely_df.rename(columns={'index': '时间'}, inplace=True) # 创建Excel工作簿 wb = openpyxl.Workbook() ws = wb.active ws.title = "分钟级数据" # 写入数据 log_message(f"\n正在写入分钟级数据到Excel...") for col_idx, col_name in enumerate(minutely_df.columns, 1): ws.cell(row=1, column=col_idx, value=col_name) for r_idx, row in enumerate(minutely_df.values, 2): for c_idx, value in enumerate(row, 1): ws.cell(row=r_idx, column=c_idx, value=value) # 应用样式 header_fill = PatternFill(start_color="1F4E78", end_color="1F4E78", fill_type="solid") header_font = Font(color="FFFFFF", bold=True) header_alignment = Alignment(horizontal="center", vertical="center") for cell in ws[1]: cell.fill = header_fill cell.font = header_font cell.alignment = header_alignment # 设置列宽 log_message("设置列宽...") for col in ws.columns: max_length = 0 for cell in col: try: if cell.value and len(str(cell.value)) > max_length: max_length = len(str(cell.value)) except: pass adjusted_width = (max_length + 2) * 1.2 ws.column_dimensions[col[0].column_letter].width = min(adjusted_width, 50) # 保存文件 if safe_save(wb, filepath): log_message(f"分钟级数据保存成功: {filepath}") return filepath else: log_message(f"分钟级数据保存失败") return None except Exception as e: log_message(f"处理分钟级数据时出错: {str(e)}") traceback.print_exc() return None def get_weather_data(): """获取天气数据并处理""" log_message(f"\n获取{LOCATION_NAME}实况天气数据...") try: # 获取实况数据 condition_row = get_condition_data() if not condition_row: log_message("获取实况数据失败") return False # 更新5分钟数据文件 fivemin_file = update_fivemin_excel(condition_row) if not fivemin_file: log_message("5分钟数据更新失败") return False # 处理分钟级数据(只在5分钟整点处理) current_minute = datetime.now().minute if current_minute % 5 == 0: # 0分、5分、10分等 log_message(f"当前时间 {datetime.now().strftime('%H:%M')} 是5分钟整点,开始生成分钟级数据") minutely_file = process_minutely_data(fivemin_file) if minutely_file: log_message(f"\n数据处理完成!") log_message(f" 5分钟数据 -> {fivemin_file}") log_message(f" 分钟数据 -> {minutely_file}") else: log_message(f"分钟数据处理失败") else: log_message(f"跳过分钟级数据处理(非5分钟整点)") return True except Exception as e: log_message(f"获取天气数据时出错: {str(e)}") traceback.print_exc() return False # 主程序循环 - 确保实时输出日志 if __name__ == '__main__': try: # 程序启动横幅 - 使用print确保直接输出 print("=" * 70, flush=True) print(f"{LOCATION_NAME}气象数据自动获取与处理系统", flush=True) print(f"系统启动时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", flush=True) print(f"Python版本: {sys.version}", flush=True) print(f"工作目录: {os.getcwd()}", flush=True) print(f"5分钟数据存储目录: {os.path.abspath(FIVEMIN_DIR)}", flush=True) print(f"分钟数据存储目录: {os.path.abspath(MINUTELY_DIR)}", flush=True) print("=" * 70, flush=True) print("按 Ctrl+C 停止程序", flush=True) print("=" * 70, flush=True) # 初始执行一次 log_message("\n执行初始数据获取...") success = get_weather_data() # 主循环 current_month = datetime.now().month while True: try: # 计算到下一个整分钟的时间(确保在0秒执行) now = datetime.now() next_minute = now.replace(second=0, microsecond=0) + timedelta(minutes=1) wait_seconds = (next_minute - now).total_seconds() if wait_seconds > 0: wait_minutes = wait_seconds / 60 log_message(f"\n等待 {wait_seconds:.1f} 秒 ({wait_minutes:.2f} 分钟) 到下一个整分钟") time.sleep(wait_seconds) # 检查月份变化 new_month = datetime.now().month if new_month != current_month: log_message(f"\n月份变更: {current_month} -> {new_month}") current_month = new_month # 获取并处理数据 log_message("\n" + "="*70) log_message(f"开始新一轮数据采集 ({datetime.now().strftime('%Y-%m-%d %H:%M:%S')})") success = get_weather_data() except KeyboardInterrupt: log_message("\n程序已停止") break except Exception as e: # 获取详细错误信息 exc_type, exc_value, exc_traceback = sys.exc_info() error_details = traceback.format_exception(exc_type, exc_value, exc_traceback) error_msg = ''.join(error_details) log_message(f"\n主循环错误: {str(e)}") log_message(f"详细错误信息:\n{error_msg}") log_message("5分钟后重试...") time.sleep(300) # 5分钟后重试 # 重新计算下一个整分钟时间 now = datetime.now() next_minute = now.replace(second=0, microsecond=0) + timedelta(minutes=1) current_month = datetime.now().month log_message(f"新的重试时间: {next_minute.strftime('%Y-%m-%d %H:%M:%S')}") except Exception as e: # 启动错误处理 print(f"程序启动失败: {str(e)}", flush=True) traceback.print_exc(file=sys.stdout)把这段程序也用上面那个方法实现在Python程序在运行时在页面显示结果
07-16
【我现在已经使用jupyter notebook将数据预处理做好填充了缺失值以及处理异常值 import pandas as pd df = pd.read_csv("数据分析:train_data.csv") df.info() # 重复值 duplicate_rows = df.duplicated() print("\n重复行的总数:", duplicate_rows.sum()) # 重复值 import pandas as pd import matplotlib.pyplot as plt df = pd.read_csv("数据分析:train_data.csv") # 检测重复行 duplicate_rows = df.duplicated() num_duplicates = duplicate_rows.sum() num_non_duplicates = len(df) - num_duplicates # 准备可视化数据 categories = ['非重复行', '重复行'] values = [num_non_duplicates, num_duplicates] plt.rcParams['font.sans-serif']=['SimHei']#设置中文字体黑体 plt.rcParams['axes.unicode_minus']=False#正常显示负号 # 创建条形图 plt.figure(figsize=(10, 6)) bars = plt.bar(categories, values, color=['lightblue', 'red']) # 添加数据标签 for bar in bars: height = bar.get_height() plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,f'{height}', ha='center', va='bottom', fontsize=10) # 添加标题和标签 plt.title('数据中重复行与非重复行数量对比', fontsize=16) plt.xlabel('行类型', fontsize=14) plt.ylabel('数量', fontsize=14) plt.tight_layout() plt.show() # 缺失值 import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # 设置中文字体 plt.rcParams['font.sans-serif']=['SimHei'] plt.rcParams['axes.unicode_minus']=False # 读取数据 df = pd.read_csv("数据分析:train_data.csv") # 计算每列缺失值数量 missing_counts = df.isnull().sum() # 计算每列缺失值占比 missing_ratios = (missing_counts / len(df)) * 100 # 创建缺失值统计表格 missing_stats = pd.DataFrame({'特征名称': missing_counts.index,'缺失值数量': missing_counts.values,'缺失值占比(%)': missing_ratios.round(2)}) # 按缺失值数量降序排列 missing_stats = missing_stats.sort_values('缺失值数量', ascending=False) print("缺失值统计结果:") print(missing_stats[missing_stats['缺失值数量'] > 0] if not missing_stats.empty else "无缺失值") # 仅选择有缺失值的列进行可视化 has_missing = missing_counts[missing_counts > 0] if not has_missing.empty: # 原始数据缺失值热力图 plt.figure(figsize=(12, 8)) sns.heatmap(df.isnull(), cbar=False, cmap='coolwarm', yticklabels=False) plt.title('原始数据缺失值分布图(红色表示缺失值)', fontsize=15) plt.tight_layout() plt.show() # 用-1填充缺失值 df_filled = df.fillna(-1) # 填充后缺失值热力图 plt.figure(figsize=(12, 8)) sns.heatmap(df_filled.isnull(), cbar=False, cmap='coolwarm', yticklabels=False) plt.title('填充后缺失值分布图(无缺失值)', fontsize=15) plt.tight_layout() plt.show() # 保存填充后的文件 df_filled.to_csv("train_data_填充缺失值.csv", index=False) print("已将缺失值用-1填充,并保存为 train_data_填充缺失值.csv") else: print("数据集中没有缺失值,无需填充") # 异常值 import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats def setup_chinese_font(): chinese_fonts = ["SimHei", "WenQuanYi Micro Hei", "Heiti TC", "Microsoft YaHei", "Arial Unicode MS"] for font in chinese_fonts: try: plt.rcParams["font.family"] = [font, "sans-serif"] plt.rcParams["axes.unicode_minus"] = False # 解决负号显示问题 sns.set(font=font) return font except: continue plt.rcParams["font.family"] = ["sans-serif"] plt.rcParams["axes.unicode_minus"] = False return "默认字体" def anomaly_detection_and_handling(): setup_chinese_font() # 1. 数据加载与初步探索 print("\n=== 1. 数据加载与初步探索 ===") # 定义所有数值型特征列 numeric_cols = ['Installation_Year', 'Operational_Hours', 'Temperature_C', 'Vibration_mms', 'Sound_dB', 'Oil_Level_pct', 'Coolant_Level_pct', 'Power_Consumption_kW', 'Last_Maintenance_Days_Ago', 'Maintenance_History_Count', 'Failure_History_Count', 'Error_Codes_Last_30_Days', 'Remaining_Useful_Life_days', 'Laser_Intensity', 'Hydraulic_Pressure_bar', 'Coolant_Flow_L_min', 'Heat_Index', 'AI_Override_Events'] # 定义使用-1填充的特殊列 special_cols = ['Laser_Intensity', 'Hydraulic_Pressure_bar', 'Coolant_Flow_L_min', 'Heat_Index'] # 定义需要特殊处理的特征列表 non_negative_features = ['Vibration_mms', 'Power_Consumption_kW','Maintenance_History_Count', 'Failure_History_Count', 'Error_Codes_Last_30_Days', 'AI_Override_Events','Remaining_Useful_Life_days', 'Last_Maintenance_Days_Ago'] # 读取数据 df = pd.read_csv("train_data_填充缺失值.csv", usecols=numeric_cols) print("\n=== 数据预处理 - 物理意义修正 ===") # 1. 安装年份修正 (>2025 → 2025) df['Installation_Year'] = np.where(df['Installation_Year'] > 2025, 2025, df['Installation_Year']) print(f"安装年份修正: >2025 → 2025,修正后范围: [{df['Installation_Year'].min()}, {df['Installation_Year'].max()}]") # 2. 非负特征负数修正 (负数 → 0) for col in non_negative_features: original_neg_count = (df[col] < 0).sum() df[col] = np.maximum(df[col], 0) print(f"{col} 负数修正: {original_neg_count}个负样本 → 0") # 3. 温度特征修正 (<19℃ → 20℃) original_low_temp_count = (df['Temperature_C'] < 19).sum() df['Temperature_C'] = np.where(df['Temperature_C'] < 19, 20, df['Temperature_C']) print(f"温度修正: {original_low_temp_count}个<19℃样本 → 20℃") # 采样用于可视化 sample_df = df.sample(frac=0.1, random_state=42) print("\n--- 数据基本信息 ---") info_df = pd.DataFrame({'总样本量': [df.shape[0]],'特征数量': [df.shape[1]],'采样可视化样本量': [sample_df.shape[0]]}) print(info_df) # 2. Z-score异常值检测 print("\n=== 2. Z-score异常值检测 (Z-score > 3) ===") # 计算Z-score统计指标 stats_df = pd.DataFrame() for col in numeric_cols: if col in special_cols: # 排除-1值 data = df[col].replace(-1, np.nan) else: data = df[col] # 计算Z-score相关统计量 col_stats = pd.DataFrame({'mean': [data.mean()],'std': [data.std()],'min': [data.min()],'max': [data.max()],'z_threshold': [3]}, index=[col]) # 特殊特征的边界设置 if col == 'Installation_Year': col_stats['lower_bound'] = 1995 col_stats['upper_bound'] = 2025 elif col in non_negative_features: col_stats['lower_bound'] = 0 # 非负特征下界为0 # 上界使用Z-score计算 col_stats['upper_bound'] = col_stats['mean'] + 3 * col_stats['std'] else: # 其他特征使用Z-score边界 col_stats['lower_bound'] = col_stats['mean'] - 3 * col_stats['std'] col_stats['upper_bound'] = col_stats['mean'] + 3 * col_stats['std'] # 合并到stats_df stats_df = pd.concat([stats_df, col_stats]) print("\n--- Z-score统计与异常值边界 ---") print(stats_df[['mean', 'std', 'min', 'max', 'z_threshold', 'lower_bound', 'upper_bound']]) # 检测异常值 outliers = {} for col in numeric_cols: lower = stats_df.loc[col, 'lower_bound'] upper = stats_df.loc[col, 'upper_bound'] if col in special_cols: # 排除-1的值 outliers[col] = df[(df[col] != -1) & ((df[col] < lower) | (df[col] > upper))] else: outliers[col] = df[(df[col] < lower) | (df[col] > upper)] # 异常值统计 outliers_stats = pd.DataFrame({'特征名称': numeric_cols,'异常值数量': [len(outliers[col]) for col in numeric_cols], '异常值比例(%)': [round(len(outliers[col])/len(df)*100, 2) for col in numeric_cols]}).sort_values('异常值比例(%)', ascending=False) print("\n--- 异常值统计结果 ---") print(outliers_stats[outliers_stats['异常值数量'] > 0]) # 3. 异常值可视化(处理前) print("\n=== 3. 异常值可视化(处理前) ===") # 选择异常值比例最高的4个特征进行可视化 top_anomaly_features = outliers_stats[outliers_stats['异常值数量'] > 0]['特征名称'].head(4).tolist() # 箱线图 fig, axes = plt.subplots(2, 2, figsize=(16, 12)) axes = axes.flatten() for i, col in enumerate(top_anomaly_features): sns.boxplot(data=sample_df, x=col, ax=axes[i]) axes[i].set_title(f'{col} 箱线图(异常值标记)') axes[i].axvline(x=stats_df.loc[col, 'lower_bound'], color='r', linestyle='--', label='下界') axes[i].axvline(x=stats_df.loc[col, 'upper_bound'], color='g', linestyle='--', label='上界') axes[i].legend() plt.tight_layout() plt.show() # 直方图可视化 fig, axes = plt.subplots(2, 2, figsize=(16, 12)) axes = axes.flatten() for i, col in enumerate(top_anomaly_features): sns.histplot(data=sample_df, x=col, kde=True, ax=axes[i]) axes[i].set_title(f'{col} 分布直方图') axes[i].axvline(x=stats_df.loc[col, 'lower_bound'], color='r', linestyle='--') axes[i].axvline(x=stats_df.loc[col, 'upper_bound'], color='g', linestyle='--') plt.tight_layout() plt.show() # 4. 异常值处理 print("\n=== 4. 异常值处理 ===") df_processed = df.copy() # 不同特征采用不同处理策略 handling_strategies = { 'Power_Consumption_kW': 'capping', # 盖帽法 'Temperature_C': 'capping', 'Vibration_mms': 'median', # 中位数替换 'Installation_Year': 'capping' # 安装年份使用盖帽法 } # 应用处理策略 for col in top_anomaly_features: lower = stats_df.loc[col, 'lower_bound'] upper = stats_df.loc[col, 'upper_bound'] if col in handling_strategies and handling_strategies[col] == 'capping': # 盖帽法 df_processed[col] = df_processed[col].clip(lower, upper) strategy = "盖帽法" else: # 中位数替换 if col in special_cols: median = df_processed[df_processed[col] != -1][col].median() else: median = df_processed[col].median() df_processed[col] = np.where( (df_processed[col] < lower) | (df_processed[col] > upper), median, df_processed[col]) strategy = "中位数替换" print(f"\n--- {col} 处理完成 ---") result_df = pd.DataFrame({'处理策略': [strategy],'异常值数量': [len(outliers[col])], '处理后异常值数量': [len(df_processed[(df_processed[col] < lower) | (df_processed[col] > upper)])]}) print(result_df) # 5. 处理前后对比可视化 print("\n=== 5. 处理前后对比可视化 ===") # 采样处理后数据 processed_sample = df_processed.sample(frac=0.1, random_state=42) # 对比箱线图 fig, axes = plt.subplots(2, 2, figsize=(16, 12)) axes = axes.flatten() for i, col in enumerate(top_anomaly_features): # 原始数据 sns.boxplot(data=sample_df, x=col, ax=axes[i], color='lightblue', label='处理前') # 处理后数据 sns.boxplot(data=processed_sample, x=col, ax=axes[i], color='lightgreen', label='处理后') axes[i].set_title(f'{col} 处理前后对比') axes[i].legend() plt.tight_layout() plt.show() # 对比直方图 fig, axes = plt.subplots(2, 2, figsize=(16, 12)) axes = axes.flatten() for i, col in enumerate(top_anomaly_features): # 原始数据 sns.histplot(data=sample_df, x=col, kde=True, ax=axes[i], color='lightblue', label='处理前', alpha=0.5) # 处理后数据 sns.histplot(data=processed_sample, x=col, kde=True, ax=axes[i], color='lightgreen', label='处理后', alpha=0.5) axes[i].set_title(f'{col} 分布对比') axes[i].legend() plt.tight_layout() plt.show() # 6. 总结 print("\n=== 6. 异常值处理总结 ===") summary = pd.DataFrame({'特征名称': top_anomaly_features,'异常值数量': [len(outliers[col]) for col in top_anomaly_features], '处理策略': [ '盖帽法' if col in handling_strategies and handling_strategies[col] == 'capping' else '中位数替换' for col in top_anomaly_features], '处理效果': ['有效消除异常值' for _ in top_anomaly_features]}) print(summary) print("\n--- 后续建议 ---") print("1. 处理后的数据可直接用于模型训练") print("2. 建议补充设备类型特征,实现更精细化的异常值检测") print("3. 对关键特征建立实时监控机制,及时发现新的异常值") # 保存处理后的数据 df_processed.to_csv('train_data_Zscore异常值处理后.csv', index=False) print("\n处理结果保存:") print("已保存处理后的数据至 'train_data_Zscore异常值处理后.csv'") if __name__ == "__main__": anomaly_detection_and_handling() 请帮我分别仔细地做出接下来的这个任务题目,每个步骤都做得完美详细,可视化的形式展示。】(如有环境问题请忽略,后续我会自己在运行代码中配置,请给我完整的代码) 题1: 使用机器编号、机器类型、运行小时数、温度、振动、声音、油位、冷却液位、功耗、距 上次维护天数、维护历史次数、故障历史次数、人工智能监控、过去30天错误编码等特征,构 建回归模型,预测机床设备在7天内是否会发生故障。要求输出模型准确率、召回率、F1值,并 分析前5个最重要的特征。
最新发布
07-26
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值