dataframe中的x_source、y_source、x_target、y_target填充

使用Pandas和Numpy操作DataFrame:数据合并与更新
本文展示了如何使用Python的pandas库和numpy库对数据进行操作,包括创建DataFrame、应用元组转换、设置和更新数据值,以及在两个数据集之间进行匹配并更新特定列的值。
import pandas as pd
import numpy as np

a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
d = [10, 21, 10, 21, 21, 10, 23, 23, 25, 25]
f = [0, 1, 1, 2, 0, 0, 2, 2, 0, 0]
df = pd.DataFrame({'gid': a, 'x': d, 'y': f})
#
print(df)

aa = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
bb = [10, 21, 10, 21, 21, 10, 23, 23, 25, 25]
cc = [0, 1, 1, 2, 0, 0, 2, 2, 0, 0]
dd = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
ee = [10, 21, 10, 21, 21, 10, 23, 23, 25, 25]
ff = [0, 1, 1, 2, 0, 0, 2, 2, 0, 0]
gg = [10, 21, 10, 21, 21, 10, 23, 23, 25, 25]

df_ = pd.DataFrame({'id': aa, 'source': bb, 'target': cc, 'x_source': dd, 'x_target': ee, 'y_source': ff, 'y_target': gg})
#
print(df_)

df['x_y'] = df[['x', 'y']].apply(tuple, axis=1)
print(df)
df_dict = df[['x_y', 'gid']].set_index('gid').to_dict()['x_y']
print(df)

for index, row in df_.iterrows():
    print(index, row['source'])
    if row['source'] in df_dict:
        df_.loc[df_['source'] == row['source'], 'x_source'] = df_dict[row['source']][0]
        df_.loc[df_['source'] == row['source'], 'y_source'] = df_dict[row['source']][1]

    if row['target'] in df_dict:
        df_.loc[df_['target'] == row['target'], 'x_target'] = df_dict[row['target']][0]
        df_.loc[df_['target'] == row['target'], 'y_target'] = df_dict[row['target']][1]

print(df_)
这是代码:# -*- coding: utf-8 -*- import numpy as np import pandas as pd import torch import torch.nn as nn import torch.optim as optim from torch.utils.data import DataLoader, TensorDataset from sklearn.preprocessing import StandardScaler, LabelEncoder from scipy.stats import kurtosis, skew from scipy.signal import hilbert, butter, filtfilt from scipy.io import loadmat import os print("🚀 开始执行完整流程...") # ================== 1. 增强版特征提取函数 ================== def extract_bearing_features(signal, fs=32000): """ 提取单通道轴承信号的36维故障敏感特征(强化 NaN/Inf 处理) """ # 输入验证 if len(signal) == 0 or not isinstance(signal, np.ndarray): signal = np.array([]) signal = signal[~np.isnan(signal)] if len(signal) > 0 else np.array([]) if len(signal) < 10: # 太短无法分析 empty = {k: 0.0 for k in [ 'mean', 'std', 'peak', 'p2p', 'kurtosis', 'skewness', 'crest_factor', 'form_factor', 'dominant_freq' ]} env_keys = [f'env_{name}_ratio' for name in ['BPFI', 'BPFO', 'BSF', 'FTF']] empty.update({k: 0.0 for k in env_keys}) return {k: 0.0 for k in empty} # ---- 时域特征 ---- mean_val = float(np.mean(signal)) std_val = float(np.std(signal)) peak_val = float(np.max(np.abs(signal))) p2p_val = float(np.ptp(signal)) kurt_val = float(kurtosis(signal)) skew_val = float(skew(signal)) crest_factor = peak_val / (std_val + 1e-8) form_factor = std_val / (np.mean(np.abs(signal)) + 1e-8) # ---- 包络谱分析 ---- try: low, high = 2000, 8000 if high >= fs / 2: high = int(fs / 2 * 0.9) nyq = fs / 2 if low >= nyq: energy_ratios = {f'env_{k}_ratio': 0.0 for k in ['BPFI', 'BPFO', 'BSF', 'FTF']} else: b, a = butter(3, [low / nyq, high / nyq], btype='band') filtered = filtfilt(b, a, signal) envelope = np.abs(hilbert(filtered)) N = min(len(envelope), 4096) yf_env = np.fft.fft(envelope[:N]) xf_env = np.fft.fftfreq(N, 1/fs)[:N//2] mag_env = np.abs(yf_env[:N//2]) rpm = 1796 fr = rpm / 60 ball_diameter = 0.3 pitch_diameter = 1.0 num_balls = 8 contact_angle = 0.7 fault_frequencies = { 'BPFI': num_balls / 2 * (1 + contact_angle * ball_diameter / pitch_diameter) * fr, 'BPFO': num_balls / 2 * (1 - contact_angle * ball_diameter / pitch_diameter) * fr, 'BSF': fr / 2 * (1 - (ball_diameter / pitch_diameter)**2 * contact_angle**2) / ball_diameter, 'FTF': fr / 2 * (1 - contact_angle * ball_diameter / pitch_diameter) } energy_ratios = {} for name, freq in fault_frequencies.items(): idx_range = (xf_env >= freq * 0.8) & (xf_env <= freq * 1.2) band_energy = np.sum(mag_env[idx_range]) if idx_range.any() else 0 total_energy = np.sum(mag_env) + 1e-8 energy_ratios[f'env_{name}_ratio'] = float(band_energy / total_energy) except Exception as e: energy_ratios = {f'env_{k}_ratio': 0.0 for k in ['BPFI', 'BPFO', 'BSF', 'FTF']} # ---- 主频分析 ---- try: N_fft = min(len(signal), 4096) yf = np.fft.fft(signal[:N_fft]) xf = np.fft.fftfreq(N_fft, 1/fs)[:N_fft//2] mag_fft = 2.0 / N_fft * np.abs(yf[:N_fft//2]) dominant_freq = float(xf[np.argmax(mag_fft)]) if len(mag_fft) > 0 and np.any(mag_fft > 0) else 0.0 except: dominant_freq = 0.0 # ---- 合并并强制数值安全 ---- features = { 'mean': mean_val, 'std': std_val, 'peak': peak_val, 'p2p': p2p_val, 'kurtosis': kurt_val, 'skewness': skew_val, 'crest_factor': crest_factor, 'form_factor': form_factor, 'dominant_freq': dominant_freq } features.update(energy_ratios) # ✅ 最终兜底:确保所有值为有限数 for k, v in features.items(): if not np.isfinite(v): features[k] = 0.0 return features # ================== 2. 构建源域特征矩阵 ================== print("📁 加载源域元数据...") features_list = [] # 示例 df_meta 构造(你需要提供自己的数据) # 假设你已有 df_meta,每行包含 signals, fault_type 等字段 for idx, row in df_meta.iterrows(): signals = row['signals'] file_id = row['file_id'] fault_type = row['fault_type'] fault_size = row['fault_size'] load = row['load'] rpm = row['rpm'] path = row['path'] valid_sensor_found = False for sensor in ['DE', 'BA', 'FE']: sig = signals.get(sensor, np.array([])) if len(sig) == 0 or np.isnan(sig).all(): continue feat = extract_bearing_features(sig, fs=32000) feat = {f"{sensor}_{k}": float(v) for k, v in feat.items()} feat.update({ 'file_id': file_id, 'fault_type': fault_type, 'fault_size': fault_size, 'load': load, 'rpm': rpm, 'path': path, 'sensor': sensor }) features_list.append(feat) valid_sensor_found = True if not valid_sensor_found: print(f"⚠️ 文件 {file_id} 所有传感器均无效,跳过") if len(features_list) == 0: raise ValueError("❌ 未提取到任何有效特征,请检查输入信号") df_source_raw = pd.DataFrame(features_list) print(f"✅ 原始源域样本数: {len(df_source_raw)}") # --- 安全聚合:优先使用 DE --- def select_sensor_priority(group): for s in ['DE', 'BA', 'FE']: subset = group[group['sensor'] == s] if len(subset) > 0: return subset.iloc[0] return group.iloc[0] df_source = df_source_raw.groupby('file_id').apply(select_sensor_priority) df_source.reset_index(drop=True, inplace=True) feature_columns = [col for col in df_source.columns if col not in ['file_id', 'fault_type', 'fault_size', 'load', 'rpm', 'path', 'sensor']] df_X_source = df_source[feature_columns].copy() df_y_source = df_source[['file_id', 'fault_type']].copy() # ================== 3. 构建目标域特征矩阵 ================== print("📁 加载目标域数据...") target_dir = r"D:/数学建模/数据集/目标域_12kHz" target_features = [] if not os.path.exists(target_dir): raise FileNotFoundError(f"目录不存在: {target_dir}") for file_name in sorted(os.listdir(target_dir)): if not file_name.endswith('.mat'): continue path = os.path.join(target_dir, file_name) try: mat_data = loadmat(path) keys = [k for k in mat_data.keys() if not k.startswith('__')] if not keys: print(f"跳过空文件: {file_name}") continue signal = mat_data[keys[0]].flatten() feats_raw = extract_bearing_features(signal, fs=12000) feats = {} for sensor in ['DE', 'BA', 'FE']: for k, v in feats_raw.items(): feats[f"{sensor}_{k}"] = v feats['file_id'] = file_name.split('.')[0] target_features.append(feats) except Exception as e: print(f"处理 {file_name} 失败: {e}") if len(target_features) == 0: raise ValueError("❌ 目标域未提取到任何特征") df_target = pd.DataFrame(target_features).set_index('file_id') # 对齐列 for col in feature_columns: if col not in df_target.columns: df_target[col] = 0.0 df_target = df_target[feature_columns] # ================== 4. 强力清洗 + 强制标准化 ================== print("🧹 开始强力清洗与强制标准化...") # --- 步骤 1: 提取并转为 float64 --- df_X_clean = df_X_source[feature_columns].copy() for col in df_X_clean.columns: df_X_clean[col] = pd.to_numeric(df_X_clean[col], errors='coerce') # --- 步骤 2: 填充缺失值 --- median_fill = df_X_clean.median(numeric_only=True) df_X_clean.fillna(median_fill, inplace=True) df_X_clean.fillna(0.0, inplace=True) # 验证是否还有 NaN assert not df_X_clean.isnull().any().any(), "清洗失败:仍有 NaN" # --- 步骤 3: 检查每列标准差,避免零方差 --- zero_std_cols = df_X_clean.std() == 0 if zero_std_cols.any(): print(f"⚠️ 以下 {zero_std_cols.sum()} 列方差为 0,将添加微小噪声扰动:") for col in df_X_clean.columns[zero_std_cols]: print(f" - {col}") df_X_clean.loc[df_X_clean[col].index[0], col] += 1e-6 # 扰动第一个值 # --- 步骤 4: 重新标准化 --- scaler = StandardScaler() X_np = df_X_clean.astype(np.float64).values X_scaled = scaler.fit_transform(X_np) # 保证在这里完成 fit 和 transform df_source_norm = pd.DataFrame(X_scaled, index=df_X_clean.index, columns=feature_columns) # --- 步骤 5: 验证标准化效果 --- X_s_tensor = torch.tensor(df_source_norm.values, dtype=torch.float32) std_per_feature = X_s_tensor.std(dim=0) mean_per_feature = X_s_tensor.mean(dim=0) print("✅ 标准化后统计验证:") print(" Mean (abs avg):", mean_per_feature.abs().mean().item()) print(" Std (avg) :", std_per_feature.mean().item()) print(" Std (min) :", std_per_feature.min().item()) print(" Std (max) :", std_per_feature.max().item()) # 如果 std 偏离太多,手动警告 if std_per_feature.mean().item() < 0.8 or std_per_feature.mean().item() > 1.2: print("❗ 注意:平均标准差偏离 1.0,建议检查特征分布") print(f"✅ 标准化完成,数据已保存至: {output_dir}") # ================== 5. 输出摘要 ================== print("\n📊 数据摘要:") print("源域样本数:", len(df_source_norm)) print("目标域样本数:", len(df_target_norm)) print("特征数量:", len(df_source_norm.columns)) print("标签分布:\n", df_y_source['fault_type'].value_counts()) # ================== 5. CORAL 模型定义 ================== class CORAL(nn.Module): def __init__(self, input_dim=39, hidden_dim=64, num_classes=5): super(CORAL, self).__init__() self.feature_extractor = nn.Sequential( nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Dropout(0.3), nn.Linear(hidden_dim, hidden_dim), nn.ReLU() ) self.classifier = nn.Linear(hidden_dim, num_classes) def forward(self, x): features = self.feature_extractor(x) logits = self.classifier(features) return features, logits def coral_loss(source, target): if source.size(0) != target.size(0): min_b = min(source.size(0), target.size(0)) source, target = source[:min_b], target[:min_b] if source.size(0) == 1: return torch.tensor(0.0, device=source.device) src_cent = source - source.mean(0, keepdim=True) tgt_cent = target - target.mean(0, keepdim=True) cov_s = torch.mm(src_cent.t(), src_cent) / (source.size(0) - 1) cov_t = torch.mm(tgt_cent.t(), tgt_cent) / (target.size(0) - 1) return torch.norm(cov_s - cov_t, p='fro') ** 2 # ================== 6. 训练准备 ================== le = LabelEncoder() y_encoded = le.fit_transform(df_y_source['fault_type']) num_classes = len(le.classes_) X_s = torch.tensor(df_source_norm.values, dtype=torch.float32) y_s = torch.tensor(y_encoded, dtype=torch.long) X_t = torch.tensor(df_target_norm.values, dtype=torch.float32) dataset_s = TensorDataset(X_s, y_s) loader_s = DataLoader(dataset_s, batch_size=32, shuffle=True) loader_t = DataLoader(TensorDataset(X_t), batch_size=32, shuffle=True) # 模型初始化 model = CORAL(input_dim=39, num_classes=num_classes) optimizer = optim.Adam(model.parameters(), lr=1e-4) criterion_cls = nn.CrossEntropyLoss() print("🏷️ 类别映射:") for i, cls in enumerate(le.classes_): print(f" {i} -> {cls}") # ================== 7. 训练循环 ================== print("🔥 开始训练...") for epoch in range(100): model.train() iter_t = iter(loader_t) total_cls = 0.0 total_coral = 0.0 total_loss = 0.0 for x_s_batch, y_s_batch in loader_s: try: (x_t_batch,) = next(iter_t) except StopIteration: iter_t = iter(loader_t) (x_t_batch,) = next(iter_t) feat_s, logit_s = model(x_s_batch) feat_t, _ = model(x_t_batch) cls_loss = criterion_cls(logit_s, y_s_batch) coral_l = coral_loss(feat_s, feat_t) loss = cls_loss + 0.001 * coral_l optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) optimizer.step() total_loss += loss.item() total_cls += cls_loss.item() total_coral += coral_l.item() if epoch % 20 == 0: avg_cls = total_cls / len(loader_s) avg_coral = total_coral / len(loader_s) print(f"Epoch {epoch}: Loss={total_loss:.4f} (Cls={avg_cls:.4f}, Coral={avg_coral:.4f})") print("🎉 训练完成!")
10-09
import pandas as pd import re import numpy as np import tkinter as tk from tkinter import filedialog, messagebox, ttk from urllib.parse import urlparse, parse_qs from collections import OrderedDict import os from openpyxl.styles import Font, Alignment from openpyxl import Workbook from openpyxl.utils.dataframe import dataframe_to_rows import webbrowser class ExcelProcessorApp: def __init__(self, root): self.root = root self.root.title("Excel表格处理器") self.root.geometry("700x550") # 增加高度以容纳预览按钮 # 初始化变量 self.source_file_path = tk.StringVar() self.hyperlink_source_path = tk.StringVar() self.output_file_path = tk.StringVar() self.preview_df = None # 存储预览数据 # 创建界面 self.create_widgets() def create_widgets(self): # 主框架 main_frame = ttk.Frame(self.root, padding="20") main_frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) # 源文件选择区域 ttk.Label(main_frame, text="源表文件:").grid(row=0, column=0, sticky=tk.W, pady=5) ttk.Entry(main_frame, textvariable=self.source_file_path, width=50).grid(row=0, column=1, padx=5, pady=5) ttk.Button(main_frame, text="浏览...", command=self.browse_source_file).grid(row=0, column=2, padx=5, pady=5) # 超链接源表选择区域 ttk.Label(main_frame, text="超链接源表:").grid(row=1, column=0, sticky=tk.W, pady=5) ttk.Entry(main_frame, textvariable=self.hyperlink_source_path, width=50).grid(row=1, column=1, padx=5, pady=5) ttk.Button(main_frame, text="浏览...", command=self.browse_hyperlink_source_file).grid(row=1, column=2, padx=5, pady=5) # 输出文件选择区域 ttk.Label(main_frame, text="输出文件:").grid(row=2, column=0, sticky=tk.W, pady=5) ttk.Entry(main_frame, textvariable=self.output_file_path, width=50).grid(row=2, column=1, padx=5, pady=5) ttk.Button(main_frame, text="浏览...", command=self.browse_output_file).grid(row=2, column=2, padx=5, pady=5) # 按钮框架 button_frame = ttk.Frame(main_frame) button_frame.grid(row=3, column=0, columnspan=3, pady=10) # 预览按钮 ttk.Button(button_frame, text="预览结果", command=self.preview_results).pack(side=tk.LEFT, padx=5) # 处理按钮 ttk.Button(button_frame, text="开始处理", command=self.process_file).pack(side=tk.LEFT, padx=5) # 进度条 self.progress = ttk.Progressbar(main_frame, mode='indeterminate') self.progress.grid(row=4, column=0, columnspan=3, sticky=(tk.W, tk.E), pady=10) # 状态标签 self.status_label = ttk.Label(main_frame, text="请选择源表和输出文件") self.status_label.grid(row=5, column=0, columnspan=3, pady=10) # 配置网格权重 main_frame.columnconfigure(1, weight=1) self.root.columnconfigure(0, weight=1) self.root.rowconfigure(0, weight=1) def browse_source_file(self): file_path = filedialog.askopenfilename( title="选择源表文件", filetypes=[("Excel文件", "*.xlsx"), ("所有文件", "*.*")] ) if file_path: self.source_file_path.set(file_path) # 自动设置输出文件路径 dir_name = os.path.dirname(file_path) base_name = os.path.basename(file_path) name, ext = os.path.splitext(base_name) output_path = os.path.join(dir_name, f"{name}_处理结果{ext}") self.output_file_path.set(output_path) self.status_label.config(text="已选择源表文件,请选择超链接源表") def browse_hyperlink_source_file(self): file_path = filedialog.askopenfilename( title="选择超链接源表文件", filetypes=[("Excel文件", "*.xlsx"), ("所有文件", "*.*")] ) if file_path: self.hyperlink_source_path.set(file_path) self.status_label.config(text="已选择超链接源表,请点击开始处理") def browse_output_file(self): file_path = filedialog.asksaveasfilename( title="选择输出文件", defaultextension=".xlsx", filetypes=[("Excel文件", "*.xlsx"), ("所有文件", "*.*")] ) if file_path: self.output_file_path.set(file_path) def preview_results(self): """预览处理结果""" if not self.source_file_path.get(): messagebox.showerror("错误", "请先选择源表文件") return if not self.hyperlink_source_path.get(): messagebox.showerror("错误", "请先选择超链接源表文件") return # 开始处理预览 self.progress.start() self.status_label.config(text="正在生成预览,请稍候...") self.root.update() try: # 调用处理函数,生成全部预览数据 self.preview_df = self.process_excel_file_preview( self.source_file_path.get(), self.hyperlink_source_path.get() ) # 停止进度条 self.progress.stop() self.status_label.config(text="预览生成完成") # 显示预览窗口 self.show_preview(self.preview_df) except Exception as e: self.progress.stop() self.status_label.config(text="预览生成失败") messagebox.showerror("错误", f"生成预览时发生错误:\n{str(e)}") def show_preview(self, preview_df): """显示预览窗口""" # 创建预览窗口 preview_window = tk.Toplevel(self.root) preview_window.title("处理结果预览") preview_window.geometry("1200x700") # 增加宽度以容纳更多列 # 创建框架 frame = ttk.Frame(preview_window, padding="10") frame.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) # 添加标题 title_label = ttk.Label(frame, text="处理结果预览", font=("Arial", 12, "bold")) title_label.grid(row=0, column=0, columnspan=2, pady=(0, 10)) # 创建表格框架 table_frame = ttk.Frame(frame) table_frame.grid(row=1, column=0, columnspan=2, sticky=(tk.W, tk.E, tk.N, tk.S)) # 创建表格 columns = list(preview_df.columns) tree = ttk.Treeview(table_frame, columns=columns, show="headings", height=20) # 设置列标题和宽度 col_widths = { '顺序': 50, '商品名称': 150, '商品链接': 150, '商品ID': 80, '优惠方式(xx元优惠券/拍立减xx元)': 150, '优惠券链接': 150, '优惠券提取': 100, '公式放这一列': 100, '优惠券1': 100, '优惠券2': 100, '优惠券3': 100, '优惠券4': 100, '优惠券5': 100, '优惠券链接(提取)': 150, '优惠券链接1': 150, '优惠券链接2': 150, '优惠券链接3': 150, '优惠券链接4': 150, '优惠券链接5': 150 } for col in columns: tree.heading(col, text=col) width = col_widths.get(col, 100) tree.column(col, width=width, minwidth=80, anchor=tk.CENTER) # 添加滚动条 vsb = ttk.Scrollbar(table_frame, orient="vertical", command=tree.yview) hsb = ttk.Scrollbar(table_frame, orient="horizontal", command=tree.xview) tree.configure(yscrollcommand=vsb.set, xscrollcommand=hsb.set) # 布局 tree.grid(row=0, column=0, sticky=(tk.W, tk.E, tk.N, tk.S)) vsb.grid(row=0, column=1, sticky=(tk.N, tk.S)) hsb.grid(row=1, column=0, sticky=(tk.W, tk.E)) # 添加数据 for _, row in preview_df.iterrows(): tree.insert("", "end", values=list(row)) # 绑定点击事件到C列(商品链接) tree.bind("<Button-1>", lambda event: self.on_tree_click(event, tree, preview_window)) # 创建可编辑的E列和F列 self.create_editable_columns(tree, preview_window, preview_df) # 添加按钮 button_frame = ttk.Frame(frame) button_frame.grid(row=2, column=0, columnspan=2, pady=10) ttk.Button(button_frame, text="确认并开始处理", command=lambda: [preview_window.destroy(), self.process_file()]).pack(side=tk.LEFT, padx=5) ttk.Button(button_frame, text="关闭预览", command=preview_window.destroy).pack(side=tk.LEFT, padx=5) # 配置权重 frame.columnconfigure(0, weight=1) frame.rowconfigure(1, weight=1) table_frame.columnconfigure(0, weight=1) table_frame.rowconfigure(0, weight=1) preview_window.columnconfigure(0, weight=1) preview_window.rowconfigure(0, weight=1) # 存储tree和df引用以便后续使用 self.preview_tree = tree self.preview_window = preview_window self.preview_df = preview_df def create_editable_columns(self, tree, window, df): """为E列和F列创建可编辑的Entry小部件""" # 获取E列和F列的索引 columns = tree["columns"] # 找到E列和F列在Treeview中的索引 e_col_index = None f_col_index = None for i, col in enumerate(columns): if tree.heading(col)["text"] == "优惠券链接": # E列 e_col_index = i elif tree.heading(col)["text"] == "优惠券提取": # F列 f_col_index = i if e_col_index is None or f_col_index is None: messagebox.showerror("错误", "无法找到可编辑列") return # 存储所有Entry小部件的引用 self.entry_widgets_e = {} self.entry_widgets_f = {} # 为每一行创建Entry小部件 for i, item_id in enumerate(tree.get_children()): # 获取行位置和大小 e_bbox = tree.bbox(item_id, column=e_col_index) f_bbox = tree.bbox(item_id, column=f_col_index) if e_bbox: # 创建E列的Entry entry_e = ttk.Entry(window) entry_e.place(x=e_bbox[0], y=e_bbox[1], width=e_bbox[2], height=e_bbox[3]) # 设置初始值 current_value = df.iloc[i, e_col_index] entry_e.insert(0, str(current_value) if not pd.isna(current_value) else "") # 绑定事件以更新DataFrame entry_e.bind("<FocusOut>", lambda e, row=i, col=e_col_index: self.update_dataframe_from_entry(e, row, col)) # 绑定回车键事件 entry_e.bind("<Return>", lambda e: e.widget.master.focus()) self.entry_widgets_e[item_id] = entry_e if f_bbox: # 创建F列的Entry entry_f = ttk.Entry(window) entry_f.place(x=f_bbox[0], y=f_bbox[1], width=f_bbox[2], height=f_bbox[3]) # 设置初始值 current_value = df.iloc[i, f_col_index] entry_f.insert(0, str(current_value) if not pd.isna(current_value) else "") # 绑定事件以更新DataFrame entry_f.bind("<FocusOut>", lambda e, row=i, col=f_col_index: self.update_dataframe_from_entry(e, row, col)) # 绑定回车键事件 entry_f.bind("<Return>", lambda e: e.widget.master.focus()) self.entry_widgets_f[item_id] = entry_f # 绑定Treeview的滚动事件,以便在滚动时更新Entry位置 def update_entry_positions(event=None): for item_id in tree.get_children(): e_bbox = tree.bbox(item_id, column=e_col_index) f_bbox = tree.bbox(item_id, column=f_col_index) if e_bbox and item_id in self.entry_widgets_e: self.entry_widgets_e[item_id].place(x=e_bbox[0], y=e_bbox[1], width=e_bbox[2], height=e_bbox[3]) if f_bbox and item_id in self.entry_widgets_f: self.entry_widgets_f[item_id].place(x=f_bbox[0], y=f_bbox[1], width=f_bbox[2], height=f_bbox[3]) # 绑定滚动事件 tree.bind("<MouseWheel>", update_entry_positions) tree.bind("<Button-4>", update_entry_positions) # Linux上滚 tree.bind("<Button-5>", update_entry_positions) # Linux下滚 # 获取滚动条并绑定事件 vsb = tree.master.children.get('!scrollbar') hsb = tree.master.children.get('!scrollbar2') if vsb: vsb.configure(command=lambda *args: [tree.yview(*args), update_entry_positions()]) if hsb: hsb.configure(command=lambda *args: [tree.xview(*args), update_entry_positions()]) # 初始更新位置 window.after(100, update_entry_positions) def update_dataframe_from_entry(self, event, row, col): """从Entry小部件更新DataFrame""" widget = event.widget new_value = widget.get() if self.preview_df is not None and row < len(self.preview_df): self.preview_df.iloc[row, col] = new_value # 同时更新Treeview中的显示 item_id = self.preview_tree.get_children()[row] values = list(self.preview_tree.item(item_id)['values']) values[col] = new_value self.preview_tree.item(item_id, values=values) def on_tree_click(self, event, tree, window): """处理树形视图的点击事件""" region = tree.identify("region", event.x, event.y) if region == "cell": column = tree.identify_column(event.x) row = tree.identify_row(event.y) # 检查是否是C列(商品链接列) if column == "#3": # 列索引从1开始,C列是第3列 item = tree.item(row) values = item['values'] if len(values) >= 3: # 确保有足够的值 url = values[2] # C列是第3个值(索引2) if url and isinstance(url, str) and url.startswith(('http://', 'https://')): webbrowser.open_new(url) def process_file(self): if not self.source_file_path.get(): messagebox.showerror("错误", "请先选择源表文件") return if not self.hyperlink_source_path.get(): messagebox.showerror("错误", "请先选择超链接源表文件") return if not self.output_file_path.get(): messagebox.showerror("错误", "请先选择输出文件") return # 开始处理 self.progress.start() self.status_label.config(text="正在处理文件,请稍候...") self.root.update() try: # 如果有预览数据,使用预览数据 if self.preview_df is not None: # 保存结果 with pd.ExcelWriter(self.output_file_path.get(), engine='openpyxl') as writer: # 保存Sheet2 self.preview_df.to_excel(writer, sheet_name='Sheet2', index=False) # 创建超链接工作表 self.create_hyperlinks_sheet(writer, self.hyperlink_source_path.get(), self.preview_df) else: # 调用处理函数 self.process_excel_file( self.source_file_path.get(), self.hyperlink_source_path.get(), self.output_file_path.get() ) # 处理完成 self.progress.stop() self.status_label.config(text="处理完成!") messagebox.showinfo("成功", f"文件处理完成,已保存到:\n{self.output_file_path.get()}") except Exception as e: self.progress.stop() self.status_label.config(text="处理失败") messagebox.showerror("错误", f"处理文件时发生错误:\n{str(e)}") # 以下所有其他方法保持不变 def process_excel_file_preview(self, source_path, hyperlink_source_path): """处理Excel文件用于预览""" # 读取源表 source_df = pd.read_excel(source_path, sheet_name='Sheet1') # 清除C列(优惠方式列)的空格 source_df.iloc[:, 2] = source_df.iloc[:, 2].apply(self.clean_text) # 创建目标DataFrame target_columns = [ '顺序', '商品名称', '商品链接', '商品ID', '优惠方式(xx元优惠券/拍立减xx元)', '优惠券链接', '优惠券提取', '公式放这一列', '优惠券1', '优惠券2', '优惠券3', '优惠券4', '优惠券5', '优惠券链接(提取)', '优惠券链接1', '优惠券链接2', '优惠券链接3', '优惠券链接4', '优惠券链接5' ] target_df = pd.DataFrame(columns=target_columns) # 填充数据 for idx, row in source_df.iterrows(): # 顺序 target_df.loc[idx, '顺序'] = idx + 1 # 商品名称和链接 target_df.loc[idx, '商品名称'] = row.iloc[0] # A列 target_df.loc[idx, '商品链接'] = row.iloc[1] # B列 # 商品ID item_id = self.extract_item_id(row.iloc[1]) target_df.loc[idx, '商品ID'] = item_id # 优惠方式(已清除空格) discount_info = row.iloc[2] # C列 target_df.loc[idx, '优惠方式(xx元优惠券/拍立减xx元)'] = discount_info # 优惠券链接 coupon_links = row.iloc[3] # D列 target_df.loc[idx, '优惠券链接'] = coupon_links # 应用ExtractCouponInfo函数 coupon_extract = self.extract_coupon_info(discount_info) target_df.loc[idx, '优惠券提取'] = coupon_extract # 应用FormatCoupon函数 formatted_coupon = self.format_coupon(coupon_extract) target_df.loc[idx, '公式放这一列'] = formatted_coupon # 根据H列的内容填充I列到M列 coupon_values = self.fill_coupon_columns(formatted_coupon, discount_info) for i in range(5): target_df.loc[idx, f'优惠券{i+1}'] = coupon_values[i] # 处理优惠券链接 urls, coupon_ids = self.split_coupon_links(coupon_links) # 优惠券链接提取 (用逗号分隔的URL) target_df.loc[idx, '优惠券链接(提取)'] = ", ".join(urls) if urls else "" # 填充优惠券链接1-5列(O列到S列) for i in range(5): if i < len(urls): target_df.loc[idx, f'优惠券链接{i+1}'] = urls[i] else: target_df.loc[idx, f'优惠券链接{i+1}'] = "" return target_df def clean_text(self, text): """清除文本中的多余空格""" if pd.isna(text) or not isinstance(text, str): return text # 去除前后空格 cleaned = text.strip() # 将多个连续空格替换为单个空格 cleaned = re.sub(r'\s+', ' ', cleaned) return cleaned def extract_coupon_info(self, text): """模拟VBA中的ExtractCouponInfo函数""" if pd.isna(text) or not isinstance(text, str): return "" # 第一段正则表达式:匹配优惠券相关短语 pattern1 = r"-?[1-9]\d*[\u4e00-\u9fa5][优][惠][券]|[领]-?[1-9]\d*[\u4e00-\u9fa5]{3,6}|[领][\u4e00-\u9fa5]{1,4}-?[1-9]\d*[元]|[领]-?[1-9]\d*[\u4e00-\u9fa5]{1,3}|[优][惠][券]-?[1-9]\d*|[领][券]-?[1-9]\d*|-?[1-9]\d*[优][惠][券]|[拍][立][减]|[直][接][拍]|[达][人][闪][降]|[闪][降]" # 第二段正则表达式:提取数字部分或特定关键词 pattern2 = r"-?[1-9]\d*[元]|-?[1-9]\d*|[拍][立][减]|[直][接][拍]|[达][人][闪][降]|[闪][降]" # 正则表达式匹配双破折号分隔的数字格式 pattern_dash = r"\b(?:\d{1,3}--)+\d{1,3}\b" # 执行第一段匹配 matches1 = re.findall(pattern1, text) # 执行双破折号分隔数字的匹配 dash_matches = re.findall(pattern_dash, text) # 使用有序字典来存储唯一值和它们的出现顺序 unique_values = OrderedDict() counter = 0 # 对每个匹配项执行第二段匹配 for match in matches1: matches2 = re.findall(pattern2, match) for value in matches2: # 添加到字典(自动去重)并记录顺序 if value not in unique_values: unique_values[value] = counter counter += 1 # 处理双破折号分隔的数字 for dash_match in dash_matches: # 分割数字并添加"元"后缀 numbers = dash_match.split("--") for number in numbers: number_with_yuan = number + "元" # 添加到字典(自动去重)并记录顺序 if number_with_yuan not in unique_values: unique_values[number_with_yuan] = counter counter += 1 # 按照原始顺序将值连接成字符串 result = "/".join(unique_values.keys()) return result def format_coupon(self, input_value): """模拟VBA中的FormatCoupon函数""" if pd.isna(input_value) or (isinstance(input_value, str) and input_value.strip() == ""): return "" # 确保处理字符串类型 if not isinstance(input_value, str): input_str = str(input_value) else: input_str = input_value # 检查是否已包含优惠券后缀 if "元优惠券" in input_str: return input_str # 处理带斜杠的分段情况 if "/" in input_str: segments = input_str.split("/") result_segments = [] unique_segments = set() # 处理每一段并去重 for segment in segments: # 递归处理每一段 processed_segment = self.format_coupon(segment) # 去重 if processed_segment not in unique_segments: unique_segments.add(processed_segment) result_segments.append(processed_segment) return "/".join(result_segments) # 关键词检测 keywords = ["拍立减", "直接拍", "闪降", "达人闪降"] for keyword in keywords: if keyword in input_str: return "直接拍" # 处理包含"元"的情况 if "元" in input_str: return input_str.replace("元", "元优惠券") # 处理纯数字情况 if input_str.isdigit(): if int(input_str) > 0: return input_str + "元优惠券" else: return input_str # 默认返回原始值 return input_str def extract_item_id(self, url): """从商品链接中提取商品ID""" if pd.isna(url) or not isinstance(url, str): return "" # 尝试从URL参数中提取id try: parsed_url = urlparse(url) query_params = parse_qs(parsed_url.query) if 'id' in query_params: return query_params['id'][0] elif 'ft.t' in url and 'id=' in url: # 处理淘宝链接格式 match = re.search(r'id=(\d+)', url) if match: return match.group(1) except: pass # 如果上述方法失败,尝试使用正则表达式匹配 match = re.search(r'(?:id=|&id=)(\d+)', url) if match: return match.group(1) return "" def split_coupon_links(self, links_text): """拆分优惠券链接""" if pd.isna(links_text) or not isinstance(links_text, str): return [], [] # 使用正则表达式提取所有URL urls = re.findall(r'https?://[^\s<>"{}|\\^`[\]]+', links_text) # 提取优惠券ID或关键信息 coupon_ids = [] for url in urls: # 尝试从URL中提取优惠券ID match = re.search(r'(?:activityId|activity_id)=([^&]+)', url) if match: coupon_ids.append(match.group(1)) else: # 如果没有activityId,尝试提取其他标识 match = re.search(r'/([^/]+)(?=\?|$)', url) if match: coupon_ids.append(match.group(1)) else: coupon_ids.append("优惠券") return urls, coupon_ids def extract_coupon_amounts(self, discount_text): """从优惠信息中提取优惠券金额""" if pd.isna(discount_text) or not isinstance(discount_text, str): return [] # 匹配优惠券金额模式 patterns = [ r"领(\d+)元优惠券", # 匹配"领XX元优惠券" r"优惠券(\d+)元", # 匹配"优惠券XX元" r"(\d+)元优惠券", # 匹配"XX元优惠券" r"领券(\d+)元", # 匹配"领券XX元" r"领(\d+)元券", # 匹配"领XX元券" r"(\d+)元券", # 匹配"XX元券" r"立减(\d+)元", # 匹配"立减XX元" r"减(\d+)元", # 匹配"减XX元" ] amounts = [] for pattern in patterns: matches = re.findall(pattern, discount_text) amounts.extend(matches) # 去重并排序 unique_amounts = list(set(amounts)) unique_amounts.sort(key=lambda x: int(x)) return unique_amounts def fill_coupon_columns(self, formatted_coupon, discount_text): """根据H列的内容填充I列到M列""" # 如果H列为空,则I列到M列保持留空 if pd.isna(formatted_coupon) or formatted_coupon == "": return [""] * 5 # 如果H列是"直接拍",则I列填充"直接拍",其他列留空 if formatted_coupon == "直接拍": return ["直接拍"] + [""] * 4 # 如果H列包含斜杠,则按斜杠分割 if "/" in formatted_coupon: coupons = formatted_coupon.split("/") # 只取前5个 return coupons[:5] + [""] * (5 - len(coupons)) # 如果H列是单个优惠券金额,则提取金额并添加"元优惠券"后缀 if formatted_coupon.endswith("元优惠券"): return [formatted_coupon] + [""] * 4 # 默认情况下,从优惠信息中提取金额并添加"元优惠券"后缀 amounts = self.extract_coupon_amounts(discount_text) formatted_amounts = [f"{amount}元优惠券" for amount in amounts] # 只取前5个 return formatted_amounts[:5] + [""] * (5 - len(formatted_amounts)) def create_hyperlinks_sheet(self, writer, hyperlink_source_path, sheet2_df): """创建超链接工作表,A列到C列来自源表,D列到H列根据Sheet2相关列创建""" # 读取超链接源表 hyperlink_df = pd.read_excel(hyperlink_source_path) # 创建新的工作表 hyperlink_sheet_name = "超链接表" # 获取工作表对象 workbook = writer.book worksheet = workbook.create_sheet(hyperlink_sheet_name) # 写入A列到C列的数据(来自超链接源表) for r_idx, row in enumerate(dataframe_to_rows(hyperlink_df, index=False, header=True), 1): for c_idx, value in enumerate(row[:3], 1): # 只取前三列 worksheet.cell(row=r_idx, column=c_idx, value=value) # 添加D列到H列的标题 headers = ["优惠券1", "优惠券2", "优惠券3", "优惠券4", "优惠券5"] for c_idx, header in enumerate(headers, 4): # 从第4列开始 worksheet.cell(row=1, column=c_idx, value=header) # 模拟VBA宏的逻辑 success_count = [0, 0, 0, 0, 0] # D, E, F, G, H列的成功计数 empty_count = [0, 0, 0, 0, 0] # D, E, F, G, H列的空计数 direct_buy_count = [0, 0, 0, 0, 0] # D, E, F, G, H列的直接拍计数 error_count = 0 # 获取最后一行 last_row = min(len(hyperlink_df), len(sheet2_df)) + 1 # +1 因为第一行是标题 # 处理每一行 for i in range(2, last_row + 1): # 从第2行开始(跳过标题行) # 获取Sheet2中对应行的数据 sheet2_row_idx = i - 2 # 转换为0-based索引 # 处理D列 (Sheet2的I列和O列) try: url = sheet2_df.iloc[sheet2_row_idx, 14] if len(sheet2_df.columns) > 14 else "" # O列 text = sheet2_df.iloc[sheet2_row_idx, 8] if len(sheet2_df.columns) > 8 else "" # I列 target_cell = worksheet.cell(row=i, column=4) # D列 self.process_single_hyperlink(target_cell, url, text, success_count, 0, empty_count, 0, direct_buy_count, 0) except Exception as e: error_count += 1 print(f"处理D列第{i}行时出错: {e}") # 处理E列 (Sheet2的J列和P列) try: url = sheet2_df.iloc[sheet2_row_idx, 15] if len(sheet2_df.columns) > 15 else "" # P列 text = sheet2_df.iloc[sheet2_row_idx, 9] if len(sheet2_df.columns) > 9 else "" # J列 target_cell = worksheet.cell(row=i, column=5) # E列 self.process_single_hyperlink(target_cell, url, text, success_count, 1, empty_count, 1, direct_buy_count, 1) except Exception as e: error_count += 1 print(f"处理E列第{i}行时出错: {e}") # 处理F列 (Sheet2的K列和Q列) try: url = sheet2_df.iloc[sheet2_row_idx, 16] if len(sheet2_df.columns) > 16 else "" # Q列 text = sheet2_df.iloc[sheet2_row_idx, 10] if len(sheet2_df.columns) > 10 else "" # K列 target_cell = worksheet.cell(row=i, column=6) # F列 self.process_single_hyperlink(target_cell, url, text, success_count, 2, empty_count, 2, direct_buy_count, 2) except Exception as e: error_count += 1 print(f"处理F列第{i}行时出错: {e}") # 处理G列 (Sheet2的L列和R列) try: url = sheet2_df.iloc[sheet2_row_idx, 17] if len(sheet2_df.columns) > 17 else "" # R列 text = sheet2_df.iloc[sheet2_row_idx, 11] if len(sheet2_df.columns) > 11 else "" # L列 target_cell = worksheet.cell(row=i, column=7) # G列 self.process_single_hyperlink(target_cell, url, text, success_count, 3, empty_count, 3, direct_buy_count, 3) except Exception as e: error_count += 1 print(f"处理G列第{i}行时出错: {e}") # 处理H列 (Sheet2的M列和S列) try: url = sheet2_df.iloc[sheet2_row_idx, 18] if len(sheet2_df.columns) > 18 else "" # S列 text = sheet2_df.iloc[sheet2_row_idx, 12] if len(sheet2_df.columns) > 12 else "" # M列 target_cell = worksheet.cell(row=i, column=8) # H列 self.process_single_hyperlink(target_cell, url, text, success_count, 4, empty_count, 4, direct_buy_count, 4) except Exception as e: error_count += 1 print(f"处理H列第{i}行时出错: {e}") # 添加处理结果统计 result_cell = worksheet.cell(row=last_row + 2, column=1) result_cell.value = "处理结果统计:" result_cell = worksheet.cell(row=last_row + 3, column=1) result_cell.value = f"D列成功: {success_count[0]} 个,空白: {empty_count[0]} 个,直接拍: {direct_buy_count[0]} 个" result_cell = worksheet.cell(row=last_row + 4, column=1) result_cell.value = f"E列成功: {success_count[1]} 个,空白: {empty_count[1]} 个,直接拍: {direct_buy_count[1]} 个" result_cell = worksheet.cell(row=last_row + 5, column=1) result_cell.value = f"F列成功: {success_count[2]} 个,空白: {empty_count[2]} 个,直接拍: {direct_buy_count[2]} 个" result_cell = worksheet.cell(row=last_row + 6, column=1) result_cell.value = f"G列成功: {success_count[3]} 个,空白: {empty_count[3]} 个,直接拍: {direct_buy_count[3]} 个" result_cell = worksheet.cell(row=last_row + 7, column=1) result_cell.value = f"H列成功: {success_count[4]} 个,空白: {empty_count[4]} 个,直接拍: {direct_buy_count[4]} 个" result_cell = worksheet.cell(row=last_row + 8, column=1) result_cell.value = f"失败: {error_count} 个" def process_single_hyperlink(self, target_cell, url, display_text, success_count, idx1, empty_count, idx2, direct_buy_count, idx3): """处理单个超链接,模拟VBA宏中的ProcessSingleHyperlink函数""" # 清除单元格内容和超链接 target_cell.value = None if target_cell.hyperlink: target_cell.hyperlink = None # 检查URL是否存在且有效 has_url = bool(url and not pd.isna(url) and str(url).strip()) # 检查文本是否存在且有效 has_text = bool(display_text and not pd.isna(display_text) and str(display_text).strip()) # 如果既没有URL也没有文本,保持单元格为空 if not has_url and not has_text: empty_count[idx2] += 1 return # 检查是否为"直接拍"文本 if has_text and str(display_text) == "直接拍": target_cell.value = display_text # 设置默认字体颜色(黑色)和无下划线 target_cell.font = Font(color="000000", underline="none") direct_buy_count[idx3] += 1 return # 如果有URL但无文本,使用URL作为显示文本 if has_url and not has_text: display_text = "链接" # 处理URL if has_url: # 添加协议前缀(如果缺失) url_str = str(url) if "://" not in url_str: url_str = "http://" + url_str # 创建超链接 target_cell.hyperlink = url_str target_cell.value = display_text # 去除下划线并设置字体颜色 target_cell.font = Font(color="0000FF", underline="none") # 蓝色字体,无下划线 success_count[idx1] += 1 else: # 只有文本没有URL,只显示文本 target_cell.value = display_text # 设置红色字体(非超链接文本) target_cell.font = Font(color="FF0000", underline="none") # 红色,无下划线 empty_count[idx2] += 1 # 设置居中对齐 target_cell.alignment = Alignment(horizontal="center", vertical="center") def process_excel_file(self, source_path, hyperlink_source_path, output_path): """处理Excel文件的主要函数""" # 读取源表 source_df = pd.read_excel(source_path, sheet_name='Sheet1') # 清除C列(优惠方式列)的空格 source_df.iloc[:, 2] = source_df.iloc[:, 2].apply(self.clean_text) # 创建目标DataFrame target_columns = [ '顺序', '商品名称', '商品链接', '商品ID', '优惠方式(xx元优惠券/拍立减xx元)', '优惠券链接', '优惠券提取', '公式放这一列', '优惠券1', '优惠券2', '优惠券3', '优惠券4', '优惠券5', '优惠券链接(提取)', '优惠券链接1', '优惠券链接2', '优惠券链接3', '优惠券链接4', '优惠券链接5' ] target_df = pd.DataFrame(columns=target_columns) # 填充数据 for idx, row in source_df.iterrows(): # 顺序 target_df.loc[idx, '顺序'] = idx + 1 # 商品名称和链接 target_df.loc[idx, '商品名称'] = row.iloc[0] # A列 target_df.loc[idx, '商品链接'] = row.iloc[1] # B列 # 商品ID item_id = self.extract_item_id(row.iloc[1]) target_df.loc[idx, '商品ID'] = item_id # 优惠方式(已清除空格) discount_info = row.iloc[2] # C列 target_df.loc[idx, '优惠方式(xx元优惠券/拍立减xx元)'] = discount_info # 优惠券链接 coupon_links = row.iloc[3] # D列 target_df.loc[idx, '优惠券链接'] = coupon_links # 应用ExtractCouponInfo函数 coupon_extract = self.extract_coupon_info(discount_info) target_df.loc[idx, '优惠券提取'] = coupon_extract # 应用FormatCoupon函数 formatted_coupon = self.format_coupon(coupon_extract) target_df.loc[idx, '公式放这一列'] = formatted_coupon # 根据H列的内容填充I列到M列 coupon_values = self.fill_coupon_columns(formatted_coupon, discount_info) for i in range(5): target_df.loc[idx, f'优惠券{i+1}'] = coupon_values[i] # 处理优惠券链接 urls, coupon_ids = self.split_coupon_links(coupon_links) # 优惠券链接提取 (用逗号分隔的URL) target_df.loc[idx, '优惠券链接(提取)'] = ", ".join(urls) if urls else "" # 填充优惠券链接1-5列(O列到S列) for i in range(5): if i < len(urls): target_df.loc[idx, f'优惠券链接{i+1}'] = urls[i] else: target_df.loc[idx, f'优惠券链接{i+1}'] = "" # 保存结果 with pd.ExcelWriter(output_path, engine='openpyxl') as writer: # 保存Sheet2 target_df.to_excel(writer, sheet_name='Sheet2', index=False) # 创建超链接工作表 self.create_hyperlinks_sheet(writer, hyperlink_source_path, target_df) # 创建主窗口并运行应用 if __name__ == "__main__": root = tk.Tk() app = ExcelProcessorApp(root) root.mainloop() 以上代码变更为C列点击后跳转网页,E列优惠方式(xx元优惠券/拍立减xx元)和F列优惠券链接预览可编辑
09-20
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值