import os
import json
from pathlib import Path
from collections import Counter
from typing import Optional, Dict, List, Any
import joblib
from sklearn.base import clone
from copy import deepcopy
import numpy as np
import matplotlib
from collections import Counter
import numpy as np
matplotlib.use("Agg") # 服务器/无界面环境也能保存图像
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
)
# 项目中的数据加载/特征提取函数(按你的工程保持不变)
from src.data_loader import load_dataset
from src.feature_extract import extract_dataset_features # 支持 extra_feats/bearing_freqs 的升级版
# =========================
# 工具函数
# =========================
def ensure_dir(path: str) -> str:
Path(path).mkdir(parents=True, exist_ok=True)
return path
def save_confusion_matrix(cm: np.ndarray, display_labels: List[str],
out_path_png: str, title: str) -> None:
fig, ax = plt.subplots(figsize=(6, 5))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=display_labels)
# 不传 colorbar 参数以兼容旧版本 sklearn/matplotlib
disp.plot(cmap="Blues", values_format="d", ax=ax)
ax.set_title(title)
plt.tight_layout()
plt.savefig(out_path_png, dpi=300)
plt.close()
def save_confusion_matrix_norm(cm: np.ndarray, display_labels: List[str],
out_path_png: str, title: str) -> None:
with np.errstate(all="ignore"):
cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)
cm_norm = np.nan_to_num(cm_norm)
fig, ax = plt.subplots(figsize=(6, 5))
disp = ConfusionMatrixDisplay(confusion_matrix=cm_norm, display_labels=display_labels)
disp.plot(cmap="Blues", values_format=".2f", ax=ax)
ax.set_title(title)
plt.tight_layout()
plt.savefig(out_path_png, dpi=300)
plt.close()
# =========================
# 模型工厂与默认网格
# =========================
def build_model(model_name: str, **params: Any) -> Pipeline:
name = model_name.lower()
if name == "mlp":
clf = MLPClassifier(
hidden_layer_sizes=params.get("hidden_layer_sizes", (128, 64)),
activation=params.get("activation", "relu"),
solver=params.get("solver", "adam"),
alpha=params.get("alpha", 1e-4),
max_iter=params.get("max_iter", 800),
early_stopping=params.get("early_stopping", True),
n_iter_no_change=params.get("n_iter_no_change", 15),
random_state=params.get("random_state", 42),
verbose=params.get("verbose", False),
)
return Pipeline([("scaler", StandardScaler()), ("clf", clf)])
elif name == "svm":
clf = SVC(
kernel=params.get("kernel", "rbf"),
C=params.get("C", 10.0),
gamma=params.get("gamma", "scale"),
probability=params.get("probability", True),
class_weight=params.get("class_weight", "balanced"), # ← 新增
random_state=params.get("random_state", 42),
)
return Pipeline([("scaler", StandardScaler()), ("clf", clf)])
elif name == "rf":
clf = RandomForestClassifier(
n_estimators=params.get("n_estimators", 400),
max_depth=params.get("max_depth", None),
min_samples_split=params.get("min_samples_split", 2),
min_samples_leaf=params.get("min_samples_leaf", 1),
class_weight=params.get("class_weight", "balanced_subsample"), # ← 保持这个
n_jobs=params.get("n_jobs", -1),
random_state=params.get("random_state", 42),
)
return Pipeline([("scaler", StandardScaler()), ("clf", clf)])
raise ValueError(f"Unknown model_name: {model_name}. Use 'mlp' | 'svm' | 'rf'.")
def get_param_grid(model_name: str) -> Optional[Dict[str, List[Any]]]:
name = model_name.lower()
if name == "svm":
# 先粗后细:第一轮用下面这个,第二轮把最佳附近再细化一圈
return {
"clf__C": [0.5, 1, 3, 10, 30],
"clf__gamma": ["scale", 0.3, 0.1, 0.03, 0.01],
}
if name == "rf":
return {
"clf__n_estimators": [500, 800, 1200], # 树越多越稳
"clf__max_depth": [None, 12, 20, 30],
"clf__min_samples_leaf": [1, 2, 4],
"clf__max_features": ["sqrt"], # 对低维/中维特征更合适
}
if name == "mlp":
return {
"clf__hidden_layer_sizes": [(128, 64), (128, 64, 32)],
"clf__alpha": [1e-5, 1e-4, 1e-3], # L2 正则强度
"clf__learning_rate": ["constant", "adaptive"],
}
return None
# =========================
# 主流程(无 main 函数)
# =========================
def run_pipeline(
data_dir: str = "data/source",
fs: int = 32000, # ← 默认改为 12 kHz(与你的数据一致)
key: str = "DE",
length: int = 2048,
step: int = 1024,
max_segments: int = 10,
standardize_each: bool = True,
model_name: str = "svm", # "mlp" | "svm" | "rf"
model_params: Optional[Dict[str, Any]] = None, # 覆盖默认超参
use_grid_search: bool = False,
scoring: str = "f1_macro",
out_dir: str = "runs",
class_names: Optional[List[str]] = None,
split_test_size: float = 0.2,
random_state: int = 42,
feature_kwargs: Optional[Dict[str, Any]] = None, # ← 新增:透传特征提取选项
) -> Dict[str, Any]:
"""
完整的源域诊断流程:加载→提特征→分层划分→建模训练→评估→保存产物
- 若 feature_kwargs 包含 extra_feats=True、bearing_freqs=... 等,将启用扩展特征(如 21 维)。
返回:包含关键结果与文件路径的字典
"""
ensure_dir(out_dir)
# 1) 加载原始信号
X_raw, y_raw = load_dataset(
data_dir, key=key, length=length, step=step,
max_segments=max_segments, standardize_each=standardize_each
)
if X_raw.shape[0] == 0:
raise RuntimeError("未从 {} 读取到样本,请检查路径与 data_loader 逻辑。".format(data_dir))
print("原始数据 shape:", X_raw.shape, "| 标签 shape:", y_raw.shape)
print("全体类别分布:", Counter(y_raw))
# 2) 提取特征(可 9 维,也可 21 维等,取决于 feature_kwargs)
feature_kwargs = feature_kwargs or {}
X_feat = extract_dataset_features(X_raw, fs=fs, **feature_kwargs)
if X_feat.shape[0] == 0:
raise RuntimeError("X_feat 为空,请检查特征提取与采样率 fs 是否一致。")
print("特征矩阵 shape:", X_feat.shape)
# 2.1 标签编码(若 y 不是 0..C-1 的整数)
le = None
y = y_raw
try:
is_int = np.issubdtype(np.asarray(y_raw).dtype, np.integer)
except Exception:
is_int = False
if not is_int:
le = LabelEncoder()
y = le.fit_transform(y_raw)
print("使用 LabelEncoder,映射:", dict(zip(le.classes_, list(range(len(le.classes_))))))
# class_names:若未指定,按编码顺序自动生成
if class_names is None:
if le is not None:
class_names = [str(c) for c in le.classes_]
else:
uniq = sorted(list(np.unique(y)))
class_names = [str(c) for c in uniq]
# 3) 分层划分
X_train, X_test, y_train, y_test = train_test_split(
X_feat, y, test_size=split_test_size, random_state=random_state, stratify=y
)
print("Train 分布:", Counter(y_train))
print("Test 分布:", Counter(y_test))
# 4) 构建模型 / 网格搜索
params = model_params or {}
pipe = build_model(model_name, **params)
best_est = pipe
best_params = None
cv_score = None
if use_grid_search:
grid = get_param_grid(model_name)
if grid is None:
raise ValueError(
"模型 {} 未定义默认网格,请将 use_grid_search=False 或自行修改 get_param_grid。".format(model_name)
)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)
search = GridSearchCV(pipe, grid, cv=cv, n_jobs=-1, scoring=scoring, refit=True, verbose=0)
search.fit(X_train, y_train)
best_est = search.best_estimator_
best_params = search.best_params_
cv_score = float(search.best_score_)
print("[{}] Grid best params: {}".format(model_name.upper(), best_params))
print("[{}] Grid best {}: {:.4f}".format(model_name.upper(), scoring, cv_score))
else:
best_est.fit(X_train, y_train)
# 5) 测试评估
y_pred = best_est.predict(X_test)
acc = float(accuracy_score(y_test, y_pred))
report = classification_report(y_test, y_pred, target_names=class_names, digits=4)
cm = confusion_matrix(y_test, y_pred, labels=sorted(list(np.unique(y))))
print("[{}] Test Accuracy: {:.4f}".format(model_name.upper(), acc))
print("[{}] Classification Report:\n{}".format(model_name.upper(), report))
# 6) 保存产物
prefix = os.path.join(out_dir, "source_{}".format(model_name.lower()))
report_path = prefix + "_report.txt"
with open(report_path, "w", encoding="utf-8") as f:
meta = {
"model": model_name,
"best_params": best_params,
"cv_score": cv_score,
"test_accuracy": acc,
"feature_shape": X_feat.shape, # 记录最终特征维度
}
f.write(json.dumps(meta, ensure_ascii=False, indent=2) + "\n\n")
f.write(report)
cm_path = prefix + "_cm.png"
cm_norm_path = prefix + "_cm_norm.png"
save_confusion_matrix(cm, class_names, cm_path,
"Confusion Matrix (Source, {})".format(model_name.upper()))
save_confusion_matrix_norm(cm, class_names, cm_norm_path,
"Confusion Matrix (Row-Norm, {})".format(model_name.upper()))
results: Dict[str, Any] = {
"model": model_name,
"best_params": best_params,
"cv_score": cv_score,
"test_accuracy": acc,
"report_path": os.path.abspath(report_path),
"cm_path": os.path.abspath(cm_path),
"cm_norm_path": os.path.abspath(cm_norm_path),
"class_names": class_names,
"label_encoder": le, # 可能为 None
"feature_shape": X_feat.shape,
"fitted_model": best_est, # ← 新增
}
return results
# =========================
# 迁移学习(问题3)新增:模型持久化、微调与三种对比实验
# =========================
def save_model(model, path: str) -> None:
ensure_dir(os.path.dirname(path))
joblib.dump(model, path)
print(f"[SAVE] 模型已保存: {path}")
def load_model(path: str):
print(f"[LOAD] 从 {path} 加载源域模型...")
return joblib.load(path)
def evaluate_and_save(
model,
X_test: np.ndarray,
y_test: np.ndarray,
class_names,
tag: str,
out_dir: str = "runs/transfer",
mode: str = "fixed", # 你现在传的是 fixed;也可以试 auto 看更清爽的2×2
debug: bool = True, # ← 新增:诊断开关
):
ensure_dir(out_dir)
y_pred = model.predict(X_test)
acc = float(accuracy_score(y_test, y_pred))
if debug:
import numpy as np
print(f"[{tag}] y_test classes: {sorted(list(np.unique(y_test)))}")
print(f"[{tag}] y_pred classes: {sorted(list(np.unique(y_pred)))}")
try:
clf = getattr(model, "named_steps", {}).get("clf", None)
if clf is not None and hasattr(clf, "classes_"):
print(f"[{tag}] clf.classes_: {clf.classes_.tolist()}")
except Exception:
pass
if mode == "fixed":
labels_eval = list(range(len(class_names))) # 例如 [0,1,2,3]
names_eval = class_names
else: # auto
labels_eval = sorted(list(np.unique(np.concatenate([y_test, y_pred]))))
names_eval = [class_names[i] if (max(labels_eval) < len(class_names)) else str(i)
for i in labels_eval]
report = classification_report(
y_test, y_pred,
labels=labels_eval,
target_names=names_eval,
digits=4,
zero_division=0,
)
print(f"[{tag}] Test Accuracy = {acc:.4f}")
print(f"[{tag}] Classification Report:\n{report}")
cm = confusion_matrix(y_test, y_pred, labels=labels_eval)
cm_path = os.path.join(out_dir, f"{tag}_cm.png")
cm_norm_path = os.path.join(out_dir, f"{tag}_cm_norm.png")
save_confusion_matrix(cm, names_eval, cm_path, f"Confusion Matrix ({tag})")
save_confusion_matrix_norm(cm, names_eval, cm_norm_path, f"Confusion Matrix (Row-Norm, {tag})")
report_path = os.path.join(out_dir, f"{tag}_report.txt")
with open(report_path, "w", encoding="utf-8") as f:
f.write(report)
return {
"tag": tag,
"test_accuracy": acc,
"report_path": os.path.abspath(report_path),
"cm_path": os.path.abspath(cm_path),
"cm_norm_path": os.path.abspath(cm_norm_path),
}
from sklearn.base import clone
from copy import deepcopy
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
def fine_tune_pipeline(
source_model,
X_target_train: np.ndarray,
y_target_train: np.ndarray,
freeze_feature_extractor: bool = True,
max_iter: int = 300,
adapt_mode: str = "scaler_and_head_mix", # "scaler_only" | "refit_all" | "scaler_and_head_mix"
# ↓↓↓ 新增:混合训练的控制项
use_source_data: bool = True,
X_source_train: Optional[np.ndarray] = None,
y_source_train: Optional[np.ndarray] = None,
source_ratio: float = 0.3, # 按目标域训练样本数的比例抽取源域样本
rebalance_after_mix: bool = True, # 混合后是否做一次简单过采样平衡
):
"""
源域初始化 + 目标域微调
- scaler_only: 仅用目标域训练集重拟合 StandardScaler,并在目标域上重训分类器
- refit_all: 整条 Pipeline 在目标域训练集上重训(scaler+clf)
- scaler_and_head_mix: ✅ 用 “源(同类)+目标” 混合数据重拟合 scaler 与分类器
"""
model_src = source_model
assert hasattr(model_src, "named_steps") and "scaler" in model_src.named_steps and "clf" in model_src.named_steps, \
"source_model 必须是包含 ('scaler','clf') 的 Pipeline"
target_classes = np.unique(y_target_train)
print(f"[fine_tune_pipeline] 目标域类别: {target_classes.tolist()} | adapt_mode={adapt_mode}")
# --- refit_all: 整条管道在目标域重训 ---
if not freeze_feature_extractor or adapt_mode == "refit_all":
model_new = deepcopy(model_src)
model_new.fit(X_target_train, y_target_train)
return model_new
# 取出源模型部件与超参
clf_src = model_src.named_steps["clf"]
# ========== A) 只用目标域(保持旧行为) ==========
if adapt_mode == "scaler_only" or not use_source_data:
# 仅目标域重拟合 scaler
scaler_tgt = StandardScaler().fit(X_target_train)
X_train_final = scaler_tgt.transform(X_target_train)
y_train_final = y_target_train
print("[fine_tune_pipeline] 使用目标域训练集进行 scaler_only 微调。")
else:
# ========== B) 混合:源 + 目标 ==========
# 1) 过滤出“源域中与目标域标签一致”的样本
if X_source_train is None or y_source_train is None:
print("[fine_tune_pipeline][WARN] 未提供 X_source_train/y_source_train,回退到 scaler_only。")
scaler_tgt = StandardScaler().fit(X_target_train)
X_train_final = scaler_tgt.transform(X_target_train)
y_train_final = y_target_train
else:
mask_keep = np.isin(y_source_train, target_classes)
Xs_keep = X_source_train[mask_keep]
ys_keep = y_source_train[mask_keep]
if Xs_keep.shape[0] == 0:
print("[fine_tune_pipeline][WARN] 源域没有与目标域重叠的类别,回退到 scaler_only。")
scaler_tgt = StandardScaler().fit(X_target_train)
X_train_final = scaler_tgt.transform(X_target_train)
y_train_final = y_target_train
else:
# 2) 按比例从源域抽样
n_tgt = X_target_train.shape[0]
n_src = int(max(1, np.floor(n_tgt * float(source_ratio))))
n_src = min(n_src, Xs_keep.shape[0]) # 不超过可用源样本
rng = np.random.RandomState(42)
idx_src = rng.choice(np.arange(Xs_keep.shape[0]), size=n_src, replace=False)
X_src_sel, y_src_sel = Xs_keep[idx_src], ys_keep[idx_src]
# 3) 拼接混合数据
X_mix = np.vstack([X_target_train, X_src_sel])
y_mix = np.concatenate([y_target_train, y_src_sel])
print(f"[fine_tune_pipeline] mix counts: source_kept={Xs_keep.shape[0]} -> sampled={n_src}, "
f"target={n_tgt}, mix_total={X_mix.shape[0]}")
# 4) 混合数据上重拟合 scaler
scaler_tgt = StandardScaler().fit(X_mix)
X_train_final = scaler_tgt.transform(X_mix)
y_train_final = y_mix
# 5) 简单重平衡(可选)
if rebalance_after_mix:
X_train_final, y_train_final = oversample_minority(X_train_final, y_train_final)
# ========== 用“新的训练集(目标或混合)”重训分类器 ==========
if isinstance(clf_src, MLPClassifier):
params = clf_src.get_params(deep=True)
mlp_new = MLPClassifier(**params)
# 放宽 max_iter,确保小样本能收敛
if "max_iter" in params:
mlp_new.set_params(max_iter=max(max_iter, int(params["max_iter"])))
else:
mlp_new.set_params(max_iter=max_iter)
mlp_new.fit(X_train_final, y_train_final)
clf_new = mlp_new
elif isinstance(clf_src, SVC):
svc_new = clone(clf_src)
svc_new.fit(X_train_final, y_train_final)
clf_new = svc_new
elif isinstance(clf_src, RandomForestClassifier):
rf_new = clone(clf_src)
rf_new.fit(X_train_final, y_train_final)
clf_new = rf_new
else:
raise TypeError(f"fine_tune_pipeline 暂不支持的分类器类型: {type(clf_src)}")
# 类别空间对齐(sklearn 会自己设置,这里保险起见)
if hasattr(clf_new, "classes_"):
clf_new.classes_ = np.unique(y_train_final)
# 拼装新的 Pipeline
model_new = Pipeline([("scaler", scaler_tgt), ("clf", clf_new)])
return model_new
def prepare_target_split(
data_dir_target: str,
fs: int,
key: str = "TARGET",
length: int = 2048,
step: int = 1024,
standardize_each: bool = True,
target_train_ratio: float = 0.2, # “小样本”比例
random_state: int = 42,
feature_kwargs: Optional[Dict[str, Any]] = None,
):
"""加载目标域数据 → 提特征 → 小样本划分(train 小,test 大)。"""
X_raw, y_raw = load_dataset(
data_dir_target, key=key, length=length, step=step, max_segments=10,
standardize_each=standardize_each
)
X_feat = extract_dataset_features(X_raw, fs=fs, **(feature_kwargs or {}))
# 若标签非整数,编码以便与 run_pipeline 保持一致
le = None
try:
is_int = np.issubdtype(np.asarray(y_raw).dtype, np.integer)
except Exception:
is_int = False
y = y_raw
if not is_int:
le = LabelEncoder()
y = le.fit_transform(y_raw)
# 小样本划分(训练集尽量小,测试集大以反映 domain gap)
X_tgt_train, X_tgt_test, y_tgt_train, y_tgt_test = train_test_split(
X_feat, y, test_size=(1 - target_train_ratio), stratify=y, random_state=random_state
)
# 类名
if le is not None:
class_names = [str(c) for c in le.classes_]
else:
class_names = [str(c) for c in sorted(list(np.unique(y)))]
print("Target Train 分布:", Counter(y_tgt_train))
print("Target Test 分布:", Counter(y_tgt_test))
return (X_tgt_train, X_tgt_test, y_tgt_train, y_tgt_test, class_names, le, X_feat.shape)
#
def run_transfer_experiments(
data_dir_source,
data_dir_target,
fs_source=32000,
fs_target=32000,
key_source="DE",
key_target="BOGIE",
model_names=("mlp", "svm", "rf"),
out_dir="runs/transfer_exp1",
freeze_feature_extractor=True,
feature_kwargs_source=None,
feature_kwargs_target=None,
target_train_ratio=0.2,
# ↓↓↓ 新增:微调混合的默认设置
use_source_data_in_ft: bool = True,
source_ratio_in_ft: float = 0.4,
adapt_mode_in_ft: str = "scaler_and_head_mix", # "scaler_and_head_mix" | "scaler_only" | "refit_all"
):
"""
迁移学习实验:Case1–Case3
"""
ensure_dir(out_dir)
summary = {}
# ===== 源域训练(得到各模型) =====
print("====== [Step 1] 源域训练 ======")
res_src_all = {}
for model_name in model_names:
res = run_pipeline(
data_dir=data_dir_source,
fs=fs_source,
key=key_source,
model_name=model_name,
out_dir=out_dir,
split_test_size=0.2,
random_state=42,
feature_kwargs=feature_kwargs_source,
)
res_src_all[model_name] = res
# 另外准备“源域训练切分”(用于 Case3 混合)
Xs_raw, ys_raw = load_dataset(
data_dir_source, key=key_source, length=2048, step=1024,
max_segments=10, standardize_each=True
)
Xs_feat = extract_dataset_features(Xs_raw, fs=fs_source, **(feature_kwargs_source or {}))
Xs_tr, Xs_te, ys_tr, ys_te = train_test_split(
Xs_feat, ys_raw, test_size=0.2, stratify=ys_raw, random_state=42
)
# ===== 目标域数据准备 =====
print("\n====== [Step 2] 目标域数据准备(小样本) ======")
Xt_tr, Xt_te, yt_tr, yt_te, class_names_tgt, le_tgt, feat_shape = prepare_target_split(
data_dir_target=data_dir_target,
fs=fs_target,
key=key_target,
target_train_ratio=target_train_ratio,
feature_kwargs=feature_kwargs_target,
)
# 平衡目标域小样本
Xt_tr_bal, yt_tr_bal = oversample_minority(Xt_tr, yt_tr)
# ===== 遍历模型 =====
for model_name in model_names:
print(f"\n==================== {model_name.upper()} ====================")
summary[model_name] = {}
source_model = res_src_all[model_name]["fitted_model"]
# Case1: 源域直接测试目标域
case1 = evaluate_and_save(
model=source_model,
X_test=Xt_te,
y_test=yt_te,
class_names=class_names_tgt,
tag=f"case1_source2target_{model_name}",
out_dir=out_dir,
mode="auto"
)
summary[model_name]["case1"] = case1
# Case2: 仅目标域小样本从零训练
fresh_model = build_model(model_name)
fresh_model.fit(Xt_tr_bal, yt_tr_bal)
case2 = evaluate_and_save(
model=fresh_model,
X_test=Xt_te,
y_test=yt_te,
class_names=class_names_tgt,
tag=f"case2_target_only_{model_name}",
out_dir=out_dir,
mode="auto"
)
summary[model_name]["case2"] = case2
# Case3: 源域初始化 + 微调(默认混合模式)
tuned_model = fine_tune_pipeline(
source_model=source_model,
X_target_train=Xt_tr_bal,
y_target_train=yt_tr_bal,
freeze_feature_extractor=freeze_feature_extractor,
max_iter=300,
adapt_mode=adapt_mode_in_ft,
use_source_data=use_source_data_in_ft,
X_source_train=Xs_tr,
y_source_train=ys_tr,
source_ratio=source_ratio_in_ft,
)
case3 = evaluate_and_save(
model=tuned_model,
X_test=Xt_te,
y_test=yt_te,
class_names=class_names_tgt,
tag=f"case3_finetune_{model_name}",
out_dir=out_dir,
mode="auto"
)
summary[model_name]["case3"] = case3
# ===== 汇总保存 =====
summary_path = os.path.join(out_dir, "summary.json")
with open(summary_path, "w", encoding="utf-8") as f:
json.dump(summary, f, ensure_ascii=False, indent=2)
print("\n[OK] 多模型迁移实验完成,结果汇总:", summary_path)
return summary
def oversample_minority(X, y, random_state=42):
"""
简单过采样:把少数类样本重复到和多数类一样多
"""
rng = np.random.RandomState(random_state)
counts = Counter(y)
max_count = max(counts.values())
X_resampled, y_resampled = [], []
for cls, count in counts.items():
idxs = np.where(y == cls)[0]
if count < max_count:
# 随机采样补齐
extra = rng.choice(idxs, size=max_count - count, replace=True)
all_idxs = np.concatenate([idxs, extra])
else:
all_idxs = idxs
X_resampled.append(X[all_idxs])
y_resampled.append(y[all_idxs])
X_new = np.vstack(X_resampled)
y_new = np.concatenate(y_resampled)
print("⚖️ 过采样后类别分布:", Counter(y_new))
return X_new, y_new
分析上述每行代码的意义,说明了什么