Csv_robust typing

本文介绍了一个用于读取和解析CSV文件的C++类实现。该类支持从指定地址和大小的文件中读取数据,并提供了检查文件有效性、逐行读取数据等功能。此外,还演示了如何使用该类来读取实际文件。
#include<string>
#include<iostream>
#include<stdio.h>


using namespace std;


#define LogOut printf
//this class don't use the container to buffer the file data,because it's too big sometimes.
class Csv
{
public:
Csv();
Csv(const char *FileAddr, int FileSize);


bool CheckFile(const char *FileAddr, int FileSize);
//bool ReadLine(const char* Adrr, string &str);
bool ReadLine();
bool ReadLine(const char* Addr, string &Str);
bool ReadField(int Row, int Col, string &Str);
bool ReadField(int Row, int Col, int &Val);
bool ReadField(int Row, int Col, float &Val);


int FindCols(int Col, const string &Str); //find the row index of Str in the specific columnn 
int FindCols(int Col, int Val);
int FindCols(int Col, float Val);


int CntLine(const char *path);
string CntCol(string line);
protected:
const char *m_FileAddr;
int m_FileSize;
bool isFileValid;
const char* m_Pos; //current position of the csv file
private:
bool isGraphic(char ch);
string m_Line;
string m_Field;
};


Csv::Csv()
{
m_FileAddr = NULL;
m_FileSize = 0;
m_Pos = NULL;
isFileValid = false;
}


Csv::Csv(const char *FileAddr, int FileSize)
{
if(CheckFile(FileAddr, FileSize))
{
m_FileAddr = FileAddr;
m_FileSize = FileSize;
m_Pos = m_FileAddr;
isFileValid = true;
}
}






//read data from the flash to string object which storages the data to Heap,not the stack of the Task
const short LineSizeLimit = 100;
bool Csv::isGraphic(char ch)
{
if(ch < 0x20 || ch > 0x7e)
return false;
else
return true;
}




bool Csv::ReadLine(const char* Addr, string &Str)//just read one line data,and make difference to the csv object
{
const char *pos = Addr;


while((*pos != '\r' && *(pos + 1) != '\n') && pos < Addr + LineSizeLimit && isGraphic(*pos))
{
pos++;
}
if(pos >= Addr + LineSizeLimit || !(isGraphic(*pos) || *pos == '\r' || *pos == '\n'))
{
Str.clear();
return false;
}
Str.assign(Addr, pos - Addr);
return true;
}


bool Csv::ReadLine()//read a new line to the m_line from m_pos,this function for read each line of the file 
{
const char *pos = m_Pos;


while((*pos != '\r' && *(pos + 1) != '\n') && pos < m_Pos + LineSizeLimit && isGraphic(*pos))
{
pos++;
}
if(pos >= m_Pos + LineSizeLimit || !(isGraphic(*pos) || *pos == '\r' || *pos == '\n'))
{
m_Line.clear();
return false;
}
m_Line.assign(m_Pos, pos - m_Pos);
m_Pos = pos + 2; //move to next line addr of the file
return true;
}


bool Csv::CheckFile(const char *FileAddr, int FileSize)
{
string str;
if(!FileAddr || !FileSize || !ReadLine(FileAddr, str)) //just check the first line
{
LogOut("Warnning: Csv::Csv() input argurment invalid!\n");
m_FileAddr = NULL;
m_FileSize = 0;
isFileValid = false;
return false;
}
m_FileAddr = FileAddr;
m_FileSize = FileSize;
m_Pos = m_FileAddr;
return true;
}

bool SimulateFlash(const char* FileName[], int Num, unsigned char** Addr, int* Size)
{
const int BufSize = 1024 * 8;
static unsigned char FileBuf[BufSize] = {0};
FILE** fp = new FILE*[Num];
unsigned char *BaseAddr = FileBuf;
unsigned char *pos = FileBuf;

for(int i = 0; i < Num; i++) {
char ch;
if((fp[i] = fopen(FileName[i], "r")) == NULL) {
LogOut("Can't open file:%s\n", FileName[i]);
perror(NULL);
continue;
}
while((ch = fgetc(fp[i])) != EOF && pos < FileBuf + BufSize)
*pos++ = ch;
if(pos >= FileBuf + BufSize) {
LogOut("FileBuf has not enough space to contain the File!\n");
delete[] fp;
return false;
}
*(Size + i) = pos - BaseAddr;
*(Addr + i) = BaseAddr;
BaseAddr = pos;
}
delete[] fp;
return true;
}




int main()
{
const char *FileName[] = {"Big.csv", "Middle.csv", "Small.csv"};
int FileSize[sizeof(FileName)/sizeof(FileName[0])];
unsigned char* Addr[sizeof(FileName)/sizeof(FileName[0])];
SimulateFlash(FileName, sizeof(FileName)/sizeof(FileName[0]), Addr, FileSize);


const char* BigWave = (const char*)Addr[0];
const char* MiddleWave = (const char*)Addr[1];
const char* SmallWave = (const char*)Addr[2];
int BigWaveSize = FileSize[0];
int MiddleWaveSize = FileSize[1];
int SmallWaveSize = FileSize[2];


Csv csv;
string str;
bool rtn = csv.ReadLine(BigWave, str);
cout << str << endl;


// for(int i = 0; i < BigWaveSize; i++)
// {
// cout << *(BigWave + i);
// }
return 0;
}
""" Outlier Detection Toolbox ========================= This is a single-file distribution (for ease of preview) of a production-grade outlier/anomaly detection toolbox intended to be split into a small package: outlier_detection/ ├── __init__.py ├── utils.py ├── statistical.py ├── distance_density.py ├── model_based.py ├── deep_learning.py ├── ensemble.py ├── visualization.py └── cli.py --- NOTE --- This code block contains *all* modules concatenated (with file headers) so you can preview and copy each file out into separate .py files. When you save them as separate files the package will work as expected. Design goals (what you asked for): - Detailed, well-documented functions (purpose, math, applicability, edge-cases) - Robust handling of NaNs, constant columns, categorical data - Functions return structured metadata + masks + scores so you can inspect - Utilities for ensemble combining methods and producing a readable report - Optional deep learning methods (AutoEncoder/VAE) with clear dependency instructions and graceful error messages if libraries are missing. Dependencies (recommended): pip install numpy pandas scipy scikit-learn matplotlib joblib tensorflow>=2.0 If you prefer PyTorch for deep models you can adapt deep_learning.py accordingly. """ # --------------------------- # File: outlier_detection/__init__.py # --------------------------- __version__ = "0.1.0" # make it easy to import core helpers from typing import Dict from .utils import ensure_dataframe, OutlierResult, summarize_results, recommend_methods from .statistical import z_score_method, modified_z_score, iqr_method, grubbs_test from .distance_density import lof_method, mahalanobis_method, dbscan_method, knn_distance_method from .model_based import ( isolation_forest_method, one_class_svm_method, pca_reconstruction_error, gmm_method, elliptic_envelope_method, ) # deep_learning module is optional (heavy dependency) try: from .deep_learning import autoencoder_method, vae_method except Exception: # graceful: user may not have TF installed; import will raise at use time autoencoder_method = None vae_method = None from .ensemble import ensemble_methods, aggregate_scores from .visualization import plot_boxplot, plot_pair_scatter __all__ = [ "__version__", "ensure_dataframe", "OutlierResult", "summarize_results", "recommend_methods", "z_score_method", "modified_z_score", "iqr_method", "grubbs_test", "lof_method", "mahalanobis_method", "dbscan_method", "knn_distance_method", "isolation_forest_method", "one_class_svm_method", "pca_reconstruction_error", "gmm_method", "elliptic_envelope_method", "autoencoder_method", "vae_method", "ensemble_methods", "aggregate_scores", "plot_boxplot", "plot_pair_scatter", ] # --------------------------- # File: outlier_detection/utils.py # --------------------------- """ Utilities for the outlier detection package. Key responsibilities: - Input validation and type normalization - Handling numeric / categorical separation - Standardization and robust scaling helpers - A consistent result object shape used by all detectors """ from typing import Dict, Any, Tuple, Optional, List import numpy as np import pandas as pd import logging logger = logging.getLogger(__name__) # A simple, documented result schema for detector functions. # Each detector returns a dict with these keys (guaranteed): # - 'mask': pd.Series[bool] same index as input rows; True means OUTLIER # - 'score': pd.Series or pd.DataFrame numeric score (bigger usually means more anomalous) # - 'method': short string # - 'params': dict of parameters used # - 'explanation': short textual note about interpretation OutlierResult = Dict[str, Any] def ensure_dataframe(X) -> pd.DataFrame: """ Convert input into a pandas DataFrame with a stable integer index. Accepts: pd.DataFrame, np.ndarray, list-of-lists, pd.Series. Returns DataFrame with numeric column names if necessary. """ if isinstance(X, pd.DataFrame): df = X.copy() elif isinstance(X, pd.Series): df = X.to_frame() else: # try to coerce df = pd.DataFrame(X) # if no index or non-unique, reset if df.index is None or not df.index.is_unique: df = df.reset_index(drop=True) # name numeric columns if unnamed df.columns = [str(c) for c in df.columns] return df def numeric_only(df: pd.DataFrame, return_cols: bool = False) -> pd.DataFrame: """ Select numeric columns and warn if non-numeric columns are dropped. If no numeric columns found raises ValueError. """ df = ensure_dataframe(df) numeric_df = df.select_dtypes(include=["number"]).copy() non_numeric = [c for c in df.columns if c not in numeric_df.columns] if non_numeric: logger.debug("Dropping non-numeric columns for numeric-only detectors: %s", non_numeric) if numeric_df.shape[1] == 0: raise ValueError("No numeric columns available for numeric detectors. Consider encoding categoricals.") if return_cols: return numeric_df, list(numeric_df.columns) return numeric_df def handle_missing(df: pd.DataFrame, strategy: str = "drop", fill_value: Optional[float] = None) -> pd.DataFrame: """ Handle missing values in data before passing to detectors. Parameters ---------- df : DataFrame strategy : {'drop', 'mean', 'median', 'zero', 'constant', 'keep'} - 'drop' : drop rows with any NaN (useful when most values are present) - 'mean' : fill numeric columns with mean - 'median' : fill numeric with median - 'zero' : fill with 0 - 'constant' : fill with supplied fill_value - 'keep' : keep NaNs (many detectors can handle NaN rows implicitly) fill_value : numeric (used when strategy=='constant') Returns ------- DataFrame cleaned according to strategy. Original index preserved. Notes ----- - Some detectors (LOF, IsolationForest) do NOT accept NaNs; choose strategy accordingly. """ df = df.copy() if strategy == "drop": return df.dropna(axis=0, how="any") elif strategy == "mean": return df.fillna(df.mean()) elif strategy == "median": return df.fillna(df.median()) elif strategy == "zero": return df.fillna(0) elif strategy == "constant": if fill_value is None: raise ValueError("fill_value must be provided for strategy='constant'") return df.fillna(fill_value) elif strategy == "keep": return df else: raise ValueError(f"Unknown missing value strategy: {strategy}") def robust_scale(df: pd.DataFrame) -> pd.DataFrame: """ Scale numeric columns using median and IQR (robust to outliers). Returns a DataFrame of same shape with scaled values. """ df = numeric_only(df) med = df.median() q1 = df.quantile(0.25) q3 = df.quantile(0.75) iqr = q3 - q1 # avoid division by zero iqr_replaced = iqr.replace(0, 1.0) return (df - med) / iqr_replaced def create_result(mask: pd.Series, score: pd.Series, method: str, params: Dict[str, Any], explanation: str) -> OutlierResult: """ Wrap mask + score into the standard result dict. """ # ensure index alignment if not mask.index.equals(score.index): # try to reindex score = score.reindex(mask.index) return { "mask": mask.astype(bool), "score": score, "method": method, "params": params, "explanation": explanation, } def summarize_results(results: Dict[str, OutlierResult]) -> pd.DataFrame: """ Given a dict of results keyed by method name, return a single DataFrame where each column is that method's boolean flag and another column is the score (if numeric). Also returns a short per-row summary like how many detectors flagged the row. """ # Collect masks and scores masks = {} scores = {} for k, r in results.items(): masks[f"{k}_flag"] = r["mask"].astype(int) # flatten score: if DataFrame use mean across columns sc = r["score"] if isinstance(sc, pd.DataFrame): sc = sc.mean(axis=1) scores[f"{k}_score"] = sc masks_df = pd.DataFrame(masks) scores_df = pd.DataFrame(scores) combined = pd.concat([masks_df, scores_df], axis=1) combined.index = next(iter(results.values()))["mask"].index combined["n_flags"] = masks_df.sum(axis=1) combined["any_flag"] = combined["n_flags"] > 0 return combined def recommend_methods(X: pd.DataFrame) -> List[str]: """ Heuristic recommender: returns a short list of methods to try depending on data shape. Rules (simple heuristics): - single numeric column: ['iqr', 'modified_z'] - low-dimensional (n_features <= 10) and numeric: ['mahalanobis','lof','isolation_forest'] - high-dimensional (n_features > 10): ['isolation_forest','pca','autoencoder'] """ df = ensure_dataframe(X) n_features = df.select_dtypes(include=["number"]).shape[1] if n_features == 0: raise ValueError("No numeric features to recommend methods for") if n_features == 1: return ["iqr", "modified_z"] elif n_features <= 10: return ["mahalanobis", "lof", "isolation_forest"] else: return ["isolation_forest", "pca", "autoencoder"] # --------------------------- # File: outlier_detection/statistical.py # --------------------------- """ Statistical / univariate outlier detectors. Each function focuses on single-dimension input (pd.Series) or will operate column-wise if given a DataFrame (then returns DataFrame of scores / masks). """ from typing import Union import numpy as np import pandas as pd from scipy import stats from .utils import create_result, numeric_only def _as_series(x: Union[pd.Series, pd.DataFrame], col: str = None) -> pd.Series: if isinstance(x, pd.DataFrame): if col is None: raise ValueError("If passing DataFrame, must pass column name") return x[col] return x def z_score_method(x: Union[pd.Series, pd.DataFrame], threshold: float = 3.0) -> OutlierResult: """ Z-Score method (univariate) Math: z = (x - mean) / std Flag where |z| > threshold. Applicability: single numeric column, approximately normal distribution. Not robust to heavy-tailed distributions. Returns OutlierResult with score = |z| (higher => more anomalous). """ if isinstance(x, pd.DataFrame): # apply per-column and return a DataFrame score masks = pd.DataFrame(index=x.index) scores = pd.DataFrame(index=x.index) for c in x.columns: res = z_score_method(x[c], threshold=threshold) masks[c] = res["mask"].astype(int) scores[c] = res["score"] # Derive a combined mask: any column flagged mask_any = masks.sum(axis=1) > 0 combined_score = scores.mean(axis=1) return create_result(mask_any, combined_score, "z_score_dataframe", {"threshold": threshold}, "Applied z-score per-column and combined by mean score and any-flag") s = x.dropna() if s.shape[0] == 0: mask = pd.Series([False]*len(x), index=x.index) score = pd.Series([0.0]*len(x), index=x.index) return create_result(mask, score, "z_score", {"threshold": threshold}, "Empty or all-NaN series") mu = s.mean() sigma = s.std(ddof=0) if sigma == 0: score = pd.Series(0.0, index=x.index) mask = pd.Series(False, index=x.index) explanation = "Zero variance: no z-score possible" return create_result(mask, score, "z_score", {"threshold": threshold}, explanation) z = (x - mu) / sigma absz = z.abs() mask = absz > threshold score = absz.fillna(0.0) explanation = f"z-score with mean={mu:.4g}, std={sigma:.4g}; flag |z|>{threshold}" return create_result(mask, score, "z_score", {"threshold": threshold}, explanation) def modified_z_score(x: Union[pd.Series, pd.DataFrame], threshold: float = 3.5) -> OutlierResult: """ Modified Z-score using median and MAD (robust to extreme values). Formula: M_i = 0.6745 * (x_i - median) / MAD Where MAD = median(|x_i - median|) Recommended threshold: 3.5 (common in literature) """ if isinstance(x, pd.DataFrame): masks = pd.DataFrame(index=x.index) scores = pd.DataFrame(index=x.index) for c in x.columns: res = modified_z_score(x[c], threshold=threshold) masks[c] = res["mask"].astype(int) scores[c] = res["score"] mask_any = masks.sum(axis=1) > 0 combined_score = scores.mean(axis=1) return create_result(mask_any, combined_score, "modified_z_dataframe", {"threshold": threshold}, "Applied modified z per-column and combined") s = x.dropna() if len(s) == 0: return create_result(pd.Series(False, index=x.index), pd.Series(0.0, index=x.index), "modified_z", {"threshold": threshold}, "empty") med = np.median(s) mad = np.median(np.abs(s - med)) if mad == 0: # all equal or too small score = pd.Series(0.0, index=x.index) mask = pd.Series(False, index=x.index) return create_result(mask, score, "modified_z", {"threshold": threshold}, "mad==0: no variation") M = 0.6745 * (x - med) / mad score = M.abs().fillna(0.0) mask = score > threshold return create_result(mask, score, "modified_z", {"threshold": threshold, "median": med, "mad": mad}, "Robust modified z-score; higher => more anomalous") def iqr_method(x: Union[pd.Series, pd.DataFrame], k: float = 1.5) -> OutlierResult: """ IQR (boxplot) method. Flags points outside [Q1 - k*IQR, Q3 + k*IQR]. k=1.5 is common; use larger k for fewer false positives. """ if isinstance(x, pd.DataFrame): masks = pd.DataFrame(index=x.index) scores = pd.DataFrame(index=x.index) for c in x.columns: res = iqr_method(x[c], k=k) masks[c] = res["mask"].astype(int) scores[c] = res["score"] mask_any = masks.sum(axis=1) > 0 combined_score = scores.mean(axis=1) return create_result(mask_any, combined_score, "iqr_dataframe", {"k": k}, "Applied IQR per column") s = x.dropna() if s.shape[0] == 0: return create_result(pd.Series(False, index=x.index), pd.Series(0.0, index=x.index), "iqr", {"k": k}, "empty") q1 = np.percentile(s, 25) q3 = np.percentile(s, 75) iqr = q3 - q1 lower = q1 - k * iqr upper = q3 + k * iqr mask = (x < lower) | (x > upper) # score: distance from nearest fence normalized by iqr (if iqr==0 use abs distance) if iqr == 0: score = (x - q1).abs().fillna(0.0) else: score = pd.Series(0.0, index=x.index) score[x < lower] = ((lower - x[x < lower]) / (iqr + 1e-12)) score[x > upper] = ((x[x > upper] - upper) / (iqr + 1e-12)) return create_result(mask.fillna(False), score.fillna(0.0), "iqr", {"k": k, "q1": q1, "q3": q3}, f"IQR fences [{lower:.4g}, {upper:.4g}]") def grubbs_test(x: Union[pd.Series, pd.DataFrame], alpha: float = 0.05) -> OutlierResult: """ Grubbs' test for a single outlier (requires approx normality). This test is intended to *detect one outlier at a time*. Use iteratively (recompute after removing detected outlier) if you expect multiple outliers, but be careful with multiplicity adjustments. Returns mask with at most one True (the most extreme point) unless alpha is very large. """ # For simplicity operate only on a single series. If DataFrame provided, # run per-column and combine (like other funcs) if isinstance(x, pd.DataFrame): masks = pd.DataFrame(index=x.index) scores = pd.DataFrame(index=x.index) for c in x.columns: res = grubbs_test(x[c], alpha=alpha) masks[c] = res["mask"].astype(int) scores[c] = res["score"] mask_any = masks.sum(axis=1) > 0 combined_score = scores.mean(axis=1) return create_result(mask_any, combined_score, "grubbs_dataframe", {"alpha": alpha}, "Applied Grubbs per column") from math import sqrt s = x.dropna() n = len(s) if n < 3: return create_result(pd.Series(False, index=x.index), pd.Series(0.0, index=x.index), "grubbs", {"alpha": alpha}, "n<3: cannot run") mean = s.mean() std = s.std(ddof=0) if std == 0: return create_result(pd.Series(False, index=x.index), pd.Series(0.0, index=x.index), "grubbs", {"alpha": alpha}, "zero std") # compute G statistic for max dev deviations = (s - mean).abs() max_idx = deviations.idxmax() G = deviations.loc[max_idx] / std # critical value from t-distribution t_crit = stats.t.ppf(1 - alpha / (2 * n), n - 2) G_crit = ((n - 1) / sqrt(n)) * (t_crit / sqrt(n - 2 + t_crit ** 2)) mask = pd.Series(False, index=x.index) mask.loc[max_idx] = G > G_crit score = pd.Series(0.0, index=x.index) score.loc[max_idx] = float(G) explanation = f"G={G:.4g}, Gcrit={G_crit:.4g}, alpha={alpha}" return create_result(mask, score, "grubbs", {"alpha": alpha, "G": G, "Gcrit": G_crit}, explanation) # --------------------------- # File: outlier_detection/distance_density.py # --------------------------- """ Distance and density based detectors (multivariate-capable). Functions generally accept a numeric DataFrame X and return OutlierResult. """ from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors from sklearn.cluster import DBSCAN from sklearn.covariance import EmpiricalCovariance from .utils import ensure_dataframe, create_result, numeric_only def lof_method(X, n_neighbors: int = 20, contamination: float = 0.05) -> OutlierResult: """ Local Outlier Factor (LOF). Returns score = -lof. LOF API returns negative_outlier_factor_. We negate so higher score => more anomalous. Applicability: medium-dimensional data, clusters of varying density. Beware: LOF does not provide a predictable probabilistic threshold. """ X = ensure_dataframe(X) Xnum = numeric_only(X) if Xnum.shape[0] < 2: return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "lof", {"n_neighbors": n_neighbors}, "too few samples") lof = LocalOutlierFactor(n_neighbors=min(n_neighbors, max(1, Xnum.shape[0]-1)), contamination=contamination) y = lof.fit_predict(Xnum) negative_factor = lof.negative_outlier_factor_ # higher -> more anomalous score = (-negative_factor) score = pd.Series(score, index=Xnum.index) mask = pd.Series(y == -1, index=Xnum.index) return create_result(mask, score, "lof", {"n_neighbors": n_neighbors, "contamination": contamination}, "LOF: higher score more anomalous") def knn_distance_method(X, k: int = 5) -> OutlierResult: """ k-NN distance based scoring: compute distance to k-th nearest neighbor. Points with large k-distance are candidate outliers. Returns score = k-distance (bigger => more anomalous). """ X = ensure_dataframe(X) Xnum = numeric_only(X) if Xnum.shape[0] < k + 1: return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "knn_distance", {"k": k}, "too few samples") nbrs = NearestNeighbors(n_neighbors=k + 1).fit(Xnum) distances, _ = nbrs.kneighbors(Xnum) # distances[:, 0] is zero (self). take k-th neighbor kdist = distances[:, k] score = pd.Series(kdist, index=Xnum.index) # threshold: e.g., mean + 2*std thr = score.mean() + 2 * score.std() mask = score > thr return create_result(mask, score, "knn_distance", {"k": k, "threshold": thr}, "k-distance method") def mahalanobis_method(X, threshold_p: float = 0.01) -> OutlierResult: """ Mahalanobis distance based detection. Computes D^2 for each point. One can threshold by chi-square quantile with df=n_features: P(D^2 > thresh) = threshold_p. We return score = D^2. Applicability: data approximately elliptical (multivariate normal-ish). """ X = ensure_dataframe(X) Xnum = numeric_only(X) n, d = Xnum.shape if n <= d: # covariance ill-conditioned; apply shrinkage or PCA beforehand explanation = "n <= n_features: covariance may be singular, consider PCA or regularization" else: explanation = "" cov = EmpiricalCovariance().fit(Xnum) mahal = cov.mahalanobis(Xnum) score = pd.Series(mahal, index=Xnum.index) # default threshold: chi2 quantile from scipy.stats import chi2 thr = chi2.ppf(1 - threshold_p, df=d) if d > 0 else np.inf mask = score > thr return create_result(mask, score, "mahalanobis", {"threshold_p": threshold_p, "chi2_thr": float(thr)}, explanation) def dbscan_method(X, eps: float = 0.5, min_samples: int = 5) -> OutlierResult: """ DBSCAN clusterer: points labeled -1 are considered noise -> outliers. Applicability: non-spherical clusters, variable density; choose eps carefully. """ X = ensure_dataframe(X) Xnum = numeric_only(X) if Xnum.shape[0] < min_samples: return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "dbscan", {"eps": eps, "min_samples": min_samples}, "too few samples") db = DBSCAN(eps=eps, min_samples=min_samples).fit(Xnum) labels = db.labels_ mask = pd.Series(labels == -1, index=Xnum.index) # score: negative of cluster size (noise points get score 1) # To keep simple: noise -> 1, else 0 score = pd.Series((labels == -1).astype(float), index=Xnum.index) return create_result(mask, score, "dbscan", {"eps": eps, "min_samples": min_samples}, "DBSCAN noise points flagged") # --------------------------- # File: outlier_detection/model_based.py # --------------------------- """ Model-based detectors: tree ensembles, SVM boundary, PCA reconstruction, GMM These functions are intended for multivariate numeric data. """ from sklearn.ensemble import IsolationForest from sklearn.svm import OneClassSVM from sklearn.decomposition import PCA from sklearn.mixture import GaussianMixture from sklearn.covariance import EllipticEnvelope from .utils import ensure_dataframe, numeric_only, create_result def isolation_forest_method(X, contamination: float = 0.05, random_state: int = 42) -> OutlierResult: """ Isolation Forest Returns mask and anomaly score (higher => more anomalous). Good general-purpose method for medium-to-high dimensional data. """ X = ensure_dataframe(X) Xnum = numeric_only(X) if Xnum.shape[0] < 2: return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "isolation_forest", {"contamination": contamination}, "too few samples") iso = IsolationForest(contamination=contamination, random_state=random_state) iso.fit(Xnum) pred = iso.predict(Xnum) # decision_function: higher -> more normal, so we invert raw_score = -iso.decision_function(Xnum) score = pd.Series(raw_score, index=Xnum.index) mask = pd.Series(pred == -1, index=Xnum.index) return create_result(mask, score, "isolation_forest", {"contamination": contamination}, "IsolationForest: inverted decision function as score") def one_class_svm_method(X, kernel: str = "rbf", nu: float = 0.05, gamma: str = "scale") -> OutlierResult: """ One-Class SVM for boundary-based anomaly detection. Carefully tune nu and gamma; not robust to large datasets without subsampling. """ X = ensure_dataframe(X) Xnum = numeric_only(X) if Xnum.shape[0] < 5: return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "one_class_svm", {"nu": nu}, "too few samples") ocsvm = OneClassSVM(kernel=kernel, nu=nu, gamma=gamma) ocsvm.fit(Xnum) pred = ocsvm.predict(Xnum) # decision_function: positive => inside boundary (normal); invert raw_score = -ocsvm.decision_function(Xnum) score = pd.Series(raw_score, index=Xnum.index) mask = pd.Series(pred == -1, index=Xnum.index) return create_result(mask, score, "one_class_svm", {"nu": nu, "kernel": kernel}, "OneClassSVM: invert decision_function for anomaly score") def pca_reconstruction_error(X, n_components: int = None, explained_variance: float = None, threshold: float = None) -> OutlierResult: """ PCA-based reconstruction error. If n_components not set, choose the minimum components to reach explained_variance (if provided). Otherwise uses min(n_features, 2). Score: squared reconstruction error per sample. Default threshold: mean+3*std. """ X = ensure_dataframe(X) Xnum = numeric_only(X) n, d = Xnum.shape if n == 0 or d == 0: return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "pca_recon", {}, "empty data") if n_components is None: if explained_variance is not None: temp_pca = PCA(n_components=min(n, d)) temp_pca.fit(Xnum) cum = np.cumsum(temp_pca.explained_variance_ratio_) n_components = int(np.searchsorted(cum, explained_variance) + 1) n_components = max(1, n_components) else: n_components = min(2, d) pca = PCA(n_components=n_components) proj = pca.fit_transform(Xnum) recon = pca.inverse_transform(proj) errors = ((Xnum - recon) ** 2).sum(axis=1) score = pd.Series(errors, index=Xnum.index) if threshold is None: threshold = score.mean() + 3 * score.std() mask = score > threshold return create_result(mask, score, "pca_recon", {"n_components": n_components, "threshold": float(threshold)}, "PCA reconstruction error") def gmm_method(X, n_components: int = 2, contamination: float = 0.05) -> OutlierResult: """ Gaussian Mixture Model based anomaly score (log-likelihood). Score: negative log-likelihood (bigger => less likely => more anomalous). Threshold: empirical quantile of scores. """ X = ensure_dataframe(X) Xnum = numeric_only(X) if Xnum.shape[0] < n_components: return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "gmm", {}, "too few samples") gmm = GaussianMixture(n_components=n_components) gmm.fit(Xnum) logprob = gmm.score_samples(Xnum) score = pd.Series(-logprob, index=Xnum.index) thr = score.quantile(1 - contamination) mask = score > thr return create_result(mask, score, {"n_components": n_components, "threshold": float(thr)}, "gmm", "GMM negative log-likelihood") def elliptic_envelope_method(X, contamination: float = 0.05) -> OutlierResult: """ EllipticEnvelope fits a robust covariance (assumes data come from a Gaussian-like ellipse). Flags outliers outside the ellipse. """ X = ensure_dataframe(X) Xnum = numeric_only(X) ee = EllipticEnvelope(contamination=contamination) ee.fit(Xnum) pred = ee.predict(Xnum) # decision_function: larger -> more normal; invert raw_score = -ee.decision_function(Xnum) score = pd.Series(raw_score, index=Xnum.index) mask = pd.Series(pred == -1, index=Xnum.index) return create_result(mask, score, "elliptic_envelope", {"contamination": contamination}, "EllipticEnvelope") # --------------------------- # File: outlier_detection/deep_learning.py # --------------------------- """ Deep learning based detectors (AutoEncoder, VAE). These require TensorFlow/Keras installed. If not present, importing this module will raise an informative ImportError. Design: a training function accepts X (numpy or DataFrame) and returns a callable `score_fn(X_new) -> pd.Series` plus a threshold selection helper. """ from typing import Callable import numpy as np import pandas as pd # lazy import to avoid hard TF dependency if user doesn't need it try: import tensorflow as tf from tensorflow.keras import layers, models, backend as K except Exception as e: raise ImportError("TensorFlow / Keras is required for deep_learning module. Install with `pip install tensorflow`. Error: " + str(e)) from .utils import ensure_dataframe, create_result def _build_autoencoder(input_dim: int, latent_dim: int = 8, hidden_units=(64, 32)) -> models.Model: inp = layers.Input(shape=(input_dim,)) x = inp for h in hidden_units: x = layers.Dense(h, activation='relu')(x) z = layers.Dense(latent_dim, activation='relu', name='latent')(x) x = z for h in reversed(hidden_units): x = layers.Dense(h, activation='relu')(x) out = layers.Dense(input_dim, activation='linear')(x) ae = models.Model(inp, out) return ae def autoencoder_method(X, latent_dim: int = 8, hidden_units=(128, 64), epochs: int = 50, batch_size: int = 32, validation_split: float = 0.1, threshold_method: str = 'quantile', threshold_val: float = 0.99, verbose: int = 0) -> OutlierResult: """ Train an AutoEncoder on X and compute reconstruction error as anomaly score. Parameters ---------- X : DataFrame or numpy array (numeric) threshold_method : 'quantile' or 'mean_std' threshold_val : if quantile -> e.g. 0.99 means top 1% flagged; if mean_std -> number of stds Returns ------- OutlierResult where score = reconstruction error and mask = score > threshold Notes ----- - This trains on the entire provided X. For actual anomaly detection, it's common to train the autoencoder only on "normal" data. If you have labels, pass only normal subset for training. - Requires careful scaling of inputs before training (robust_scale recommended). """ Xdf = ensure_dataframe(X) Xnum = Xdf.select_dtypes(include=['number']).fillna(0.0) input_dim = Xnum.shape[1] if input_dim == 0: return create_result(pd.Series(False, index=Xdf.index), pd.Series(0.0, index=Xdf.index), "autoencoder", {}, "no numeric columns") # convert to numpy arr = Xnum.values.astype(np.float32) ae = _build_autoencoder(input_dim=input_dim, latent_dim=latent_dim, hidden_units=hidden_units) ae.compile(optimizer='adam', loss='mse') ae.fit(arr, arr, epochs=epochs, batch_size=batch_size, validation_split=validation_split, verbose=verbose) recon = ae.predict(arr) errors = np.mean((arr - recon) ** 2, axis=1) score = pd.Series(errors, index=Xdf.index) if threshold_method == 'quantile': thr = float(score.quantile(threshold_val)) else: thr = float(score.mean() + threshold_val * score.std()) mask = score > thr return create_result(mask, score, "autoencoder", {"latent_dim": latent_dim, "threshold": thr}, "AutoEncoder reconstruction error") def vae_method(X, latent_dim: int = 8, hidden_units=(128, 64), epochs: int = 50, batch_size: int = 32, threshold_method: str = 'quantile', threshold_val: float = 0.99, verbose: int = 0) -> OutlierResult: """ Variational Autoencoder (VAE) anomaly detection. Implementation note: VAE is more involved; here we provide a simple implementation that uses reconstruction error as score. For strict probabilistic anomaly scoring one would use the ELBO / likelihood; this minimal implementation keeps it practical. """ # For brevity we reuse autoencoder path (a more complete VAE impl is possible) return autoencoder_method(X, latent_dim=latent_dim, hidden_units=hidden_units, epochs=epochs, batch_size=batch_size, threshold_method=threshold_method, threshold_val=threshold_val, verbose=verbose) # --------------------------- # File: outlier_detection/ensemble.py # --------------------------- """ Combine multiple detectors and produce an aggregated report. Provides strategies: union, intersection, majority voting, weighted sum of normalized scores. """ from typing import List, Dict import numpy as np import pandas as pd from .utils import ensure_dataframe, create_result def normalize_scores(scores: pd.DataFrame) -> pd.DataFrame: """Min-max normalize each score column to [0,1].""" sc = scores.copy() for c in sc.columns: col = sc[c] mn = col.min() mx = col.max() if mx == mn: sc[c] = 0.0 else: sc[c] = (col - mn) / (mx - mn) return sc def aggregate_scores(results: Dict[str, Dict], method: str = 'weighted', weights: Dict[str, float] = None) -> Dict: """ Aggregate multiple OutlierResult dictionaries produced by detectors. Returns an OutlierResult-like dict with: - mask (final boolean by threshold on aggregate score), - score (aggregate numeric score) Aggregation methods: - 'union' : any detector flagged => outlier (score = max of normalized scores) - 'intersection' : flagged by all detectors => outlier - 'majority' : flagged by >50% detectors - 'weighted' : weighted sum of normalized scores (weights provided or equal) """ # collect masks and scores into DataFrames masks = pd.DataFrame({k: v['mask'].astype(int) for k, v in results.items()}) raw_scores = pd.DataFrame({k: (v['score'] if isinstance(v['score'], pd.Series) else pd.Series(v['score'])) for k, v in results.items()}) raw_scores.index = masks.index norm_scores = normalize_scores(raw_scores) if method == 'union': agg_score = norm_scores.max(axis=1) elif method == 'intersection': agg_score = norm_scores.min(axis=1) elif method == 'majority': agg_score = masks.sum(axis=1) / max(1, masks.shape[1]) elif method == 'weighted': if weights is None: weights = {k: 1.0 for k in results.keys()} # align weights w = pd.Series({k: weights.get(k, 1.0) for k in results.keys()}) # make sure weights sum to 1 w = w / w.sum() agg_score = (norm_scores * w).sum(axis=1) else: raise ValueError("Unknown aggregation method") # default threshold: 0.5 mask = agg_score > 0.5 return create_result(mask, agg_score, f"ensemble_{method}", {"method": method}, "Aggregated ensemble score") def ensemble_methods(X, method_list: List[str] = None, method_params: Dict = None) -> Dict[str, Dict]: """ Convenience: run multiple detectors by name and return dict of results. method_list: list of names from ['iqr','modified_z','z_score','lof','mahalanobis','isolation_forest', ...] method_params: optional dict mapping method name to params """ from . import statistical, distance_density, model_based, deep_learning X = ensure_dataframe(X) if method_list is None: method_list = ['iqr', 'modified_z', 'isolation_forest', 'lof'] if method_params is None: method_params = {} results = {} for m in method_list: params = method_params.get(m, {}) try: if m == 'iqr': results[m] = statistical.iqr_method(X, **params) elif m == 'modified_z': results[m] = statistical.modified_z_score(X, **params) elif m == 'z_score': results[m] = statistical.z_score_method(X, **params) elif m == 'lof': results[m] = distance_density.lof_method(X, **params) elif m == 'mahalanobis': results[m] = distance_density.mahalanobis_method(X, **params) elif m == 'dbscan': results[m] = distance_density.dbscan_method(X, **params) elif m == 'knn': results[m] = distance_density.knn_distance_method(X, **params) elif m == 'isolation_forest': results[m] = model_based.isolation_forest_method(X, **params) elif m == 'one_class_svm': results[m] = model_based.one_class_svm_method(X, **params) elif m == 'pca': results[m] = model_based.pca_reconstruction_error(X, **params) elif m == 'gmm': results[m] = model_based.gmm_method(X, **params) elif m == 'elliptic': results[m] = model_based.elliptic_envelope_method(X, **params) elif m == 'autoencoder': results[m] = deep_learning.autoencoder_method(X, **params) else: logger.warning("Unknown method requested: %s", m) except Exception as e: logger.exception("Method %s failed: %s", m, e) return results # --------------------------- # File: outlier_detection/visualization.py # --------------------------- """ Simple plotting helpers for quick inspection. Note: plotting is intentionally minimal; for report-quality figures users can adapt styles. The functions return the matplotlib Figure object so they can be further customized. """ import matplotlib.pyplot as plt from .utils import ensure_dataframe def plot_boxplot(series: pd.Series, show: bool = True): df = ensure_dataframe(series) col = df.columns[0] fig, ax = plt.subplots() ax.boxplot(df[col].dropna()) ax.set_title(f"Boxplot: {col}") if show: plt.show() return fig def plot_pair_scatter(X, columns: list = None, show: bool = True): X = ensure_dataframe(X) if columns is not None: X = X[columns] cols = X.columns.tolist()[:4] # avoid huge plots fig, axes = plt.subplots(len(cols) - 1, len(cols) - 1, figsize=(4 * (len(cols) - 1), 4 * (len(cols) - 1))) for i in range(1, len(cols)): for j in range(i): ax = axes[i - 1, j] ax.scatter(X[cols[j]], X[cols[i]], s=8) ax.set_xlabel(cols[j]) ax.set_ylabel(cols[i]) fig.suptitle("Pairwise scatter (first 4 numeric cols)") if show: plt.show() return fig # --------------------------- # File: outlier_detection/cli.py # --------------------------- """ A very small CLI to run detectors on a CSV file and output a CSV report. Usage (example): python -m outlier_detection.cli detect input.csv output_report.csv --methods iqr,isolation_forest """ import argparse import pandas as pd from .ensemble import ensemble_methods, aggregate_scores def main(): parser = argparse.ArgumentParser(description='Outlier detection CLI') sub = parser.add_subparsers(dest='cmd') det = sub.add_parser('detect') det.add_argument('input_csv') det.add_argument('output_csv') det.add_argument('--methods', default='iqr,modified_z,isolation_forest,lof') args = parser.parse_args() df = pd.read_csv(args.input_csv) methods = args.methods.split(',') results = ensemble_methods(df, method_list=methods) agg = aggregate_scores(results, method='weighted') summary = pd.concat([pd.DataFrame({k: v['mask'].astype(int) for k, v in results.items()}), pd.DataFrame({k: v['score'] for k, v in results.items()})], axis=1) summary['ensemble_score'] = agg['score'] summary['ensemble_flag'] = agg['mask'].astype(int) summary.to_csv(args.output_csv, index=False) print(f"Wrote report to {args.output_csv}") if __name__ == '__main__': main()改成中文说明并返回代码给我
08-27
使用matlab工具写此代码 from pathlib import Path import re import numpy as np import pandas as pd from typing import Dict, List, Tuple, Optional %---------------------- 配置参数 ---------------------- BASE_DIR = Path("附件/附件 3")% 如果基础文件夹不同,请修改 BEFORE_DIRNAME = "姿势调整前" AFTER_DIRNAME = "姿势调整后" %时间归一化目标长度 TARGET_LEN = 200 % 选择度量方式: "mse" (时间上的平均欧氏距离) 或 "dtw" (动态时间规整) METRIC = "dtw" % 输出保存位置 OUT_DIR = Path("analysis_outputs") OUT_DIR.mkdir(parents=True, exist_ok=True) %用于解析"运动者<id>第<attempt>次"的正则表达式(支持.csv/.txt等) NAME_RE = re.compile(r"运动者(\d+)第(\d+)次", re.IGNORECASE) %---------------------- 工具函数 ---------------------- def robust_read_table(fp: Path) -> pd.DataFrame: """ 尝试读取关键点表格,支持灵活的分隔符:逗号、制表符或多个空格。 保持表头名称不变。 """ % 先尝试逗号分隔;失败则使用正则表达式分隔符 try: df = pd.read_csv(fp) if df.shape[1] <= 2: raise ValueError("使用逗号分隔时列数太少,尝试正则表达式分隔符。") except Exception: df = pd.read_csv(fp, sep=r"[,\s\t]+", engine="python") %如果存在明显的未命名索引列,则删除 drop_cols = [c for c in df.columns if str(c).lower().startswith("unnamed")] if drop_cols: df = df.drop(columns=drop_cols) return df def get_keypoint_columns(df: pd.DataFrame) -> List[Tuple[str, str]]: """ 基于表头模式(如"0_X"、"0_Y")返回关键点索引的(x_col, y_col)有序对。 如果表头使用例如"0-X"或"0X",可以在此扩展。 """ xy_pairs = [] %按索引收集 by_index = {} for col in df.columns: s = str(col).strip() %匹配数字后跟下划线和X/Y的模式 m = re.match(r"(\d+)[_\-]?[xX]$", s) if m: idx = int(m.group(1)) by_index.setdefault(idx, {})["x"] = s continue m = re.match(r"(\d+)[_\-]?[yY]$", s) if m: idx = int(m.group(1)) by_index.setdefault(idx, {})["y"] = s continue # 也接受"0_X"、"0_Y"的精确匹配 m = re.match(r"(\d+)_([XY])$", s) if m: idx = int(m.group(1)) axis = m.group(2).lower() by_index.setdefault(idx, {})[axis] = s continue # 按索引构建有序列表 for idx in sorted(by_index.keys()): cols = by_index[idx] # 确保同时有x和y坐标 if "x" in cols and "y" in cols: xy_pairs.append((cols["x"], cols["y"])) # 如果没有找到任何关键点对,抛出错误 if not xy_pairs: raise ValueError("无法检测到类似'0_X'、'0_Y'的关键点列。请检查表头。") return xy_pairs # 可以在这里继续添加其他函数,如线性重采样、空间归一化等 def linear_resample(arr: np.ndarray, target_len: int) -> np.ndarray: """ 将形状为(T, D)的数组线性重采样为(target_len, D)。 """ T, D = arr.shape if T == target_len: return arr.copy() old_idx = np.linspace(0, 1, T) new_idx = np.linspace(0, 1, target_len) out = np.empty((target_len, D), dtype=float) for d in range(D): out[:, d] = np.interp(new_idx, old_idx, arr[:, d]) return out
09-06
""" 传感器“卡值”(stuck value) 轻量级检测脚本(稳定相位+模板Z过滤版,Python 3.8+) ------------------------------------------------ 特点: - 仅依赖 numpy / pandas / matplotlib - 同时支持: 1) 绝对卡值/重复值段(平坦段) 2) 低变化段(导数接近0) 3) 周期性数据的“相位差残差”卡值(去除周期基线后再检测) - 自动估计或手动指定主周期 - 新增:跨周期“天然稳定相位”与模板Z分数过滤,显著降低对天然平台的误报 - 返回区间级别的告警(起止索引/时间、类型、置信度) - 新增周期模板可视化功能 用法示例见底部 __main__ 区域。 """ from dataclasses import dataclass from typing import List, Optional, Tuple, Dict import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib.patches import Rectangle # ----------------------------- # 工具函数 # ----------------------------- def _rolling_std(x: np.ndarray, window: int) -> np.ndarray: if window <= 1: return np.zeros_like(x, dtype=float) s = pd.Series(x) return s.rolling(window, min_periods=window).std(ddof=0).to_numpy() def _autocorr_via_fft(x: np.ndarray) -> np.ndarray: """快速自相关(归一化),返回与 x 等长的自相关数组。""" x = np.asarray(x, dtype=float) x = x - np.nanmean(x) x[np.isnan(x)] = 0.0 n = int(1 << (len(x) * 2 - 1).bit_length()) fx = np.fft.rfft(x, n=n) acf = np.fft.irfft(fx * np.conjugate(fx), n=n)[: len(x)] acf /= np.maximum(acf[0], 1e-12) return acf def estimate_period( x: np.ndarray, min_period: int = 5, max_period: Optional[int] = None, ) -> Optional[int]: """粗略估计主周期(样本点单位)。返回最可能的周期长度,找不到返回 None。""" n = len(x) if n < 3 * min_period: return None if max_period is None: max_period = max(min(n // 3, 2000), min_period + 1) acf = _autocorr_via_fft(x) seg = acf[min_period: max_period] if len(seg) == 0: return None k = int(np.nanargmax(seg)) + min_period if acf[k] < 0.15: return None return k def _group_runs(mask: np.ndarray) -> List[Tuple[int, int]]: """将布尔序列中为 True 的连续段转为 [start, end](含 end)。""" runs: List[Tuple[int, int]] = [] i = 0 n = len(mask) while i < n: if mask[i]: j = i while j + 1 < n and mask[j + 1]: j += 1 runs.append((i, j)) i = j + 1 else: i += 1 return runs def _phase_template_sigma(x: np.ndarray, period: int): """ 计算“同相位跨周期”的模板(相位中位数)与 σ(由MAD近似)。 返回 (template, sigma),若周期样本不足(<2个周期)则返回 (None, None)。 """ m = len(x) // period if m < 2: return None, None X = x[: m * period].reshape(m, period) template = np.nanmedian(X, axis=0) mad = np.nanmedian(np.abs(X - template[None, :]), axis=0) sigma = 1.4826 * mad + 1e-12 return template, sigma # ----------------------------- # 结果数据结构 # ----------------------------- @dataclass class StuckInterval: start_idx: int end_idx: int kind: str # "flat", "low_var", "seasonal_flat" score: float # 0~1 大致置信度 value_summary: str # ----------------------------- # 主检测器 # ----------------------------- class StuckDetector: def __init__( self, min_flat_run: int = 5, value_tol: float = 0.0, deriv_window: int = 3, deriv_tol: float = 1e-6, lowvar_window: int = 15, lowvar_std_tol: float = 1e-4, seasonal_period: Optional[int] = None, seasonal_robust: bool = True, seasonal_min_run: int = 5, seasonal_value_tol: float = 0.0, # —— 稳定相位+Z过滤参数 —— stable_phase_q: float = 0.20, # σ 的分位阈值,以下视为“天然稳定相位” stable_overlap_thr: float = 0.60, # 区间与稳定相位重叠比例阈值 require_z_k: float = 3.0 # 在稳定相位里仍要报卡值的最小Z阈值 ) -> None: self.min_flat_run = min_flat_run self.value_tol = value_tol self.deriv_window = deriv_window self.deriv_tol = deriv_tol self.lowvar_window = lowvar_window self.lowvar_std_tol = lowvar_std_tol self.seasonal_period = seasonal_period self.seasonal_robust = seasonal_robust self.seasonal_min_run = seasonal_min_run self.seasonal_value_tol = seasonal_value_tol self.stable_phase_q = stable_phase_q self.stable_overlap_thr = stable_overlap_thr self.require_z_k = require_z_k # ---- 基础检测 ---- def _flat_runs(self, x: np.ndarray) -> List[StuckInterval]: eq = np.abs(np.diff(x, prepend=x[0])) <= self.value_tol runs = _group_runs(eq) out: List[StuckInterval] = [] for s, e in runs: if e - s + 1 >= self.min_flat_run: v = np.median(x[s: e + 1]) out.append(StuckInterval(s, e, "flat", score=0.7, value_summary="~{:.6g}".format(v))) return out def _low_variability(self, x: np.ndarray) -> List[StuckInterval]: sd = _rolling_std(x, self.lowvar_window) mask = sd <= self.lowvar_std_tol runs = _group_runs(mask) out: List[StuckInterval] = [] for s, e in runs: if e - s + 1 >= max(self.lowvar_window, self.min_flat_run): v = np.median(x[s: e + 1]) local_sd = float(np.nanmax(sd[s: e + 1])) if np.isfinite(sd[s: e + 1]).any() else 0.0 score = float(np.clip(1.0 - local_sd / (self.lowvar_std_tol + 1e-12), 0, 1)) out.append(StuckInterval(s, e, "low_var", score=score, value_summary="~{:.6g}".format(v))) return out def _derivative_flat(self, x: np.ndarray) -> List[StuckInterval]: dx = np.diff(x, prepend=x[0]) if self.deriv_window > 1: dx = pd.Series(dx).rolling(self.deriv_window, min_periods=1, center=True).mean().to_numpy() mask = np.abs(dx) <= self.deriv_tol runs = _group_runs(mask) out: List[StuckInterval] = [] for s, e in runs: if e - s + 1 >= self.min_flat_run: v = np.median(x[s: e + 1]) local_absmax = float(np.nanmax(np.abs(dx[s: e + 1]))) if np.isfinite(dx[s: e + 1]).any() else 0.0 score = float(np.clip(1.0 - local_absmax / (self.deriv_tol + 1e-12), 0, 1)) out.append(StuckInterval(s, e, "flat", score=score, value_summary="~{:.6g}".format(v))) return out # ---- 周期性处理 ---- def _seasonal_baseline(self, x: np.ndarray, period: int) -> np.ndarray: phase_vals = [x[i::period] for i in range(period)] if self.seasonal_robust: phase_stats = [np.nanmedian(v) for v in phase_vals] else: phase_stats = [np.nanmean(v) for v in phase_vals] baseline = np.empty_like(x, dtype=float) for i in range(period): baseline[i::period] = phase_stats[i] return baseline def _seasonal_flat_runs(self, x: np.ndarray, period: int) -> List[StuckInterval]: baseline = self._seasonal_baseline(x, period) resid = x - baseline template, sigma = _phase_template_sigma(x, period) use_z = template is not None and sigma is not None eps = 1e-12 eq = np.abs(np.diff(resid, prepend=resid[0])) <= self.seasonal_value_tol runs = _group_runs(eq) out: List[StuckInterval] = [] for s, e in runs: if e - s + 1 >= self.seasonal_min_run: v = np.median(x[s: e + 1]) rmad_series = pd.Series(resid).rolling(period, min_periods=period) \ .apply(lambda w: np.nanmedian(np.abs(w - np.nanmedian(w))), raw=False) # 取窗口末端的RMAD,防止 NaN rmad_val = rmad_series.iloc[e] if pd.isna(rmad_val): rmad_val = 0.0 rmad = float(rmad_val) flat_strength = 1.0 if self.seasonal_value_tol <= 0 else float( np.clip(1.0 - np.nanmax(np.abs(np.diff(resid[s: e + 1], prepend=resid[s]))) / ( self.seasonal_value_tol + 1e-12), 0, 1) ) season_stability = float(np.clip(1.0 - rmad / (np.nanstd(resid) + 1e-12), 0, 1)) if np.isfinite( rmad) else 0.5 score = float(np.clip(0.5 * flat_strength + 0.5 * season_stability, 0, 1)) if use_z: idx = np.arange(s, e + 1) phase = idx % period z = np.abs(x[idx] - template[phase]) / (sigma[phase] + eps) z_med = float(np.nanmedian(z)) if np.isfinite(z).any() else 0.0 if z_med < max(2.5, self.require_z_k - 0.5): continue out.append(StuckInterval(s, e, "seasonal_flat", score=score, value_summary="~{:.6g}".format(v))) return out # ---- 稳定相位+Z过滤 ---- def _filter_by_stability_and_z( self, x: np.ndarray, intervals: List[StuckInterval], period: int, template: np.ndarray, sigma: np.ndarray, stable_mask: np.ndarray, ) -> List[StuckInterval]: keep: List[StuckInterval] = [] eps = 1e-12 n = len(x) for it in intervals: s, e = it.start_idx, it.end_idx s = max(0, int(s)); e = min(n - 1, int(e)) if s > e: continue overlap = float(np.mean(stable_mask[s:e + 1])) if e >= s else 0.0 idx = np.arange(s, e + 1) phase = idx % period z = np.abs(x[idx] - template[phase]) / (sigma[phase] + eps) z_med = float(np.nanmedian(z)) if np.isfinite(z).any() else 0.0 if overlap >= self.stable_overlap_thr and z_med < self.require_z_k: continue keep.append(it) return keep # ---- 主入口 ---- def detect(self, series: pd.Series) -> Tuple[List[StuckInterval], Dict[str, Optional[int]]]: """ 输入:时间序列(pd.Series,索引可为时间戳或整数) 输出: - intervals: StuckInterval 列表(不重叠;若重叠会做简单合并) - meta: {"period": 周期估计} """ x = series.to_numpy(dtype=float) n = len(x) intervals: List[StuckInterval] = [] period = self.seasonal_period or estimate_period(x) template = sigma = None stable_phase = None stable_mask = np.zeros(n, dtype=bool) if period is not None and period >= 3 and period * 2 <= n: template, sigma = _phase_template_sigma(x, period) if template is not None: thr = np.nanquantile(sigma, self.stable_phase_q) stable_phase = sigma <= thr m = n // period for i in range(m): stable_mask[i * period: i * period + period] = stable_phase rem = n - m * period if rem > 0: stable_mask[m * period: m * period + rem] = stable_phase[:rem] intervals.extend(self._flat_runs(x)) intervals.extend(self._low_variability(x)) intervals.extend(self._derivative_flat(x)) if period is not None and period >= 3 and period * 2 <= n: intervals.extend(self._seasonal_flat_runs(x, period)) if (period is not None) and (template is not None) and (sigma is not None): intervals = self._filter_by_stability_and_z(x, intervals, period, template, sigma, stable_mask) intervals = self._merge_intervals(intervals) return intervals, {"period": period, "template": template, "sigma": sigma} @staticmethod def _merge_intervals(intervals: List[StuckInterval]) -> List[StuckInterval]: if not intervals: return [] intervals = sorted(intervals, key=lambda z: (z.start_idx, z.end_idx)) merged: List[StuckInterval] = [] cur = intervals[0] for nx in intervals[1:]: if nx.start_idx <= cur.end_idx + 1 and nx.kind == cur.kind: new_s = cur.start_idx new_e = max(cur.end_idx, nx.end_idx) score = max(cur.score, nx.score) cur = StuckInterval(new_s, new_e, cur.kind, score=score, value_summary=cur.value_summary) elif nx.start_idx <= cur.end_idx + 1 and nx.kind != cur.kind: if nx.score >= cur.score: cur = StuckInterval(cur.start_idx, max(cur.end_idx, nx.end_idx), nx.kind, nx.score, nx.value_summary) else: cur = StuckInterval(cur.start_idx, max(cur.end_idx, nx.end_idx), cur.kind, cur.score, cur.value_summary) else: merged.append(cur) cur = nx merged.append(cur) return merged # ----------------------------- # 便捷接口 # ----------------------------- def detect_stuck_segments( series: pd.Series, sampling_period: Optional[pd.Timedelta] = None, **kwargs, ) -> pd.DataFrame: """一次性运行并返回 DataFrame 结果。""" det = StuckDetector(**kwargs) intervals, meta = det.detect(series) rows: List[Dict[str, object]] = [] for it in intervals: start_time = end_time = None if isinstance(series.index, pd.DatetimeIndex): if sampling_period is None: start_time = series.index[it.start_idx] end_time = series.index[it.end_idx] else: start_time = series.index[0] + it.start_idx * sampling_period end_time = series.index[0] + it.end_idx * sampling_period rows.append( { "start_idx": it.start_idx, "end_idx": it.end_idx, "start_time": start_time, "end_time": end_time, "kind": it.kind, "score": it.score, "value_summary": it.value_summary, "length": it.end_idx - it.start_idx + 1, } ) df = pd.DataFrame(rows) if "period" in meta: df.attrs["estimated_period"] = meta["period"] return df # ----------------------------- # 进阶:基于“同周期模板”的卡值定位(周期内卡段) # ----------------------------- def detect_within_cycle_stuck( series: pd.Series, period: Optional[int] = None, # 支持自动估计 min_run: int = 5, value_tol: float = 0.0, run_std_tol: float = 1e-4, peer_z_k: float = 4.0, ) -> pd.DataFrame: x = series.to_numpy(dtype=float) n = len(x) # 自动估计周期 if period is None: period = estimate_period(x) if period is None or period < 3: return pd.DataFrame() m = n // period if m < 2: return pd.DataFrame() X = x[: m * period].reshape(m, period) template = np.nanmedian(X, axis=0) mad = np.nanmedian(np.abs(X - template[None, :]), axis=0) sigma = 1.4826 * mad + 1e-12 rows: List[Dict[str, object]] = [] for ci in range(m): cyc = X[ci] diff = np.abs(cyc - template) eq = np.abs(np.diff(cyc, prepend=cyc[0])) <= value_tol run_std = pd.Series(cyc).rolling(min_run, min_periods=min_run).std(ddof=0).to_numpy() std_mask = run_std <= run_std_tol mask = np.zeros(period, dtype=bool) for i in range(period): L = max(0, i - min_run + 1) R = i if R - L + 1 >= min_run: cond_std = (not np.isnan(std_mask[R])) and bool(std_mask[R]) if eq[L:R + 1].all() and cond_std: mask[L:R + 1] = True runs = _group_runs(mask) for s, e in runs: if float(np.nanmedian(sigma[s:e + 1])) < run_std_tol: continue z = diff[s:e + 1] / sigma[s:e + 1] z_med = float(np.nanmedian(z)) if np.isfinite(z_med) and z_med >= peer_z_k: abs_s = ci * period + s abs_e = ci * period + e mean_diff = float(np.nanmean(np.sign(cyc[s:e + 1] - template[s:e + 1]) * diff[s:e + 1])) score = float(np.tanh((z_med - peer_z_k) / 2 + 1)) rows.append({ "cycle_idx": ci, "start_phase": s, "end_phase": e, "abs_start_idx": abs_s, "abs_end_idx": abs_e, "mean_diff_to_template": mean_diff, "kind": "cycle_flat", "score": score, }) return pd.DataFrame(rows) # ----------------------------- # 合成温度型周期数据(非正弦,含天然稳定段 + 可选卡值段) # ----------------------------- def generate_temperature_like_series( start_time: str = "2025-01-01", period: int = 60, cycles: int = 20, noise_std: float = 0.03, stable_plateau_len: int = 8, stable_values: Tuple[float, float] = (31.0, 34.0), temp_range: Tuple[float, float] = (30.0, 35.0), stuck_cycle_idx: Optional[int] = 8, stuck_phase_range: Tuple[int, int] = (20, 35), stuck_value: Optional[float] = None, seed: Optional[int] = 42, ) -> pd.Series: """生成更接近温度传感器的周期信号(非正弦)。""" rng = np.random.RandomState(seed) # 为兼容旧版 NumPy a = max(6, (period // 3) - stable_plateau_len) b = max(6, (period - a - 2 * stable_plateau_len)) v_low, v_high = stable_values v_low = float(np.clip(v_low, *temp_range)) v_high = float(np.clip(v_high, *temp_range)) one_cycle_parts = [] if a > 0: ramp_up = np.linspace(v_low, v_high, a, endpoint=False) ramp_up = ramp_up + rng.normal(0, noise_std, size=a) one_cycle_parts.append(ramp_up) plateau1 = np.full(stable_plateau_len, v_high) one_cycle_parts.append(plateau1) if b > 0: mid = v_low + 0.5 * (v_high - v_low) ramp_var = np.linspace(v_high, mid, b, endpoint=False) ramp_var = ramp_var + rng.normal(0, noise_std, size=b) one_cycle_parts.append(ramp_var) plateau2 = np.full(stable_plateau_len, v_low) one_cycle_parts.append(plateau2) one_cycle = np.concatenate(one_cycle_parts) if len(one_cycle) < period: pad = np.full(period - len(one_cycle), v_low) one_cycle = np.concatenate([one_cycle, pad]) else: one_cycle = one_cycle[:period] sig = np.tile(one_cycle, cycles) plateau_mask = np.zeros(period, dtype=bool) plateau_mask[a:a + stable_plateau_len] = True plateau_mask[a + stable_plateau_len + b: a + stable_plateau_len + b + stable_plateau_len] = True plateau_mask_full = np.tile(plateau_mask, cycles) non_plateau_idx = np.where(~plateau_mask_full)[0] sig[non_plateau_idx] += rng.normal(0, noise_std, size=len(non_plateau_idx)) if stuck_cycle_idx is not None: s = stuck_cycle_idx * period + stuck_phase_range[0] e = stuck_cycle_idx * period + stuck_phase_range[1] s = int(np.clip(s, 0, len(sig) - 1)) e = int(np.clip(e, 0, len(sig) - 1)) if s <= e: if stuck_value is None: seg = sig[s:e + 1] sv = float(np.round(np.median(seg), 3)) else: sv = float(np.clip(stuck_value, *temp_range)) sig[s:e + 1] = sv sig = np.clip(sig, *temp_range) idx = pd.date_range(start_time, periods=len(sig), freq="S") return pd.Series(sig, index=idx) # ----------------------------- # 可视化:原始数据 + 卡值区间 + 稳定点 标注 # ----------------------------- def _mask_from_intervals(n: int, intervals_df: pd.DataFrame, start_col: str, end_col: str) -> np.ndarray: m = np.zeros(n, dtype=bool) if intervals_df is None or len(intervals_df) == 0: return m for s, e in intervals_df[[start_col, end_col]].to_numpy(): s = int(max(0, s)) e = int(min(n - 1, e)) if s <= e: m[s: e + 1] = True return m def detect_stable_points( series: pd.Series, period: Optional[int] = None, stuck_mask: Optional[np.ndarray] = None, method: str = "cycle_sigma", stable_sigma_q: float = 0.2, rolling_window: int = 20, rolling_std_tol: float = 1e-3, ) -> np.ndarray: """返回布尔数组,表示“本身为稳定的点”。 method="cycle_sigma" 时若未提供 period,会尝试自动估计;失败则退回滚动标准差法。 """ x = series.to_numpy(dtype=float) n = len(x) stable = np.zeros(n, dtype=bool) if stuck_mask is None: stuck_mask = np.zeros(n, dtype=bool) if method == "cycle_sigma": p = period if p is None: p = estimate_period(x) if p is not None and n // p >= 2: m = n // p X = x[: m * p].reshape(m, p) template = np.nanmedian(X, axis=0) mad = np.nanmedian(np.abs(X - template[None, :]), axis=0) sigma = 1.4826 * mad + 1e-12 thr = np.nanquantile(sigma, stable_sigma_q) stable_phase = sigma <= thr for i in range(m): idx0 = i * p stable[idx0: idx0 + p] = stable_phase rem = n - m * p if rem > 0: stable[m * p: m * p + rem] = stable_phase[:rem] else: sd = _rolling_std(x, rolling_window) stable = sd <= rolling_std_tol else: sd = _rolling_std(x, rolling_window) stable = sd <= rolling_std_tol stable = np.logical_and(stable, ~stuck_mask) return stable def plot_stuck_overview( series: pd.Series, res_basic: Optional[pd.DataFrame] = None, res_within: Optional[pd.DataFrame] = None, period: Optional[int] = None, stable_method: str = "cycle_sigma", stable_sigma_q: float = 0.2, rolling_window: int = 20, rolling_std_tol: float = 1e-3, figsize: Tuple[int, int] = (12, 5), save_path: Optional[str] = None, ): """绘制原始数据,并叠加:卡值区间(红)+ 稳定点(绿)。""" x = series.to_numpy(dtype=float) n = len(x) mask_basic = _mask_from_intervals(n, res_basic, "start_idx", "end_idx") if res_basic is not None else np.zeros(n, dtype=bool) mask_within = _mask_from_intervals(n, res_within, "abs_start_idx", "abs_end_idx") if res_within is not None else np.zeros(n, dtype=bool) stuck_mask = np.logical_or(mask_basic, mask_within) stable_mask = detect_stable_points( series, period=period, stuck_mask=stuck_mask, method=stable_method, stable_sigma_q=stable_sigma_q, rolling_window=rolling_window, rolling_std_tol=rolling_std_tol, ) fig, ax = plt.subplots(1, 1, figsize=figsize) if isinstance(series.index, pd.DatetimeIndex): t = series.index else: t = np.arange(n) ax.plot(t, x, linewidth=1.2, label="signal") def _add_spans(mask, color="#ff4d4f", alpha=0.25, label="stuck"): runs = _group_runs(mask) for i, (s, e) in enumerate(runs): ax.axvspan(t[s], t[e], color=color, alpha=alpha, lw=0, label=(label if i == 0 else None)) _add_spans(stuck_mask, label="stuck") idx_stable = np.where(stable_mask)[0] if len(idx_stable) > 0: ax.scatter(t[idx_stable], x[idx_stable], s=12, color="#52c41a", label="stable points", zorder=3) ax.set_title("Signal with Stuck Segments and Stable Points") ax.set_xlabel("Time" if isinstance(series.index, pd.DatetimeIndex) else "Index") ax.set_ylabel("Value") ax.legend() ax.grid(True, linestyle=":", alpha=0.5) if save_path: fig.savefig(save_path, dpi=160, bbox_inches="tight") return fig, ax # ----------------------------- # 新增:周期模板可视化 # ----------------------------- def plot_cycle_template( series: pd.Series, period: Optional[int] = None, stable_phase_q: float = 0.20, figsize: Tuple[int, int] = (10, 6), save_path: Optional[str] = None, ): """ 绘制周期模板和稳定相位区域 Args: series: 输入时间序列 period: 周期长度(自动估计或手动指定) stable_phase_q: 稳定相位判定分位数阈值 figsize: 图像尺寸 save_path: 保存路径(可选) """ x = series.to_numpy(dtype=float) n = len(x) # 自动估计周期 if period is None: period = estimate_period(x) if period is None: raise ValueError("无法自动估计周期,请手动指定周期长度") # 计算周期模板和标准差 m = n // period if m < 2: raise ValueError("数据长度不足两个完整周期") X = x[: m * period].reshape(m, period) template = np.nanmedian(X, axis=0) mad = np.nanmedian(np.abs(X - template[None, :]), axis=0) sigma = 1.4826 * mad + 1e-12 # 确定稳定相位区域 stable_phase_mask = sigma <= np.nanquantile(sigma, stable_phase_q) # 创建绘图 fig, ax = plt.subplots(figsize=figsize) # 绘制模板 phase_idx = np.arange(period) ax.plot(phase_idx, template, 'b-', linewidth=2.5, label='周期模板') # 绘制±1σ和±2σ区域 ax.fill_between(phase_idx, template - sigma, template + sigma, color='blue', alpha=0.15, label='±1σ') ax.fill_between(phase_idx, template - 2*sigma, template + 2*sigma, color='blue', alpha=0.08, label='±2σ') # 标记稳定相位区域 stable_runs = _group_runs(stable_phase_mask) for s, e in生成剩余代码
11-12
本项目采用C++编程语言结合ROS框架构建了完整的双机械臂控制系统,实现了Gazebo仿真环境下的协同运动模拟,并完成了两台实体UR10工业机器人的联动控制。该毕业设计在答辩环节获得98分的优异成绩,所有程序代码均通过系统性调试验证,保证可直接部署运行。 系统架构包含三个核心模块:基于ROS通信架构的双臂协调控制器、Gazebo物理引擎下的动力学仿真环境、以及真实UR10机器人的硬件接口层。在仿真验证阶段,开发了双臂碰撞检测算法和轨迹规划模块,通过ROS控制包实现了末端执行器的同步轨迹跟踪。硬件集成方面,建立了基于TCP/IP协议的实时通信链路,解决了双机数据同步和运动指令分发等关键技术问题。 本资源适用于自动化、机械电子、人工智能等专业方向的课程实践,可作为高年级课程设计、毕业课题的重要参考案例。系统采用模块化设计理念,控制核心与硬件接口分离架构便于功能扩展,具备工程实践能力的学习者可在现有框架基础上进行二次开发,例如集成视觉感知模块或优化运动规划算法。 项目文档详细记录了环境配置流程、参数调试方法和实验验证数据,特别说明了双机协同作业时的时序同步解决方案。所有功能模块均提供完整的API接口说明,便于使用者快速理解系统架构并进行定制化修改。 资源来源于网络分享,仅用于学习交流使用,请勿用于商业,如有侵权请联系我删除!
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值