size not match(label size和 predict size )

本文探讨了在使用XGBoost进行多分类任务时遇到的常见错误,特别是当预测大小与标签大小不匹配时的问题。文章详细分析了错误原因,并提供了修改评价指标和使用Scikit-Learn封装的解决方案。

XGBoostError: b'[19:12:58] src/metric/rank_metric.cc:89: Check failed: (preds.size()) == (info.labels.size()) label size predict size not match'

 

I am training a XGBoostClassifier for my training set.

My training features are in the shape of (45001, 10338) which is a numpy array and my training labels are in the shape of (45001,) [I have 1161 unique labels so I have done a label encoding for the labels] which is also a numpy array.

From the documentation, it clearly says that I can create DMatrix from numpy array. So I am using the above mentioned training features and labels as numpy arrays straightaway. But I am getting the following error

---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
<ipython-input-30-3de36245534e> in <module>()
     13  scale_pos_weight=1,
     14  seed=27)
---> 15 modelfit(xgb1, train_x, train_y)

<ipython-input-27-9d215eac135e> in modelfit(alg, train_data_features, train_labels, useTrainCV, cv_folds, early_stopping_rounds)
      6         xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
      7         cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
----> 8             metrics='auc',early_stopping_rounds=early_stopping_rounds)
      9         alg.set_params(n_estimators=cvresult.shape[0])
     10 

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks)
    399         for fold in cvfolds:
    400             fold.update(i, obj)
--> 401         res = aggcv([f.eval(i, feval) for f in cvfolds])
    402 
    403         for key, mean, std in res:

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in <listcomp>(.0)
    399         for fold in cvfolds:
    400             fold.update(i, obj)
--> 401         res = aggcv([f.eval(i, feval) for f in cvfolds])
    402 
    403         for key, mean, std in res:

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in eval(self, iteration, feval)
    221     def eval(self, iteration, feval):
    222         """"Evaluate the CVPack for one iteration."""
--> 223         return self.bst.eval_set(self.watchlist, iteration, feval)
    224 
    225 

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in eval_set(self, evals, iteration, feval)
    865             _check_call(_LIB.XGBoosterEvalOneIter(self.handle, iteration,
    866                                                   dmats, evnames, len(evals),
--> 867                                                   ctypes.byref(msg)))
    868             return msg.value
    869         else:

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in _check_call(ret)
    125     """
    126     if ret != 0:
--> 127         raise XGBoostError(_LIB.XGBGetLastError())
    128 
    129 

XGBoostError: b'[19:12:58] src/metric/rank_metric.cc:89: Check failed: (preds.size()) == (info.labels.size()) label size predict size not match'

Please find my model Code below:

def modelfit(alg, train_data_features, train_labels,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgb_param['num_class'] = 1161   
        xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc',early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(train_data_features, train_labels, eval_metric='auc')

    #Predict training set:
    dtrain_predictions = alg.predict(train_data_features)
    dtrain_predprob = alg.predict_proba(train_data_features)[:,1]

    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(train_labels, dtrain_predictions))

Where am I going wrong in the above place ?

My classifier as follows :

xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=50,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

EDIT - 2 After changing evaluation metric,

---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
<ipython-input-9-30c62a886c2e> in <module>()
     13  scale_pos_weight=1,
     14  seed=27)
---> 15 modelfit(xgb1, train_x_trail, train_y_trail)

<ipython-input-8-9d215eac135e> in modelfit(alg, train_data_features, train_labels, useTrainCV, cv_folds, early_stopping_rounds)
      6         xgtrain = xgb.DMatrix(train_data_features, label=train_labels)
      7         cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
----> 8             metrics='auc',early_stopping_rounds=early_stopping_rounds)
      9         alg.set_params(n_estimators=cvresult.shape[0])
     10 

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in cv(params, dtrain, num_boost_round, nfold, stratified, folds, metrics, obj, feval, maximize, early_stopping_rounds, fpreproc, as_pandas, verbose_eval, show_stdv, seed, callbacks)
    398                            evaluation_result_list=None))
    399         for fold in cvfolds:
--> 400             fold.update(i, obj)
    401         res = aggcv([f.eval(i, feval) for f in cvfolds])
    402 

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/training.py in update(self, iteration, fobj)
    217     def update(self, iteration, fobj):
    218         """"Update the boosters for one iteration"""
--> 219         self.bst.update(self.dtrain, iteration, fobj)
    220 
    221     def eval(self, iteration, feval):

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in update(self, dtrain, iteration, fobj)
    804 
    805         if fobj is None:
--> 806             _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle))
    807         else:
    808             pred = self.predict(dtrain)

/home/carnd/anaconda3/envs/dl/lib/python3.5/site-packages/xgboost/core.py in _check_call(ret)
    125     """
    126     if ret != 0:
--> 127         raise XGBoostError(_LIB.XGBGetLastError())
    128 
    129 

XGBoostError: b'[03:43:03] src/objective/multiclass_obj.cc:42: Check failed: (info.labels.size()) != (0) label set cannot be empty'

python numpy xgboost

shareimprove this question

edited Aug 4 '17 at 3:45

asked Jul 23 '17 at 4:56

Kathiravan Natarajan

6172830

add a comment

====================================================================================================================================================================================================================================================================================================================================================================

2 Answers

active oldest votes

up vote 5 down vote accepted

+50

The original error that you get is because this metric was not designed for multi-class classification (see here).

You could use scikit learn wrapper of xgboost to overcome this issue. I modified your code with this wrapper, to produce similar function. I am not sure why are you doing gridsearch though, as you are not enumerating over parameters. Instead, you are using the parameters you specified in xgb1. Here is the modified code:

import xgboost as xgb
import sklearn
import numpy as np
from sklearn.model_selection import GridSearchCV

def modelfit(alg, train_data_features, train_labels,useTrainCV=True, cv_folds=5):

    if useTrainCV:
        params=alg.get_xgb_params()
        xgb_param=dict([(key,[params[key]]) for key in params])

        boost = xgb.sklearn.XGBClassifier()
        cvresult = GridSearchCV(boost,xgb_param,cv=cv_folds)
        cvresult.fit(X,y)
        alg=cvresult.best_estimator_


    #Fit the algorithm on the data
    alg.fit(train_data_features, train_labels)

    #Predict training set:
    dtrain_predictions = alg.predict(train_data_features)
    dtrain_predprob = alg.predict_proba(train_data_features)[:,1]

    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % sklearn.metrics.accuracy_score(train_labels, dtrain_predictions))

xgb1 = xgb.sklearn.XGBClassifier(
 learning_rate =0.1,
 n_estimators=50,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective='multi:softmax',
 nthread=4,
 scale_pos_weight=1,
 seed=27)    


X=np.random.normal(size=(200,30))
y=np.random.randint(0,5,200)

modelfit(xgb1, X, y)

The output that I get is

Model Report
Accuracy : 1

Note that I used much smaller size for the data. With the size that you mentioned, the algorithm may be very slow.

shareimprove this answer

answered Aug 4 '17 at 15:41

Miriam Farber

10.9k82746

  • In tensorflow, we create batches and run them. can I run this algorithm in batch wise ? Let's say 100 records after another ? How can I save this model and train it again ? I will accept your answer – Kathiravan Natarajan Aug 5 '17 at 0:21

  • When you train neural network on tensorflow you use batch gradient descent. Thus you can do that in chunks. However, xgboost operates differently, so you cannot just separate it into chunks. However, I looked at xgboost faq page: xgboost.readthedocs.io/en/latest/faq.html, and in the section about large data sets they write this: XGBoost is designed to be memory efficient. Usually it can handle problems as long as the data fit into your memory (This usually means millions of instances). If you are running out of memory, checkout external memory version or distributed version of xgboost – Miriam Farber Aug 5 '17 at 8:45

  • Thus, based on the above quote, it seems that you can try to run the code on your computer as it is. You can also put verbose=2 in GridSearchCV so that it will print more details while it's running. If it won't work, you could try the distributed version. They give a link to it from the faq page (the one I linked to in the previous comment). You could also put useTrainCV=False. As you have one set of parameters, you don't really need the gridsearch, so you can skip that part of your code (which is currently the most heavy part in your code). – Miriam Farber Aug 5 '17 at 9:00

add a comment

====================================================================================================================================================================================================================================================================================================================================================================

up vote 2 down vote

The error is b/c you are trying to use AUC evaluation metric for multiclass classification, but AUC is only applicable for two-class problems. In xgboost implementation, "auc" expects prediction size to be the same as label size, while your multiclass prediction size would be 45001*1161. Use either "mlogloss" or "merror" multiclass metrics.

P.S.: currently, xgboost would be rather slow with so many classes, as there is some inefficiency with predictions caching during training.

shareimprove this answer

answered Aug 3 '17 at 2:59

Vadim Khotilovich

46328

  • Please check the new error above after changing the evaluation metric – Kathiravan Natarajan Aug 4 '17 at 3:44

import os import re import numpy as np import time import gc import logging import random from collections import defaultdict from sklearn.feature_extraction.text import HashingVectorizer from sklearn.decomposition import TruncatedSVD import hdbscan # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def escape_for_regex(s): escaped = re.escape(s) escaped = re.sub(r'\\d(\\d)+', r'\\d+', escaped) escaped = re.sub(r'\\w(\\w)+', r'\\w+', escaped) return escaped def generate_safe_regex(strings): if not strings: return "" if len(strings) > 1000: sample = random.sample(strings, min(100, len(strings))) return generate_safe_regex(sample) prefix = os.path.commonprefix(strings) suffix = os.path.commonprefix([s[::-1] for s in strings])[::-1] var_parts = [] for s in strings: if s.startswith(prefix) and s.endswith(suffix): var_part = s[len(prefix):len(s)-len(suffix)] if var_part: var_parts.append(var_part) if var_parts: char_types = set() for part in var_parts: for char in part: if char.isdigit(): char_types.add('\\d') elif char.isalpha(): char_types.add('\\w') else: char_types.add(re.escape(char)) if len(char_types) == 1: var_pattern = next(iter(char_types)) + "+" else: var_pattern = ".+" else: var_pattern = "" prefix_escaped = escape_for_regex(prefix) suffix_escaped = escape_for_regex(suffix) return f"^{prefix_escaped}{var_pattern}{suffix_escaped}$" def cluster_strings_in_batches(strings, batch_size=5000, n_components=50): vectorizer = HashingVectorizer( analyzer='char', ngram_range=(1, 3), n_features=2**18, alternate_sign=False ) clusters = defaultdict(list) batch_count = (len(strings) + batch_size - 1) // batch_size for i in range(batch_count): start_idx = i * batch_size end_idx = min((i + 1) * batch_size, len(strings)) batch = strings[start_idx:end_idx] logger.info(f"处理批次 {i+1}/{batch_count},包含 {len(batch)} 个字符串") X = vectorizer.transform(batch) svd = TruncatedSVD(n_components=min(n_components, len(batch)-1), random_state=42) X_reduced = svd.fit_transform(X) clusterer = hdbscan.HDBSCAN( min_cluster_size=max(5, len(batch)//100), min_samples=2, metric='euclidean', cluster_selection_method='leaf', core_dist_n_jobs=-1 ) batch_labels = clusterer.fit_predict(X_reduced) for idx, label in enumerate(batch_labels): if label != -1: global_label = f"{i}_{label}" clusters[global_label].append(batch[idx]) del X, X_reduced, batch_labels, clusterer, svd gc.collect() return clusters def merge_clusters_by_pattern(clusters, similarity_threshold=0.7): """基于模式相似度合并聚类""" logger.info(f"开始合并 {len(clusters)} 个聚类") cluster_patterns = {label: generate_safe_regex(strings) for label, strings in clusters.items()} label_to_pattern = cluster_patterns merged_clusters = {} used_labels = set() for label1, pattern1 in label_to_pattern.items(): if label1 in used_labels: continue merged_strings = clusters[label1].copy() current_pattern = pattern1 for label2, pattern2 in label_to_pattern.items(): if label1 == label2 or label2 in used_labels: continue if pattern1 == pattern2 or pattern1 in pattern2 or pattern2 in pattern1: merged_strings.extend(clusters[label2]) current_pattern = f"({pattern1}|{pattern2})" used_labels.add(label2) merged_clusters[label1] = { 'strings': merged_strings, 'pattern': current_pattern } used_labels.add(label1) logger.info(f"合并后剩余 {len(merged_clusters)} 个聚类") return merged_clusters def validate_regex_efficient_with_similarity(clusters, sample_size=50): validation = {} total_clusters = len(clusters) logger.info(f"开始验证 {total_clusters} 个聚类...") for i, (label, data) in enumerate(clusters.items()): if i % 100 == 0: logger.info(f"验证进度: {i}/{total_clusters}") pattern = data['pattern'] strings_in_cluster = data['strings'] sample = random.sample(strings_in_cluster, min(sample_size, len(strings_in_cluster))) if strings_in_cluster else [] try: compiled = re.compile(pattern) matches = 0 for s in sample: if compiled.fullmatch(s): matches += 1 match_rate = matches / len(sample) if sample else 0 except re.error: match_rate = 0 validation[label] = { "pattern": pattern, "total_strings": len(strings_in_cluster), "matched_strings": int(match_rate * len(strings_in_cluster)), "match_rate": match_rate } return validation def recluster_low_quality_clusters(clusters, validation, threshold=0.99, batch_size=1000, n_components=20): """对匹配率低于阈值的聚类进行二次聚类""" logger.info("开始对低质量聚类进行二次聚类...") low_quality_clusters = { label: clusters[label]['strings'] for label, data in validation.items() if data['match_rate'] < threshold and len(clusters[label]['strings']) > 1 } logger.info(f"发现 {len(low_quality_clusters)} 个低质量聚类(匹配率 < {threshold})") new_clusters = {} base_label_prefix = "reclustered_" for idx, (label, strings) in enumerate(low_quality_clusters.items()): logger.info(f"二次聚类处理: {label}, 字符串数量: {len(strings)}") sub_clusters = cluster_strings_in_batches(strings, batch_size=batch_size, n_components=n_components) for sub_label, sub_strings in sub_clusters.items(): new_label = f"{base_label_prefix}{label}_{idx}_{sub_label}" new_clusters[new_label] = { 'strings': sub_strings, 'pattern': generate_safe_regex(sub_strings) } logger.info(f"二次聚类生成 {len(new_clusters)} 个新聚类") return new_clusters def merge_all_clusters(original_clusters, new_clusters): merged = original_clusters.copy() for label, data in new_clusters.items(): merged[label] = data return merged def save_clusters_to_files_with_similarity(clusters, validation, output_folder="clustering_results"): if not os.path.exists(output_folder): os.makedirs(output_folder) for label, data in clusters.items(): pattern = data['pattern'] strings_in_cluster = data['strings'] match_rate = validation[label]['match_rate'] if label in validation else 0.0 safe_label = re.sub(r'[^\w\-_]', '_', label)[:50] filename = f"cluster_{safe_label}.txt" with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as f: f.write(f"正则模式: {pattern}\n") f.write(f"匹配率: {match_rate:.2%}\n\n") f.write("\n".join(strings_in_cluster[:1000])) if len(strings_in_cluster) > 1000: f.write(f"\n\n...还有 {len(strings_in_cluster)-1000} 个字符串未显示") def save_flat_cluster_results_csv(clusters, output_folder="clustering_results"): """保存平铺格式的聚类结果为CSV文件:字符串 + 标签""" if not os.path.exists(output_folder): os.makedirs(output_folder) label_map = {label: i for i, label in enumerate(clusters.keys())} output_path = os.path.join(output_folder, "flat_cluster_results.csv") with open(output_path, 'w', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(["string", "cluster_label"]) for label, data in clusters.items(): int_label = label_map[label] for s in data['strings']: safe_s = s.replace('"', '""') if ',' in safe_s or '"' in safe_s or '\n' in safe_s: safe_s = f'"{safe_s}"' writer.writerow([safe_s, int_label]) logger.info(f"平铺格式聚类结果(CSV)已保存到: {output_path}") # ====================== 主程序 ====================== if __name__ == "__main__": folder_path = "文件夹路径" output_folder = "聚类结果" logger.info("开始读取文件...") strings = [] for filename in os.listdir(folder_path): if filename.endswith('.txt'): with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f: strings.extend(line.strip() for line in f if line.strip()) logger.info(f"找到 {len(strings)} 个字符串") unique_strings = list(set(strings)) logger.info(f"去重后剩余 {len(unique_strings)} 个唯一字符串") logger.info("开始聚类处理...") start_time = time.time() clusters = cluster_strings_in_batches(unique_strings, batch_size=5000, n_components=50) merged_clusters = merge_clusters_by_pattern(clusters) logger.info(f"第一次聚类完成,耗时: {time.time()-start_time:.2f}秒") logger.info("高效验证第一次聚类结果...") first_validation = validate_regex_efficient_with_similarity(merged_clusters) logger.info("对匹配率低于 0.99 的聚类进行二次聚类...") new_clusters = recluster_low_quality_clusters(merged_clusters, first_validation, threshold=0.99) logger.info("合并原始聚类与二次聚类结果...") final_clusters = merge_all_clusters(merged_clusters, new_clusters) logger.info("验证最终聚类结果...") final_validation = validate_regex_efficient_with_similarity(final_clusters) logger.info("保存结果...") save_clusters_to_files_with_similarity(final_clusters, final_validation, output_folder) logger.info("保存平铺格式聚类结果(CSV)...") save_flat_cluster_results_csv(final_clusters, output_folder) print("\n聚类摘要:") print(f"总字符串数: {len(unique_strings)}") print(f"聚类数量: {len(final_clusters)}") total_match_rate = sum(data['match_rate'] for data in final_validation.values()) avg_match_rate = total_match_rate / len(final_validation) if final_validation else 0 print(f"平均匹配率: {avg_match_rate:.2%}") print("\n处理完成!结果已保存到 '聚类结果' 文件夹") 这个代码给我修改一下 现在不需要正则了 就看聚类的相似度 纯做聚类
08-15
import os import re, csv import numpy as np import time import gc import logging import random from collections import defaultdict from sklearn.feature_extraction.text import HashingVectorizer from sklearn.decomposition import TruncatedSVD import hdbscan 配置日志 logging.basicConfig(level=logging.INFO, format=‘%(asctime)s - %(levelname)s - %(message)s’) logger = logging.getLogger(name) def escape_for_regex(s): escaped = re.escape(s) escaped = re.sub(r’\d(\d)+‘, r’\d+‘, escaped) escaped = re.sub(r’\w(\w)+‘, r’\w+', escaped) return escaped def generate_safe_regex(strings): if not strings: return “” if len(strings) > 1000: sample = random.sample(strings, min(100, len(strings))) return generate_safe_regex(sample) prefix = os.path.commonprefix(strings) suffix = os.path.commonprefix([s[::-1] for s in strings])[::-1] var_parts = [] for s in strings: if s.startswith(prefix) and s.endswith(suffix): var_part = s[len(prefix):len(s)-len(suffix)] if var_part: var_parts.append(var_part) if var_parts: char_types = set() for part in var_parts: for char in part: if char.isdigit(): char_types.add('\\d') elif char.isalpha(): char_types.add('\\w') else: char_types.add(re.escape(char)) if len(char_types) == 1: var_pattern = next(iter(char_types)) + "+" else: var_pattern = ".+" else: var_pattern = "" prefix_escaped = escape_for_regex(prefix) suffix_escaped = escape_for_regex(suffix) return f"^{prefix_escaped}{var_pattern}{suffix_escaped}$" def cluster_strings_in_batches(strings, batch_size=5000, n_components=50, min_cluster_size=5, min_samples=2): if not strings: return {} vectorizer = HashingVectorizer( analyzer='char', ngram_range=(1, 3), n_features=2**18, alternate_sign=False ) clusters = defaultdict(list) batch_count = (len(strings) + batch_size - 1) // batch_size for i in range(batch_count): start_idx = i * batch_size end_idx = min((i + 1) * batch_size, len(strings)) batch = strings[start_idx:end_idx] logger.info(f"处理批次 {i+1}/{batch_count},包含 {len(batch)} 个字符串") X = vectorizer.transform(batch) svd = TruncatedSVD(n_components=min(n_components, len(batch)-1), random_state=42) X_reduced = svd.fit_transform(X) # 动态调整聚类参数 current_min_cluster_size = max(min_cluster_size, min(10, len(batch)//50)) current_min_samples = max(min_samples, min(3, len(batch)//100)) clusterer = hdbscan.HDBSCAN( min_cluster_size=current_min_cluster_size, min_samples=current_min_samples, metric='euclidean', cluster_selection_method='leaf', core_dist_n_jobs=-1 ) batch_labels = clusterer.fit_predict(X_reduced) for idx, label in enumerate(batch_labels): if label != -1: global_label = f"{i}_{label}" clusters[global_label].append(batch[idx]) else: # 处理离群点 outlier_label = f"outlier_{i}_{idx}" clusters[outlier_label] = [batch[idx]] del X, X_reduced, batch_labels, clusterer, svd gc.collect() return clusters def validate_regex_efficient(cluster_strings, pattern, sample_size=50): if not cluster_strings: return 0.0 if len(cluster_strings) == 1: # 单个字符串总是100%匹配 return 1.0 sample = random.sample(cluster_strings, min(sample_size, len(cluster_strings))) try: compiled = re.compile(pattern) matches = 0 for s in sample: if compiled.fullmatch(s): matches += 1 match_rate = matches / len(sample) except re.error: match_rate = 0.0 return match_rate def merge_single_string_clusters(clusters, metrics): “”“合并单字符串聚类并进行重新聚类”“” logger.info(“开始处理单字符串聚类…”) # 分离单字符串聚类多字符串聚类 single_string_clusters = [] multi_string_clusters = {} multi_string_metrics = [] for label, data in clusters.items(): if len(data['strings']) == 1: single_string_clusters.append(data['strings'][0]) else: multi_string_clusters[label] = data for metric in metrics: if metric['num_strings'] == 1: continue # 跳过单字符串指标 multi_string_metrics.append(metric) logger.info(f"找到 {len(single_string_clusters)} 个单字符串聚类") logger.info(f"保留 {len(multi_string_clusters)} 个多字符串聚类") if not single_string_clusters: logger.info("没有单字符串聚类需要处理") return clusters, metrics # 对单字符串集合进行重新聚类 logger.info(f"对 {len(single_string_clusters)} 个单字符串进行重新聚类...") new_clusters = cluster_strings_in_batches( single_string_clusters, batch_size=2000, n_components=30, min_cluster_size=3, # 更小的聚类大小 min_samples=1 ) # 处理新聚类 new_metrics = [] for label, strings_in_cluster in new_clusters.items(): pattern = generate_safe_regex(strings_in_cluster) match_rate = validate_regex_efficient(strings_in_cluster, pattern) new_label = f"merged_{label}" multi_string_clusters[new_label] = { 'strings': strings_in_cluster, 'pattern': pattern } new_metrics.append({ 'cluster_label': new_label, 'parent_cluster': "merged_singles", 'num_strings': len(strings_in_cluster), 'match_rate': match_rate, 'pattern': pattern }) logger.info(f"重新聚类生成 {len(new_clusters)} 个新聚类") # 合并指标 final_metrics = multi_string_metrics + new_metrics return multi_string_clusters, final_metrics def save_cluster_metrics_to_csv(metrics, output_folder): “”“保存聚类指标到CSV文件”“” if not os.path.exists(output_folder): os.makedirs(output_folder) output_path = os.path.join(output_folder, "cluster_metrics.csv") with open(output_path, 'w', encoding='utf-8', newline='') as f: writer = csv.writer(f) writer.writerow(["cluster_label", "parent_cluster", "num_strings", "match_rate", "pattern"]) for metric in metrics: writer.writerow([ metric['cluster_label'], metric['parent_cluster'], metric['num_strings'], metric['match_rate'], metric['pattern'] ]) logger.info(f"聚类指标已保存到: {output_path}") def save_cluster_summary_report(metrics, output_folder): “”“保存聚类摘要报告”“” if not os.path.exists(output_folder): os.makedirs(output_folder) output_path = os.path.join(output_folder, "cluster_summary.txt") # 计算统计信息 total_clusters = len(metrics) total_strings = sum(m['num_strings'] for m in metrics) avg_match_rate = sum(m['match_rate'] for m in metrics) / total_clusters if total_clusters > 0 else 0 # 按聚类大小分类 size_groups = { "single": [m for m in metrics if m['num_strings'] == 1], "small": [m for m in metrics if 2 <= m['num_strings'] <= 5], "medium": [m for m in metrics if 6 <= m['num_strings'] <= 20], "large": [m for m in metrics if m['num_strings'] > 20] } # 按匹配率分类 match_groups = { "perfect": [m for m in metrics if m['match_rate'] == 1.0], "good": [m for m in metrics if 0.95 <= m['match_rate'] < 1.0], "fair": [m for m in metrics if 0.8 <= m['match_rate'] < 0.95], "poor": [m for m in metrics if m['match_rate'] < 0.8] } with open(output_path, 'w', encoding='utf-8') as f: f.write("===== 聚类分析报告 =====\n\n") f.write(f"总聚类数: {total_clusters}\n") f.write(f"总字符串数: {total_strings}\n") f.write(f"平均匹配率: {avg_match_rate:.2%}\n\n") f.write("===== 聚类大小分布 =====\n") f.write(f"单字符串聚类: {len(size_groups['single'])} ({len(size_groups['single'])/total_clusters:.2%})\n") f.write(f"小型聚类 (2-5个字符串): {len(size_groups['small'])} ({len(size_groups['small'])/total_clusters:.2%})\n") f.write(f"中型聚类 (6-20个字符串): {len(size_groups['medium'])} ({len(size_groups['medium'])/total_clusters:.2%})\n") f.write(f"大型聚类 (>20个字符串): {len(size_groups['large'])} ({len(size_groups['large'])/total_clusters:.2%})\n\n") f.write("===== 聚类质量分布 =====\n") f.write(f"完美匹配 (100%): {len(match_groups['perfect'])} ({len(match_groups['perfect'])/total_clusters:.2%})\n") f.write(f"优质匹配 (95-99%): {len(match_groups['good'])} ({len(match_groups['good'])/total_clusters:.2%})\n") f.write(f"一般匹配 (80-94%): {len(match_groups['fair'])} ({len(match_groups['fair'])/total_clusters:.2%})\n") f.write(f"较差匹配 (<80%): {len(match_groups['poor'])} ({len(match_groups['poor'])/total_clusters:.2%})\n\n") # 分析单字符串聚类 if size_groups['single']: f.write("===== 单字符串聚类分析 =====\n") f.write(f"单字符串聚类数量: {len(size_groups['single'])}\n") f.write(f"占总聚类比例: {len(size_groups['single'])/total_clusters:.2%}\n") f.write(f"占总字符串比例: {len(size_groups['single'])/total_strings:.2%}\n\n") # 分析大型聚类 if size_groups['large']: f.write("===== 大型聚类分析 =====\n") largest_cluster = max(size_groups['large'], key=lambda x: x['num_strings']) f.write(f"最大聚类: {largest_cluster['cluster_label']} (包含 {largest_cluster['num_strings']} 个字符串)\n") # 计算大型聚类的平均匹配率 large_match_rates = [m['match_rate'] for m in size_groups['large']] avg_large_match_rate = sum(large_match_rates) / len(large_match_rates) if large_match_rates else 0 f.write(f"大型聚类平均匹配率: {avg_large_match_rate:.2%}\n") logger.info(f"聚类摘要报告已保存到: {output_path}") return { 'total_clusters': total_clusters, 'total_strings': total_strings, 'avg_match_rate': avg_match_rate, 'single_clusters': len(size_groups['single']), 'small_clusters': len(size_groups['small']), 'medium_clusters': len(size_groups['medium']), 'large_clusters': len(size_groups['large']), 'perfect_clusters': len(match_groups['perfect']), 'good_clusters': len(match_groups['good']), 'fair_clusters': len(match_groups['fair']), 'poor_clusters': len(match_groups['poor']) } def save_clusters_to_files(clusters, output_folder): “”“保存聚类结果到文件”“” if not os.path.exists(output_folder): os.makedirs(output_folder) for label, data in clusters.items(): pattern = data['pattern'] strings_in_cluster = data['strings'] safe_label = re.sub(r'[^\w\-_]', '_', label)[:50] filename = f"cluster_{safe_label}.txt" with open(os.path.join(output_folder, filename), 'w', encoding='utf-8') as f: f.write(f"正则模式: {pattern}\n") f.write(f"字符串数量: {len(strings_in_cluster)}\n\n") # 显示所有字符串(如果数量合理) if len(strings_in_cluster) <= 1000: f.write("\n".join(strings_in_cluster)) else: f.write("\n".join(strings_in_cluster[:500])) f.write(f"\n\n...省略中间部分...\n\n") f.write("\n".join(strings_in_cluster[-500:])) f.write(f"\n\n...还有 {len(strings_in_cluster)-1000} 个字符串未显示") logger.info(f"聚类结果已保存到: {output_folder}") def save_single_strings_to_file(single_strings, output_folder): “”“保存所有单字符串到单独文件”“” if not single_strings: return if not os.path.exists(output_folder): os.makedirs(output_folder) output_path = os.path.join(output_folder, "single_strings.txt") with open(output_path, 'w', encoding='utf-8') as f: f.write(f"单字符串总数: {len(single_strings)}\n\n") f.write("\n".join(single_strings)) logger.info(f"单字符串已保存到: {output_path}") ====================== 主程序 ====================== if name == “main”: folder_path = “D:/我的文档/桌面/字符串” output_folder = “聚类结果” metrics_folder = “聚类指标” min_match_rate = 0.95 # 目标匹配率 max_iterations = 5 # 最大迭代次数 logger.info("开始读取文件...") strings = [] for filename in os.listdir(folder_path): if filename.endswith('.txt'): with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as f: strings.extend(line.strip() for line in f if line.strip()) logger.info(f"找到 {len(strings)} 个字符串") unique_strings = list(set(strings)) logger.info(f"去重后剩余 {len(unique_strings)} 个唯一字符串") logger.info("开始第一次聚类处理...") start_time = time.time() # 第一次聚类 first_clusters = cluster_strings_in_batches(unique_strings, batch_size=5000, n_components=50) first_cluster_time = time.time() - start_time logger.info(f"第一次聚类完成,耗时: {first_cluster_time:.2f}秒") logger.info(f"生成 {len(first_clusters)} 个初始聚类组") # 处理第一次聚类结果 initial_clusters = {} initial_metrics = [] for label, strings_in_group in first_clusters.items(): pattern = generate_safe_regex(strings_in_group) match_rate = validate_regex_efficient(strings_in_group, pattern) initial_clusters[label] = { 'strings': strings_in_group, 'pattern': pattern } initial_metrics.append({ 'cluster_label': label, 'parent_cluster': "initial", 'num_strings': len(strings_in_group), 'match_rate': match_rate, 'pattern': pattern }) # 分析初始聚类质量 initial_summary = save_cluster_summary_report(initial_metrics, metrics_folder) logger.info(f"初始聚类质量: 单字符串聚类 {initial_summary['single_clusters']} 个") # 分离单字符串聚类并进行重新聚类 final_clusters, final_metrics = merge_single_string_clusters(initial_clusters, initial_metrics) # 分析最终聚类质量 final_summary = save_cluster_summary_report(final_metrics, metrics_folder) # 提取所有单字符串(如果有) all_single_strings = [] for data in final_clusters.values(): if len(data['strings']) == 1: all_single_strings.append(data['strings'][0]) # 保存结果 save_cluster_metrics_to_csv(final_metrics, metrics_folder) save_clusters_to_files(final_clusters, output_folder) save_single_strings_to_file(all_single_strings, output_folder) # 打印最终报告 print("\n===== 聚类处理完成 =====") print(f"总处理时间: {time.time()-start_time:.2f}秒") print(f"总聚类数: {final_summary['total_clusters']}") print(f"总字符串数: {final_summary['total_strings']}") print(f"平均匹配率: {final_summary['avg_match_rate']:.2%}") print(f"\n聚类大小分布:") print(f" 单字符串聚类: {final_summary['single_clusters']} ({final_summary['single_clusters']/final_summary['total_clusters']:.2%})") print(f" 小型聚类 (2-5): {final_summary['small_clusters']} ({final_summary['small_clusters']/final_summary['total_clusters']:.2%})") print(f" 中型聚类 (6-20): {final_summary['medium_clusters']} ({final_summary['medium_clusters']/final_summary['total_clusters']:.2%})") print(f" 大型聚类 (>20): {final_summary['large_clusters']} ({final_summary['large_clusters']/final_summary['total_clusters']:.2%})") print("\n聚类质量分布:") print(f" 完美匹配 (100%): {final_summary['perfect_clusters']} ({final_summary['perfect_clusters']/final_summary['total_clusters']:.2%})") print(f" 优质匹配 (95-99%): {final_summary['good_clusters']} ({final_summary['good_clusters']/final_summary['total_clusters']:.2%})") print(f" 一般匹配 (80-94%): {final_summary['fair_clusters']} ({final_summary['fair_clusters']/final_summary['total_clusters']:.2%})") print(f" 较差匹配 (<80%): {final_summary['poor_clusters']} ({final_summary['poor_clusters']/final_summary['total_clusters']:.2%})") if all_single_strings: print(f"\n仍然存在 {len(all_single_strings)} 个单字符串聚类") print("这些字符串可能是真正的离群点或具有独特模式") print(f"已保存到 {output_folder}/single_strings.txt") print(f"\n详细结果已保存到: {output_folder} {metrics_folder}") 这个代码最终形成的聚类组数太多了 需要你帮我处理优化
最新发布
08-15
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值