import pandas as pd
import spacy
import re
import html
# 加载spaCy的英文模型和中文模型
try:
nlp_en = spacy.load('en_core_web_lg')
print("英文模型加载成功")
except OSError:
print("英文模型未安装,请运行: python -m spacy download en_core_web_lg")
nlp_en = None
try:
nlp_zh = spacy.load('zh_core_web_lg')
print("中文模型加载成功")
except OSError:
print("中文模型未安装,请运行: python -m spacy download zh_core_web_lg")
nlp_zh = None
# 全局统计变量
cleanup_stats = {
'html_tags': 0,
'xml_tags': 0,
'broken_tags': 0,
'html_entities': 0,
'xml_namespaces': 0,
'brackets_removed': 0,
'fr_marks_removed': 0,
'rag_llm_marks': 0,
'reference_texts': 0,
'stopwords_removed': 0,
'sentences_processed': 0,
'mixed_text_segments': 0
}
def reset_cleanup_stats():
"""重置清理统计"""
for key in cleanup_stats:
cleanup_stats[key] = 0
def print_cleanup_stats():
"""
打印清理统计信息
"""
print("\n" + "=" * 60)
print("详细清理统计报告")
print("=" * 60)
print(f"\n1. HTML/XML标签清理:")
print(f" - 完整HTML/XML标签: {cleanup_stats['html_tags']} 个")
print(f" - 残缺标签: {cleanup_stats['broken_tags']} 个")
print(f" - HTML实体: {cleanup_stats['html_entities']} 个")
print(f" - XML命名空间: {cleanup_stats['xml_namespaces']} 个")
print(f"\n2. 特殊标记清理:")
print(f" - 方括号测试标记: {cleanup_stats['brackets_removed']} 个")
print(f" - 故障率标记(F/R): {cleanup_stats['fr_marks_removed']} 个")
print(f" - RAG/LLM标记: {cleanup_stats['rag_llm_marks']} 个")
print(f" - 引用文本: {cleanup_stats['reference_texts']} 个")
print(f"\n3. 文本处理统计:")
print(f" - 停用词移除: {cleanup_stats['stopwords_removed']} 个")
print(f" - 句子处理: {cleanup_stats['sentences_processed']} 个")
print(f" - 混合文本分段: {cleanup_stats['mixed_text_segments']} 个")
total_cleaned = sum(cleanup_stats.values())
print(f"\n总计清理项目: {total_cleaned} 个")
def analyze_text_mixed(text):
"""
分析文本的中英文混合程度,返回更细致的语言信息
"""
if pd.isna(text) or text == '' or not isinstance(text, str):
return {'dominant_lang': 'en', 'chinese_ratio': 0, 'mixed_level': 'none'}
text = str(text)
# 统计中文字符
chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
chinese_count = len(chinese_chars)
# 统计英文字母
english_chars = re.findall(r'[a-zA-Z]', text)
english_count = len(english_chars)
# 统计数字和其他字符
total_chars = len(text)
chinese_ratio = chinese_count / max(total_chars, 1)
english_ratio = english_count / max(total_chars, 1)
# 判断主导语言和混合程度
if chinese_ratio > 0.6:
dominant_lang = 'zh'
mixed_level = 'mostly_chinese'
elif english_ratio > 0.6:
dominant_lang = 'en'
mixed_level = 'mostly_english'
elif chinese_ratio > 0.3 and english_ratio > 0.3:
dominant_lang = 'zh' if chinese_ratio > english_ratio else 'en'
mixed_level = 'highly_mixed'
elif chinese_ratio > 0.1:
dominant_lang = 'en' # 英文为主,包含少量中文
mixed_level = 'slightly_mixed'
else:
dominant_lang = 'en'
mixed_level = 'pure_english'
return {
'dominant_lang': dominant_lang,
'chinese_ratio': chinese_ratio,
'english_ratio': english_ratio,
'mixed_level': mixed_level,
'chinese_count': chinese_count,
'english_count': english_count
}
def count_html_tags_in_text(text):
"""
统计文本中的HTML/XML标签
"""
if pd.isna(text) or text == '' or not isinstance(text, str):
return 0
text = str(text)
# 统计完整标签
full_tags = re.findall(r'<[^>]+>', text)
# 统计残缺标签
broken_start_tags = re.findall(r'<[^>]*$', text)
broken_end_tags = re.findall(r'^[^>]*>', text)
# 统计HTML实体
html_entities = re.findall(r'&[a-z]+;', text)
numeric_entities = re.findall(r'&#\d+;', text)
# 统计XML命名空间
xml_namespaces = re.findall(r'xmlns:[^=]+="[^"]+"', text)
xsi_namespaces = re.findall(r'xsi:[^=]+="[^"]+"', text)
return {
'full_tags': len(full_tags),
'broken_tags': len(broken_start_tags) + len(broken_end_tags),
'html_entities': len(html_entities) + len(numeric_entities),
'xml_namespaces': len(xml_namespaces) + len(xsi_namespaces)
}
def clean_html_xml_tags(text):
"""
删除残缺的HTML和XML标签(支持中英文)并统计
"""
if pd.isna(text) or text == '' or not isinstance(text, str):
return text
original_text = str(text)
text = html.unescape(original_text)
# 统计清理前的标签数量
tag_stats_before = count_html_tags_in_text(text)
# 记录原始长度用于统计
original_len = len(text)
# 删除完整的HTML/XML标签
text, full_tags_count = re.subn(r'<[^>]+>', '', text)
cleanup_stats['html_tags'] += full_tags_count
# 删除残缺的标签
text, broken_start_count = re.subn(r'<[^>]*$', '', text)
text, broken_end_count = re.subn(r'^[^>]*>', '', text)
cleanup_stats['broken_tags'] += broken_start_count + broken_end_count
# 删除HTML特殊字符
text, html_entities_count = re.subn(r'&[a-z]+;', '', text)
text, numeric_entities_count = re.subn(r'&#\d+;', '', text)
cleanup_stats['html_entities'] += html_entities_count + numeric_entities_count
# 删除XML命名空间
text, xmlns_count = re.subn(r'xmlns:[^=]+="[^"]+"', '', text)
text, xsi_count = re.subn(r'xsi:[^=]+="[^"]+"', '', text)
cleanup_stats['xml_namespaces'] += xmlns_count + xsi_count
# 记录清理长度变化
cleaned_len = len(text)
chars_removed = original_len - cleaned_len
return text.strip()
def clean_mixed_text(text):
"""
专门处理中英文混合文本
"""
if pd.isna(text) or text == '' or not isinstance(text, str):
return text
text = str(text)
lang_info = analyze_text_mixed(text)
# 统计句子数量
sentences = re.split(r'[。\.!!??;;]', text)
sentences = [s.strip() for s in sentences if s.strip()]
cleanup_stats['sentences_processed'] += len(sentences)
# 根据混合程度选择处理策略
if lang_info['mixed_level'] in ['pure_english', 'mostly_english']:
# 以英文为主,使用英文处理
if nlp_en is not None:
doc = nlp_en(text)
return clean_english_text(doc)
else:
return basic_text_clean(text)
elif lang_info['mixed_level'] in ['mostly_chinese']:
# 以中文为主,使用中文处理
if nlp_zh is not None:
doc = nlp_zh(text)
return clean_chinese_text(doc)
else:
return basic_text_clean(text)
else:
# 高度混合或轻微混合,使用混合处理策略
return process_highly_mixed_text(text, lang_info)
def process_highly_mixed_text(text, lang_info):
"""
处理高度中英文混合的文本
"""
# 策略1: 分句处理
sentences = re.split(r'[。\.!!??;;]', text)
sentences = [s.strip() for s in sentences if s.strip()]
cleanup_stats['sentences_processed'] += len(sentences)
cleaned_sentences = []
for sentence in sentences:
if not sentence:
continue
sent_lang_info = analyze_text_mixed(sentence)
if sent_lang_info['mixed_level'] in ['pure_english', 'mostly_english']:
# 英文句子用英文处理
if nlp_en is not None:
doc = nlp_en(sentence)
cleaned = clean_english_text(doc)
else:
cleaned = basic_text_clean(sentence)
elif sent_lang_info['mixed_level'] in ['mostly_chinese']:
# 中文句子用中文处理
if nlp_zh is not None:
doc = nlp_zh(sentence)
cleaned = clean_chinese_text(doc)
else:
cleaned = basic_text_clean(sentence)
else:
# 混合句子使用特殊处理
cleaned = process_mixed_sentence(sentence)
if cleaned:
cleaned_sentences.append(cleaned)
# 根据主导语言选择连接符
separator = '。' if lang_info['dominant_lang'] == 'zh' else '. '
return separator.join(cleaned_sentences)
def process_mixed_sentence(sentence):
"""
处理单个混合句子
"""
# 使用正则表达式分割中英文部分
# 匹配中文字符段和英文字符段
segments = re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z0-9\s\.\-]+', sentence)
cleanup_stats['mixed_text_segments'] += len(segments)
cleaned_segments = []
for segment in segments:
if not segment.strip():
continue
# 判断段落的语言特性
if re.search(r'[\u4e00-\u9fff]', segment):
# 中文字符段
if nlp_zh is not None:
doc = nlp_zh(segment)
cleaned = clean_chinese_text(doc)
else:
cleaned = basic_text_clean(segment)
else:
# 英文字符段
if nlp_en is not None:
doc = nlp_en(segment)
cleaned = clean_english_text(doc)
else:
cleaned = basic_text_clean(segment)
cleaned_segments.append(cleaned)
return ' '.join(cleaned_segments)
def clean_chinese_text(doc):
"""
中文文本清洗处理 - 增强版,保留重要英文术语
"""
cleaned_tokens = []
original_token_count = len([token for token in doc])
for token in doc:
# 中文处理:保留中文字符、数字、字母
if (token.is_alpha or token.is_digit) and not token.is_space:
if len(token.text.strip()) > 0:
# 对于英文术语,检查是否是技术缩写
if re.match(r'^[a-zA-Z]+$', token.text):
# 如果是常见技术缩写,保留原样
if token.text.upper() in ['BSOD', 'USB', 'TBT', 'CPU', 'GPU', 'LED', 'LCD', 'BIOS', 'OS']:
cleaned_tokens.append(token.text.upper())
elif len(token.text) > 1: # 普通英文单词,转为小写
cleaned_tokens.append(token.text.lower())
else: # 单个英文字母,可能是缩写的一部分
cleaned_tokens.append(token.text)
else:
# 中文字符,直接保留
cleaned_tokens.append(token.text)
cleaned_text = ' '.join(cleaned_tokens)
# 清理特殊字符,保留中英文标点
cleaned_text = re.sub(r'[^\w\s\u4e00-\u9fff。,;:!?()《》\.\-]', ' ', cleaned_text)
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
# 统计停用词移除
cleanup_stats['stopwords_removed'] += (original_token_count - len(cleaned_tokens))
return cleaned_text
def clean_english_text(doc):
"""
英文文本清洗处理 - 增强版,保留中文内容
"""
cleaned_tokens = []
original_token_count = len([token for token in doc])
for token in doc:
if (token.is_alpha or token.is_digit) and not token.is_space:
# 检查是否是中文字符
if re.search(r'[\u4e00-\u9fff]', token.text):
# 中文字符,直接保留
cleaned_tokens.append(token.text)
else:
# 英文字符处理
if not token.is_stop and len(token.text) > 1:
cleaned_tokens.append(token.lemma_.lower())
elif token.text.upper() in ['BSOD', 'USB', 'TBT', 'CPU', 'GPU', 'LED', 'LCD', 'BIOS', 'OS']:
cleaned_tokens.append(token.text)
elif len(token.text) == 1 and token.text.isupper():
# 单个大写字母,可能是缩写
cleaned_tokens.append(token.text)
cleaned_text = ' '.join(cleaned_tokens)
# 清理特殊字符,保留中文字符
cleaned_text = re.sub(r'[^\w\s\u4e00-\u9fff\.\-]', ' ', cleaned_text)
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
# 统计停用词移除
cleanup_stats['stopwords_removed'] += (original_token_count - len(cleaned_tokens))
return cleaned_text
def clean_text_with_spacy(text):
"""
使用spaCy进行中英文混合文本清洗和标准化 - 主入口函数
"""
if pd.isna(text) or text == '' or not isinstance(text, str):
return text
# 先删除HTML/XML标签
text = clean_html_xml_tags(text)
if text == '':
return text
# 使用混合文本处理
return clean_mixed_text(text)
def basic_text_clean(text):
"""
基础文本清理(当spaCy不可用时使用)
"""
if pd.isna(text) or text == '' or not isinstance(text, str):
return text
text = str(text)
# 基本清理:移除特殊字符,保留中英文、数字、基本标点
text = re.sub(r'[^\w\s\u4e00-\u9fff。,;:!?()《》\.\-]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
def clean_subject_field(text):
"""
专门处理subject字段的清洗(增强混合文本支持)
"""
if pd.isna(text) or text == '' or not isinstance(text, str):
return text
text = str(text)
# 统计方括号标记
bracket_matches = re.findall(r'[\[【][^\]】]+[\]】]', text)
cleanup_stats['brackets_removed'] += len(bracket_matches)
# 删除方括号内的测试标记(支持中英文方括号)
text = re.sub(r'[\[【][^\]】]+[\]】]', '', text)
# 统计故障率标记
fr_matches = re.findall(r'[\((]F/R:[^\))]+[\))]', text)
fr_matches += re.findall(r'FR:[^\))]+[\))]', text)
cleanup_stats['fr_marks_removed'] += len(fr_matches)
# 删除故障率标记
text = re.sub(r'[\((]F/R:[^\))]+[\))]', '', text)
text = re.sub(r'FR:[^\))]+[\))]', '', text)
# 应用通用的HTML/XML清理
text = clean_html_xml_tags(text)
# 中英文问题关键词
problem_keywords = ['fail', 'error', 'cannot', 'no', 'not work', 'bsod', 'black screen',
'失败', '错误', '无法', '不工作', '蓝屏', '黑屏', '问题', '故障']
# 提取核心问题描述
if any(keyword in text.lower() for keyword in problem_keywords):
# 中英文分句
sentences = re.split(r'[。\.!!??;;]', text)
sentences = [s.strip() for s in sentences if s.strip()]
cleanup_stats['sentences_processed'] += len(sentences)
if sentences:
# 优先保留包含问题描述的句子
problem_sentences = []
for sentence in sentences:
if any(keyword in sentence.lower() for keyword in problem_keywords):
problem_sentences.append(sentence)
if problem_sentences:
# 根据主导语言选择分隔符
lang_info = analyze_text_mixed(text)
separator = '。' if lang_info['dominant_lang'] == 'zh' else '. '
text = separator.join(problem_sentences)
else:
text = sentences[0]
return clean_text_with_spacy(text)
def clean_reason_field(text):
"""
专门处理Reason字段的清洗(中英文支持)
"""
if pd.isna(text) or text == '' or not isinstance(text, str):
return text
text = str(text)
# 统计RAG/LLM标记
rag_matches = re.findall(r'[\[【]RAG[\]】]:\s*', text)
llm_matches = re.findall(r'[\[【]LLM[\]】]:\s*', text)
cleanup_stats['rag_llm_marks'] += len(rag_matches) + len(llm_matches)
# 删除RAG/LLM标记(支持中英文括号)
text = re.sub(r'[\[【]RAG[\]】]:\s*', '', text)
text = re.sub(r'[\[【]LLM[\]】]:\s*', '', text)
# 统计引用文本
ref_patterns = [
(r'This decision was made with reference to several relevant defects including:', 'reference_texts'),
(r'该决定参考了以下相关缺陷:', 'reference_texts'),
(r'参考相关缺陷:', 'reference_texts')
]
for pattern, stat_key in ref_patterns:
matches = re.findall(pattern, text)
if matches:
cleanup_stats[stat_key] += len(matches)
text = re.sub(pattern, '', text)
# 中英文分句
sentences = re.split(r'[。\.!!??]', text)
sentences = [s.strip() for s in sentences if s.strip()]
cleanup_stats['sentences_processed'] += len(sentences)
if sentences:
# 中英文技术关键词
tech_keywords = ['similar', 'match', 'category', 'driver', 'bios', 'ee', 'thermal',
'类似', '匹配', '类别', '驱动', 'BIOS', 'EE', '散热', '相关']
# 保留第一个句子和包含技术描述的句子
important_sentences = [sentences[0]]
for sentence in sentences[1:]:
if any(keyword in sentence.lower() for keyword in tech_keywords):
important_sentences.append(sentence)
# 根据语言选择分隔符
lang = analyze_text_mixed(text)['dominant_lang']
separator = '。' if lang == 'zh' else '. '
text = separator.join(important_sentences)
return clean_text_with_spacy(text)
def parse_csv_data(csv_content):
"""
解析CSV格式的数据
"""
lines = csv_content.strip().split('\n')
# 解析表头
headers_line = lines[0]
# 检测表头格式
if 'DefectId' in headers_line:
headers = ['DefectId', 'SendDate', 'ReceiveDate', 'VendorName', 'ProductName',
'subject', 'status_name', 'Category', 'FeedbackCategory', 'Reason', 'Match', 'LLM']
else:
headers = headers_line.split(',')
# 解析数据行
data_rows = []
for line in lines[1:]:
if not line.strip():
continue
# 处理包含引号和逗号的字段
row = []
in_quotes = False
field = ''
for char in line:
if char == '"':
in_quotes = not in_quotes
elif char == ',' and not in_quotes:
row.append(field.strip())
field = ''
else:
field += char
row.append(field.strip())
# 确保每行有足够的列
while len(row) < len(headers):
row.append('')
while len(row) > len(headers):
row.pop()
data_rows.append(row)
return pd.DataFrame(data_rows, columns=headers)
def clean_data_with_spacy(csv_content):
"""
主清洗函数
"""
print("开始解析CSV数据...")
df = parse_csv_data(csv_content)
print(f"原始数据形状: {df.shape}")
print(f"列名: {list(df.columns)}")
# 检查spaCy模型可用性
if nlp_en is None and nlp_zh is None:
print("警告: 未找到spaCy模型,将使用基础文本清理")
# 重置清理统计
reset_cleanup_stats()
# 数据清洗管道
print("\n开始数据清洗...")
# 1. 清洗文本字段
text_columns = ['VendorName', 'ProductName', 'subject', 'status_name',
'Category', 'FeedbackCategory', 'Reason']
for col in text_columns:
if col in df.columns:
print(f"清洗列: {col}")
if col == 'subject':
df[col] = df[col].apply(clean_subject_field)
elif col == 'Reason':
df[col] = df[col].apply(clean_reason_field)
else:
df[col] = df[col].apply(clean_text_with_spacy)
# 2. 清洗日期字段
date_columns = ['SendDate', 'ReceiveDate']
for col in date_columns:
if col in df.columns:
print(f"标准化日期列: {col}")
df[col] = pd.to_datetime(df[col], errors='coerce')
# 处理中文日期格式
df[col] = df[col].dt.strftime('%Y-%m-%d %H:%M:%S')
# 3. 清洗数字字段
numeric_columns = ['DefectId', 'Match', 'LLM']
for col in numeric_columns:
if col in df.columns:
print(f"清洗数字列: {col}")
df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)
return df
def clean_csv_file(file_path):
"""
清理CSV文件的主函数
"""
try:
# 尝试不同的编码方式读取文件
encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1', 'utf-8-sig']
content = None
for encoding in encodings:
try:
with open(file_path, 'r', encoding=encoding) as f:
content = f.read()
print(f"成功使用编码: {encoding} 读取文件。")
break
except UnicodeDecodeError:
continue
except FileNotFoundError:
print(f"文件未找到: {file_path}")
return None
if content is None:
print("无法使用常见编码读取文件!")
return None
cleaned_df = clean_data_with_spacy(content)
# 打印详细清理统计
print_cleanup_stats()
# 生成输出路径
if file_path.endswith('.csv'):
output_path = file_path.replace('.csv', '_cleaned.csv')
else:
output_path = file_path + '_cleaned.csv'
# 保存清洗后的数据
cleaned_df.to_csv(output_path, index=False, encoding='utf-8-sig')
print(f"\n" + "=" * 60)
print(f"文件清洗完成! 保存为: {output_path}")
print(f"清洗后的数据形状: {cleaned_df.shape}")
print("=" * 60)
# 显示清理效果对比
print("\n清理效果对比:")
original_lines = content.strip().split('\n')
print(f"原始文件行数: {len(original_lines)}")
print(f"清洗后行数: {len(cleaned_df)}")
# 计算字符减少比例
original_chars = sum(len(line) for line in original_lines)
cleaned_chars = sum(cleaned_df[col].astype(str).str.len().sum()
for col in cleaned_df.columns if cleaned_df[col].dtype == 'object')
reduction_ratio = (original_chars - cleaned_chars) / original_chars * 100
print(f"字符数减少: {reduction_ratio:.1f}%")
return cleaned_df
except Exception as e:
print(f"文件处理错误: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
file_path = "./csv/AutoAssign.csv"
# 检查文件是否存在
import os
if os.path.exists(file_path):
print(f"开始处理文件: {file_path}")
result_df = clean_csv_file(file_path)
if result_df is not None:
print("\n处理成功!")
else:
print("处理失败!")
else:
print(f"文件不存在: {file_path}")
print("请检查文件路径是否正确。")
02-20
692
692
03-29
304
304
10-16
2638
2638
06-21
592
592

被折叠的 条评论
为什么被折叠?



