Python使用spaCy对CSV文件数据清洗的一个示例

原创于 2025-12-08 10:18:06 发布 · 21 阅读
0 ·
CC 4.0 BY-SA版权
文章标签：
#python #人工智能 #开发语言
Python 专栏收录该内容
11 篇文章
订阅专栏
import pandas as pd
import spacy
import re
import html


# 加载spaCy的英文模型和中文模型
try:
    nlp_en = spacy.load('en_core_web_lg')
    print("英文模型加载成功")
except OSError:
    print("英文模型未安装，请运行: python -m spacy download en_core_web_lg")
    nlp_en = None

try:
    nlp_zh = spacy.load('zh_core_web_lg')
    print("中文模型加载成功")
except OSError:
    print("中文模型未安装，请运行: python -m spacy download zh_core_web_lg")
    nlp_zh = None


# 全局统计变量
cleanup_stats = {
    'html_tags': 0,
    'xml_tags': 0,
    'broken_tags': 0,
    'html_entities': 0,
    'xml_namespaces': 0,
    'brackets_removed': 0,
    'fr_marks_removed': 0,
    'rag_llm_marks': 0,
    'reference_texts': 0,
    'stopwords_removed': 0,
    'sentences_processed': 0,
    'mixed_text_segments': 0
}


def reset_cleanup_stats():
    """重置清理统计"""
    for key in cleanup_stats:
        cleanup_stats[key] = 0


def print_cleanup_stats():
    """
    打印清理统计信息
    """
    print("\n" + "=" * 60)
    print("详细清理统计报告")
    print("=" * 60)

    print(f"\n1. HTML/XML标签清理:")
    print(f"   - 完整HTML/XML标签: {cleanup_stats['html_tags']} 个")
    print(f"   - 残缺标签: {cleanup_stats['broken_tags']} 个")
    print(f"   - HTML实体: {cleanup_stats['html_entities']} 个")
    print(f"   - XML命名空间: {cleanup_stats['xml_namespaces']} 个")

    print(f"\n2. 特殊标记清理:")
    print(f"   - 方括号测试标记: {cleanup_stats['brackets_removed']} 个")
    print(f"   - 故障率标记(F/R): {cleanup_stats['fr_marks_removed']} 个")
    print(f"   - RAG/LLM标记: {cleanup_stats['rag_llm_marks']} 个")
    print(f"   - 引用文本: {cleanup_stats['reference_texts']} 个")

    print(f"\n3. 文本处理统计:")
    print(f"   - 停用词移除: {cleanup_stats['stopwords_removed']} 个")
    print(f"   - 句子处理: {cleanup_stats['sentences_processed']} 个")
    print(f"   - 混合文本分段: {cleanup_stats['mixed_text_segments']} 个")

    total_cleaned = sum(cleanup_stats.values())
    print(f"\n总计清理项目: {total_cleaned} 个")


def analyze_text_mixed(text):
    """
    分析文本的中英文混合程度，返回更细致的语言信息
    """
    if pd.isna(text) or text == '' or not isinstance(text, str):
        return {'dominant_lang': 'en', 'chinese_ratio': 0, 'mixed_level': 'none'}

    text = str(text)

    # 统计中文字符
    chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
    chinese_count = len(chinese_chars)

    # 统计英文字母
    english_chars = re.findall(r'[a-zA-Z]', text)
    english_count = len(english_chars)

    # 统计数字和其他字符
    total_chars = len(text)

    chinese_ratio = chinese_count / max(total_chars, 1)
    english_ratio = english_count / max(total_chars, 1)

    # 判断主导语言和混合程度
    if chinese_ratio > 0.6:
        dominant_lang = 'zh'
        mixed_level = 'mostly_chinese'
    elif english_ratio > 0.6:
        dominant_lang = 'en'
        mixed_level = 'mostly_english'
    elif chinese_ratio > 0.3 and english_ratio > 0.3:
        dominant_lang = 'zh' if chinese_ratio > english_ratio else 'en'
        mixed_level = 'highly_mixed'
    elif chinese_ratio > 0.1:
        dominant_lang = 'en'  # 英文为主，包含少量中文
        mixed_level = 'slightly_mixed'
    else:
        dominant_lang = 'en'
        mixed_level = 'pure_english'

    return {
        'dominant_lang': dominant_lang,
        'chinese_ratio': chinese_ratio,
        'english_ratio': english_ratio,
        'mixed_level': mixed_level,
        'chinese_count': chinese_count,
        'english_count': english_count
    }


def count_html_tags_in_text(text):
    """
    统计文本中的HTML/XML标签
    """
    if pd.isna(text) or text == '' or not isinstance(text, str):
        return 0

    text = str(text)

    # 统计完整标签
    full_tags = re.findall(r'<[^>]+>', text)

    # 统计残缺标签
    broken_start_tags = re.findall(r'<[^>]*$', text)
    broken_end_tags = re.findall(r'^[^>]*>', text)

    # 统计HTML实体
    html_entities = re.findall(r'&[a-z]+;', text)
    numeric_entities = re.findall(r'&#\d+;', text)

    # 统计XML命名空间
    xml_namespaces = re.findall(r'xmlns:[^=]+="[^"]+"', text)
    xsi_namespaces = re.findall(r'xsi:[^=]+="[^"]+"', text)

    return {
        'full_tags': len(full_tags),
        'broken_tags': len(broken_start_tags) + len(broken_end_tags),
        'html_entities': len(html_entities) + len(numeric_entities),
        'xml_namespaces': len(xml_namespaces) + len(xsi_namespaces)
    }


def clean_html_xml_tags(text):
    """
    删除残缺的HTML和XML标签（支持中英文）并统计
    """
    if pd.isna(text) or text == '' or not isinstance(text, str):
        return text

    original_text = str(text)
    text = html.unescape(original_text)

    # 统计清理前的标签数量
    tag_stats_before = count_html_tags_in_text(text)

    # 记录原始长度用于统计
    original_len = len(text)

    # 删除完整的HTML/XML标签
    text, full_tags_count = re.subn(r'<[^>]+>', '', text)
    cleanup_stats['html_tags'] += full_tags_count

    # 删除残缺的标签
    text, broken_start_count = re.subn(r'<[^>]*$', '', text)
    text, broken_end_count = re.subn(r'^[^>]*>', '', text)
    cleanup_stats['broken_tags'] += broken_start_count + broken_end_count

    # 删除HTML特殊字符
    text, html_entities_count = re.subn(r'&[a-z]+;', '', text)
    text, numeric_entities_count = re.subn(r'&#\d+;', '', text)
    cleanup_stats['html_entities'] += html_entities_count + numeric_entities_count

    # 删除XML命名空间
    text, xmlns_count = re.subn(r'xmlns:[^=]+="[^"]+"', '', text)
    text, xsi_count = re.subn(r'xsi:[^=]+="[^"]+"', '', text)
    cleanup_stats['xml_namespaces'] += xmlns_count + xsi_count

    # 记录清理长度变化
    cleaned_len = len(text)
    chars_removed = original_len - cleaned_len

    return text.strip()


def clean_mixed_text(text):
    """
    专门处理中英文混合文本
    """
    if pd.isna(text) or text == '' or not isinstance(text, str):
        return text

    text = str(text)
    lang_info = analyze_text_mixed(text)

    # 统计句子数量
    sentences = re.split(r'[。\.!！?？;；]', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    cleanup_stats['sentences_processed'] += len(sentences)

    # 根据混合程度选择处理策略
    if lang_info['mixed_level'] in ['pure_english', 'mostly_english']:
        # 以英文为主，使用英文处理
        if nlp_en is not None:
            doc = nlp_en(text)
            return clean_english_text(doc)
        else:
            return basic_text_clean(text)

    elif lang_info['mixed_level'] in ['mostly_chinese']:
        # 以中文为主，使用中文处理
        if nlp_zh is not None:
            doc = nlp_zh(text)
            return clean_chinese_text(doc)
        else:
            return basic_text_clean(text)

    else:
        # 高度混合或轻微混合，使用混合处理策略
        return process_highly_mixed_text(text, lang_info)


def process_highly_mixed_text(text, lang_info):
    """
    处理高度中英文混合的文本
    """
    # 策略1: 分句处理
    sentences = re.split(r'[。\.!！?？;；]', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    cleanup_stats['sentences_processed'] += len(sentences)

    cleaned_sentences = []

    for sentence in sentences:
        if not sentence:
            continue

        sent_lang_info = analyze_text_mixed(sentence)

        if sent_lang_info['mixed_level'] in ['pure_english', 'mostly_english']:
            # 英文句子用英文处理
            if nlp_en is not None:
                doc = nlp_en(sentence)
                cleaned = clean_english_text(doc)
            else:
                cleaned = basic_text_clean(sentence)

        elif sent_lang_info['mixed_level'] in ['mostly_chinese']:
            # 中文句子用中文处理
            if nlp_zh is not None:
                doc = nlp_zh(sentence)
                cleaned = clean_chinese_text(doc)
            else:
                cleaned = basic_text_clean(sentence)

        else:
            # 混合句子使用特殊处理
            cleaned = process_mixed_sentence(sentence)

        if cleaned:
            cleaned_sentences.append(cleaned)

    # 根据主导语言选择连接符
    separator = '。' if lang_info['dominant_lang'] == 'zh' else '. '
    return separator.join(cleaned_sentences)


def process_mixed_sentence(sentence):
    """
    处理单个混合句子
    """
    # 使用正则表达式分割中英文部分
    # 匹配中文字符段和英文字符段
    segments = re.findall(r'[\u4e00-\u9fff]+|[a-zA-Z0-9\s\.\-]+', sentence)
    cleanup_stats['mixed_text_segments'] += len(segments)

    cleaned_segments = []

    for segment in segments:
        if not segment.strip():
            continue

        # 判断段落的语言特性
        if re.search(r'[\u4e00-\u9fff]', segment):
            # 中文字符段
            if nlp_zh is not None:
                doc = nlp_zh(segment)
                cleaned = clean_chinese_text(doc)
            else:
                cleaned = basic_text_clean(segment)
        else:
            # 英文字符段
            if nlp_en is not None:
                doc = nlp_en(segment)
                cleaned = clean_english_text(doc)
            else:
                cleaned = basic_text_clean(segment)

        cleaned_segments.append(cleaned)

    return ' '.join(cleaned_segments)


def clean_chinese_text(doc):
    """
    中文文本清洗处理 - 增强版，保留重要英文术语
    """
    cleaned_tokens = []
    original_token_count = len([token for token in doc])

    for token in doc:
        # 中文处理：保留中文字符、数字、字母
        if (token.is_alpha or token.is_digit) and not token.is_space:
            if len(token.text.strip()) > 0:
                # 对于英文术语，检查是否是技术缩写
                if re.match(r'^[a-zA-Z]+$', token.text):
                    # 如果是常见技术缩写，保留原样
                    if token.text.upper() in ['BSOD', 'USB', 'TBT', 'CPU', 'GPU', 'LED', 'LCD', 'BIOS', 'OS']:
                        cleaned_tokens.append(token.text.upper())
                    elif len(token.text) > 1:  # 普通英文单词，转为小写
                        cleaned_tokens.append(token.text.lower())
                    else:  # 单个英文字母，可能是缩写的一部分
                        cleaned_tokens.append(token.text)
                else:
                    # 中文字符，直接保留
                    cleaned_tokens.append(token.text)

    cleaned_text = ' '.join(cleaned_tokens)

    # 清理特殊字符，保留中英文标点
    cleaned_text = re.sub(r'[^\w\s\u4e00-\u9fff。，；：！？（）《》\.\-]', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    # 统计停用词移除
    cleanup_stats['stopwords_removed'] += (original_token_count - len(cleaned_tokens))

    return cleaned_text


def clean_english_text(doc):
    """
    英文文本清洗处理 - 增强版，保留中文内容
    """
    cleaned_tokens = []
    original_token_count = len([token for token in doc])

    for token in doc:
        if (token.is_alpha or token.is_digit) and not token.is_space:
            # 检查是否是中文字符
            if re.search(r'[\u4e00-\u9fff]', token.text):
                # 中文字符，直接保留
                cleaned_tokens.append(token.text)
            else:
                # 英文字符处理
                if not token.is_stop and len(token.text) > 1:
                    cleaned_tokens.append(token.lemma_.lower())
                elif token.text.upper() in ['BSOD', 'USB', 'TBT', 'CPU', 'GPU', 'LED', 'LCD', 'BIOS', 'OS']:
                    cleaned_tokens.append(token.text)
                elif len(token.text) == 1 and token.text.isupper():
                    # 单个大写字母，可能是缩写
                    cleaned_tokens.append(token.text)

    cleaned_text = ' '.join(cleaned_tokens)

    # 清理特殊字符，保留中文字符
    cleaned_text = re.sub(r'[^\w\s\u4e00-\u9fff\.\-]', ' ', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    # 统计停用词移除
    cleanup_stats['stopwords_removed'] += (original_token_count - len(cleaned_tokens))

    return cleaned_text


def clean_text_with_spacy(text):
    """
    使用spaCy进行中英文混合文本清洗和标准化 - 主入口函数
    """
    if pd.isna(text) or text == '' or not isinstance(text, str):
        return text

    # 先删除HTML/XML标签
    text = clean_html_xml_tags(text)

    if text == '':
        return text

    # 使用混合文本处理
    return clean_mixed_text(text)


def basic_text_clean(text):
    """
    基础文本清理（当spaCy不可用时使用）
    """
    if pd.isna(text) or text == '' or not isinstance(text, str):
        return text

    text = str(text)

    # 基本清理：移除特殊字符，保留中英文、数字、基本标点
    text = re.sub(r'[^\w\s\u4e00-\u9fff。，；：！？（）《》\.\-]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text


def clean_subject_field(text):
    """
    专门处理subject字段的清洗（增强混合文本支持）
    """
    if pd.isna(text) or text == '' or not isinstance(text, str):
        return text

    text = str(text)

    # 统计方括号标记
    bracket_matches = re.findall(r'[\[【][^\]】]+[\]】]', text)
    cleanup_stats['brackets_removed'] += len(bracket_matches)

    # 删除方括号内的测试标记（支持中英文方括号）
    text = re.sub(r'[\[【][^\]】]+[\]】]', '', text)

    # 统计故障率标记
    fr_matches = re.findall(r'[\(（]F/R:[^\)）]+[\)）]', text)
    fr_matches += re.findall(r'FR:[^\)）]+[\)）]', text)
    cleanup_stats['fr_marks_removed'] += len(fr_matches)

    # 删除故障率标记
    text = re.sub(r'[\(（]F/R:[^\)）]+[\)）]', '', text)
    text = re.sub(r'FR:[^\)）]+[\)）]', '', text)

    # 应用通用的HTML/XML清理
    text = clean_html_xml_tags(text)

    # 中英文问题关键词
    problem_keywords = ['fail', 'error', 'cannot', 'no', 'not work', 'bsod', 'black screen',
                        '失败', '错误', '无法', '不工作', '蓝屏', '黑屏', '问题', '故障']

    # 提取核心问题描述
    if any(keyword in text.lower() for keyword in problem_keywords):
        # 中英文分句
        sentences = re.split(r'[。\.!！?？;；]', text)
        sentences = [s.strip() for s in sentences if s.strip()]
        cleanup_stats['sentences_processed'] += len(sentences)

        if sentences:
            # 优先保留包含问题描述的句子
            problem_sentences = []
            for sentence in sentences:
                if any(keyword in sentence.lower() for keyword in problem_keywords):
                    problem_sentences.append(sentence)

            if problem_sentences:
                # 根据主导语言选择分隔符
                lang_info = analyze_text_mixed(text)
                separator = '。' if lang_info['dominant_lang'] == 'zh' else '. '
                text = separator.join(problem_sentences)
            else:
                text = sentences[0]

    return clean_text_with_spacy(text)


def clean_reason_field(text):
    """
    专门处理Reason字段的清洗（中英文支持）
    """
    if pd.isna(text) or text == '' or not isinstance(text, str):
        return text

    text = str(text)

    # 统计RAG/LLM标记
    rag_matches = re.findall(r'[\[【]RAG[\]】]:\s*', text)
    llm_matches = re.findall(r'[\[【]LLM[\]】]:\s*', text)
    cleanup_stats['rag_llm_marks'] += len(rag_matches) + len(llm_matches)

    # 删除RAG/LLM标记（支持中英文括号）
    text = re.sub(r'[\[【]RAG[\]】]:\s*', '', text)
    text = re.sub(r'[\[【]LLM[\]】]:\s*', '', text)

    # 统计引用文本
    ref_patterns = [
        (r'This decision was made with reference to several relevant defects including:', 'reference_texts'),
        (r'该决定参考了以下相关缺陷：', 'reference_texts'),
        (r'参考相关缺陷：', 'reference_texts')
    ]

    for pattern, stat_key in ref_patterns:
        matches = re.findall(pattern, text)
        if matches:
            cleanup_stats[stat_key] += len(matches)
            text = re.sub(pattern, '', text)

    # 中英文分句
    sentences = re.split(r'[。\.!！?？]', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    cleanup_stats['sentences_processed'] += len(sentences)

    if sentences:
        # 中英文技术关键词
        tech_keywords = ['similar', 'match', 'category', 'driver', 'bios', 'ee', 'thermal',
                         '类似', '匹配', '类别', '驱动', 'BIOS', 'EE', '散热', '相关']

        # 保留第一个句子和包含技术描述的句子
        important_sentences = [sentences[0]]
        for sentence in sentences[1:]:
            if any(keyword in sentence.lower() for keyword in tech_keywords):
                important_sentences.append(sentence)

        # 根据语言选择分隔符
        lang = analyze_text_mixed(text)['dominant_lang']
        separator = '。' if lang == 'zh' else '. '
        text = separator.join(important_sentences)

    return clean_text_with_spacy(text)


def parse_csv_data(csv_content):
    """
    解析CSV格式的数据
    """
    lines = csv_content.strip().split('\n')

    # 解析表头
    headers_line = lines[0]
    # 检测表头格式
    if 'DefectId' in headers_line:
        headers = ['DefectId', 'SendDate', 'ReceiveDate', 'VendorName', 'ProductName',
                   'subject', 'status_name', 'Category', 'FeedbackCategory', 'Reason', 'Match', 'LLM']
    else:
        headers = headers_line.split(',')

    # 解析数据行
    data_rows = []
    for line in lines[1:]:
        if not line.strip():
            continue

        # 处理包含引号和逗号的字段
        row = []
        in_quotes = False
        field = ''

        for char in line:
            if char == '"':
                in_quotes = not in_quotes
            elif char == ',' and not in_quotes:
                row.append(field.strip())
                field = ''
            else:
                field += char
        row.append(field.strip())

        # 确保每行有足够的列
        while len(row) < len(headers):
            row.append('')
        while len(row) > len(headers):
            row.pop()

        data_rows.append(row)

    return pd.DataFrame(data_rows, columns=headers)


def clean_data_with_spacy(csv_content):
    """
    主清洗函数
    """
    print("开始解析CSV数据...")
    df = parse_csv_data(csv_content)

    print(f"原始数据形状: {df.shape}")
    print(f"列名: {list(df.columns)}")

    # 检查spaCy模型可用性
    if nlp_en is None and nlp_zh is None:
        print("警告: 未找到spaCy模型，将使用基础文本清理")

    # 重置清理统计
    reset_cleanup_stats()

    # 数据清洗管道
    print("\n开始数据清洗...")

    # 1. 清洗文本字段
    text_columns = ['VendorName', 'ProductName', 'subject', 'status_name',
                    'Category', 'FeedbackCategory', 'Reason']

    for col in text_columns:
        if col in df.columns:
            print(f"清洗列: {col}")
            if col == 'subject':
                df[col] = df[col].apply(clean_subject_field)
            elif col == 'Reason':
                df[col] = df[col].apply(clean_reason_field)
            else:
                df[col] = df[col].apply(clean_text_with_spacy)

    # 2. 清洗日期字段
    date_columns = ['SendDate', 'ReceiveDate']
    for col in date_columns:
        if col in df.columns:
            print(f"标准化日期列: {col}")
            df[col] = pd.to_datetime(df[col], errors='coerce')
            # 处理中文日期格式
            df[col] = df[col].dt.strftime('%Y-%m-%d %H:%M:%S')

    # 3. 清洗数字字段
    numeric_columns = ['DefectId', 'Match', 'LLM']
    for col in numeric_columns:
        if col in df.columns:
            print(f"清洗数字列: {col}")
            df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(int)

    return df


def clean_csv_file(file_path):
    """
    清理CSV文件的主函数
    """
    try:
        # 尝试不同的编码方式读取文件
        encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1', 'utf-8-sig']
        content = None

        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as f:
                    content = f.read()
                print(f"成功使用编码: {encoding} 读取文件。")
                break
            except UnicodeDecodeError:
                continue
            except FileNotFoundError:
                print(f"文件未找到: {file_path}")
                return None

        if content is None:
            print("无法使用常见编码读取文件！")
            return None

        cleaned_df = clean_data_with_spacy(content)

        # 打印详细清理统计
        print_cleanup_stats()

        # 生成输出路径
        if file_path.endswith('.csv'):
            output_path = file_path.replace('.csv', '_cleaned.csv')
        else:
            output_path = file_path + '_cleaned.csv'

        # 保存清洗后的数据
        cleaned_df.to_csv(output_path, index=False, encoding='utf-8-sig')

        print(f"\n" + "=" * 60)
        print(f"文件清洗完成! 保存为: {output_path}")
        print(f"清洗后的数据形状: {cleaned_df.shape}")
        print("=" * 60)

        # 显示清理效果对比
        print("\n清理效果对比:")
        original_lines = content.strip().split('\n')
        print(f"原始文件行数: {len(original_lines)}")
        print(f"清洗后行数: {len(cleaned_df)}")

        # 计算字符减少比例
        original_chars = sum(len(line) for line in original_lines)
        cleaned_chars = sum(cleaned_df[col].astype(str).str.len().sum()
                            for col in cleaned_df.columns if cleaned_df[col].dtype == 'object')
        reduction_ratio = (original_chars - cleaned_chars) / original_chars * 100
        print(f"字符数减少: {reduction_ratio:.1f}%")

        return cleaned_df

    except Exception as e:
        print(f"文件处理错误: {e}")
        import traceback
        traceback.print_exc()
        return None


if __name__ == "__main__":

    file_path = "./csv/AutoAssign.csv"

    # 检查文件是否存在
    import os

    if os.path.exists(file_path):
        print(f"开始处理文件: {file_path}")
        result_df = clean_csv_file(file_path)
        if result_df is not None:
            print("\n处理成功！")
        else:
            print("处理失败！")
    else:
        print(f"文件不存在: {file_path}")
        print("请检查文件路径是否正确。")