python 实现加载CSV分类文本进行中文文本分类,使用google-bert\bert-base-chinese进行微调

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from tqdm import tqdm
import jieba

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")

class TextClassificationDataset(Dataset):
    """自定义文本分类数据集类"""
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # 对文本进行编码
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def load_and_preprocess_data(csv_path, text_column, label_column):
    """加载和预处理CSV数据"""
    # 读取CSV文件
    df = pd.read_csv(csv_path)
    print(f"数据形状: {df.shape}")
    print(f"标签分布:\n{df[label_column].value_counts()}")
    
    # 确保文本列是字符串类型
    df[text_column] = df[text_column].astype(str)
    
    # 划分训练集、验证集和测试集
    train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df[label_column])
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df[label_column])
    
    print(f"训练集大小: {len(train_df)}")
    print(f"验证集大小: {len(val_df)}")
    print(f"测试集大小: {len(test_df)}")
    
    return train_df, val_df, test_df

def create_data_loaders(train_df, val_df, test_df, text_column, label_column, tokenizer, batch_size=16, max_length=512):
    """创建数据加载器"""
    
    train_dataset = TextClassificationDataset(
        texts=train_df[text_column].values,
        labels=train_df[label_column].values,
        tokenizer=tokenizer,
        max_length=max_length
    )
    
    val_dataset = TextClassificationDataset(
        texts=val_df[text_column].values,
        labels=val_df[label_column].values,
        tokenizer=tokenizer,
        max_length=max_length
    )
    
    test_dataset = TextClassificationDataset(
        texts=test_df[text_column].values,
        labels=test_df[label_column].values,
        tokenizer=tokenizer,
        max_length=max_length
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler=None):
    """训练一个epoch"""
    model.train()
    losses = []
    correct_predictions = 0
    total_predictions = 0
    
    progress_bar = tqdm(data_loader, desc="训练中")
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # 前向传播
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        
        # 反向传播
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        if scheduler:
            scheduler.step()
        optimizer.zero_grad()
        
        # 统计
        losses.append(loss.item())
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        total_predictions += len(labels)
        
        # 更新进度条
        progress_bar.set_postfix({
            '损失': f'{loss.item():.4f}',
            '准确率': f'{correct_predictions.double() / total_predictions:.4f}'
        })
    
    accuracy = correct_predictions.double() / total_predictions
    average_loss = np.mean(losses)
    
    return average_loss, accuracy

def eval_model(model, data_loader, loss_fn, device):
    """评估模型"""
    model.eval()
    losses = []
    correct_predictions = 0
    total_predictions = 0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="评估中"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            
            losses.append(loss.item())
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            total_predictions += len(labels)
            
            all_predictions.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = correct_predictions.double() / total_predictions
    average_loss = np.mean(losses)
    
    return average_loss, accuracy, all_predictions, all_labels

def main():
    """主函数"""
    # 参数配置
    CSV_PATH = "your_data.csv"  # 替换为你的CSV文件路径
    TEXT_COLUMN = "text"  # 替换为你的文本列名
    LABEL_COLUMN = "label"  # 替换为你的标签列名
    MODEL_NAME = "google-bert/bert-base-chinese"
    BATCH_SIZE = 16
    MAX_LENGTH = 256
    EPOCHS = 4
    LEARNING_RATE = 2e-5
    
    # 加载分词器
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    print("分词器加载完成")
    
    # 加载数据
    train_df, val_df, test_df = load_and_preprocess_data(CSV_PATH, TEXT_COLUMN, LABEL_COLUMN)
    
    # 获取类别数量
    num_labels = len(train_df[LABEL_COLUMN].unique())
    print(f"类别数量: {num_labels}")
    
    # 创建数据加载器
    train_loader, val_loader, test_loader = create_data_loaders(
        train_df, val_df, test_df, TEXT_COLUMN, LABEL_COLUMN, tokenizer, BATCH_SIZE, MAX_LENGTH
    )
    
    # 加载模型
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_labels,
        output_attentions=False,
        output_hidden_states=False
    )
    model = model.to(device)
    print("模型加载完成")
    
    # 设置优化器和损失函数
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    loss_fn = torch.nn.CrossEntropyLoss()
    
    # 训练模型
    best_accuracy = 0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    for epoch in range(EPOCHS):
        print(f"\nEpoch {epoch + 1}/{EPOCHS}")
        print("-" * 50)
        
        # 训练
        train_loss, train_acc = train_epoch(
            model, train_loader, loss_fn, optimizer, device, scheduler
        )
        
        # 验证
        val_loss, val_acc, _, _ = eval_model(model, val_loader, loss_fn, device)
        
        print(f"训练损失: {train_loss:.4f}, 训练准确率: {train_acc:.4f}")
        print(f"验证损失: {val_loss:.4f}, 验证准确率: {val_acc:.4f}")
        
        # 保存最佳模型
        if val_acc > best_accuracy:
            best_accuracy = val_acc
            torch.save(model.state_dict(), 'best_bert_model.bin')
            print("保存最佳模型!")
        
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc.item())
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc.item())
    
    # 加载最佳模型并在测试集上评估
    model.load_state_dict(torch.load('best_bert_model.bin'))
    test_loss, test_acc, test_preds, test_labels = eval_model(
        model, test_loader, loss_fn, device
    )
    
    print(f"\n测试集结果:")
    print(f"测试损失: {test_loss:.4f}")
    print(f"测试准确率: {test_acc:.4f}")
    print(f"\n分类报告:")
    print(classification_report(test_labels, test_preds))
    
    # 保存最终模型和分词器
    model.save_pretrained("./my_finetuned_bert")
    tokenizer.save_pretrained("./my_finetuned_bert")
    print("\n模型和分词器已保存到 ./my_finetuned_bert 目录")

# 预测函数
def predict_text(text, model_path="./my_finetuned_bert"):
    """使用训练好的模型进行预测"""
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    model.eval()
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=256,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        input_ids = encoding['input_ids']
        attention_mask = encoding['attention_mask']
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        prediction = torch.argmax(probabilities, dim=1)
    
    return prediction.item(), probabilities.numpy()

if __name__ == "__main__":
    main()

CSV格式

text,label
"这个产品非常好用,质量很赞",1
"服务态度很差,体验不好",0
"中规中矩,没有特别之处",2

def main():
    # 检测GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"使用设备: {device}")
    if torch.cuda.is_available():
        print(f"GPU型号: {torch.cuda.get_device_name(0)}")
        print(f"GPU内存: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
    
    # 参数配置
    CSV_PATH = "your_dataset.csv"  # 替换为你的CSV文件路径
    TEXT_COLUMN = "text"  # 替换为你的文本列名
    LABEL_COLUMN = "label"  # 替换为你的标签列名
    
    # 加载数据
    print("加载和预处理数据...")
    df, label_to_id, id_to_label = load_and_preprocess_data(
        CSV_PATH, TEXT_COLUMN, LABEL_COLUMN
    )
    
    print(f"数据统计:")
    print(f"- 总样本数: {len(df)}")
    print(f"- 类别数量: {len(label_to_id)}")
    print(f"- 类别映射: {label_to_id}")
    
    # 初始化BERT分类器
    classifier = BERTClassifier(
        model_name='bert-base-chinese',
        num_labels=len(label_to_id),
        max_length=256
    )
    
    # 准备数据
    texts = df['cleaned_text'].tolist()
    labels = df['encoded_label'].tolist()
    
    train_loader, val_loader = classifier.prepare_data(
        texts, labels, batch_size=16, test_size=0.2
    )
    
    # 训练模型
    print("开始训练BERT模型...")
    training_stats = classifier.train(
        train_loader, val_loader, epochs=3, learning_rate=2e-5
    )
    
    # 加载最佳模型进行最终评估
    classifier.load_model('best_bert_model.pth')
    final_accuracy, report = classifier.evaluate(val_loader)
    
    print("\n最终模型性能:")
    print(f"验证集准确率: {final_accuracy:.4f}")
    print("\n分类报告:")
    print(report)
    
    # 示例推理
    test_texts = [
        "这个产品非常好用,质量很棒!",
        "服务态度很差,非常不满意",
        "一般般,没什么特别的感觉"
    ]
    
    print("\n示例推理结果:")
    for text in test_texts:
        prediction, probabilities = classifier.predict(text)
        predicted_label = id_to_label[prediction]
        print(f"文本: {text}")
        print(f"预测类别: {predicted_label} (概率: {probabilities[prediction]:.4f})")
        print("-" * 50)

class InferenceEngine:
    """专门的推理引擎,自动检测GPU"""
    def __init__(self, model_path, id_to_label, model_name='bert-base-chinese'):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"推理引擎使用设备: {self.device}")
        
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForSequenceClassification.from_pretrained(
            model_name, 
            num_labels=len(id_to_label)
        )
        self.model.load_state_dict(torch.load(model_path, map_location=self.device))
        self.model.to(self.device)
        self.model.eval()
        
        self.id_to_label = id_to_label
    
    def predict_single(self, text):
        """单条文本预测(GPU加速)"""
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=256,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)
        
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probabilities = torch.softmax(logits, dim=1)
            prediction = torch.argmax(logits, dim=1)
        
        pred_label = self.id_to_label[prediction.cpu().numpy()[0]]
        confidence = probabilities.cpu().numpy()[0][prediction.cpu().numpy()[0]]
        
        return pred_label, confidence
    
    def predict_batch_gpu(self, texts, batch_size=32):
        """批量预测(GPU优化)"""
        results = []
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_results = []
            
            for text in batch_texts:
                label, confidence = self.predict_single(text)
                batch_results.append({'text': text, 'label': label, 'confidence': confidence})
            
            results.extend(batch_results)
            
            if torch.cuda.is_available():
                torch.cuda.empty_cache()  # 清理GPU缓存
        
        return results

# 使用示例
if __name__ == "__main__":
    # 训练模型
    main()
    
    # 单独的推理示例
    print("\n" + "="*50)
    print("推理引擎演示")
    print("="*50)
    
    # 假设我们已经有了训练好的模型和标签映射
    id_to_label = {0: "负面", 1: "正面", 2: "中性"}  # 根据实际训练结果替换
    
    inference_engine = InferenceEngine(
        model_path='best_bert_model.pth',
        id_to_label=id_to_label
    )
    
    # 批量推理示例
    test_texts = [
        "这个电影真的很精彩,演员演技很棒",
        "产品质量很差,不建议购买",
        "服务一般,没有特别突出的地方",
        "非常满意的一次购物体验"
    ]
    
    results = inference_engine.predict_batch_gpu(test_texts)
    
    print("批量推理结果:")
    for result in results:
        print(f"文本: {result['text']}")
        print(f"预测: {result['label']} (置信度: {result['confidence']:.4f})")
        print("-" * 30)

评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值