好的,以下是一个使用百度飞桨(PaddlePaddle)和Jupyter Notebook进行自然语言处理的完整代码示例。这个示例将展示如何加载CSV数据集、显示训练和测试数据集的前5组数据和数据量,并显示训练过程中的迭代次数和精度。
```python
import pandas as pd
import paddle
from paddle.text.datasets import Imdb
from paddle.io import DataLoader, Dataset
from paddle.text.transform import ToTensor
from paddle.nn import Embedding, Linear, Softmax
from paddle.optimizer import Adam
# 1. 加载数据集
# 假设你的CSV文件名为 'data.csv',并且包含 'text' 和 'label' 两列
data = pd.read_csv('data.csv')
# 分割训练集和测试集
train_data = data.sample(frac=0.8, random_state=0)
test_data = data.drop(train_data.index)
# 显示训练数据集的前5组数据和数据量
print("训练数据集的前5组数据:")
print(train_data.head())
print(f"训练数据集的数据量: {len(train_data)}")
# 显示测试数据集的前5组数据和数据量
print("\n测试数据集的前5组数据:")
print(test_data.head())
print(f"测试数据集的数据量: {len(test_data)}")
# 2. 数据预处理
class TextDataset(Dataset):
def __init__(self, data, transform=None):
self.data = data
self.transform = transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
text = self.data.iloc[idx]['text']
label = self.data.iloc[idx]['label']
if self.transform:
text = self.transform(text)
return text, label
transform = ToTensor()
train_dataset = TextDataset(train_data, transform=transform)
test_dataset = TextDataset(test_data, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# 3. 定义模型
class TextClassifier(paddle.nn.Layer):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
super(TextClassifier, self).__init__()
self.embedding = Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
self.fc1 = Linear(in_features=embedding_dim, out_features=hidden_dim)
self.fc2 = Linear(in_features=hidden_dim, out_features=num_classes)
self.softmax = Softmax()
def forward(self, x):
x = self.embedding(x)
x = paddle.mean(x, axis=1)
x = self.fc1(x)
x = paddle.nn.functional.relu(x)
x = self.fc2(x)
x = self.softmax(x)
return x
# 4. 训练模型
vocab_size = 10000
embedding_dim = 128
hidden_dim = 128
num_classes = 2
model = TextClassifier(vocab_size, embedding_dim, hidden_dim, num_classes)
criterion = paddle.nn.CrossEntropyLoss()
optimizer = Adam(parameters=model.parameters())
for epoch in range(10):
model.train()
for batch_text, batch_label in train_loader:
output = model(batch_text)
loss = criterion(output, batch_label)
loss.backward()
optimizer.step()
optimizer.clear_grad()
# 5. 评估模型
model.eval()
correct = 0
total = 0
for batch_text, batch_label in test_loader:
output = model(batch_text)
loss = criterion(output, batch_label)
predicted = paddle.argmax(output, axis=1)
correct += paddle.sum(predicted == batch_label).numpy()[0]
total += len(batch_label)
accuracy = correct / total
print(f"Epoch {epoch + 1}, Loss: {loss.numpy()[0]}, Accuracy: {accuracy}")
#