✅ 学习建议:如何从这个脚本进阶?
✅ 把 BoW 替换成 TF-IDF 或 One-Hot
✅ 添加 ReLU 激活函数
✅ 改成多分类任务(3 类情绪)
✅ 引入 sklearn 做对比实验
✅ 用 PyTorch 重写这个流程
✅ 最终迁移到 Hugging Face + Transformers
# ========================================
# 🧠 纯 Python 实现的 AI 模型全流程
# 包括:数据处理 → 模型定义 → 训练 → 保存 → 推理
# 无需 PyTorch/TensorFlow,仅使用 numpy 和标准库
# ========================================
import numpy as np
import pickle
import re
from collections import Counter
import random
# ================================
# 1. 模拟数据集(正面/负面评论)
# ================================
def create_dataset():
positive_texts = [
"I love this movie it's amazing",
"great acting and wonderful story",
"best film ever watched",
"highly recommended excellent quality",
"beautiful cinematography and music"
]
negative_texts = [
"this movie is terrible and boring",
"awful acting waste of time",
"worst film I have ever seen",
"boring plot and bad editing",
"not recommended at all"
]
texts = positive_texts + negative_texts
labels = [1] * len(positive_texts) + [0] * len(negative_texts)
return texts, labels
texts, labels = create_dataset()
print("✅ 数据集创建完成")
for t, l in zip(texts, labels):
print(f" '{t}' -> {l}")
# ================================
# 2. 文本预处理与词汇表构建
# ================================
def clean_text(text):
text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
return text.strip()
def build_vocab(texts, min_freq=1):
words = []
for text in texts:
cleaned = clean_text(text)
words.extend(cleaned.split())
freq = Counter(words)
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, count in freq.items():
if count >= min_freq:
vocab[word] = len(vocab)
return vocab
vocab = build_vocab(texts)
print(f"\n✅ 构建词汇表完成,大小: {len(vocab)}")
print("示例:", dict(list(vocab.items())[:10]))
# ================================
# 3. 文本向量化(Bag-of-Words)
# ================================
def text_to_bow(text, vocab):
vec = np.zeros(len(vocab))
cleaned = clean_text(text)
for word in cleaned.split():
idx = vocab.get(word, 1) # <UNK>
vec[idx] += 1
return vec
X = np.array([text_to_bow(t, vocab) for t in texts])
y = np.array(labels).reshape(-1, 1)
print(f"\n✅ 向量化完成 X.shape={X.shape}, y.shape={y.shape}")
print("示例向量:", X[0][:10])
# ================================
# 4. 定义神经网络(MLP)
# ================================
class SimpleNN:
def __init__(self, input_size, hidden_size=8, output_size=1):
self.W1 = np.random.randn(input_size, hidden_size) * 0.5
self.b1 = np.zeros((1, hidden_size))
self.W2 = np.random.randn(hidden_size, output_size) * 0.5
self.b2 = np.zeros((1, output_size))
self.lr = 0.01
def sigmoid(self, z):
# 防止溢出
z = np.clip(z, -500, 500)
return 1 / (1 + np.exp(-z))
def sigmoid_derivative(self, a):
return a * (1 - a)
def forward(self, X):
self.z1 = np.dot(X, self.W1) + self.b1
self.a1 = self.sigmoid(self.z1)
self.z2 = np.dot(self.a1, self.W2) + self.b2
self.a2 = self.sigmoid(self.z2)
return self.a2
def backward(self, X, y_true):
m = X.shape[0]
# 输出层误差
dz2 = (self.a2 - y_true) * self.sigmoid_derivative(self.a2)
dW2 = np.dot(self.a1.T, dz2) / m
db2 = np.sum(dz2, axis=0, keepdims=True) / m
# 隐藏层误差
da1 = np.dot(dz2, self.W2.T)
dz1 = da1 * self.sigmoid_derivative(self.a1)
dW1 = np.dot(X.T, dz1) / m
db1 = np.sum(dz1, axis=0, keepdims=True) / m
# 更新参数
self.W2 -= self.lr * dW2
self.b2 -= self.lr * db2
self.W1 -= self.lr * dW1
self.b1 -= self.lr * db1
def train_step(self, X, y):
pred = self.forward(X)
loss = -np.mean(y * np.log(pred + 1e-8) + (1 - y) * np.log(1 - pred + 1e-8))
self.backward(X, y)
return loss
def predict(self, X):
prob = self.forward(X)
return (prob > 0.5).astype(int), prob
# 初始化模型
model = SimpleNN(input_size=X.shape[1], hidden_size=6)
print(f"\n✅ 模型初始化完成 W1:{model.W1.shape} W2:{model.W2.shape}")
# ================================
# 5. 训练模型
# ================================
epochs = 1000
for epoch in range(epochs):
loss = model.train_step(X, y)
if epoch % 200 == 0:
print(f"Epoch {epoch}, Loss: {loss:.4f}")
print("\n✅ 训练完成")
# ================================
# 6. 评估与推理测试
# ================================
pred_label, pred_prob = model.predict(X)
print("\n📊 训练集预测结果:")
for i, text in enumerate(texts):
real = y[i][0]
pred = pred_label[i][0]
prob = pred_prob[i][0]
status = "✅" if real == pred else "❌"
print(f"{status} '{text}' -> 真实:{real} 预测:{pred} (置信度:{prob:.2f})")
# ================================
# 7. 保存模型和词汇表
# ================================
def save_model(model, vocab, filepath):
data = {
'W1': model.W1,
'b1': model.b1,
'W2': model.W2,
'b2': model.b2,
'vocab': vocab
}
with open(filepath, 'wb') as f:
pickle.dump(data, f)
print(f"\n💾 模型已保存至 {filepath}")
save_model(model, vocab, "simple_nlp_model.pkl")
# ================================
# 8. 加载模型并进行新推理
# ================================
def load_and_predict(text, model_path, vocab):
with open(model_path, 'rb') as f:
data = pickle.load(f)
# 重建模型结构
class InferenceModel:
def __init__(self, data):
self.W1 = data['W1']
self.b1 = data['b1']
self.W2 = data['W2']
self.b2 = data['b2']
def sigmoid(self, z):
z = np.clip(z, -500, 500)
return 1 / (1 + np.exp(-z))
def forward(self, x):
z1 = np.dot(x, self.W1) + self.b1
a1 = self.sigmoid(z1)
z2 = np.dot(a1, self.W2) + self.b2
a2 = self.sigmoid(z2)
return a2
loaded_model = InferenceModel(data)
vector = text_to_bow(text, vocab).reshape(1, -1)
prob = loaded_model.forward(vector)[0][0]
pred = 1 if prob > 0.5 else 0
print(f"\n🔍 新句子推理: '{text}'")
print(f" 预测类别: {pred} (正面情绪)" if pred == 1 else f" 预测类别: {pred} (负面情绪)")
print(f" 置信度: {prob:.2f}")
return pred, prob
# 测试新句子
load_and_predict("I really enjoyed this fantastic movie", "simple_nlp_model.pkl", vocab)
load_and_predict("this is a terrible and boring film", "simple_nlp_model.pkl", vocab)
把进阶的完整代码给我,并标注修改的部分以及原因
最新发布