词嵌入代码入门级理解

最新推荐文章于 2025-05-13 22:10:57 发布

原创最新推荐文章于 2025-05-13 22:10:57 发布 · 159 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#nlp

import torchtext
# 隐藏警告消息
torchtext.disable_torchtext_deprecation_warning()
from torchtext.vocab import GloVe
import torch
from torch import nn
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
#glove算法：将单词转为向量
glove = GloVe(name="6B", dim=300)
# 使用nn.Embedding创建词嵌入
# 此时的Embedding层嵌入了Glove词向量数据
embedding = nn.Embedding.from_pretrained(glove.vectors)

print(embedding.weight.shape)
# torch.Size([400000, 300]) 4w单词，每个单词30个维度表示

words = ["man", "woman", "king", "queen", "cat", "dog", "mother", "father"]
indices = []
for word in words:
    # 将单词通过GloVe词汇表转换为索引
    index = glove.stoi[word]
    indices.append(index)
    print(word, "-->", index)

# 将索引转换为张量
indices = torch.tensor(indices)
vectors = embedding(indices).detach().numpy()
print("vectors.shape", vectors.shape)  # (8, 300) 8个单词，300个维度


# 对vector降维,从300降至2维
pca = PCA(n_components=2)
vectors_2d = pca.fit_transform(vectors)

# 将单词和向量绘制在二维空间中
plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1])
for i, word in enumerate(words):
    plt.annotate(
        word,
        xy=(vectors_2d[i, 0], vectors_2d[i, 1]),
        xytext=(5, 2),
        textcoords="offset points",
    )

plt.show()