import torchtext
# 隐藏警告消息
torchtext.disable_torchtext_deprecation_warning()
from torchtext.vocab import GloVe
import torch
from torch import nn
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
#glove算法:将单词转为向量
glove = GloVe(name="6B", dim=300)
# 使用nn.Embedding创建词嵌入
# 此时的Embedding层嵌入了Glove词向量数据
embedding = nn.Embedding.from_pretrained(glove.vectors)
print(embedding.weight.shape)
# torch.Size([400000, 300]) 4w单词,每个单词30个维度表示
words = ["man", "woman", "king", "queen", "cat", "dog", "mother", "father"]
indices = []
for word in words:
# 将单词通过GloVe词汇表转换为索引
index = glove.stoi[word]
indices.append(index)
print(word, "-->", index)
# 将索引转换为张量
indices = torch.tensor(indices)
vectors = embedding(indices).detach().numpy()
print("vectors.shape", vectors.shape) # (8, 300) 8个单词,300个维度
# 对vector降维,从300降至2维
pca = PCA(n_components=2)
vectors_2d = pca.fit_transform(vectors)
# 将单词和向量绘制在二维空间中
plt.scatter(vectors_2d[:, 0], vectors_2d[:, 1])
for i, word in enumerate(words):
plt.annotate(
word,
xy=(vectors_2d[i, 0], vectors_2d[i, 1]),
xytext=(5, 2),
textcoords="offset points",
)
plt.show()
词嵌入代码入门级理解
最新推荐文章于 2025-05-13 22:10:57 发布
629

被折叠的 条评论
为什么被折叠?



