import numpy as np
import torch
from torch import nn
import torch.utils.data as Data
# 统计词频
from collections import Counter
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"current device:{device}")
# 句子数据
sentences = [
# 动物行为
"dogs chase cats", "cats drink milk", "dogs love bones",
"cats hate water", "dogs bark loudly", "cats climb trees",
"dogs play fetch", "cats purr softly", "dogs guard houses",
"cats hunt mice", "dogs wag tails", "cats nap all day",
"birds fly high", "fish swim fast", "birds sing in morning",
"tigers roar at night", "elephants walk slowly", "monkeys swing on trees",
# 食物特征
"apples are sweet", "bananas taste creamy", "apples grow on trees",
"bananas have peels", "apples make juice", "bananas are yellow",
"apples can be red", "bananas rich in potassium",
"oranges are juicy", "grapes are sour", "strawberries are red",
"bread is soft", "cheese tastes salty", "eggs are nutritious",
"rice is sticky", "noodles are long", "chocolate is sweet",
# 日常生活
"kids eat apples", "monkeys love bananas", "dogs enjoy meat",
"cats prefer fish", "people drink milk", "children like fruit",
"breakfast has milk", "dogs drink water", "cats avoid dogs",
"people walk to work", "buses run on time", "cars drive fast",
"students study hard", "teachers explain clearly", "doctors help patients",
"cooks prepare meals", "babies cry at night", "seniors rest peacefully",
# 情感与动作
"he feels happy", "she looks tired", "they laugh together",
"he cries softly", "she dances gracefully", "they argue loudly",
"he hugs his dog", "she kisses her baby", "they shake hands",
"he smiles kindly", "she yells in anger", "they cheer with joy",
# 自然与天气
"sun rises early", "moon shines at night", "stars twinkle above",
"clouds float in sky", "rain falls gently", "wind blows strongly",
"snow covers mountains", "ice melts in sun", "lightning flashes bright",
"thunder rumbles loudly", "fog covers the fields", "dew drops sparkle",
# 人物与职业
"doctors heal patients", "teachers teach students", "engineers build bridges",
"writers write stories", "artists draw pictures", "singers sing songs",
"actors perform plays", "pilots fly planes", "farmers grow crops",
"chefs cook meals", "drivers drive cars", "scientists make discoveries",
# 更多组合
"apple pie is delicious", "banana smoothie is healthy",
"dog food contains nutrients", "cat food has tuna flavor",
"apple and banana salad", "dog and cat live together",
"milk with banana shake", "fish for cat dinner",
"bread with cheese", "rice with chicken", "noodles with beef",
"coffee with milk", "tea with sugar", "juice with ice"
]
# 分词
word_list = "".join(sentences).split()
# 转换成set去重
vocab = set(list(word_list))
vocab_size = len(vocab)
# print(vocal)
# 构建索引
v_to_idx = {c: i for i, c in enumerate(vocab)}
# print(f"v_to_idx={v_to_idx}")
idx_to_v = {i: c for i, c in enumerate(vocab)}
# 构建词频
f_word = Counter(vocab)
# print(f"f(w)={f_word}")
f_total = len(word_list)
# print(f"\sumf(w)={f_total}")
# 计算负采样概率
prob_neg_samples = {w: (cnt / f_total) ** 0.75 for w, cnt in f_word.items()}
# print(f"prob_neg_samples = {prob_neg_samples}")
# 转换成np.array的形式,其中放的是负采样概率
prob_neg_samples = np.array([prob_neg_samples[w] for w in vocab])
# print(f"prob_neg_samples = {prob_neg_samples}")
# 归一化
prob_neg_samples /= prob_neg_samples.sum()
# 超参数
# 窗口大小
window_size = 2
# 1个正例对4个反例
neg_count = 4
# 一批训练多少个样本
batch_size = 128
# word-embedding维度
embedding_size = 20
# 生成负采样训练样本
def generate_neg_sample():
skip_gram = [] # 存放((正/反例),flag) flag=True为正例
for idx in range(window_size, len(word_list) - window_size):
centre = v_to_idx[word_list[idx]] # 中心词
# 上下文词
contexts = [v_to_idx[word_list[i]] for i in range(idx - window_size, idx + window_size + 1) if i != idx]
for c in contexts:
# 对中心词-上下文词,作为正样本加入
skip_gram.append((centre, c, True))
# 负样本对应词索引集合
negs = []
# 随机选择负样本
while len(negs) < neg_count:
neg_idx = np.random.choice(len(vocab), p=prob_neg_samples)
if neg_idx != c:
negs.append(neg_idx)
# 负样本加入
for ni in negs:
skip_gram.append((centre, ni, False))
return skip_gram
# 将负采样训练样本转换为torch.tensor
data = generate_neg_sample()
centres = torch.tensor([d[0] for d in data], dtype=torch.long).to(device)
print(f"centres={centres}")
contexts = torch.tensor([d[1] for d in data], dtype=torch.long).to(device)
print(f"contexts ={contexts}")
labels = torch.tensor([d[2] for d in data], dtype=torch.float32).to(device)
print(f"labels={labels}")
# 使用TensorDataset将三个张量合成一个数据集对象,每个样本是一个三元组(centres, contexts, labels)
dataset = Data.TensorDataset(centres, contexts, labels)
# 按批次加载对象
dataloader = Data.DataLoader(dataset, batch_size, shuffle=True)
class NegSkipGram(nn.Module):
def __init__(self, vocab_size, embedding_size):
super().__init__()
# 得到中心词词嵌入向量
self.centre_embedding = nn.Embedding(vocab_size, embedding_size)
# 得到上下文词词嵌入向量
self.context_embedding = nn.Embedding(vocab_size, embedding_size)
# 初始化两个embedding层中的权重矩阵,使用正态分布进行原地填充(直接修改原tensor的值)
nn.init.normal_(
self.centre_embedding.weight, # 输出中心词word embedding的权重矩阵
mean=0, # 均值为0
std=0.01 # 标准差
)
nn.init.normal_(
self.context_embedding.weight, # 输出上下文词word embedding的权重矩阵
mean=0,
std=0.01
)
def forward(self, centres, contexts):
# 中心词词向量
vec_centre = self.centre_embedding(centres)
# print(f"vec_centre={vec_centre}")
# 上下文词词向量
vec_context = self.context_embedding(contexts)
# print(f"vec_context={vec_context}")
# 余弦相似度
return nn.CosineSimilarity(vec_centre, vec_context, dim=1)
model = NegSkipGram(vocab_size, embedding_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimiser = torch.optim.Adam(model.parameters(), lr=0.005)
epochs = 1000
for i in range(epochs):
for batch_centre, batch_context, batch_label in dataloader:
cos_score = model(batch_centre, batch_context)
loss = criterion(cos_score, batch_label)
optimiser.zero_grad()
loss.backward()
为什么报错
D:\anaconda3\envs\pytorch\python.exe D:\PycharmProect\Word2Vec\skip-gram.py
current device:cuda
centres=tensor([143, 143, 143, ..., 122, 122, 122], device='cuda:0')
contexts =tensor([ 76, 151, 9, ..., 170, 156, 90], device='cuda:0')
labels=tensor([1., 0., 0., ..., 0., 0., 0.], device='cuda:0')
Traceback (most recent call last):
File "D:\PycharmProect\Word2Vec\skip-gram.py", line 189, in <module>
cos_score = model(batch_centre, batch_context)
File "D:\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1736, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "D:\anaconda3\envs\pytorch\lib\site-packages\torch\nn\modules\module.py", line 1747, in _call_impl
return forward_call(*args, **kwargs)
File "D:\PycharmProect\Word2Vec\skip-gram.py", line 178, in forward
return nn.CosineSimilarity(vec_centre, vec_context, dim=1)
TypeError: __init__() got multiple values for argument 'dim'
进程已结束,退出代码为 1