Embedding Representation Visualization

import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


#embedding = np.loadtxt('word_embedding.txt', skiprows = 1)
with open('word_embedding.txt', 'r') as fin:
    firstline = fin.readline().strip().split()
    idx, dims = int(firstline[0]), int(firstline[1])
    embedding = np.zeros([idx, dims], dtype = np.float)
    lines = fin.readlines()
    for line in lines:
        i = 1
        line = line.strip().split(' ')
        if len(line) > 1:
            embedding[i] = line[1:]
            i = i + 1

print('idx = {}, dims = {}, embedding = {}\n'.format(idx, dims, embedding.shape))

emb = TSNE(n_components = 2).fit_transform(embedding)

plt.switch_backend('pdf')
plt.figure(1)
plt.scatter(emb[:, 0], emb[:, 1])
plt.title('word_embedding')
plt.xlabel('dim 1')
plt.ylabel('dim 2')

plt.savefig('word_embedding.jpg')


``` from bertopic import BERTopic import numpy as np import pandas as pd from umap import UMAP from hdbscan import HDBSCAN from sklearn.feature_extraction.text import CountVectorizer from bertopic.vectorizers import ClassTfidfTransformer import re import nltk from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS from nltk.tokenize import word_tokenize from wordcloud import WordCloud import matplotlib.pyplot as plt # 加载原始文本数据(仍需用于主题表示) df = pd.read_csv('tokenized_abstract.csv', encoding='utf-8') sentences = df['Tokenized_Abstract'].tolist() print('文本条数: ', len(sentences)) print('预览第一条: ', sentences[0]) # 检查缺失值 print("缺失值数量:", df['Tokenized_Abstract'].isna().sum()) # 检查非字符串类型 non_str_mask = df['Tokenized_Abstract'].apply(lambda x: not isinstance(x, str)) print("非字符串样本:\n", df[non_str_mask]['Tokenized_Abstract'].head()) vectorizer_model = None from sentence_transformers import SentenceTransformer # 加载时间数据 df['Date'] = pd.to_datetime(df['Date']) # 从Date列提取年份 years = df['Date'].dt.year print(years) # 按时间段分组 # 2. 定义时间段函数 def get_time_period(date): if date.year <= 2010: return 't1' elif date.year <= 2018: return 't2' return 't3' timestamps = df['Date'].tolist() df['period'] = df['Date'].apply(get_time_period) print(df['period'].value_counts()) # Step 1 - Extract embeddings embedding_model = SentenceTransformer("C:\\Users\\18267\\.cache\\huggingface\\hub\\models--sentence-transformers--all-mpnet-base-v2\\snapshots\\9a3225965996d404b775526de6dbfe85d3368642") embeddings = np.load('clean_emb_last.npy') print(f"嵌入的形状: {embeddings.shape}") # Step 2 - Reduce dimensionality umap_model = UMAP(n_neighbors=7, n_components=10, min_dist=0.0, metric='cosine',random_state=42) # Step 3 - Cluster reduced embeddings hdbscan_model = HDBSCAN(min_samples=7, min_cluster_size=60,metric='euclidean', cluster_selection_method='eom', prediction_data=True) # Step 4 - Tokenize topics # Combine custom stop words with scikit-learn's English stop words custom_stop_words = ['h2', 'storing', 'storage', 'include', 'comprise', 'utility', 'model', 'disclosed', 'embodiment', 'invention', 'prior', 'art', 'according', 'present', 'method', 'system', 'device', 'may', 'also', 'use', 'used', 'provide', 'wherein', 'configured', 'predetermined', 'plurality', 'comprising', 'consists', 'following', 'characterized', 'claim', 'claims', 'said', 'first', 'second', 'third', 'fourth', 'fifth', 'one', 'two', 'three','hydrogen'] # Create combined stop words set all_stop_words = set(custom_stop_words).union(ENGLISH_STOP_WORDS) vectorizer_model = CountVectorizer(stop_words=list(all_stop_words)) # Step 5 - Create topic representation ctfidf_model = ClassTfidfTransformer() # All steps together topic_model = BERTopic( embedding_model=embedding_model, # Step 1 - Extract embeddings umap_model=umap_model, # Step 2 - Reduce dimensionality hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics ctfidf_model=ctfidf_model, # Step 5 - Extract topic words top_n_words=50 ) # 拟合模型 topics, probs = topic_model.fit_transform(documents=sentences, # 仍需提供文档用于主题词生成 embeddings=embeddings # 注入预计算嵌入) ) # 获取主题聚类信息 topic_info = topic_model.get_topic_info() print(topic_info)```在代码中添加了对时间数据的加载和按照划分的3个时间阶段对时间进行了阶段划分,重新考虑代码应该如何修改
最新发布
03-16
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值