```
from bertopic import BERTopic
import numpy as np
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
import matplotlib.pyplot as plt
# 加载原始文本数据(仍需用于主题表示)
df = pd.read_csv('tokenized_abstract.csv', encoding='utf-8')
sentences = df['Tokenized_Abstract'].tolist()
print('文本条数: ', len(sentences))
print('预览第一条: ', sentences[0])
# 检查缺失值
print("缺失值数量:", df['Tokenized_Abstract'].isna().sum())
# 检查非字符串类型
non_str_mask = df['Tokenized_Abstract'].apply(lambda x: not isinstance(x, str))
print("非字符串样本:\n", df[non_str_mask]['Tokenized_Abstract'].head())
vectorizer_model = None
from sentence_transformers import SentenceTransformer
# 加载时间数据
df['Date'] = pd.to_datetime(df['Date'])
# 从Date列提取年份
years = df['Date'].dt.year
print(years)
# 按时间段分组
# 2. 定义时间段函数
def get_time_period(date):
if date.year <= 2010:
return 't1'
elif date.year <= 2018:
return 't2'
return 't3'
timestamps = df['Date'].tolist()
df['period'] = df['Date'].apply(get_time_period)
print(df['period'].value_counts())
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("C:\\Users\\18267\\.cache\\huggingface\\hub\\models--sentence-transformers--all-mpnet-base-v2\\snapshots\\9a3225965996d404b775526de6dbfe85d3368642")
embeddings = np.load('clean_emb_last.npy')
print(f"嵌入的形状: {embeddings.shape}")
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=7, n_components=10, min_dist=0.0, metric='cosine',random_state=42)
# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_samples=7, min_cluster_size=60,metric='euclidean',
cluster_selection_method='eom',
prediction_data=True)
# Step 4 - Tokenize topics
# Combine custom stop words with scikit-learn's English stop words
custom_stop_words = ['h2', 'storing', 'storage', 'include', 'comprise',
'utility', 'model', 'disclosed', 'embodiment', 'invention', 'prior', 'art',
'according', 'present', 'method', 'system', 'device', 'may', 'also', 'use',
'used', 'provide', 'wherein', 'configured', 'predetermined', 'plurality',
'comprising', 'consists', 'following', 'characterized', 'claim', 'claims',
'said', 'first', 'second', 'third', 'fourth', 'fifth', 'one', 'two', 'three','hydrogen']
# Create combined stop words set
all_stop_words = set(custom_stop_words).union(ENGLISH_STOP_WORDS)
vectorizer_model = CountVectorizer(stop_words=list(all_stop_words))
# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()
# All steps together
topic_model = BERTopic(
embedding_model=embedding_model, # Step 1 - Extract embeddings
umap_model=umap_model, # Step 2 - Reduce dimensionality
hdbscan_model=hdbscan_model, # Step 3 - Cluster reduced embeddings
vectorizer_model=vectorizer_model, # Step 4 - Tokenize topics
ctfidf_model=ctfidf_model, # Step 5 - Extract topic words
top_n_words=50
)
# 拟合模型
topics, probs = topic_model.fit_transform(documents=sentences, # 仍需提供文档用于主题词生成
embeddings=embeddings # 注入预计算嵌入)
)
# 获取主题聚类信息
topic_info = topic_model.get_topic_info()
print(topic_info)```在代码中添加了对时间数据的加载和按照划分的3个时间阶段对时间进行了阶段划分,重新考虑代码应该如何修改
最新发布