
一、环境准备
pip install pymilvus python-dotenv transformers torch tqdm
二、文本分割与向量化
from glob import glob
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
import torch
def text_to_vector(text_chunk):
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")
inputs = tokenizer(text_chunk, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state[:, 0, :].numpy().squeeze()
def split_text_file(file_path, chunk_size=300):
with open(file_path, "r") as f:
full_text = f.read()
return [ful