大语言模型分词器
代码
import transformers
import tokenizers
from transformers import AutoTokenizer
tokenizers = AutoTokenizer.from_pretrained("data/bert-base-uncased")
text = "This is a test sentence."
tokens = tokenizers.encode(text)
token_count = len(toke