import json
import collections
json_file = open("train-v1.1.json")
data = json.load(json_file)
all_words = []
for paragraphs_title in data["data"]:
all_words.extend(paragraphs_title["title"].split())
paragraphs = paragraphs_title["paragraphs"]
for context_qas in paragraphs:
all_words.extend(context_qas["context"].split())
qas = context_qas["qas"]
for answers_question in qas:
answers = answers_question["answers"]
all_words.extend(answers_question["question"].split())
if len(answers)>1:
print(answers)
for answerstart_text in answers:
all_words.extend(answerstart_text["text"].split())
counter = collections.Counter(all_words)
count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
words, _ = list(zip(*count_pairs))
word_to_id = dict(zip(words, range(len(words))))
data_vec = []
for paragraphs_title in data["data"]:
title = paragraphs_title["title"]
paragraphs = paragraphs_title["paragraphs"]
paragraphs_title = []
data_vec.append(paragraphs_title)
for context_qas in paragraphs:
paragraphs_vec = []
paragraphs_title.append(paragraphs_vec)
context_vec = []
questions_answers = []
paragraphs_vec.append(context_vec)
paragraphs_vec.append(questions_answers)
for word in context_qas["context"].split():
context_vec.append(word_to_id[word])
qas = context_qas["qas"]
for answers_question in qas:
question_answer = []
questions_answers.append(question_answer)
question_vec = []
answer_vec = []
question_answer.append(question_vec)
question_answer.append(answer_vec)
answers = answers_question["answers"]
for word in answers[0]["text"].split():
answer_vec.append(word_to_id[word])
for word in answers_question["question"].split():
question_vec.append(word_to_id[word])
print("!")
统计SQuAD的词汇得到word2id 并把词都转成id的python代码
最新推荐文章于 2024-10-04 12:17:36 发布