安装必要包 pandas
!pip install pandas openpyxl
读取数据
import pandas as pd
import re
# 读取Excel文件
df = pd.read_excel('训练集-语文.xlsx')
df = df.replace('.', '.', regex=True)
df = df.replace('(', '(', regex=True)
# 读取第二行(即第三行)“选项”列的内容
second_row_option_content = df.loc[2, '选项']
# 显示第二行“选项”列的内容
print(second_row_option_content)
处理数据
通过正则表达式,筛选所需数据,提取排列
通过dataframe链接数据特征
def chinese_multiple_choice_questions(questions_with_answers):
# 输入的题目文本
text = questions_with_answers
# 正则表达式模式
question_pattern = re.compile(r'\d+\..*?(?=\d+\.|$)', re.DOTALL)
choice_pattern = re.compile(r'([A-D])\s*(.*?)(?=[A-D]|$|\n)', re.DOTALL)
# 找到所有问题
questions = question_pattern.findall(text)
# 初始化选择题和简答题列表
multiple_choice_questions = []
short_answer_questions = []
# 处理每个问题
for id,question in enumerate(questions):
# 检查是否是选择题
if re.search(r'[A-D]', question):
choices = choice_pattern.findall(question)
question_text = re.split(r'\n', question.split('(')[0])[0]
pattern_question = re.compile(r'(\d+)\.(.*)')
matches_question = str(id+1)+'.'+ pattern_question.findall(question_text)[0][1] # 取出问题后重排序
# print(str(id+1)+'.'+matches_question)
multiple_choice_questions.append({
'question': matches_question,
'choices': choices
})
else:
short_answer_questions.append(question.strip())
return multiple_choice_questions
questions_list = []
for data_id in range(len(df[:3])):
second_row_option_content = df.loc[data_id, '选项']
questions_list.append(chinese_multiple_choice_questions(second_row_option_content))
def chinese_multiple_choice_answers(questions_with_answers):
questions_with_answers = questions_with_answers.replace("