def get_data_train(filepath):
df = pd.read_csv(filepath,sep='\t')
questions = df['Question']
sentences = df['Sentence']
labels = df['Label']
ids = df['QuestionID']
QuestionID = []
data_list = []
data_x = []
data_y = []
for i ,j in zip(df['QuestionID'], labels):
if j==1 and i not in QuestionID:
QuestionID.append(i)
for id in QuestionID:
for index in range(len(ids)):
if id==ids[index]:
data_x.append((questions[index],sentences[index]))
data_y.append(labels[index])
ros_under = RandomUnderSampler(sampling_strategy={0:3000}, random_state=0)
x_ros_under, y_ros_under = ros_under.fit_resample(np.array(data_x), np.array(data_y).reshape(-1, 1))
ros_over = RandomOverSampler(sampling_strategy={1:3000}, random_state=0)
x_ros, y_ros = ros_over.fit_resample(x_ros_under,y_ros_under)
for i,j in zip(x_ros.tolist(), y_ros.tolist()):
data_list.append((i[0], i[1], to_categorical(j, 2)))
data_list = np.array(data_list)
return data_list