train.py
from __future__ import print_function
from model import TFN
from utils import MultimodalDataset
from torch.utils.data import DataLoader
from torch.autograd import Variable
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import argparse
import torch
import random
import torch.nn as nn
import torch.optim as optim
import numpy as np
def preprocess(options):
dataset = options['dataset']
epochs = options['epochs']
model_path = options['model_path']
max_len = options['max_len']
model_path = os.path.join(
model_path, "tfn.pt")
print("Temp location for saving model: {}".format(model_path))
print("Currently using {} dataset.".format(dataset))
mosi = MultimodalDataset(dataset, max_len=max_len)
train_set, valid_set, test_set = mosi.train_set, mosi.valid_set, mosi.test_set
audio_dim = train_set[0][0].shape[1]
print("Audio feature dimension is: {}".format(audio_dim))
visual_dim = train_set[0][1].shape[1]
print("Visual feature dimension is: {}".format(visual_dim))
text_dim = train_set[0][2].shape[1]
print("Text feature dimension is: {}".format(text_dim))
input_dims = (audio_dim, visual_dim, text_dim)
visual_max = np.max(np.max(np.abs(train_set.visual), axis=0), axis=0)
visual_max[visual_max==0] = 1
train_set.visual = train_set.visual / visual_max
valid_set.visual = valid_set.visual / visual_max
test_set.visual = test_set.visual / visual_max
train_set.visual = np.mean(train_set.visual, axis=0, keepdims=True)
train_set.audio = np.mean(train_set.audio, axis=0, keepdims=True)
valid_set.visual = np.mean(valid_set.visual, axis=0, keepdims=True)
valid_set.audio = np.mean(valid_set.audio,