facenet:triplet-loss理解与train_tripletloss.py代码理解

本文深入解析Facenet人脸识别模型的核心算法——三元损失函数Triplet-loss,阐述其工作原理与训练策略,旨在帮助读者理解如何通过网络学习使同一类别的样本距离更近,不同类别样本距离更远。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

对于Facenet进行人脸特征提取,算法内容较为核心和比较难以理解的地方在于三元损失函数Triplet-loss。
在这里插入图片描述神经网络所要学习的目标是:使得Anchor到Positive的距离要比Anchor到Negative的距离要短(Anchor为一个样本,Positive为与Anchor同类的样本,Negative为与Anchor不同类的样本)。通过学习使得类别内部的样本距离大于不同类别样本的距离即可。
通过数学表达式如下所示,alpha为一个常数:
在这里插入图片描述
损失函数的定义如下所示:
在这里插入图片描述
利用随机梯度下降法不断让loss变小,第一个范式为(a,p)距离,第二个范式为(a,n)距离,alpha为常数,因此可以理解为网络学习的过程就是使得类内距离变小,类见距离变大。
损失函数确定好之后如何在训练时寻找anchor对应的negative样本和positive样本成为一个要着重考虑的问题。理论上讲,我们应当选择hard positive与hard negative,如下论文所示。
在这里插入图片描述
但是这种做法是不可行的,而且这种训练方式可能无法达到优的效果。(个人理解:这种方法最麻烦的是需要在整个训练集上找到anchor对应的negative,相当耗时)
因此作者的办法是在训练时,在一个batch上寻找negative,选择的标准是满足如下表达式:
在这里插入图片描述
这里找到的negative样本叫做semi-hard,意思是在batch的范围内,只要(a,n)的距离大于(a,p)的距离即可。
以上便是对triplet-loss的理解。

对于如何训练该模型,可以参考github上 https://github.com/davidsandberg/facenet 的实现(tf版本),以下我做了一些关于train_tripletloss.py的注释,注释为个人理解,有不对的地方请大家指正,其中有些部分参考(http://www.mamicode.com/info-detail-2096766.html ).
另外说明一点,程序实现找到negative:
在满足(a,n)距离-(a,p)<alpha关系的基础上,随机选择一个样本作为negative。

"""Training a face recognizer with TensorFlow based on the FaceNet paper
FaceNet: A Unified Embedding for Face Recognition and Clustering: http://arxiv.org/abs/1503.03832
"""
# MIT License
# 
# Copyright (c) 2016 David Sandberg
# 
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
# 
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from datetime import datetime
import os.path
import time
import sys
import tensorflow as tf
import numpy as np
import importlib
import itertools
import argparse
import facenet
import lfw

from tensorflow.python.ops import data_flow_ops

from six.moves import xrange  # @UnresolvedImport

def main(args):
  
    network = importlib.import_module(args.model_def)

    subdir = datetime.strftime(datetime.now(), '%Y%m%d-%H%M%S')
    log_dir = os.path.join(os.path.expanduser(args.logs_base_dir), subdir)
    if not os.path.isdir(log_dir):  # Create the log directory if it doesn't exist
        os.makedirs(log_dir)
    model_dir = os.path.join(os.path.expanduser(args.models_base_dir), subdir)
    if not os.path.isdir(model_dir):  # Create the model directory if it doesn't exist
        os.makedirs(model_dir)

    # Write arguments to a text file
    facenet.write_arguments_to_file(args, os.path.join(log_dir, 'arguments.txt'))
        
    # Store some git revision info in a text file in the log directory
    src_path,_ = os.path.split(os.path.realpath(__file__))
    facenet.store_revision_info(src_path, log_dir, ' '.join(sys.argv))

    np.random.seed(seed=args.seed)
    train_set = facenet.get_dataset(args.data_dir)
    
    print('Model directory: %s' % model_dir)
    print('Log directory: %s' % log_dir)
    if args.pretrained_model:
        print('Pre-trained model: %s' % os.path.expanduser(args.pretrained_model))
    
    if args.lfw_dir:
        print('LFW directory: %s' % args.lfw_dir)
        # Read the file containing the pairs used for testing
        pairs = lfw.read_pairs(os.path.expanduser(args.lfw_pairs))
        # Get the paths for the corresponding images
        lfw_paths, actual_issame = lfw.get_paths(os.path.expanduser(args.lfw_dir), pairs)
        
    
    with tf.Graph().as_default():
        tf.set_random_seed(args.seed)
        global_step = tf.Variable(0, trainable=False)

        # Placeholder for the learning rate
        learning_rate_placeholder = tf.placeholder(tf.float32, name='learning_rate')
        
        batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size')
        
        phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train')
        
        image_paths_placeholder = tf.placeholder(tf.string, shape=(None,3), name='image_paths')
        labels_placeholder = tf.placeholder(tf.int64, shape=(None,3), name='labels')
        
        input_queue = data_flow_ops.FIFOQueue(capacity=100000,
                                    dtypes=[tf.string, tf.int64],
                                    shapes=[(3,), (3,)],
                                    shared_name=None, name=None)
        enqueue_op = input_queue.enqueue_many([image_paths_placeholder, labels_placeholder])
        
        nrof_preprocess_threads = 4
        images_and_labels = []
        for _ in range(nrof_preprocess_threads):
            filenames, label = input_queue.dequeue()
            images = []
            for filename in tf.unstack(filenames):
                file_contents = tf.read_file(filename)
                image = tf.image.decode_image(file_contents, channels=3)
                
                if args.random_crop:
                    image = tf.random_crop(image, [args.image_size, args.image_size, 3])
                else:
                    image = tf.image.resize_image_with_crop_or_pad(image, args.image_size, args.image_size)
                if args.random_flip:
                    image = tf.image.random_flip_left_right(image)
    
                #pylint: disable=no-member
                image.set_shape((args.image_size, args.image_size, 3))
                images.append(tf.image.per_image_standardization(image))
            images_and_labels.append([images, label])
    
        image_batch, labels_batch = tf.train.batch_join(
            images_and_labels, batch_size=batch_size_placeholder, 
            shapes=[(args.image_size, args.image_size, 3), ()], enqueue_many=True,
            capacity=4 * nrof_preprocess_threads * args.batch_size,
            allow_smaller_final_batch=True)
        image_batch = tf.identity(image_batch, 'image_batch')
        image_batch = tf.identity(image_batch, 'input')
        labels_batch = tf.identity(labels_batch, 'label_batch')

        # Build the inference graph
        prelogits, _ = network.inference(image_batch, args.keep_probability, 
            phase_train=phase_train_placeholder, bottleneck_layer_size=args.embedding_size,
            weight_decay=args.weight_decay)
        
        embeddings = tf.nn.l2_normalize(prelogits, 1, 1e-10, name='embeddings')
        # Split embeddings into anchor, positive and negative and calculate triplet loss
        anchor, positive, negative = tf.unstack(tf.reshape(embeddings, [-1,3,args.embedding_size]), 3, 1)
        triplet_loss = facenet.triplet_loss(anchor, positive, negative, args.alpha)
        
        learning_rate = tf.train.exponential_decay(learning_rate_placeholder, global_step,
            args.learning_rate_decay_epochs*args.epoch_size, args.learning_rate_decay_factor, staircase=True)
        tf.summary.scalar('learning_rate', learning_rate)

        # Calculate the total losses
        regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
        total_loss = tf.add_n([triplet_loss] + regularization_losses, name='total_loss')

        # Build a Graph that trains the model with one batch of examples and updates the model parameters
        train_op = facenet.train(total_loss, global_step, args.optimizer, 
            learning_rate, args.moving_average_decay, tf.global_variables())
        
        # Create a saver
        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=3)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()

        # Start running operations on the Graph.
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))        

        # Initialize variables
        sess.run(tf.global_variables_initializer(), feed_dict={phase_train_placeholder:True})
        sess.run(tf.local_variables_initializer(), feed_dict={phase_train_placeholder:True})

        summary_writer = tf.summary.FileWriter(log_dir, sess.graph)
        coord = tf.train.Coordinator()
        tf.train.start_queue_runners(coord=coord, sess=sess)

        with sess.as_default():

            if args.pretrained_model:
                print('Restoring pretrained model: %s' % args.pretrained_model)
                saver.restore(sess, os.path.expanduser(args.pretrained_model))

            # Training and validation loop
            epoch = 0
            while epoch < args.max_nrof_epochs:
                step = sess.run(global_step, feed_dict=None)
                epoch = step // args.epoch_size
                # Train for one epoch
                train(args, sess, train_set, epoch, image_paths_placeholder, labels_placeholder, labels_batch,
                    batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, input_queue, global_step, 
                    embeddings, total_loss, train_op, summary_op, summary_writer, args.learning_rate_schedule_file,
                    args.embedding_size, anchor, positive, negative, triplet_loss)

                # Save variables and the metagraph if it doesn't exist already
                save_variables_and_metagraph(sess, saver, summary_writer, model_dir, subdir, step)

                # Evaluate on LFW
                if args.lfw_dir:
                    evaluate(sess, lfw_paths, embeddings, labels_batch, image_paths_placeholder, labels_placeholder, 
                            batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, actual_issame, args.batch_size, 
                            args.lfw_nrof_folds, log_dir, step, summary_writer, args.embedding_size)

    return model_dir


def train(args, sess, dataset, epoch, image_paths_placeholder, labels_placeholder, labels_batch,
          batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, input_queue, global_step, 
          embeddings, loss, train_op, summary_op, summary_writer, learning_rate_schedule_file,
          embedding_size, anchor, positive, negative, triplet_loss):
    batch_number = 0
    
    if args.learning_rate>0.0:
        lr = args.learning_rate
    else:
        lr = facenet.get_learning_rate_from_file(learning_rate_schedule_file, epoch)
    while batch_number < args.epoch_size:
        # Sample people randomly from the dataset
        #从dataset中随机抽取样本数据
        image_paths, num_per_class = sample_people(dataset, args.people_per_batch, args.images_per_person)
        
        print('Running forward pass on sampled images: ', end='')
        start_time = time.time()
        #样本个数nrof_examples=batch中的人数people_per_batch×每个人对应的照片数量images_per_person
        nrof_examples = args.people_per_batch * args.images_per_person
        labels_array = np.reshape(np.arange(nrof_examples),(-1,3))
        image_paths_array = np.reshape(np.expand_dims(np.array(image_paths),1), (-1,3))
        sess.run(enqueue_op, {image_paths_placeholder: image_paths_array, labels_placeholder: labels_array})
        #emb_array为所有样本对应的embedding特征信息
        emb_array = np.zeros((nrof_examples, embedding_size))
        #需要运行的batch数量nrof_batches为总样本数除以batch_size
        nrof_batches = int(np.ceil(nrof_examples / args.batch_size))
        for i in range(nrof_batches):
            batch_size = min(nrof_examples-i*args.batch_size, args.batch_size)
            #喂数据,得到一个batch的emb和lab信息
            emb, lab = sess.run([embeddings, labels_batch], feed_dict={batch_size_placeholder: batch_size, 
                learning_rate_placeholder: lr, phase_train_placeholder: True})
            emb_array[lab,:] = emb
        print('%.3f' % (time.time()-start_time))

        # Select triplets based on the embeddings
        print('Selecting suitable triplets for training')
        #select_triplets用来找寻triplets
        triplets, nrof_random_negs, nrof_triplets = select_triplets(emb_array, num_per_class, 
            image_paths, args.people_per_batch, args.alpha)
        selection_time = time.time() - start_time
        print('(nrof_random_negs, nrof_triplets) = (%d, %d): time=%.3f seconds' % 
            (nrof_random_negs, nrof_triplets, selection_time))

        # Perform training on the selected triplets
        #nrof_batches代表需要运行的batch数=nrof_triplets×3(图片数量)/batch_size
        nrof_batches = int(np.ceil(nrof_triplets*3/args.batch_size))
        #triplet_paths表示所有选中的图片路径
        triplet_paths = list(itertools.chain(*triplets))
        labels_array = np.reshape(np.arange(len(triplet_paths)),(-1,3))
        triplet_paths_array = np.reshape(np.expand_dims(np.array(triplet_paths),1), (-1,3))
        sess.run(enqueue_op, {image_paths_placeholder: triplet_paths_array, labels_placeholder: labels_array})
        nrof_examples = len(triplet_paths)
        train_time = 0
        i = 0
        emb_array = np.zeros((nrof_examples, embedding_size))
        loss_array = np.zeros((nrof_triplets,))
        summary = tf.Summary()
        step = 0
        #不断运行batch
        while i < nrof_batches:
            start_time = time.time()
            batch_size = min(nrof_examples-i*args.batch_size, args.batch_size)
            feed_dict = {batch_size_placeholder: batch_size, learning_rate_placeholder: lr, phase_train_placeholder: True}
            err, _, step, emb, lab = sess.run([loss, train_op, global_step, embeddings, labels_batch], feed_dict=feed_dict)
            emb_array[lab,:] = emb
            loss_array[i] = err
            duration = time.time() - start_time
            print('Epoch: [%d][%d/%d]\tTime %.3f\tLoss %2.3f' %
                  (epoch, batch_number+1, args.epoch_size, duration, err))
            batch_number += 1
            i += 1
            train_time += duration
            summary.value.add(tag='loss', simple_value=err)
            
        # Add validation loss and accuracy to summary
        #pylint: disable=maybe-no-member
        summary.value.add(tag='time/selection', simple_value=selection_time)
        summary_writer.add_summary(summary, step)
    return step
  
def select_triplets(embeddings, nrof_images_per_class, image_paths, people_per_batch, alpha):
    """ Select the triplets for training
    """
    trip_idx = 0
    #某个人的图片的embedding在emb_arr中的开始索引
    emb_start_idx = 0
    num_trips = 0
    triplets = []
    
    # VGG Face: Choosing good triplets is crucial and should strike a balance between
    #  selecting informative (i.e. challenging) examples and swamping training with examples that
    #  are too hard. This is achieve by extending each pair (a, p) to a triplet (a, p, n) by sampling
    #  the image n at random, but only between the ones that violate the triplet loss margin. The
    #  latter is a form of hard-negative mining, but it is not as aggressive (and much cheaper) than
    #  choosing the maximally violating example, as often done in structured output learning.
    
    #遍历每一个人
    for i in xrange(people_per_batch):
        #这个人有多少张图片
        nrof_images = int(nrof_images_per_class[i])
        #遍历第i个人所有照片对应的embedding特征
        for j in xrange(1,nrof_images):
            #a_idx表示第j张图片在emb_arr中的位置
            a_idx = emb_start_idx + j - 1
            #neg_dists_sqr表示这张图片跟其他所有图片的欧式距离
            neg_dists_sqr = np.sum(np.square(embeddings[a_idx] - embeddings), 1)
            for pair in xrange(j, nrof_images): # For every possible positive pair.
                #在同一个人下我们找postive图片
                p_idx = emb_start_idx + pair
                #(a,p)之间的欧式距离
                pos_dist_sqr = np.sum(np.square(embeddings[a_idx]-embeddings[p_idx]))
                #同一个人的图片不作为negative ,所以将其另为nan
                neg_dists_sqr[emb_start_idx:emb_start_idx+nrof_images] = np.NaN
                #all_neg = np.where(np.logical_and(neg_dists_sqr-pos_dist_sqr<alpha, pos_dist_sqr<neg_dists_sqr))[0]  # FaceNet selection
                #找到其他人的图片满足条件:(a,n)距离-(a,p)距离<alpha
                all_neg = np.where(neg_dists_sqr-pos_dist_sqr<alpha)[0] # VGG Face selecction
                #nrof_random_negs表示满足上述条件的emb_arr索引
                nrof_random_negs = all_neg.shape[0]
                #如果有满足条件的negative
                if nrof_random_negs>0:
                    #从中随机选取一个作为negative
                    rnd_idx = np.random.randint(nrof_random_negs)
                    n_idx = all_neg[rnd_idx]
                    #将anchor的图片路径,postive的图片路径,negative的图片路径放入三元组triplets中
                    triplets.append((image_paths[a_idx], image_paths[p_idx], image_paths[n_idx]))
                    #print('Triplet %d: (%d, %d, %d), pos_dist=%2.6f, neg_dist=%2.6f (%d, %d, %d, %d, %d)' % 
                    #    (trip_idx, a_idx, p_idx, n_idx, pos_dist_sqr, neg_dists_sqr[n_idx], nrof_random_negs, rnd_idx, i, j, emb_start_idx))
                    trip_idx += 1

                num_trips += 1

        emb_start_idx += nrof_images

    np.random.shuffle(triplets)
    return triplets, num_trips, len(triplets)

def sample_people(dataset, people_per_batch, images_per_person):
    #nrof_images为总共抽取图片数量
    #people_per_batch为每一个batch抽样多少人
    #images_per_person为没人抽取多少张
    nrof_images = people_per_batch * images_per_person
  
    # Sample classes from the dataset
    #nrof_classes表示dataset共有多少人的图片
    nrof_classes = len(dataset)
    #class_indices为每个人的索引
    class_indices = np.arange(nrof_classes)
    #随机打乱索引,为下一步抽样做准备
    np.random.shuffle(class_indices)
    i = 0
    #保存抽样出来的图片途径
    image_paths = []
    #每个人抽取的图片个数
    num_per_class = []
    #抽样的图片是属于哪一个人的,作为label
    sampled_class_indices = []
    # Sample images from these classes until we have enough
    #不断进行抽样达到指定的数量为止
    while len(image_paths)<nrof_images:
        #class_index为第i个人的标签
        class_index = class_indices[i]
        #nrof_images_in_class为第i个人的图片总数
        nrof_images_in_class = len(dataset[class_index])
        #image_indices为图片索引
        image_indices = np.arange(nrof_images_in_class)
        #随机打乱第i人的图片索引数组
        np.random.shuffle(image_indices)
        #nrof_images_from_class表示应该从该人物中抽取的图片数量
        nrof_images_from_class = min(nrof_images_in_class, images_per_person, nrof_images-len(image_paths))
        idx = image_indices[0:nrof_images_from_class]
        #从该人中抽取人物对应图片的路径
        image_paths_for_class = [dataset[class_index].image_paths[j] for j in idx]
        #将抽样出来的图像所对应的label加到sampled_class_indices中
        sampled_class_indices += [class_index]*nrof_images_from_class
        #将抽样出来的图片路径加到image_paths中
        image_paths += image_paths_for_class
        #每个人物抽取啊图片的数量信息加到num_per_class中
        num_per_class.append(nrof_images_from_class)
        i+=1
  #返回抽取的图片路径和每个人所对应的照片数量
    return image_paths, num_per_class

def evaluate(sess, image_paths, embeddings, labels_batch, image_paths_placeholder, labels_placeholder, 
        batch_size_placeholder, learning_rate_placeholder, phase_train_placeholder, enqueue_op, actual_issame, batch_size, 
        nrof_folds, log_dir, step, summary_writer, embedding_size):
    start_time = time.time()
    # Run forward pass to calculate embeddings
    print('Running forward pass on LFW images: ', end='')
    
    nrof_images = len(actual_issame)*2
    assert(len(image_paths)==nrof_images)
    labels_array = np.reshape(np.arange(nrof_images),(-1,3))
    image_paths_array = np.reshape(np.expand_dims(np.array(image_paths),1), (-1,3))
    sess.run(enqueue_op, {image_paths_placeholder: image_paths_array, labels_placeholder: labels_array})
    emb_array = np.zeros((nrof_images, embedding_size))
    nrof_batches = int(np.ceil(nrof_images / batch_size))
    label_check_array = np.zeros((nrof_images,))
    for i in xrange(nrof_batches):
        batch_size = min(nrof_images-i*batch_size, batch_size)
        emb, lab = sess.run([embeddings, labels_batch], feed_dict={batch_size_placeholder: batch_size,
            learning_rate_placeholder: 0.0, phase_train_placeholder: False})
        emb_array[lab,:] = emb
        label_check_array[lab] = 1
    print('%.3f' % (time.time()-start_time))
    
    assert(np.all(label_check_array==1))
    
    _, _, accuracy, val, val_std, far = lfw.evaluate(emb_array, actual_issame, nrof_folds=nrof_folds)
    
    print('Accuracy: %1.3f+-%1.3f' % (np.mean(accuracy), np.std(accuracy)))
    print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
    lfw_time = time.time() - start_time
    # Add validation loss and accuracy to summary
    summary = tf.Summary()
    #pylint: disable=maybe-no-member
    summary.value.add(tag='lfw/accuracy', simple_value=np.mean(accuracy))
    summary.value.add(tag='lfw/val_rate', simple_value=val)
    summary.value.add(tag='time/lfw', simple_value=lfw_time)
    summary_writer.add_summary(summary, step)
    with open(os.path.join(log_dir,'lfw_result.txt'),'at') as f:
        f.write('%d\t%.5f\t%.5f\n' % (step, np.mean(accuracy), val))

def save_variables_and_metagraph(sess, saver, summary_writer, model_dir, model_name, step):
    # Save the model checkpoint
    print('Saving variables')
    start_time = time.time()
    checkpoint_path = os.path.join(model_dir, 'model-%s.ckpt' % model_name)
    saver.save(sess, checkpoint_path, global_step=step, write_meta_graph=False)
    save_time_variables = time.time() - start_time
    print('Variables saved in %.2f seconds' % save_time_variables)
    metagraph_filename = os.path.join(model_dir, 'model-%s.meta' % model_name)
    save_time_metagraph = 0  
    if not os.path.exists(metagraph_filename):
        print('Saving metagraph')
        start_time = time.time()
        saver.export_meta_graph(metagraph_filename)
        save_time_metagraph = time.time() - start_time
        print('Metagraph saved in %.2f seconds' % save_time_metagraph)
    summary = tf.Summary()
    #pylint: disable=maybe-no-member
    summary.value.add(tag='time/save_variables', simple_value=save_time_variables)
    summary.value.add(tag='time/save_metagraph', simple_value=save_time_metagraph)
    summary_writer.add_summary(summary, step)
  
  
def get_learning_rate_from_file(filename, epoch):
    with open(filename, 'r') as f:
        for line in f.readlines():
            line = line.split('#', 1)[0]
            if line:
                par = line.strip().split(':')
                e = int(par[0])
                lr = float(par[1])
                if e <= epoch:
                    learning_rate = lr
                else:
                    return learning_rate
    

def parse_arguments(argv):
    parser = argparse.ArgumentParser()
    
    parser.add_argument('--logs_base_dir', type=str, 
        help='Directory where to write event logs.', default='~/logs/facenet')
    parser.add_argument('--models_base_dir', type=str,
        help='Directory where to write trained models and checkpoints.', default='~/models/facenet')
    parser.add_argument('--gpu_memory_fraction', type=float,
        help='Upper bound on the amount of GPU memory that will be used by the process.', default=1.0)
    parser.add_argument('--pretrained_model', type=str,
        help='Load a pretrained model before training starts.')
    parser.add_argument('--data_dir', type=str,
        help='Path to the data directory containing aligned face patches.',
        default='~/datasets/casia/casia_maxpy_mtcnnalign_182_160')
    parser.add_argument('--model_def', type=str,
        help='Model definition. Points to a module containing the definition of the inference graph.', default='models.inception_resnet_v1')
    parser.add_argument('--max_nrof_epochs', type=int,
        help='Number of epochs to run.', default=500)
    parser.add_argument('--batch_size', type=int,
        help='Number of images to process in a batch.', default=90)
    parser.add_argument('--image_size', type=int,
        help='Image size (height, width) in pixels.', default=160)
    parser.add_argument('--people_per_batch', type=int,
        help='Number of people per batch.', default=45)
    parser.add_argument('--images_per_person', type=int,
        help='Number of images per person.', default=40)
    parser.add_argument('--epoch_size', type=int,
        help='Number of batches per epoch.', default=1000)
    parser.add_argument('--alpha', type=float,
        help='Positive to negative triplet distance margin.', default=0.2)
    parser.add_argument('--embedding_size', type=int,
        help='Dimensionality of the embedding.', default=128)
    parser.add_argument('--random_crop', 
        help='Performs random cropping of training images. If false, the center image_size pixels from the training images are used. ' +
         'If the size of the images in the data directory is equal to image_size no cropping is performed', action='store_true')
    parser.add_argument('--random_flip', 
        help='Performs random horizontal flipping of training images.', action='store_true')
    parser.add_argument('--keep_probability', type=float,
        help='Keep probability of dropout for the fully connected layer(s).', default=1.0)
    parser.add_argument('--weight_decay', type=float,
        help='L2 weight regularization.', default=0.0)
    parser.add_argument('--optimizer', type=str, choices=['ADAGRAD', 'ADADELTA', 'ADAM', 'RMSPROP', 'MOM'],
        help='The optimization algorithm to use', default='ADAGRAD')
    parser.add_argument('--learning_rate', type=float,
        help='Initial learning rate. If set to a negative value a learning rate ' +
        'schedule can be specified in the file "learning_rate_schedule.txt"', default=0.1)
    parser.add_argument('--learning_rate_decay_epochs', type=int,
        help='Number of epochs between learning rate decay.', default=100)
    parser.add_argument('--learning_rate_decay_factor', type=float,
        help='Learning rate decay factor.', default=1.0)
    parser.add_argument('--moving_average_decay', type=float,
        help='Exponential decay for tracking of training parameters.', default=0.9999)
    parser.add_argument('--seed', type=int,
        help='Random seed.', default=666)
    parser.add_argument('--learning_rate_schedule_file', type=str,
        help='File containing the learning rate schedule that is used when learning_rate is set to to -1.', default='data/learning_rate_schedule.txt')

    # Parameters for validation on LFW
    parser.add_argument('--lfw_pairs', type=str,
        help='The file containing the pairs to use for validation.', default='data/pairs.txt')
    parser.add_argument('--lfw_dir', type=str,
        help='Path to the data directory containing aligned face patches.', default='')
    parser.add_argument('--lfw_nrof_folds', type=int,
        help='Number of folds to use for cross validation. Mainly used for testing.', default=10)
    return parser.parse_args(argv)
  

if __name__ == '__main__':
    main(parse_arguments(sys.argv[1:]))

将以下两份代码分别修改,使其可用完美配合:代码1:# 使用示例 from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer from contrastive_trainer import ContrastiveTrainer, ContrastiveDataCollator from datasets import load_dataset class ContrastiveTrainer(Trainer): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # 确保父类初始化 self._prepare_for_training() # 显式调用 def _prepare_for_training(self): # 自定义逻辑或直接复用父类方法 super()._prepare_for_training() # 调用父类实现 # 加载模型和分词器 model = AutoModelForCausalLM.from_pretrained("model/Qwen/Qwen1.5-1.8B") tokenizer = AutoTokenizer.from_pretrained("model/Qwen/Qwen1.5-1.8B") tokenizer.pad_token = tokenizer.eos_token train_dataset = load_dataset('json', data_files='data/processed/train_style_triplets.json')['train'] val_dataset = load_dataset('json', data_files='data/processed/val_style_triplets.json')['train'] # 验证集 # 训练参数配置 training_args = TrainingArguments( output_dir="./model/lora_adapter", per_device_train_batch_size=4, num_train_epochs=3, learning_rate=2e-4, logging_steps=50, save_steps=500, fp16=True, gradient_accumulation_steps=4, ) # 对比学习配置 contrastive_config = { "temperature": 0.07, "margin": 0.3, "weight": 0.8, "repr_layer": -1 # 使用最后一层隐藏状态 } # 初始化数据收集器 data_collator = ContrastiveDataCollator( tokenizer=tokenizer, max_length=512, padding="max_length" ) # 初始化训练器 (修正参数传递) trainer = ContrastiveTrainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, contrastive_config=contrastive_config # 作为独立参数传递 ) # 开始训练 trainer.train() # 评估模型 eval_results = trainer.evaluate() 代码2:# trainer/contrastive_trainer.py import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader from transformers import Trainer, TrainingArguments from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.utils import PaddingStrategy from typing import Any, Dict, List, Optional, Tuple, Union import logging import numpy as np import os from tqdm import tqdm from dataclasses import dataclass class ContrastiveTrainer(Trainer): def __init__(self, *args, contrastive_config=None, **kwargs): # 1. 显式声明并提取自定义参数 self.contrastive_config = contrastive_config or {} # 2. 验证配置完整性 required_keys = ["temperature", "margin", "weight", "repr_layer"] if any(key not in self.contrastive_config for key in required_keys): raise ValueError("Missing required keys in contrastive_config") # 3. 安全移除自定义参数后调用父类初始化 super().__init__(*args, **kwargs) # 此时 kwargs 不含 contrastive_config # 4. 初始化依赖配置的参数 self.temperature = self.contrastive_config["temperature"] self.margin = self.contrastive_config["margin"] self.contrastive_weight = self.contrastive_config["weight"] self.repr_layer = self.contrastive_config["repr_layer"] # 5. 损失函数初始化 self.cross_entropy = nn.CrossEntropyLoss() # 设置日志 logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @dataclass class ContrastiveDataCollator: """ 对比学习数据收集器,处理对比学习的正负样本对 """ tokenizer: PreTrainedTokenizerBase padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None pad_to_multiple_of: Optional[int] = None return_tensors: str = "pt" def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]: """ 处理一批数据,生成模型输入格式 """ # 分离出三元组的各个部分 anchor_features = [{"input_ids": f["anchor_input_ids"]} for f in features] positive_features = [{"input_ids": f["positive_input_ids"]} for f in features] negative_features = [{"input_ids": f["negative_input_ids"]} for f in features] # 对每个部分分别进行填充 batch_anchor = self.tokenizer.pad( anchor_features, padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, return_tensors=self.return_tensors, ) batch_positive = self.tokenizer.pad( positive_features, padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, return_tensors=self.return_tensors, ) batch_negative = self.tokenizer.pad( negative_features, padding=self.padding, max_length=self.max_length, pad_to_multiple_of=self.pad_to_multiple_of, return_tensors=self.return_tensors, ) # 创建注意力掩码 def create_attention_mask(input_ids): mask = torch.ones_like(input_ids) mask[input_ids == self.tokenizer.pad_token_id] = 0 return mask # 返回一个字典,包含所有部分 return { "anchor_input_ids": batch_anchor["input_ids"], "anchor_attention_mask": create_attention_mask(batch_anchor["input_ids"]), "positive_input_ids": batch_positive["input_ids"], "positive_attention_mask": create_attention_mask(batch_positive["input_ids"]), "negative_input_ids": batch_negative["input_ids"], "negative_attention_mask": create_attention_mask(batch_negative["input_ids"]), } class ContrastiveTrainer(Trainer): """ 对比学习训练器类,实现对比学习训练逻辑 """ def __init__( self, model: nn.Module = None, args: TrainingArguments = None, data_collator: Optional[ContrastiveDataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, model_init: Optional[callable] = None, compute_metrics: Optional[callable] = None, callbacks: Optional[List[Any]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), preprocess_logits_for_metrics: Optional[callable] = None, contrastive_config: Optional[Dict] = None ): super().__init__( model=model, args=args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=eval_dataset, tokenizer=tokenizer, model_init=model_init, compute_metrics=compute_metrics, callbacks=callbacks, optimizers=optimizers, preprocess_logits_for_metrics=preprocess_logits_for_metrics ) # 对比学习配置 self.contrastive_config = contrastive_config or {} self.temperature = self.contrastive_config.get("temperature", 0.07) self.margin = self.contrastive_config.get("margin", 0.3) self.contrastive_weight = self.contrastive_config.get("weight", 0.8) self.repr_layer = self.contrastive_config.get("repr_layer", -1) # 默认最后一层隐藏状态 # 损失函数 self.cross_entropy = nn.CrossEntropyLoss() def compute_contrastive_loss(self, anchor_emb, pos_emb, neg_emb): """ 计算对比损失 (InfoNCE + Triplet Margin组合) """ # 计算余弦相似度 pos_sim = F.cosine_similarity(anchor_emb, pos_emb) neg_sim = F.cosine_similarity(anchor_emb, neg_emb) # InfoNCE损失 numerator = torch.exp(pos_sim / self.temperature) denominator = numerator + torch.exp(neg_sim / self.temperature) info_nce_loss = -torch.log(numerator / denominator).mean() # 三元组损失 triplet_loss = F.triplet_margin_loss( anchor_emb, pos_emb, neg_emb, margin=self.margin ) # 加权组合 return info_nce_loss + triplet_loss def get_sequence_representation(self, outputs, attention_mask): """ 获取序列表示(取最后一个token的隐藏状态) """ # 获取指定层的隐藏状态 hidden_states = outputs.hidden_states[self.repr_layer] # 获取每个序列的最后一个非填充token # 注意:attention_mask中1表示有效token,0表示填充 last_token_indices = attention_mask.sum(dim=1) - 1 # 收集每个序列的最后一个token的隐藏状态 batch_size = hidden_states.size(0) sequence_representations = hidden_states[ torch.arange(batch_size), last_token_indices ] return sequence_representations def compute_loss(self, model, inputs, return_outputs=False): """ 计算总损失(语言建模损失 + 对比损失) """ # 提取输入 anchor_input_ids = inputs.get("anchor_input_ids") anchor_attention_mask = inputs.get("anchor_attention_mask") positive_input_ids = inputs.get("positive_input_ids") positive_attention_mask = inputs.get("positive_attention_mask") negative_input_ids = inputs.get("negative_input_ids") negative_attention_mask = inputs.get("negative_attention_mask") # 前向传播获取隐藏状态 def get_embeddings(input_ids, attention_mask): outputs = model( input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True, return_dict=True ) return self.get_sequence_representation(outputs, attention_mask) # 获取三元组的嵌入表示 anchor_emb = get_embeddings(anchor_input_ids, anchor_attention_mask) pos_emb = get_embeddings(positive_input_ids, positive_attention_mask) neg_emb = get_embeddings(negative_input_ids, negative_attention_mask) # 计算对比损失 cl_loss = self.compute_contrastive_loss(anchor_emb, pos_emb, neg_emb) cl_loss = cl_loss * self.contrastive_weight # 计算语言建模损失(仅针对positive回复) lm_labels = positive_input_ids.clone() lm_labels[lm_labels == self.tokenizer.pad_token_id] = -100 # 忽略填充token lm_outputs = model( input_ids=positive_input_ids, attention_mask=positive_attention_mask, labels=lm_labels ) lm_loss = lm_outputs.loss # 总损失 = LM损失 + 对比损失 total_loss = lm_loss + cl_loss # 如果返回输出,则返回损失和输出 if return_outputs: outputs = { "lm_loss": lm_loss, "cl_loss": cl_loss, "total_loss": total_loss, "logits": lm_outputs.logits } return total_loss, outputs return total_loss def training_step(self, model, inputs): """ 自定义训练步骤 """ model.train() inputs = self._prepare_inputs(inputs) # 前向传播 with self.compute_loss_context_manager(): loss, outputs = self.compute_loss(model, inputs, return_outputs=True) # 如果使用梯度累积,需要除以累积步数 if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps # 反向传播 loss.backward() # 记录日志 self.log({ "train/lm_loss": outputs["lm_loss"].item(), "train/cl_loss": outputs["cl_loss"].item(), "train/loss": loss.item(), "train/lr": self.lr_scheduler.get_last_lr()[0] }) return loss.detach() def log(self, logs: Dict[str, float]): """ 自定义日志记录 """ if self.state.epoch is not None: logs["epoch"] = round(self.state.epoch, 2) # 每N步记录一次日志 if self.state.global_step % self.args.logging_steps == 0: logger.info(f"Step {self.state.global_step}: {logs}") def train(self, **kwargs): """ 自定义训练循环 """ # 初始化训练 self._prepare_for_training() # 训练循环 for epoch in range(int(self.args.num_train_epochs)): logger.info(f"Starting epoch {epoch + 1}/{self.args.num_train_epochs}") # 创建数据加载器 train_dataloader = self.get_train_dataloader() # 训练一个epoch for step, inputs in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1}")): # 训练步骤 loss = self.training_step(self.model, inputs) # 梯度更新 if (step + 1) % self.args.gradient_accumulation_steps == 0: self.optimizer.step() self.lr_scheduler.step() self.optimizer.zero_grad() self.state.global_step += 1 # 模型保存 if self.args.save_strategy == "steps" and self.state.global_step % self.args.save_steps == 0: self._save_model(self.args.output_dir) # 每个epoch结束时保存模型 if self.args.save_strategy == "epoch": self._save_model(self.args.output_dir) def _save_model(self, output_dir: str): """ 保存模型适配器 """ save_path = os.path.join(output_dir, f"checkpoint-{self.state.global_step}") os.makedirs(save_path, exist_ok=True) # 保存适配器权重 self.model.save_pretrained(save_path) self.tokenizer.save_pretrained(save_path) logger.info(f"Model saved to {save_path}") def evaluate(self, **kwargs): """ 自定义评估方法 """ self.model.eval() eval_dataloader = self.get_eval_dataloader() total_loss = 0.0 total_lm_loss = 0.0 total_cl_loss = 0.0 num_batches = 0 with torch.no_grad(): for inputs in tqdm(eval_dataloader, desc="Evaluating"): inputs = self._prepare_inputs(inputs) loss, outputs = self.compute_loss(self.model, inputs, return_outputs=True) total_loss += loss.item() total_lm_loss += outputs["lm_loss"].item() total_cl_loss += outputs["cl_loss"].item() num_batches += 1 avg_loss = total_loss / num_batches avg_lm_loss = total_lm_loss / num_batches avg_cl_loss = total_cl_loss / num_batches metrics = { "eval_loss": avg_loss, "eval_lm_loss": avg_lm_loss, "eval_cl_loss": avg_cl_loss } logger.info(f"Evaluation results: {metrics}") return metrics
最新发布
07-20
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值