使用作者的预训练权重在自己的数据集上finetune Siamese Rpn

最新推荐文章于 2025-04-02 17:07:14 发布

原创最新推荐文章于 2025-04-02 17:07:14 发布 · 1.8k 阅读

6 ·

CC 4.0 BY-SA版权

本文介绍如何使用预训练权重在自定义数据集上调整Siamese RPN模型，以提升目标跟踪性能。文中详细解释了训练流程、数据加载及预处理方法。

使用作者的预训练权重在自己的数据集上finetune Siamese Rpn，以获得更好的跟踪效果

完整工程：https://github.com/woshiwwwppp/Finetune-Simese-RPN-

训练主程序train.py

# -*- coding: utf-8 -*-
import os
import random
import sys; sys.path.append('../')
import torch
import torch.nn as nn
import numpy as np
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
import argparse
from run_SiamRPN import TrainDataLoader
from shapely.geometry import Polygon
from tensorboardX import SummaryWriter
from os.path import realpath, dirname, join
from net import SiamRPNvot

parser = argparse.ArgumentParser(description='PyTorch SiameseRPN Training')

parser.add_argument('--train_path', default='D:\\uav_frame\\00', metavar='DIR',help='path to dataset')

parser.add_argument('--weight_dir', default='D:\\project\\py\\yolo3\\Siamese-RPN-pytorch-master', metavar='DIR',help='path to weight')

parser.add_argument('--checkpoint_path', default=None, help='resume')

parser.add_argument('--max_epoches', default=10000, type=int, metavar='N', help='number of total epochs to run')

parser.add_argument('--max_batches', default=0, type=int, metavar='N', help='number of batch in one epoch')

parser.add_argument('--init_type',  default='xavier', type=str, metavar='INIT', help='init net')

parser.add_argument('--lr', default=0.0005, type=float, metavar='LR', help='initial learning rate')

parser.add_argument('--momentum', default=0.9, type=float, metavar='momentum', help='momentum')

parser.add_argument('--weight_decay', '--wd', default=5e-5, type=float, metavar='W', help='weight decay (default: 1e-4)')

parser.add_argument('--debug', default=False, type=bool,  help='whether to debug')

def main():
    args = parser.parse_args()
    """ compute max_batches """
    for root, dirs, files in os.walk(args.train_path):
        for dirnames in dirs:
            dir_path = os.path.join(root, dirnames)
            args.max_batches += len(os.listdir(dir_path))

    """ Model on gpu """
    model = SiamRPNvot()
    model = model.cuda()
    model.load_state_dict(torch.load(join(realpath(dirname(__file__)), 'SiamRPNVOT.model')))
    model.train().cuda()
    cudnn.benchmark = True

    """ train dataloader """
    data_loader = TrainDataLoader(args.train_path,model)
    if not os.path.exists(args.weight_dir):
        os.makedirs(args.weight_dir)


    """ loss and optimizer """
    criterion = MultiBoxLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay = args.weight_decay)

    """ train phase """
    closses, rlosses, tlosses = AverageMeter(), AverageMeter(), AverageMeter()
    steps = 0
    writer = SummaryWriter()
    for epoch in range(args.max_epoches):
        cur_lr = adjust_learning_rate(args.lr, optimizer, epoch, gamma=0.1)
        index_list = range(data_loader.__len__())#获取数据集的长度
        losss=[0.0,0.0,0.0]

        for example in range(args.max_batches):
            ret = data_loader.__get__(random.choice(index_list))
            template = ret['temple'].cuda()
            detection= ret['detection'].cuda()
            pos_neg_diff = ret['pos_neg_diff_tensor'].cuda()
            model.temple(template)
            rout,cout = model(detection)
            cout = cout.squeeze().permute(1, 2, 0).reshape(-1, 2)
            rout = rout.squeeze().permute(1, 2, 0).reshape(-1, 4)

            predictions, targets = (cout, rout), pos_neg_diff
            closs, rloss, loss, reg_pred, reg_target, pos_index, neg_index = criterion(predictions, targets)
            closs_ = closs.cpu().item()

            if np.isnan(closs_):
               sys.exit(0)

            closses.update(closs.cpu().item())
            rlosses.update(rloss.cpu().item())
            tlosses.update(loss.cpu().item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            steps += 1
            losss[0]=closses.avg
            losss[1] = rlosses.avg
            losss[2] = tlosses.avg
        print("Epoch:{:04d}\tcloss:{:.4f}\trloss:{:.4f}\ttloss:{:.4f}".format(epoch,  closses.avg, rlosses.avg, tlosses.avg ))
        writer.add_scalar("closses", losss[0], epoch)
        writer.add_scalar("rlosses", losss[1], epoch)
        writer.add_scalar("tlosses", losss[2], epoch)
        if steps % 150 == 0:
            file_path = os.path.join(args.weight_dir, 'weights-{:07d}.pth'.format(steps))
            state = {
            'epoch' :epoch+1,
            'state_dict' :model.state_dict(),
            'optimizer' : optimizer.state_dict(),
            }
            torch.save(state, file_path)

def intersection(g, p):
    g = Polygon(g[:8].reshape((4, 2)))
    p = Polygon(p[:8].reshape((4, 2)))
    if not g.is_valid or not p.is_valid:
        return 0
    inter = Polygon(g).intersection(Polygon(p)).area
    union = g.area + p.area - inter
    if union == 0:
        return 0
    else:
        return inter/union

def standard_nms(S, thres):
    """ use pre_thres to filter """
    index = np.where(S[:, 8] > thres)[0]
    S = S[index] # ~ 100, 4

    # Then use standard nms
    order = np.argsort(S[:, 8])[::-1]
    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])

        inds = np.where(ovr <= thres)[0]
        order = order[inds+1]
    return S[keep]

def reshape(x):
    t = np.array(x, dtype = np.float32)
    return t.reshape(-1, 1)

class MultiBoxLoss(nn.Module):
    def __init__(self):
        super(MultiBoxLoss, self).__init__()

    def forward(self, predictions, targets):
        #print('+++++++++++++++++++++++++++++++++++++++++++++++++++')
        cout, rout = predictions
        """ class """

        class_pred, class_target = cout, targets[:, 0].long()
        # pos_index , neg_index    = list(np.where(class_target == 1)[0]), list(np.where(class_target == 0)[0])
        #获取正负样本的位置
        pos_index, neg_index = list(np.where( class_target.cuda().data.cpu().numpy() == 1)[0]), list(np.where( class_target.cuda().data.cpu().numpy() == 0)[0])
        pos_num, neg_num         = len(pos_index), len(neg_index)
        #列表相加是拼起来再索引
        class_pred, class_target = class_pred[pos_index + neg_index], class_target[pos_index + neg_index]

        closs = F.cross_entropy(class_pred, class_target, size_average=False, reduce=False)
        closs = torch.div(torch.sum(closs), 64)

        """ regression """
        reg_pred = rout
        reg_target = targets[:, 1:]
        rloss = F.smooth_l1_loss(reg_pred, reg_target, size_average=False, reduce=False) #1445, 4
        rloss = torch.div(torch.sum(rloss, dim = 1), 4)
        rloss = torch.div(torch.sum(rloss[pos_index]), 16)

        loss = closs + rloss
        return closs, rloss, loss, reg_pred, reg_target, pos_index, neg_index

class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def adjust_learning_rate(lr, optimizer, epoch, gamma=0.1):
    """Sets the learning rate to the initial LR decayed 0.9 every 50 epochs"""
    lr = lr * (0.9 ** (epoch // 1))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return lr

if __name__ == '__main__':
    main()

数据读取和标签制作

# --------------------------------------------------------
# DaSiamRPN
# Licensed under The MIT License
# Written by Qiang Wang (wangqiang2015 at ia.ac.cn)
# --------------------------------------------------------
import numpy as np
from torch.autograd import Variable
import torch
import torch.nn.functional as F
import random
import os
import os.path as osp
from utils import get_subwindow_tracking
import xml.etree.ElementTree as ET
import cv2
import sys
from net import SiamRPNvot
def generate_anchor(total_stride, scales, ratios, score_size):
    anchor_num = len(ratios) * len(scales)
    anchor = np.zeros((anchor_num, 4),  dtype=np.float32)
    size = total_stride * total_stride
    count = 0
    for ratio in ratios:
        # ws = int(np.sqrt(size * 1.0 / ratio))
        ws = int(np.sqrt(size / ratio))
        hs = int(ws * ratio)
        for scale in scales:
            wws = ws * scale
            hhs = hs * scale
            anchor[count, 0] = 0
            anchor[count, 1] = 0
            anchor[count, 2] = wws
            anchor[count, 3] = hhs
            count += 1

    anchor = np.tile(anchor, score_size * score_size).reshape((-1, 4))
    ori = - (score_size / 2) * total_stride
    xx, yy = np.meshgrid([ori + total_stride * dx for dx in range(score_size)],
                         [ori + total_stride * dy for dy in range(score_size)])
    xx, yy = np.tile(xx.flatten(), (anchor_num, 1)).flatten(), \
             np.tile(yy.flatten(), (anchor_num, 1)).flatten()
    anchor[:, 0], anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
    return anchor

class TrackerConfig(object):
    # These are the default hyper-params for DaSiamRPN 0.3827
    windowing = 'cosine'  # to penalize large displacements [cosine/uniform]
    # Params from the network architecture, have to be consistent with the training
    exemplar_size = 127  # input z size
    instance_size = 271  # input x size (search region)
    total_stride = 8
    score_size = (instance_size-exemplar_size)/total_stride+1
    context_amount = 0.5  # context amount for the exemplar
    ratios = [0.33, 0.5, 1, 2, 3]
    scales = [8, ]
    anchor_num = len(ratios) * len(scales)
    anchor = []
    penalty_k = 0.055
    window_influence = 0.42
    lr = 0.295
    # adaptive change search region #
    adaptive = True

    def update(self, cfg):
        for k, v in cfg.items():
            setattr(self, k, v)
        self.score_size = (self.instance_size - self.exemplar_size) / self.total_stride + 1

def tracker_eval(net, x_crop, target_pos, target_sz, window, scale_z, p):
    delta, score = net(x_crop)

    delta = delta.permute(1, 2, 3, 0).contiguous().view(4, -1).data.cpu().numpy()
    score = F.softmax(score.permute(1, 2, 3, 0).contiguous().view(2, -1), dim=0).data[1, :].cpu().numpy()

    delta[0, :] = delta[0, :] * p.anchor[:, 2] + p.anchor[:, 0]
    delta[1, :] = delta[1, :] * p.anchor[:, 3] + p.anchor[:, 1]
    delta[2, :] = np.exp(delta[2, :]) * p.anchor[:, 2]
    delta[3, :] = np.exp(delta[3, :]) * p.anchor[:, 3]

    def change(r):
        return np.maximum(r, 1./r)

    def sz(w, h):
        pad = (w + h) * 0.5
        sz2 = (w + pad) * (h + pad)
        return np.sqrt(sz2)

    def sz_wh(wh):
        pad = (wh[0] + wh[1]) * 0.5
        sz2 = (wh[0] + pad) * (wh[1] + pad)
        return np.sqrt(sz2)

    # size penalty
    s_c = change(sz(delta[2, :], delta[3, :]) / (sz_wh(target_sz)))  # scale penalty
    r_c = change((target_sz[0] / target_sz[1]) / (delta[2, :] / delta[3, :]))  # ratio penalty

    penalty = np.exp(-(r_c * s_c - 1.) * p.penalty_k)
    pscore = penalty * score

    # window float
    pscore = pscore * (1 - p.window_influence) + window * p.window_influence
    best_pscore_id = np.argmax(pscore)

    target = delta[:, best_pscore_id] / scale_z
    target_sz = target_sz / scale_z
    lr = penalty[best_pscore_id] * score[best_pscore_id] * p.lr

    res_x = target[0] + target_pos[0]
    res_y = target[1] + target_pos[1]

    res_w = target_sz[0] * (1 - lr) + target[2] * lr
    res_h = target_sz[1] * (1 - lr) + target[3] * lr

    target_pos = np.array([res_x, res_y])
    target_sz = np.array([res_w, res_h])
    return target_pos, target_sz, score[best_pscore_id]

def SiamRPN_init(im, target_pos, target_sz, net):
    state = dict()
    p = TrackerConfig()
    p.update(net.cfg)
    state['im_h'] = im.shape[0]
    state['im_w'] = im.shape[1]

    if p.adaptive:
        if ((target_sz[0] * target_sz[1]) / float(state['im_h'] * state['im_w'])) < 0.004:
            p.instance_size = 287  # small object big search region
        else:
            p.instance_size = 271

        p.score_size = (p.instance_size - p.exemplar_size) / p.total_stride + 1

    p.anchor = generate_anchor(p.total_stride, p.scales, p.ratios, int(p.score_size))

    avg_chans = np.mean(im, axis=(0, 1))#图像均值

    wc_z = target_sz[0] + p.context_amount * sum(target_sz)
    hc_z = target_sz[1] + p.context_amount * sum(target_sz)
    s_z = round(np.sqrt(wc_z * hc_z))
    # initialize the exemplar
    z_crop = get_subwindow_tracking(im, target_pos, p.exemplar_size, s_z, avg_chans)

    z = Variable(z_crop.unsqueeze(0))
    net.temple(z.cuda())

    if p.windowing == 'cosine':
        window = np.outer(np.hanning(p.score_size), np.hanning(p.score_size))
    elif p.windowing == 'uniform':
        window = np.ones((p.score_size, p.score_size))
    window = np.tile(window.flatten(), p.anchor_num)

    state['p'] = p
    state['net'] = net
    state['avg_chans'] = avg_chans
    state['window'] = window
    state['target_pos'] = target_pos
    state['target_sz'] = target_sz
    return state

def SiamRPN_track(state, im):
    p = state['p']
    net = state['net']
    avg_chans = state['avg_chans']
    window = state['window']
    target_pos = state['target_pos']
    target_sz = state['target_sz']

    wc_z = target_sz[1] + p.context_amount * sum(target_sz)
    hc_z = target_sz[0] + p.context_amount * sum(target_sz)
    s_z = np.sqrt(wc_z * hc_z)
    scale_z = p.exemplar_size / s_z
    d_search = (p.instance_size - p.exemplar_size) / 2
    pad = d_search / scale_z
    s_x = s_z + 2 * pad

    # extract scaled crops for search region x at previous target position
    x_crop = Variable(get_subwindow_tracking(im, target_pos, p.instance_size, round(s_x), avg_chans).unsqueeze(0))

    target_pos, target_sz, score = tracker_eval(net, x_crop.cuda(), target_pos, target_sz * scale_z, window, scale_z, p)
    target_pos[0] = max(0, min(state['im_w'], target_pos[0]))
    target_pos[1] = max(0, min(state['im_h'], target_pos[1]))
    target_sz[0] = max(10, min(state['im_w'], target_sz[0]))
    target_sz[1] = max(10, min(state['im_h'], target_sz[1]))
    state['target_pos'] = target_pos
    state['target_sz'] = target_sz
    state['score'] = score
    return state

class Anchor_ms(object):
    """
    stable version for anchor generator
    """
    def __init__(self, feature_w, feature_h):
        self.w = feature_w
        self.h = feature_h
        self.base = 64  # base size for anchor box
        self.stride = 15  # center point shift stride
        self.scale = [1 / 3, 1 / 2, 1, 2, 3]  # aspect ratio
        self.anchors = self.gen_anchors()  # xywh
        self.eps = 0.01

    def gen_single_anchor(self):
        scale = np.array(self.scale, dtype=np.float32)
        s = self.base * self.base
        w, h = np.sqrt(s / scale), np.sqrt(s * scale)
        c_x, c_y = (self.stride - 1) // 2, (self.stride - 1) // 2
        anchor = np.vstack([c_x * np.ones_like(scale, dtype=np.float32), c_y * np.ones_like(scale, dtype=np.float32), w,
                            h]).transpose()
        anchor = self.center_to_corner(anchor)
        return anchor

    def gen_anchors(self):
        anchor = self.gen_single_anchor()
        k = anchor.shape[0]
        delta_x, delta_y = [x * self.stride for x in range(self.w)], [y * self.stride for y in range(self.h)]
        shift_x, shift_y = np.meshgrid(delta_x, delta_y)
        shifts = np.vstack([shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel()]).transpose()
        a = shifts.shape[0]
        anchors = (anchor.reshape((1, k, 4)) + shifts.reshape((a, 1, 4))).reshape((a * k, 4))  # corner format
        anchors = self.corner_to_center(anchors)
        return anchors

    # float
    def diff_anchor_gt(self, gt):
        eps = self.eps
        anchors, gt = self.anchors.copy(), gt.copy()
        diff = np.zeros_like(anchors, dtype=np.float32)  # 生成和anchor一样大小的0矩阵
        # detection帧的groundtruth，四个坐标值分别反变换后得到rpn滑框的dx,dy,dw,dh
        diff[:, 0] = (gt[0] - anchors[:, 0]) / (anchors[:, 2] + eps)  #
        diff[:, 1] = (gt[1] - anchors[:, 1]) / (anchors[:, 3] + eps)
        diff[:, 2] = np.log((gt[2] + eps) / (anchors[:, 2] + eps))
        diff[:, 3] = np.log((gt[3] + eps) / (anchors[:, 3] + eps))
        return diff

    # float
    def center_to_corner(self, box):
        box = box.copy()
        box_ = np.zeros_like(box, dtype=np.float32)
        box_[:, 0] = box[:, 0] - (box[:, 2] - 1) / 2
        box_[:, 1] = box[:, 1] - (box[:, 3] - 1) / 2
        box_[:, 2] = box[:, 0] + (box[:, 2] - 1) / 2
        box_[:, 3] = box[:, 1] + (box[:, 3] - 1) / 2
        box_ = box_.astype(np.float32)
        return box_

    # float
    def corner_to_center(self, box):
        box = box.copy()
        box_ = np.zeros_like(box, dtype=np.float32)
        box_[:, 0] = box[:, 0] + (box[:, 2] - box[:, 0]) / 2
        box_[:, 1] = box[:, 1] + (box[:, 3] - box[:, 1]) / 2
        box_[:, 2] = (box[:, 2] - box[:, 0])
        box_[:, 3] = (box[:, 3] - box[:, 1])
        box_ = box_.astype(np.float32)
        return box_

    def pos_neg_anchor(self, gt, pos_num=16, neg_num=48, threshold_pos=0.5, threshold_neg=0.1):
        #根据detection帧的box与自动生成的box计算iou，得出正负样本
        gt = gt.copy()
        gt_corner = self.center_to_corner(np.array(gt, dtype=np.float32).reshape(1, 4))
        an_corner = self.center_to_corner(np.array(self.anchors, dtype=np.float32))
        # 计算groundtruth与生成的1445个框的iou
        iou_value = self.iou(an_corner, gt_corner).reshape(-1)  # (1445)
        max_iou = max(iou_value)
        pos, neg = np.zeros_like(iou_value, dtype=np.int32), np.zeros_like(iou_value, dtype=np.int32)

        # pos
        # 选取得分最高的30个框
        pos_cand = np.argsort(iou_value)[::-1][:30]
        # 从这30个框中选择16个作为正样本
        pos_index = np.random.choice(pos_cand, pos_num, replace=False)
        if max_iou > threshold_pos:
            pos[pos_index] = 1  # 对应pos_index的pos置1

        # neg
        neg_cand = np.where(iou_value < threshold_neg)[0]  # 选择iou小于threshold_neg的为负样本
        neg_ind = np.random.choice(neg_cand, neg_num, replace=False)  # 随机选择48个负样本
        neg[neg_ind] = 1

        return pos, neg

    def iou(self, box1, box2):
        box1, box2 = box1.copy(), box2.copy()
        N = box1.shape[0]
        K = box2.shape[0]
        box1 = np.array(box1.reshape((N, 1, 4))) + np.zeros((1, K, 4))  # box1=[N,K,4]
        box2 = np.array(box2.reshape((1, K, 4))) + np.zeros((N, 1, 4))  # box1=[N,K,4]
        x_max = np.max(np.stack((box1[:, :, 0], box2[:, :, 0]), axis=-1), axis=2)
        x_min = np.min(np.stack((box1[:, :, 2], box2[:, :, 2]), axis=-1), axis=2)
        y_max = np.max(np.stack((box1[:, :, 1], box2[:, :, 1]), axis=-1), axis=2)
        y_min = np.min(np.stack((box1[:, :, 3], box2[:, :, 3]), axis=-1), axis=2)
        tb = x_min - x_max
        lr = y_min - y_max
        tb[np.where(tb < 0)] = 0
        lr[np.where(lr < 0)] = 0
        over_square = tb * lr
        all_square = (box1[:, :, 2] - box1[:, :, 0]) * (box1[:, :, 3] - box1[:, :, 1]) + (
                    box2[:, :, 2] - box2[:, :, 0]) * (box2[:, :, 3] - box2[:, :, 1]) - over_square
        return over_square / all_square

class TrainDataLoader(object):

    # out_feature最后滑框和类别特征图的大小
    def __init__(self, img_dir_path, net,out_feature=19, max_inter=80, check=False ,tmp_dir='../tmp/visualization'):  #
        assert osp.isdir(img_dir_path), 'input img_dir_path error'

        self.img_dir_path = img_dir_path  # this is a root dir contain subclass
        self.max_inter = max_inter
        self.sub_class_dir = [sub_class_dir for sub_class_dir in os.listdir(img_dir_path) if
                              os.path.isdir(os.path.join(img_dir_path, sub_class_dir))]
        self.anchor_generator = Anchor_ms(out_feature, out_feature)
        self.anchors = self.anchor_generator.gen_anchors()  # centor，依据17*17的特征图，每个点生成5个框，17*17*5=1445
        self.ret = {}
        self.check = check
        self.tmp_dir = self.init_dir(tmp_dir)
        self.ret['tmp_dir'] = tmp_dir
        self.ret['check'] = check
        self.count = 0
        self.ret['p']=TrackerConfig()
        self.ret['p'].update(net.cfg)

    def init_dir(self, tmp_dir):
        if not osp.exists(tmp_dir):
            os.makedirs(tmp_dir)
        return tmp_dir

    # def get_transform_for_train(self):
    #     transform_list = []
    #     transform_list.append(transforms.ToTensor())
    #     transform_list.append(transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)))
    #     return transforms.Compose(transform_list)
    #
    # # tuple  计算图像三个通道的均值
    # def _average(self):
    #     assert self.ret.__contains__('template_img_path'), 'no template path'
    #     assert self.ret.__contains__('detection_img_path'), 'no detection path'
    #     template = Image.open(self.ret['template_img_path'])
    #     detection = Image.open(self.ret['detection_img_path'])
    #
    #     mean_template = tuple(map(round, ImageStat.Stat(template).mean))
    #     mean_detection = tuple(map(round, ImageStat.Stat(detection).mean))
    #     self.ret['mean_template'] = mean_template
    #     self.ret['mean_detection'] = mean_detection

    def _pick_img_pairs(self, index_of_subclass):
        # img_dir_path -> sub_class_dir_path -> template_img_path
        # use index_of_subclass to select a sub directory
        assert index_of_subclass < len(self.sub_class_dir), 'index_of_subclass should less than total classes'
        sub_class_dir_basename = self.sub_class_dir[index_of_subclass]
        sub_class_dir_path = os.path.join(self.img_dir_path, sub_class_dir_basename)
        sub_class_img_name = [img_name for img_name in os.listdir(sub_class_dir_path) if
                              not img_name.find('.jpg') == -1]
        sub_class_img_name = sorted(sub_class_img_name)
        sub_class_img_num = len(sub_class_img_name)
        # select template, detection
        # ++++++++++++++++++++++++++++ add break in sequeence [0,0,0,0] ++++++++++++++++++++++++++++++++++
        if self.max_inter >= sub_class_img_num - 1:
            self.max_inter = sub_class_img_num // 2
        template_index = np.clip(random.choice(range(0, max(1, sub_class_img_num - self.max_inter))), 0,
                                 sub_class_img_num - 1)  # 在子文件夹中随机选了一个template
        detection_index = np.clip(random.choice(range(1, max(2, self.max_inter))) + template_index, 0,
                                  sub_class_img_num - 1)  # 在子文件夹中随机选了一个template
        template_name, detection_name = sub_class_img_name[template_index], sub_class_img_name[detection_index]
        template_img_path, detection_img_path = osp.join(sub_class_dir_path, template_name), osp.join(
            sub_class_dir_path, detection_name)
        im_template = cv2.imread(template_img_path)
        im_detection = cv2.imread(detection_img_path)
        template_xml_path=template_img_path.split('.')[0]+'.xml'
        detection_xml_path = detection_img_path.split('.')[0] + '.xml'
        box_template=self.get_xywh_from_xml(template_xml_path)
        box_detection = self.get_xywh_from_xml(detection_xml_path)

        # load infomation of template and detection
        self.ret['img_template'] = im_template
        self.ret['img_detection'] = im_detection
        self.ret['template_target_pos'] = np.array([box_template[0],box_template[1]])
        self.ret['template_target_sz'] = np.array([box_template[2],box_template[3]])
        self.ret['detection_target_pos'] = np.array([box_detection[0],box_detection[1]])
        self.ret['detection_target_sz'] = np.array([box_detection[2], box_detection[3]])
        self.ret['anchors'] = self.anchors
        # self._average()  # 计算图像均值

    def get_xywh_from_xml(self,file):
        in_file = open(file)
        tree = ET.parse(in_file)
        root = tree.getroot()
        x,y,w,h=0,0,0,0
        for object in root.iter('object'):
            bndbox=object.find('bndbox')
            xmin= int(bndbox.find('xmin').text)
            xmax = int(bndbox.find('xmax').text)
            ymin = int(bndbox.find('ymin').text)
            ymax = int(bndbox.find('ymax').text)
            x=int((xmin+xmax)/2)
            y=int((ymin+ymax)/2)
            w=int(xmax-xmin)
            h=int(ymax-ymin)
        return [x,y,w,h]

    def _pad_crop_resize_template(self):
        self.ret['im_h'] = self.ret['img_template'].shape[0]
        self.ret['im_w'] = self.ret['img_template'].shape[1]
        self.ret['p'].score_size = (self.ret['p'].instance_size - self.ret['p'].exemplar_size) / self.ret['p'].total_stride + 1
        self.ret['p'].anchor = generate_anchor(self.ret['p'].total_stride, self.ret['p'].scales, self.ret['p'].ratios, int(self.ret['p'].score_size))
        avg_chans = np.mean(self.ret['img_template'], axis=(0, 1))  # 图像均值
        wc_z = self.ret['template_target_sz'][0] + self.ret['p'].context_amount * sum(self.ret['template_target_sz'])
        hc_z = self.ret['template_target_sz'][1] + self.ret['p'].context_amount * sum(self.ret['template_target_sz'])
        s_z = round(np.sqrt(wc_z * hc_z))
        # initialize the exemplar
        z_crop = get_subwindow_tracking(self.ret['img_template'], self.ret['template_target_pos'], self.ret['p'].exemplar_size, s_z, avg_chans)
        z = Variable(z_crop.unsqueeze(0))
        # net.temple(z.cuda())
        if self.ret['p'].windowing == 'cosine':
            window = np.outer(np.hanning(self.ret['p'].score_size), np.hanning(self.ret['p'].score_size))
        elif self.ret['p'].windowing == 'uniform':
            window = np.ones((self.ret['p'].score_size, self.ret['p'].score_size))
        window = np.tile(window.flatten(), self.ret['p'].anchor_num)
        self.ret['temple'] = z
        self.ret['avg_chans'] = avg_chans
        self.ret['window'] = window

    def _pad_crop_resize_detection(self):
        wc_z = self.ret['detection_target_sz'][1] + self.ret['p'].context_amount * sum(self.ret['detection_target_sz'])
        hc_z = self.ret['detection_target_sz'][0] + self.ret['p'].context_amount * sum(self.ret['detection_target_sz'])
        s_z = np.sqrt(wc_z * hc_z)
        scale_z = self.ret['p'].exemplar_size / s_z
        d_search = (self.ret['p'].instance_size - self.ret['p'].exemplar_size) / 2
        pad = d_search / scale_z
        s_x = s_z + 2 * pad
        avg_chans = np.mean(self.ret['img_detection'], axis=(0, 1))  # 图像均值
        # extract scaled crops for search region x at previous target position
        x_crop = Variable(get_subwindow_tracking(self.ret['img_detection'], self.ret['detection_target_pos'], self.ret['p'].instance_size, round(s_x), avg_chans).unsqueeze(0))
        self.ret['detection'] = x_crop


    def _generate_pos_neg_diff(self):

        # np.array((self.ret['detection_target_pos'][0],self.ret['detection_target_pos'][1], self.ret['detection_target_sz'][0], self.ret['detection_target_sz'][1]), dtype=np.int32)
        gt_box_in_detection = np.array((self.ret['detection_target_pos'][0],self.ret['detection_target_pos'][1], self.ret['detection_target_sz'][0], self.ret['detection_target_sz'][1]), dtype=np.int32)
        pos, neg = self.anchor_generator.pos_neg_anchor(gt_box_in_detection)  # 生成框的正负样本标记向量
        diff = self.anchor_generator.diff_anchor_gt(gt_box_in_detection)  # 生成回归框的dx,dy,dw,dh
        pos, neg, diff = pos.reshape((-1, 1)), neg.reshape((-1, 1)), diff.reshape((-1, 4))
        class_target = np.array([-100.] * self.anchors.shape[0], np.int32)

        # pos   获取pos的真实长度，并生成标记向量
        pos_index = np.where(pos == 1)[0]
        pos_num = len(pos_index)
        self.ret['pos_anchors'] = np.array(self.ret['anchors'][pos_index, :],dtype=np.int32) if not pos_num == 0 else None
        if pos_num > 0:
            class_target[pos_index] = 1

        # neg   获取neg的真实长度，并生成标记向量
        neg_index = np.where(neg == 1)[0]
        neg_num = len(neg_index)
        class_target[neg_index] = 0

   

        class_logits = class_target.reshape(-1, 1)  # 生成框的类别logits的值用于分类，负样本的值为-100，正样本的值为0
        pos_neg_diff = np.hstack((class_logits, diff))  # 将类别和坐标合并
        self.ret['pos_neg_diff'] = pos_neg_diff
        self.ret['pos_neg_diff_tensor'] = torch.Tensor(pos_neg_diff)

        return pos_neg_diff

  
    def to_torch(self,ndarray):
        if type(ndarray).__module__ == 'numpy':
            return torch.from_numpy(ndarray)
        elif not torch.is_tensor(ndarray):
            raise ValueError("Cannot convert {} to torch tensor"
                             .format(type(ndarray)))
        return ndarray

    def im_to_torch(self,img):
        img = np.transpose(img, (2, 0, 1))  # C*H*W
        img = self.to_torch(img).float()
        return img

    def get_subwindow_tracking(self,im, pos, model_sz, original_sz, avg_chans, out_mode='torch', new=False):
        if isinstance(pos, float):
            pos = [pos, pos]
        sz = original_sz
        im_sz = im.shape
        c = (original_sz + 1) / 2
        context_xmin = round(pos[0] - c)  # floor(pos(2) - sz(2) / 2);
        context_xmax = context_xmin + sz - 1
        context_ymin = round(pos[1] - c)  # floor(pos(1) - sz(1) / 2);
        context_ymax = context_ymin + sz - 1
        left_pad = int(max(0., -context_xmin))
        top_pad = int(max(0., -context_ymin))
        right_pad = int(max(0., context_xmax - im_sz[1] + 1))
        bottom_pad = int(max(0., context_ymax - im_sz[0] + 1))

        context_xmin = context_xmin + left_pad
        context_xmax = context_xmax + left_pad
        context_ymin = context_ymin + top_pad
        context_ymax = context_ymax + top_pad

        # zzp: a more easy speed version
        r, c, k = im.shape
        if any([top_pad, bottom_pad, left_pad, right_pad]):
            te_im = np.zeros((r + top_pad + bottom_pad, c + left_pad + right_pad, k),
                             np.uint8)  # 0 is better than 1 initialization
            te_im[top_pad:top_pad + r, left_pad:left_pad + c, :] = im
            if top_pad:
                te_im[0:top_pad, left_pad:left_pad + c, :] = avg_chans
            if bottom_pad:
                te_im[r + top_pad:, left_pad:left_pad + c, :] = avg_chans
            if left_pad:
                te_im[:, 0:left_pad, :] = avg_chans
            if right_pad:
                te_im[:, c + left_pad:, :] = avg_chans
            im_patch_original = te_im[int(context_ymin):int(context_ymax + 1), int(context_xmin):int(context_xmax + 1),
                                :]
        else:
            im_patch_original = im[int(context_ymin):int(context_ymax + 1), int(context_xmin):int(context_xmax + 1), :]

        if not np.array_equal(model_sz, original_sz):
            im_patch = cv2.resize(im_patch_original, (model_sz, model_sz))  # zzp: use cv to get a better speed
        else:
            im_patch = im_patch_original

        return self.im_to_torch(im_patch) if out_mode in 'torch' else im_patch

    def __get__(self, index):
        self._pick_img_pairs(index)
        self._pad_crop_resize_template()
        self._pad_crop_resize_detection()
        self._generate_pos_neg_diff()  # 生成框的1445*5的张量，代表每个框的类别，dx,dy,dw,dh
        # self._tranform()  # PIL to Tensor
        self.count += 1
        return self.ret

    def __len__(self):
        return len(self.sub_class_dir)



if __name__ == '__main__':
    # we will do a test for dataloader
    net = SiamRPNvot()
    loader = TrainDataLoader('D:\\uav_frame\\00',net ,check = True)
    #print(loader.__len__())
    index_list = range(loader.__len__())
    for i in range(1000):
        ret = loader.__get__(random.choice(index_list))
        label = ret['pos_neg_diff'][:, 0].reshape(-1)
        pos_index = list(np.where(label == 1)[0])
        pos_num = len(pos_index)
        print(pos_index)
        print(pos_num)
        if pos_num != 0 and pos_num != 16:
            print(pos_num)
            sys.exit(0)
        print(i)