想一步步的实现Diffusion VLA论文的思路,不过论文的图像的输入用DINOv2进行特征提取的,我先把这个部分换成ResNet50。
老铁们,直接上代码:
from PIL import Image
import torch
import torchvision.models as models
from torch import nn
from datasets import Dataset
from modelscope import snapshot_download, AutoTokenizer
from swanlab.integration.transformers import SwanLabCallback
from qwen_vl_utils import process_vision_info
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from transformers import (
TrainingArguments,
Trainer,
DataCollatorForSeq2Seq,
Qwen2VLForConditionalGeneration,
AutoProcessor,
)
import swanlab
import json
from torchvision import transforms
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
import torchvision.models as models
class CustomResNet(nn.Module):
def __init__(self, output_size=(256, 1176)):
super(CustomResNet, self).__init__()
# 预训练的 ResNet 模型
resnet = models.resnet50(pretrained=True)
# 去掉 ResNet 的最后全连接层和池化层
self.features = nn.Sequential(*list(resnet.children())[:-2]) # 去掉最后的FC层和AvgPool层
# 自定义的卷积层,调整步幅和padding来控制尺寸
self.conv1 = nn.Conv2d(2048, 2048, kernel_size=3, stride=1, padding=1) # 保持大小
self.conv2 = nn.Conv2d(2048, 2048, kernel_size=3, stride=1, padding=1) # 保持大小
self.conv3 = nn.Conv2d(2048, 2048, kernel_size=3, stride=1, padding=1) # 保持大小
# 上采样层,用于增加特征图的尺寸
self.upconv1 = nn.ConvTranspose2d(2048, 2048, kernel_size=4, stride=4, padding=0) # 上采样
self.upconv2 = nn.ConvTranspose2d(2048, 2048, kernel_size=4, stride=4, padding=0) # 上采样
# 最终卷积层将特征图变为单通道输出(灰度图)
self.final_conv = nn.Conv2d(2048, 1, kernel_size=1) # 输出单通道
def forward(self, x):
# 获取ResNet的特征图
x = self.features(x)
# 经过卷积层
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
# 上采样阶段:增加特征图的尺寸
x = self.upconv1(x) # 上采样1
x = self.upconv2(x) # 上采样2
# 使用插值进行微调输出尺寸
x = F.interpolate(x, siz