CLIP使用

安装环境

# 1. 环境安装pytorch
# 2. 安装tqdm
pip install ftfy regex tqdm
# 3. 安装clip
pip install git+https://github.com/openai/CLIP.git
    
# 内网使用pip install git+https://github.91chi.fun/https://github.com/openai/CLIP.git

API

# 1. 返回可以用的模型
clip.available_models()
['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']

# 2. 返回对应的模型和图像转换器
model, preprocess = clip.load("ViT-B/32")

# 3. preprocess将Image转换成tensor[3, 224, 224],然后unsqueeze(0)转成[batch_size, 3, 3, 224]后才能输入模型
image = preprocess(Image.open("CLIP.png")).unsqueeze(0)

# 4. 将多个句子[batch_size]的每个句子转换成向量[batch_size, context_length]
# 	每个句子开头加一个BOS(49406) EOS(49407),然后填充到长度context_length(默认值为77)
# 	(若长度大于context_length-2,需设置参数truncate=True,然后返回值为BOS 内容 EOS,即EOS没有被切割掉)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) # [3, 77]

# 5. 获取多个图片的特征
image_features = model.encode_image(image)
   
# 6. 获取多个文本的特征
text_features = model.encode_text(text)

# 7. 获取 多个图片和多个文本 之间余弦相似度(0~1)
logits_per_image, logits_per_text = model(image, text)

各模型shape&dtype

ViT-B/32
# 512 (224, 224)
image					torch.Size([B, 3, 224, 224])   torch.float32
text					torch.Size([B, 77])			  torch.int32
image_features			 torch.Size([B, 512]) 		   torch.float16
text_features			 torch.Size([B, 512])   	   torch.float16

ViT-B/16
# 512 (224, 224)
image					torch.Size([B, 3, 224, 224])   torch.float32
text					torch.Size([B, 77])			  torch.int32
image_features			 torch.Size([B, 512]) 		   torch.float16
text_features			 torch.Size([B, 512])   	   torch.float16

ViT-L/14
# 768 (224, 224)
image					torch.Size([B, 3, 224, 224])   torch.float32
text					torch.Size([B, 77])			  torch.int32
image_features			 torch.Size([B, 768]) 		   torch.float16
text_features			 torch.Size([B, 768])   	   torch.float16

ViT-L/14@336px
# 768 (336, 336)
image					torch.Size([B, 3, 336, 336])   torch.float32
text					torch.Size([B, 77])			  torch.int32
image_features			 torch.Size([B, 768]) 		   torch.float16
text_features			 torch.Size([B, 768])   	   torch.float16

各模型preprocess&tokenize

# 无论load哪个模型,clip.tokenize返回值均相同

# preprocess的不同在于返回的图片尺寸不同,
# 因此 ViT-B/32 ViT-B/16 ViT-L/14的preprocess返回值相同,ViT-L/14@336px的preprocess返回值与它们不同

简单使用

示例1

import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

示例2

import os
import clip
import torch
from torchvision.datasets import CIFAR100

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Download the dataset
cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)

# Prepare the inputs
image, class_id = cifar100[3637]
image_input = preprocess(image).unsqueeze(0).to(device)
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device)

# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)

# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
# 这里乘100没什么用,意思是用百分比表示相似度
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")

示例3

import os
import clip
import torch

import numpy as np
from sklearn.linear_model import LogisticRegression
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
from tqdm import tqdm

# Load the model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

# Load the dataset
root = os.path.expanduser("~/.cache")
train = CIFAR100(root, download=True, train=True, transform=preprocess)
test = CIFAR100(root, download=True, train=False, transform=preprocess)


def get_features(dataset):
    all_features = []
    all_labels = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
            features = model.encode_image(images.to(device))

            all_features.append(features)
            all_labels.append(labels)

    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()

# Calculate the image features
train_features, train_labels = get_features(train)
test_features, test_labels = get_features(test)

# Perform logistic regression
classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
classifier.fit(train_features, train_labels)

# Evaluate using the logistic regression classifier
predictions = classifier.predict(test_features)
accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
print(f"Accuracy = {accuracy:.3f}")

(重要)固定或更新CLIP参数

关于detach

# 因为我们的模型只用到了CLIP视觉的编码器,所以我们只输出视觉编码器的参数有没有变化即可
# 不打开位置1和位置2,全部输出False,即所有参数都进行了更新
# 仅打开位置1,CLIP的参数为True,Linear为False,即Linear的参数更新
# 仅打开位置2,CLIP的参数为Flase,Linear为True,即只有CLIP的参数更新


import os
import clip
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torch.nn import functional as F
import torch


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.model, self.preprocess = clip.load('ViT-B/32', 'cpu')
        self.linear = nn.Linear(512, 10)

        # 位置2
        # for param in self.linear.parameters():
        #    param.requires_grad = False

    def forward(self, x):
        features = self.model.encode_image(x)

        # 位置1
        # features = features.detach()

        return self.linear(features)


net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)

root = os.path.expanduser("~/.cache")
train = CIFAR10(root, download=True, train=True, transform=net.preprocess)
train = next(iter(DataLoader(train, batch_size=8)))

storeParam = {}
for name, param in net.model.visual.named_parameters():
    storeParam[name] = param.detach().clone()
for name, param in net.linear.named_parameters():
    storeParam[name] = param.detach().clone()

for i in range(10):
    out = net(train[0])
    loss = F.cross_entropy(out, train[1])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(loss.item())

for name, param in net.model.visual.named_parameters():
    print(f"{name} {torch.equal(param, storeParam[name])}")
for name, param in net.linear.named_parameters():
    print(f"{name} {torch.equal(param, storeParam[name])}")

CLIP层结构

VIT-B/32

CLIP(
  # 图片相关
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (2): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (3): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (4): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (5): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (6): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (7): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (8): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (9): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (10): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (11): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
    
  # 文本相关
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (1): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (2): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (3): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (4): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (5): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (6): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (7): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (8): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (9): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (10): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (11): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (token_embedding): Embedding(49408, 512)
  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)

VIT-B/16

CLIP(
  # 图片相关 
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (2): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (3): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (4): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (5): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (6): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (7): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (8): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (9): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (10): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (11): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): _LinearWithBias(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (ln_post): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  
  # 文本相关
  (token_embedding): Embedding(49408, 512)
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (1): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (2): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (3): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (4): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (5): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (6): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (7): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (8): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (9): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (10): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
      (11): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=512, out_features=2048, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=2048, out_features=512, bias=True)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (ln_final): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
)

VIT-L/14

CLIP(
  # 图片相关
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (2): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (3): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (4): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (5): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (6): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (7): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (8): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (9): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (10): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (11): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (12): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (13): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (14): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (15): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (16): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (17): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (18): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (19): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (20): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (21): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (22): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (23): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
    
  # 文本相关
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (1): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (2): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (3): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (4): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (5): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (6): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (7): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (8): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (9): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (10): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (11): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (token_embedding): Embedding(49408, 768)
  (ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

VIT-L/14@336px

CLIP(
  # 图片相关
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (2): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (3): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (4): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (5): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (6): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (7): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (8): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (9): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (10): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (11): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (12): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (13): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (14): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (15): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (16): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (17): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (18): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (19): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (20): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (21): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (22): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (23): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
   
  # 文本相关
  (transformer): Transformer(
    (resblocks): Sequential(
      (0): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (1): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (2): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (3): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (4): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (5): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (6): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (7): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (8): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (9): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (10): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (11): ResidualAttentionBlock(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): QuickGELU()
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
  (token_embedding): Embedding(49408, 768)
  (ln_final): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

CLIP参数结构

VIT-B

包括VIT-B/32和VIT-B/16

# 计算余弦相似度时的权重,详细代码为:
# normalized features
# image_features = image_features / image_features.norm(dim=1, keepdim=True)
# text_features = text_features / text_features.norm(dim=1, keepdim=True)
# # cosine similarity as logits
# logit_scale = self.logit_scale.exp()
# logits_per_image = logit_scale * image_features @ text_features.t()
# logits_per_text = logits_per_image.t()
logit_scale


# 图片相关
visual.conv1.weight
visual.class_embedding
visual.positional_embedding
visual.ln_pre.weight
visual.ln_pre.bias
visual.transformer.resblocks.0.attn.in_proj_weight
visual.transformer.resblocks.0.attn.in_proj_bias
visual.transformer.resblocks.0.attn.out_proj.weight
visual.transformer.resblocks.0.attn.out_proj.bias
visual.transformer.resblocks.0.ln_1.weight
visual.transformer.resblocks.0.ln_1.bias
visual.transformer.resblocks.0.mlp.c_fc.weight
visual.transformer.resblocks.0.mlp.c_fc.bias
visual.transformer.resblocks.0.mlp.c_proj.weight
visual.transformer.resblocks.0.mlp.c_proj.bias
visual.transformer.resblocks.0.ln_2.weight
visual.transformer.resblocks.0.ln_2.bias
visual.transformer.resblocks.1.attn.in_proj_weight
visual.transformer.resblocks.1.attn.in_proj_bias
visual.transformer.resblocks.1.attn.out_proj.weight
visual.transformer.resblocks.1.attn.out_proj.bias
visual.transformer.resblocks.1.ln_1.weight
visual.transformer.resblocks.1.ln_1.bias
visual.transformer.resblocks.1.mlp.c_fc.weight
visual.transformer.resblocks.1.mlp.c_fc.bias
visual.transformer.resblocks.1.mlp.c_proj.weight
visual.transformer.resblocks.1.mlp.c_proj.bias
visual.transformer.resblocks.1.ln_2.weight
visual.transformer.resblocks.1.ln_2.bias
visual.transformer.resblocks.2.attn.in_proj_weight
visual.transformer.resblocks.2.attn.in_proj_bias
visual.transformer.resblocks.2.attn.out_proj.weight
visual.transformer.resblocks.2.attn.out_proj.bias
visual.transformer.resblocks.2.ln_1.weight
visual.transformer.resblocks.2.ln_1.bias
visual.transformer.resblocks.2.mlp.c_fc.weight
visual.transformer.resblocks.2.mlp.c_fc.bias
visual.transformer.resblocks.2.mlp.c_proj.weight
visual.transformer.resblocks.2.mlp.c_proj.bias
visual.transformer.resblocks.2.ln_2.weight
visual.transformer.resblocks.2.ln_2.bias
visual.transformer.resblocks.3.attn.in_proj_weight
visual.transformer.resblocks.3.attn.in_proj_bias
visual.transformer.resblocks.3.attn.out_proj.weight
visual.transformer.resblocks.3.attn.out_proj.bias
visual.transformer.resblocks.3.ln_1.weight
visual.transformer.resblocks.3.ln_1.bias
visual.transformer.resblocks.3.mlp.c_fc.weight
visual.transformer.resblocks.3.mlp.c_fc.bias
visual.transformer.resblocks.3.mlp.c_proj.weight
visual.transformer.resblocks.3.mlp.c_proj.bias
visual.transformer.resblocks.3.ln_2.weight
visual.transformer.resblocks.3.ln_2.bias
visual.transformer.resblocks.4.attn.in_proj_weight
visual.transformer.resblocks.4.attn.in_proj_bias
visual.transformer.resblocks.4.attn.out_proj.weight
visual.transformer.resblocks.4.attn.out_proj.bias
visual.transformer.resblocks.4.ln_1.weight
visual.transformer.resblocks.4.ln_1.bias
visual.transformer.resblocks.4.mlp.c_fc.weight
visual.transformer.resblocks.4.mlp.c_fc.bias
visual.transformer.resblocks.4.mlp.c_proj.weight
visual.transformer.resblocks.4.mlp.c_proj.bias
visual.transformer.resblocks.4.ln_2.weight
visual.transformer.resblocks.4.ln_2.bias
visual.transformer.resblocks.5.attn.in_proj_weight
visual.transformer.resblocks.5.attn.in_proj_bias
visual.transformer.resblocks.5.attn.out_proj.weight
visual.transformer.resblocks.5.attn.out_proj.bias
visual.transformer.resblocks.5.ln_1.weight
visual.transformer.resblocks.5.ln_1.bias
visual.transformer.resblocks.5.mlp.c_fc.weight
visual.transformer.resblocks.5.mlp.c_fc.bias
visual.transformer.resblocks.5.mlp.c_proj.weight
visual.transformer.resblocks.5.mlp.c_proj.bias
visual.transformer.resblocks.5.ln_2.weight
visual.transformer.resblocks.5.ln_2.bias
visual.transformer.resblocks.6.attn.in_proj_weight
visual.transformer.resblocks.6.attn.in_proj_bias
visual.transformer.resblocks.6.attn.out_proj.weight
visual.transformer.resblocks.6.attn.out_proj.bias
visual.transformer.resblocks.6.ln_1.weight
visual.transformer.resblocks.6.ln_1.bias
visual.transformer.resblocks.6.mlp.c_fc.weight
visual.transformer.resblocks.6.mlp.c_fc.bias
visual.transformer.resblocks.6.mlp.c_proj.weight
visual.transformer.resblocks.6.mlp.c_proj.bias
visual.transformer.resblocks.6.ln_2.weight
visual.transformer.resblocks.6.ln_2.bias
visual.transformer.resblocks.7.attn.in_proj_weight
visual.transformer.resblocks.7.attn.in_proj_bias
visual.transformer.resblocks.7.attn.out_proj.weight
visual.transformer.resblocks.7.attn.out_proj.bias
visual.transformer.resblocks.7.ln_1.weight
visual.transformer.resblocks.7.ln_1.bias
visual.transformer.resblocks.7.mlp.c_fc.weight
visual.transformer.resblocks.7.mlp.c_fc.bias
visual.transformer.resblocks.7.mlp.c_proj.weight
visual.transformer.resblocks.7.mlp.c_proj.bias
visual.transformer.resblocks.7.ln_2.weight
visual.transformer.resblocks.7.ln_2.bias
visual.transformer.resblocks.8.attn.in_proj_weight
visual.transformer.resblocks.8.attn.in_proj_bias
visual.transformer.resblocks.8.attn.out_proj.weight
visual.transformer.resblocks.8.attn.out_proj.bias
visual.transformer.resblocks.8.ln_1.weight
visual.transformer.resblocks.8.ln_1.bias
visual.transformer.resblocks.8.mlp.c_fc.weight
visual.transformer.resblocks.8.mlp.c_fc.bias
visual.transformer.resblocks.8.mlp.c_proj.weight
visual.transformer.resblocks.8.mlp.c_proj.bias
visual.transformer.resblocks.8.ln_2.weight
visual.transformer.resblocks.8.ln_2.bias
visual.transformer.resblocks.9.attn.in_proj_weight
visual.transformer.resblocks.9.attn.in_proj_bias
visual.transformer.resblocks.9.attn.out_proj.weight
visual.transformer.resblocks.9.attn.out_proj.bias
visual.transformer.resblocks.9.ln_1.weight
visual.transformer.resblocks.9.ln_1.bias
visual.transformer.resblocks.9.mlp.c_fc.weight
visual.transformer.resblocks.9.mlp.c_fc.bias
visual.transformer.resblocks.9.mlp.c_proj.weight
visual.transformer.resblocks.9.mlp.c_proj.bias
visual.transformer.resblocks.9.ln_2.weight
visual.transformer.resblocks.9.ln_2.bias
visual.transformer.resblocks.10.attn.in_proj_weight
visual.transformer.resblocks.10.attn.in_proj_bias
visual.transformer.resblocks.10.attn.out_proj.weight
visual.transformer.resblocks.10.attn.out_proj.bias
visual.transformer.resblocks.10.ln_1.weight
visual.transformer.resblocks.10.ln_1.bias
visual.transformer.resblocks.10.mlp.c_fc.weight
visual.transformer.resblocks.10.mlp.c_fc.bias
visual.transformer.resblocks.10.mlp.c_proj.weight
visual.transformer.resblocks.10.mlp.c_proj.bias
visual.transformer.resblocks.10.ln_2.weight
visual.transformer.resblocks.10.ln_2.bias
visual.transformer.resblocks.11.attn.in_proj_weight
visual.transformer.resblocks.11.attn.in_proj_bias
visual.transformer.resblocks.11.attn.out_proj.weight
visual.transformer.resblocks.11.attn.out_proj.bias
visual.transformer.resblocks.11.ln_1.weight
visual.transformer.resblocks.11.ln_1.bias
visual.transformer.resblocks.11.mlp.c_fc.weight
visual.transformer.resblocks.11.mlp.c_fc.bias
visual.transformer.resblocks.11.mlp.c_proj.weight
visual.transformer.resblocks.11.mlp.c_proj.bias
visual.transformer.resblocks.11.ln_2.weight
visual.transformer.resblocks.11.ln_2.bias
visual.ln_post.weight
visual.ln_post.bias
visual.proj

# 文本相关
token_embedding.weight
positional_embedding
transformer.resblocks.0.attn.in_proj_weight
transformer.resblocks.0.attn.in_proj_bias
transformer.resblocks.0.attn.out_proj.weight
transformer.resblocks.0.attn.out_proj.bias
transformer.resblocks.0.ln_1.weight
transformer.resblocks.0.ln_1.bias
transformer.resblocks.0.mlp.c_fc.weight
transformer.resblocks.0.mlp.c_fc.bias
transformer.resblocks.0.mlp.c_proj.weight
transformer.resblocks.0.mlp.c_proj.bias
transformer.resblocks.0.ln_2.weight
transformer.resblocks.0.ln_2.bias
transformer.resblocks.1.attn.in_proj_weight
transformer.resblocks.1.attn.in_proj_bias
transformer.resblocks.1.attn.out_proj.weight
transformer.resblocks.1.attn.out_proj.bias
transformer.resblocks.1.ln_1.weight
transformer.resblocks.1.ln_1.bias
transformer.resblocks.1.mlp.c_fc.weight
transformer.resblocks.1.mlp.c_fc.bias
transformer.resblocks.1.mlp.c_proj.weight
transformer.resblocks.1.mlp.c_proj.bias
transformer.resblocks.1.ln_2.weight
transformer.resblocks.1.ln_2.bias
transformer.resblocks.2.attn.in_proj_weight
transformer.resblocks.2.attn.in_proj_bias
transformer.resblocks.2.attn.out_proj.weight
transformer.resblocks.2.attn.out_proj.bias
transformer.resblocks.2.ln_1.weight
transformer.resblocks.2.ln_1.bias
transformer.resblocks.2.mlp.c_fc.weight
transformer.resblocks.2.mlp.c_fc.bias
transformer.resblocks.2.mlp.c_proj.weight
transformer.resblocks.2.mlp.c_proj.bias
transformer.resblocks.2.ln_2.weight
transformer.resblocks.2.ln_2.bias
transformer.resblocks.3.attn.in_proj_weight
transformer.resblocks.3.attn.in_proj_bias
transformer.resblocks.3.attn.out_proj.weight
transformer.resblocks.3.attn.out_proj.bias
transformer.resblocks.3.ln_1.weight
transformer.resblocks.3.ln_1.bias
transformer.resblocks.3.mlp.c_fc.weight
transformer.resblocks.3.mlp.c_fc.bias
transformer.resblocks.3.mlp.c_proj.weight
transformer.resblocks.3.mlp.c_proj.bias
transformer.resblocks.3.ln_2.weight
transformer.resblocks.3.ln_2.bias
transformer.resblocks.4.attn.in_proj_weight
transformer.resblocks.4.attn.in_proj_bias
transformer.resblocks.4.attn.out_proj.weight
transformer.resblocks.4.attn.out_proj.bias
transformer.resblocks.4.ln_1.weight
transformer.resblocks.4.ln_1.bias
transformer.resblocks.4.mlp.c_fc.weight
transformer.resblocks.4.mlp.c_fc.bias
transformer.resblocks.4.mlp.c_proj.weight
transformer.resblocks.4.mlp.c_proj.bias
transformer.resblocks.4.ln_2.weight
transformer.resblocks.4.ln_2.bias
transformer.resblocks.5.attn.in_proj_weight
transformer.resblocks.5.attn.in_proj_bias
transformer.resblocks.5.attn.out_proj.weight
transformer.resblocks.5.attn.out_proj.bias
transformer.resblocks.5.ln_1.weight
transformer.resblocks.5.ln_1.bias
transformer.resblocks.5.mlp.c_fc.weight
transformer.resblocks.5.mlp.c_fc.bias
transformer.resblocks.5.mlp.c_proj.weight
transformer.resblocks.5.mlp.c_proj.bias
transformer.resblocks.5.ln_2.weight
transformer.resblocks.5.ln_2.bias
transformer.resblocks.6.attn.in_proj_weight
transformer.resblocks.6.attn.in_proj_bias
transformer.resblocks.6.attn.out_proj.weight
transformer.resblocks.6.attn.out_proj.bias
transformer.resblocks.6.ln_1.weight
transformer.resblocks.6.ln_1.bias
transformer.resblocks.6.mlp.c_fc.weight
transformer.resblocks.6.mlp.c_fc.bias
transformer.resblocks.6.mlp.c_proj.weight
transformer.resblocks.6.mlp.c_proj.bias
transformer.resblocks.6.ln_2.weight
transformer.resblocks.6.ln_2.bias
transformer.resblocks.7.attn.in_proj_weight
transformer.resblocks.7.attn.in_proj_bias
transformer.resblocks.7.attn.out_proj.weight
transformer.resblocks.7.attn.out_proj.bias
transformer.resblocks.7.ln_1.weight
transformer.resblocks.7.ln_1.bias
transformer.resblocks.7.mlp.c_fc.weight
transformer.resblocks.7.mlp.c_fc.bias
transformer.resblocks.7.mlp.c_proj.weight
transformer.resblocks.7.mlp.c_proj.bias
transformer.resblocks.7.ln_2.weight
transformer.resblocks.7.ln_2.bias
transformer.resblocks.8.attn.in_proj_weight
transformer.resblocks.8.attn.in_proj_bias
transformer.resblocks.8.attn.out_proj.weight
transformer.resblocks.8.attn.out_proj.bias
transformer.resblocks.8.ln_1.weight
transformer.resblocks.8.ln_1.bias
transformer.resblocks.8.mlp.c_fc.weight
transformer.resblocks.8.mlp.c_fc.bias
transformer.resblocks.8.mlp.c_proj.weight
transformer.resblocks.8.mlp.c_proj.bias
transformer.resblocks.8.ln_2.weight
transformer.resblocks.8.ln_2.bias
transformer.resblocks.9.attn.in_proj_weight
transformer.resblocks.9.attn.in_proj_bias
transformer.resblocks.9.attn.out_proj.weight
transformer.resblocks.9.attn.out_proj.bias
transformer.resblocks.9.ln_1.weight
transformer.resblocks.9.ln_1.bias
transformer.resblocks.9.mlp.c_fc.weight
transformer.resblocks.9.mlp.c_fc.bias
transformer.resblocks.9.mlp.c_proj.weight
transformer.resblocks.9.mlp.c_proj.bias
transformer.resblocks.9.ln_2.weight
transformer.resblocks.9.ln_2.bias
transformer.resblocks.10.attn.in_proj_weight
transformer.resblocks.10.attn.in_proj_bias
transformer.resblocks.10.attn.out_proj.weight
transformer.resblocks.10.attn.out_proj.bias
transformer.resblocks.10.ln_1.weight
transformer.resblocks.10.ln_1.bias
transformer.resblocks.10.mlp.c_fc.weight
transformer.resblocks.10.mlp.c_fc.bias
transformer.resblocks.10.mlp.c_proj.weight
transformer.resblocks.10.mlp.c_proj.bias
transformer.resblocks.10.ln_2.weight
transformer.resblocks.10.ln_2.bias
transformer.resblocks.11.attn.in_proj_weight
transformer.resblocks.11.attn.in_proj_bias
transformer.resblocks.11.attn.out_proj.weight
transformer.resblocks.11.attn.out_proj.bias
transformer.resblocks.11.ln_1.weight
transformer.resblocks.11.ln_1.bias
transformer.resblocks.11.mlp.c_fc.weight
transformer.resblocks.11.mlp.c_fc.bias
transformer.resblocks.11.mlp.c_proj.weight
transformer.resblocks.11.mlp.c_proj.bias
transformer.resblocks.11.ln_2.weight
transformer.resblocks.11.ln_2.bias
ln_final.weight
ln_final.bias
text_projection

VIT-L

包括VIT-L/14和VIT-L/14@336px

logit_scale

# 图片相关
visual.conv1.weight
visual.class_embedding
visual.positional_embedding
visual.ln_pre.weight
visual.ln_pre.bias
visual.transformer.resblocks.0.attn.in_proj_weight
visual.transformer.resblocks.0.attn.in_proj_bias
visual.transformer.resblocks.0.attn.out_proj.weight
visual.transformer.resblocks.0.attn.out_proj.bias
visual.transformer.resblocks.0.ln_1.weight
visual.transformer.resblocks.0.ln_1.bias
visual.transformer.resblocks.0.mlp.c_fc.weight
visual.transformer.resblocks.0.mlp.c_fc.bias
visual.transformer.resblocks.0.mlp.c_proj.weight
visual.transformer.resblocks.0.mlp.c_proj.bias
visual.transformer.resblocks.0.ln_2.weight
visual.transformer.resblocks.0.ln_2.bias
visual.transformer.resblocks.1.attn.in_proj_weight
visual.transformer.resblocks.1.attn.in_proj_bias
visual.transformer.resblocks.1.attn.out_proj.weight
visual.transformer.resblocks.1.attn.out_proj.bias
visual.transformer.resblocks.1.ln_1.weight
visual.transformer.resblocks.1.ln_1.bias
visual.transformer.resblocks.1.mlp.c_fc.weight
visual.transformer.resblocks.1.mlp.c_fc.bias
visual.transformer.resblocks.1.mlp.c_proj.weight
visual.transformer.resblocks.1.mlp.c_proj.bias
visual.transformer.resblocks.1.ln_2.weight
visual.transformer.resblocks.1.ln_2.bias
visual.transformer.resblocks.2.attn.in_proj_weight
visual.transformer.resblocks.2.attn.in_proj_bias
visual.transformer.resblocks.2.attn.out_proj.weight
visual.transformer.resblocks.2.attn.out_proj.bias
visual.transformer.resblocks.2.ln_1.weight
visual.transformer.resblocks.2.ln_1.bias
visual.transformer.resblocks.2.mlp.c_fc.weight
visual.transformer.resblocks.2.mlp.c_fc.bias
visual.transformer.resblocks.2.mlp.c_proj.weight
visual.transformer.resblocks.2.mlp.c_proj.bias
visual.transformer.resblocks.2.ln_2.weight
visual.transformer.resblocks.2.ln_2.bias
visual.transformer.resblocks.3.attn.in_proj_weight
visual.transformer.resblocks.3.attn.in_proj_bias
visual.transformer.resblocks.3.attn.out_proj.weight
visual.transformer.resblocks.3.attn.out_proj.bias
visual.transformer.resblocks.3.ln_1.weight
visual.transformer.resblocks.3.ln_1.bias
visual.transformer.resblocks.3.mlp.c_fc.weight
visual.transformer.resblocks.3.mlp.c_fc.bias
visual.transformer.resblocks.3.mlp.c_proj.weight
visual.transformer.resblocks.3.mlp.c_proj.bias
visual.transformer.resblocks.3.ln_2.weight
visual.transformer.resblocks.3.ln_2.bias
visual.transformer.resblocks.4.attn.in_proj_weight
visual.transformer.resblocks.4.attn.in_proj_bias
visual.transformer.resblocks.4.attn.out_proj.weight
visual.transformer.resblocks.4.attn.out_proj.bias
visual.transformer.resblocks.4.ln_1.weight
visual.transformer.resblocks.4.ln_1.bias
visual.transformer.resblocks.4.mlp.c_fc.weight
visual.transformer.resblocks.4.mlp.c_fc.bias
visual.transformer.resblocks.4.mlp.c_proj.weight
visual.transformer.resblocks.4.mlp.c_proj.bias
visual.transformer.resblocks.4.ln_2.weight
visual.transformer.resblocks.4.ln_2.bias
visual.transformer.resblocks.5.attn.in_proj_weight
visual.transformer.resblocks.5.attn.in_proj_bias
visual.transformer.resblocks.5.attn.out_proj.weight
visual.transformer.resblocks.5.attn.out_proj.bias
visual.transformer.resblocks.5.ln_1.weight
visual.transformer.resblocks.5.ln_1.bias
visual.transformer.resblocks.5.mlp.c_fc.weight
visual.transformer.resblocks.5.mlp.c_fc.bias
visual.transformer.resblocks.5.mlp.c_proj.weight
visual.transformer.resblocks.5.mlp.c_proj.bias
visual.transformer.resblocks.5.ln_2.weight
visual.transformer.resblocks.5.ln_2.bias
visual.transformer.resblocks.6.attn.in_proj_weight
visual.transformer.resblocks.6.attn.in_proj_bias
visual.transformer.resblocks.6.attn.out_proj.weight
visual.transformer.resblocks.6.attn.out_proj.bias
visual.transformer.resblocks.6.ln_1.weight
visual.transformer.resblocks.6.ln_1.bias
visual.transformer.resblocks.6.mlp.c_fc.weight
visual.transformer.resblocks.6.mlp.c_fc.bias
visual.transformer.resblocks.6.mlp.c_proj.weight
visual.transformer.resblocks.6.mlp.c_proj.bias
visual.transformer.resblocks.6.ln_2.weight
visual.transformer.resblocks.6.ln_2.bias
visual.transformer.resblocks.7.attn.in_proj_weight
visual.transformer.resblocks.7.attn.in_proj_bias
visual.transformer.resblocks.7.attn.out_proj.weight
visual.transformer.resblocks.7.attn.out_proj.bias
visual.transformer.resblocks.7.ln_1.weight
visual.transformer.resblocks.7.ln_1.bias
visual.transformer.resblocks.7.mlp.c_fc.weight
visual.transformer.resblocks.7.mlp.c_fc.bias
visual.transformer.resblocks.7.mlp.c_proj.weight
visual.transformer.resblocks.7.mlp.c_proj.bias
visual.transformer.resblocks.7.ln_2.weight
visual.transformer.resblocks.7.ln_2.bias
visual.transformer.resblocks.8.attn.in_proj_weight
visual.transformer.resblocks.8.attn.in_proj_bias
visual.transformer.resblocks.8.attn.out_proj.weight
visual.transformer.resblocks.8.attn.out_proj.bias
visual.transformer.resblocks.8.ln_1.weight
visual.transformer.resblocks.8.ln_1.bias
visual.transformer.resblocks.8.mlp.c_fc.weight
visual.transformer.resblocks.8.mlp.c_fc.bias
visual.transformer.resblocks.8.mlp.c_proj.weight
visual.transformer.resblocks.8.mlp.c_proj.bias
visual.transformer.resblocks.8.ln_2.weight
visual.transformer.resblocks.8.ln_2.bias
visual.transformer.resblocks.9.attn.in_proj_weight
visual.transformer.resblocks.9.attn.in_proj_bias
visual.transformer.resblocks.9.attn.out_proj.weight
visual.transformer.resblocks.9.attn.out_proj.bias
visual.transformer.resblocks.9.ln_1.weight
visual.transformer.resblocks.9.ln_1.bias
visual.transformer.resblocks.9.mlp.c_fc.weight
visual.transformer.resblocks.9.mlp.c_fc.bias
visual.transformer.resblocks.9.mlp.c_proj.weight
visual.transformer.resblocks.9.mlp.c_proj.bias
visual.transformer.resblocks.9.ln_2.weight
visual.transformer.resblocks.9.ln_2.bias
visual.transformer.resblocks.10.attn.in_proj_weight
visual.transformer.resblocks.10.attn.in_proj_bias
visual.transformer.resblocks.10.attn.out_proj.weight
visual.transformer.resblocks.10.attn.out_proj.bias
visual.transformer.resblocks.10.ln_1.weight
visual.transformer.resblocks.10.ln_1.bias
visual.transformer.resblocks.10.mlp.c_fc.weight
visual.transformer.resblocks.10.mlp.c_fc.bias
visual.transformer.resblocks.10.mlp.c_proj.weight
visual.transformer.resblocks.10.mlp.c_proj.bias
visual.transformer.resblocks.10.ln_2.weight
visual.transformer.resblocks.10.ln_2.bias
visual.transformer.resblocks.11.attn.in_proj_weight
visual.transformer.resblocks.11.attn.in_proj_bias
visual.transformer.resblocks.11.attn.out_proj.weight
visual.transformer.resblocks.11.attn.out_proj.bias
visual.transformer.resblocks.11.ln_1.weight
visual.transformer.resblocks.11.ln_1.bias
visual.transformer.resblocks.11.mlp.c_fc.weight
visual.transformer.resblocks.11.mlp.c_fc.bias
visual.transformer.resblocks.11.mlp.c_proj.weight
visual.transformer.resblocks.11.mlp.c_proj.bias
visual.transformer.resblocks.11.ln_2.weight
visual.transformer.resblocks.11.ln_2.bias
visual.transformer.resblocks.12.attn.in_proj_weight
visual.transformer.resblocks.12.attn.in_proj_bias
visual.transformer.resblocks.12.attn.out_proj.weight
visual.transformer.resblocks.12.attn.out_proj.bias
visual.transformer.resblocks.12.ln_1.weight
visual.transformer.resblocks.12.ln_1.bias
visual.transformer.resblocks.12.mlp.c_fc.weight
visual.transformer.resblocks.12.mlp.c_fc.bias
visual.transformer.resblocks.12.mlp.c_proj.weight
visual.transformer.resblocks.12.mlp.c_proj.bias
visual.transformer.resblocks.12.ln_2.weight
visual.transformer.resblocks.12.ln_2.bias
visual.transformer.resblocks.13.attn.in_proj_weight
visual.transformer.resblocks.13.attn.in_proj_bias
visual.transformer.resblocks.13.attn.out_proj.weight
visual.transformer.resblocks.13.attn.out_proj.bias
visual.transformer.resblocks.13.ln_1.weight
visual.transformer.resblocks.13.ln_1.bias
visual.transformer.resblocks.13.mlp.c_fc.weight
visual.transformer.resblocks.13.mlp.c_fc.bias
visual.transformer.resblocks.13.mlp.c_proj.weight
visual.transformer.resblocks.13.mlp.c_proj.bias
visual.transformer.resblocks.13.ln_2.weight
visual.transformer.resblocks.13.ln_2.bias
visual.transformer.resblocks.14.attn.in_proj_weight
visual.transformer.resblocks.14.attn.in_proj_bias
visual.transformer.resblocks.14.attn.out_proj.weight
visual.transformer.resblocks.14.attn.out_proj.bias
visual.transformer.resblocks.14.ln_1.weight
visual.transformer.resblocks.14.ln_1.bias
visual.transformer.resblocks.14.mlp.c_fc.weight
visual.transformer.resblocks.14.mlp.c_fc.bias
visual.transformer.resblocks.14.mlp.c_proj.weight
visual.transformer.resblocks.14.mlp.c_proj.bias
visual.transformer.resblocks.14.ln_2.weight
visual.transformer.resblocks.14.ln_2.bias
visual.transformer.resblocks.15.attn.in_proj_weight
visual.transformer.resblocks.15.attn.in_proj_bias
visual.transformer.resblocks.15.attn.out_proj.weight
visual.transformer.resblocks.15.attn.out_proj.bias
visual.transformer.resblocks.15.ln_1.weight
visual.transformer.resblocks.15.ln_1.bias
visual.transformer.resblocks.15.mlp.c_fc.weight
visual.transformer.resblocks.15.mlp.c_fc.bias
visual.transformer.resblocks.15.mlp.c_proj.weight
visual.transformer.resblocks.15.mlp.c_proj.bias
visual.transformer.resblocks.15.ln_2.weight
visual.transformer.resblocks.15.ln_2.bias
visual.transformer.resblocks.16.attn.in_proj_weight
visual.transformer.resblocks.16.attn.in_proj_bias
visual.transformer.resblocks.16.attn.out_proj.weight
visual.transformer.resblocks.16.attn.out_proj.bias
visual.transformer.resblocks.16.ln_1.weight
visual.transformer.resblocks.16.ln_1.bias
visual.transformer.resblocks.16.mlp.c_fc.weight
visual.transformer.resblocks.16.mlp.c_fc.bias
visual.transformer.resblocks.16.mlp.c_proj.weight
visual.transformer.resblocks.16.mlp.c_proj.bias
visual.transformer.resblocks.16.ln_2.weight
visual.transformer.resblocks.16.ln_2.bias
visual.transformer.resblocks.17.attn.in_proj_weight
visual.transformer.resblocks.17.attn.in_proj_bias
visual.transformer.resblocks.17.attn.out_proj.weight
visual.transformer.resblocks.17.attn.out_proj.bias
visual.transformer.resblocks.17.ln_1.weight
visual.transformer.resblocks.17.ln_1.bias
visual.transformer.resblocks.17.mlp.c_fc.weight
visual.transformer.resblocks.17.mlp.c_fc.bias
visual.transformer.resblocks.17.mlp.c_proj.weight
visual.transformer.resblocks.17.mlp.c_proj.bias
visual.transformer.resblocks.17.ln_2.weight
visual.transformer.resblocks.17.ln_2.bias
visual.transformer.resblocks.18.attn.in_proj_weight
visual.transformer.resblocks.18.attn.in_proj_bias
visual.transformer.resblocks.18.attn.out_proj.weight
visual.transformer.resblocks.18.attn.out_proj.bias
visual.transformer.resblocks.18.ln_1.weight
visual.transformer.resblocks.18.ln_1.bias
visual.transformer.resblocks.18.mlp.c_fc.weight
visual.transformer.resblocks.18.mlp.c_fc.bias
visual.transformer.resblocks.18.mlp.c_proj.weight
visual.transformer.resblocks.18.mlp.c_proj.bias
visual.transformer.resblocks.18.ln_2.weight
visual.transformer.resblocks.18.ln_2.bias
visual.transformer.resblocks.19.attn.in_proj_weight
visual.transformer.resblocks.19.attn.in_proj_bias
visual.transformer.resblocks.19.attn.out_proj.weight
visual.transformer.resblocks.19.attn.out_proj.bias
visual.transformer.resblocks.19.ln_1.weight
visual.transformer.resblocks.19.ln_1.bias
visual.transformer.resblocks.19.mlp.c_fc.weight
visual.transformer.resblocks.19.mlp.c_fc.bias
visual.transformer.resblocks.19.mlp.c_proj.weight
visual.transformer.resblocks.19.mlp.c_proj.bias
visual.transformer.resblocks.19.ln_2.weight
visual.transformer.resblocks.19.ln_2.bias
visual.transformer.resblocks.20.attn.in_proj_weight
visual.transformer.resblocks.20.attn.in_proj_bias
visual.transformer.resblocks.20.attn.out_proj.weight
visual.transformer.resblocks.20.attn.out_proj.bias
visual.transformer.resblocks.20.ln_1.weight
visual.transformer.resblocks.20.ln_1.bias
visual.transformer.resblocks.20.mlp.c_fc.weight
visual.transformer.resblocks.20.mlp.c_fc.bias
visual.transformer.resblocks.20.mlp.c_proj.weight
visual.transformer.resblocks.20.mlp.c_proj.bias
visual.transformer.resblocks.20.ln_2.weight
visual.transformer.resblocks.20.ln_2.bias
visual.transformer.resblocks.21.attn.in_proj_weight
visual.transformer.resblocks.21.attn.in_proj_bias
visual.transformer.resblocks.21.attn.out_proj.weight
visual.transformer.resblocks.21.attn.out_proj.bias
visual.transformer.resblocks.21.ln_1.weight
visual.transformer.resblocks.21.ln_1.bias
visual.transformer.resblocks.21.mlp.c_fc.weight
visual.transformer.resblocks.21.mlp.c_fc.bias
visual.transformer.resblocks.21.mlp.c_proj.weight
visual.transformer.resblocks.21.mlp.c_proj.bias
visual.transformer.resblocks.21.ln_2.weight
visual.transformer.resblocks.21.ln_2.bias
visual.transformer.resblocks.22.attn.in_proj_weight
visual.transformer.resblocks.22.attn.in_proj_bias
visual.transformer.resblocks.22.attn.out_proj.weight
visual.transformer.resblocks.22.attn.out_proj.bias
visual.transformer.resblocks.22.ln_1.weight
visual.transformer.resblocks.22.ln_1.bias
visual.transformer.resblocks.22.mlp.c_fc.weight
visual.transformer.resblocks.22.mlp.c_fc.bias
visual.transformer.resblocks.22.mlp.c_proj.weight
visual.transformer.resblocks.22.mlp.c_proj.bias
visual.transformer.resblocks.22.ln_2.weight
visual.transformer.resblocks.22.ln_2.bias
visual.transformer.resblocks.23.attn.in_proj_weight
visual.transformer.resblocks.23.attn.in_proj_bias
visual.transformer.resblocks.23.attn.out_proj.weight
visual.transformer.resblocks.23.attn.out_proj.bias
visual.transformer.resblocks.23.ln_1.weight
visual.transformer.resblocks.23.ln_1.bias
visual.transformer.resblocks.23.mlp.c_fc.weight
visual.transformer.resblocks.23.mlp.c_fc.bias
visual.transformer.resblocks.23.mlp.c_proj.weight
visual.transformer.resblocks.23.mlp.c_proj.bias
visual.transformer.resblocks.23.ln_2.weight
visual.transformer.resblocks.23.ln_2.bias
visual.ln_post.weight
visual.ln_post.bias
visual.proj

# 文本相关
token_embedding.weight
positional_embedding
transformer.resblocks.0.attn.in_proj_weight
transformer.resblocks.0.attn.in_proj_bias
transformer.resblocks.0.attn.out_proj.weight
transformer.resblocks.0.attn.out_proj.bias
transformer.resblocks.0.ln_1.weight
transformer.resblocks.0.ln_1.bias
transformer.resblocks.0.mlp.c_fc.weight
transformer.resblocks.0.mlp.c_fc.bias
transformer.resblocks.0.mlp.c_proj.weight
transformer.resblocks.0.mlp.c_proj.bias
transformer.resblocks.0.ln_2.weight
transformer.resblocks.0.ln_2.bias
transformer.resblocks.1.attn.in_proj_weight
transformer.resblocks.1.attn.in_proj_bias
transformer.resblocks.1.attn.out_proj.weight
transformer.resblocks.1.attn.out_proj.bias
transformer.resblocks.1.ln_1.weight
transformer.resblocks.1.ln_1.bias
transformer.resblocks.1.mlp.c_fc.weight
transformer.resblocks.1.mlp.c_fc.bias
transformer.resblocks.1.mlp.c_proj.weight
transformer.resblocks.1.mlp.c_proj.bias
transformer.resblocks.1.ln_2.weight
transformer.resblocks.1.ln_2.bias
transformer.resblocks.2.attn.in_proj_weight
transformer.resblocks.2.attn.in_proj_bias
transformer.resblocks.2.attn.out_proj.weight
transformer.resblocks.2.attn.out_proj.bias
transformer.resblocks.2.ln_1.weight
transformer.resblocks.2.ln_1.bias
transformer.resblocks.2.mlp.c_fc.weight
transformer.resblocks.2.mlp.c_fc.bias
transformer.resblocks.2.mlp.c_proj.weight
transformer.resblocks.2.mlp.c_proj.bias
transformer.resblocks.2.ln_2.weight
transformer.resblocks.2.ln_2.bias
transformer.resblocks.3.attn.in_proj_weight
transformer.resblocks.3.attn.in_proj_bias
transformer.resblocks.3.attn.out_proj.weight
transformer.resblocks.3.attn.out_proj.bias
transformer.resblocks.3.ln_1.weight
transformer.resblocks.3.ln_1.bias
transformer.resblocks.3.mlp.c_fc.weight
transformer.resblocks.3.mlp.c_fc.bias
transformer.resblocks.3.mlp.c_proj.weight
transformer.resblocks.3.mlp.c_proj.bias
transformer.resblocks.3.ln_2.weight
transformer.resblocks.3.ln_2.bias
transformer.resblocks.4.attn.in_proj_weight
transformer.resblocks.4.attn.in_proj_bias
transformer.resblocks.4.attn.out_proj.weight
transformer.resblocks.4.attn.out_proj.bias
transformer.resblocks.4.ln_1.weight
transformer.resblocks.4.ln_1.bias
transformer.resblocks.4.mlp.c_fc.weight
transformer.resblocks.4.mlp.c_fc.bias
transformer.resblocks.4.mlp.c_proj.weight
transformer.resblocks.4.mlp.c_proj.bias
transformer.resblocks.4.ln_2.weight
transformer.resblocks.4.ln_2.bias
transformer.resblocks.5.attn.in_proj_weight
transformer.resblocks.5.attn.in_proj_bias
transformer.resblocks.5.attn.out_proj.weight
transformer.resblocks.5.attn.out_proj.bias
transformer.resblocks.5.ln_1.weight
transformer.resblocks.5.ln_1.bias
transformer.resblocks.5.mlp.c_fc.weight
transformer.resblocks.5.mlp.c_fc.bias
transformer.resblocks.5.mlp.c_proj.weight
transformer.resblocks.5.mlp.c_proj.bias
transformer.resblocks.5.ln_2.weight
transformer.resblocks.5.ln_2.bias
transformer.resblocks.6.attn.in_proj_weight
transformer.resblocks.6.attn.in_proj_bias
transformer.resblocks.6.attn.out_proj.weight
transformer.resblocks.6.attn.out_proj.bias
transformer.resblocks.6.ln_1.weight
transformer.resblocks.6.ln_1.bias
transformer.resblocks.6.mlp.c_fc.weight
transformer.resblocks.6.mlp.c_fc.bias
transformer.resblocks.6.mlp.c_proj.weight
transformer.resblocks.6.mlp.c_proj.bias
transformer.resblocks.6.ln_2.weight
transformer.resblocks.6.ln_2.bias
transformer.resblocks.7.attn.in_proj_weight
transformer.resblocks.7.attn.in_proj_bias
transformer.resblocks.7.attn.out_proj.weight
transformer.resblocks.7.attn.out_proj.bias
transformer.resblocks.7.ln_1.weight
transformer.resblocks.7.ln_1.bias
transformer.resblocks.7.mlp.c_fc.weight
transformer.resblocks.7.mlp.c_fc.bias
transformer.resblocks.7.mlp.c_proj.weight
transformer.resblocks.7.mlp.c_proj.bias
transformer.resblocks.7.ln_2.weight
transformer.resblocks.7.ln_2.bias
transformer.resblocks.8.attn.in_proj_weight
transformer.resblocks.8.attn.in_proj_bias
transformer.resblocks.8.attn.out_proj.weight
transformer.resblocks.8.attn.out_proj.bias
transformer.resblocks.8.ln_1.weight
transformer.resblocks.8.ln_1.bias
transformer.resblocks.8.mlp.c_fc.weight
transformer.resblocks.8.mlp.c_fc.bias
transformer.resblocks.8.mlp.c_proj.weight
transformer.resblocks.8.mlp.c_proj.bias
transformer.resblocks.8.ln_2.weight
transformer.resblocks.8.ln_2.bias
transformer.resblocks.9.attn.in_proj_weight
transformer.resblocks.9.attn.in_proj_bias
transformer.resblocks.9.attn.out_proj.weight
transformer.resblocks.9.attn.out_proj.bias
transformer.resblocks.9.ln_1.weight
transformer.resblocks.9.ln_1.bias
transformer.resblocks.9.mlp.c_fc.weight
transformer.resblocks.9.mlp.c_fc.bias
transformer.resblocks.9.mlp.c_proj.weight
transformer.resblocks.9.mlp.c_proj.bias
transformer.resblocks.9.ln_2.weight
transformer.resblocks.9.ln_2.bias
transformer.resblocks.10.attn.in_proj_weight
transformer.resblocks.10.attn.in_proj_bias
transformer.resblocks.10.attn.out_proj.weight
transformer.resblocks.10.attn.out_proj.bias
transformer.resblocks.10.ln_1.weight
transformer.resblocks.10.ln_1.bias
transformer.resblocks.10.mlp.c_fc.weight
transformer.resblocks.10.mlp.c_fc.bias
transformer.resblocks.10.mlp.c_proj.weight
transformer.resblocks.10.mlp.c_proj.bias
transformer.resblocks.10.ln_2.weight
transformer.resblocks.10.ln_2.bias
transformer.resblocks.11.attn.in_proj_weight
transformer.resblocks.11.attn.in_proj_bias
transformer.resblocks.11.attn.out_proj.weight
transformer.resblocks.11.attn.out_proj.bias
transformer.resblocks.11.ln_1.weight
transformer.resblocks.11.ln_1.bias
transformer.resblocks.11.mlp.c_fc.weight
transformer.resblocks.11.mlp.c_fc.bias
transformer.resblocks.11.mlp.c_proj.weight
transformer.resblocks.11.mlp.c_proj.bias
transformer.resblocks.11.ln_2.weight
transformer.resblocks.11.ln_2.bias
ln_final.weight
ln_final.bias
text_projection

仅优化CLIP的后几层参数

def get_optim_params(model_name: str):
    if model_name in ['ViT-B/32', 'ViT-B/16']:
        return ['visual.transformer.resblocks.11.attn.in_proj_weight',
                'visual.transformer.resblocks.11.attn.in_proj_bias',
                'visual.transformer.resblocks.11.attn.out_proj.weight',
                'visual.transformer.resblocks.11.attn.out_proj.bias',
                'visual.transformer.resblocks.11.ln_1.weight',
                'visual.transformer.resblocks.11.ln_1.bias',
                'visual.transformer.resblocks.11.mlp.c_fc.weight',
                'visual.transformer.resblocks.11.mlp.c_fc.bias',
                'visual.transformer.resblocks.11.mlp.c_proj.weight',
                'visual.transformer.resblocks.11.mlp.c_proj.bias',
                'visual.transformer.resblocks.11.ln_2.weight',
                'visual.transformer.resblocks.11.ln_2.bias',
                'visual.ln_post.weight',
                'visual.ln_post.bias',
                'visual.proj',
                'transformer.resblocks.11.attn.in_proj_weight',
                'transformer.resblocks.11.attn.in_proj_bias',
                'transformer.resblocks.11.attn.out_proj.weight',
                'transformer.resblocks.11.attn.out_proj.bias',
                'transformer.resblocks.11.ln_1.weight',
                'transformer.resblocks.11.ln_1.bias',
                'transformer.resblocks.11.mlp.c_fc.weight',
                'transformer.resblocks.11.mlp.c_fc.bias',
                'transformer.resblocks.11.mlp.c_proj.weight',
                'transformer.resblocks.11.mlp.c_proj.bias',
                'transformer.resblocks.11.ln_2.weight',
                'transformer.resblocks.11.ln_2.bias',
                'ln_final.weight',
                'ln_final.bias',
                'text_projection']
    elif model_name in ['ViT-L/14', 'ViT-L/14@336px']:
        return ['visual.transformer.resblocks.23.attn.in_proj_weight',
                'visual.transformer.resblocks.23.attn.in_proj_bias',
                'visual.transformer.resblocks.23.attn.out_proj.weight',
                'visual.transformer.resblocks.23.attn.out_proj.bias',
                'visual.transformer.resblocks.23.ln_1.weight',
                'visual.transformer.resblocks.23.ln_1.bias',
                'visual.transformer.resblocks.23.mlp.c_fc.weight',
                'visual.transformer.resblocks.23.mlp.c_fc.bias',
                'visual.transformer.resblocks.23.mlp.c_proj.weight',
                'visual.transformer.resblocks.23.mlp.c_proj.bias',
                'visual.transformer.resblocks.23.ln_2.weight',
                'visual.transformer.resblocks.23.ln_2.bias',
                'visual.ln_post.weight',
                'visual.ln_post.bias',
                'visual.proj',
                'transformer.resblocks.11.attn.in_proj_weight',
                'transformer.resblocks.11.attn.in_proj_bias',
                'transformer.resblocks.11.attn.out_proj.weight',
                'transformer.resblocks.11.attn.out_proj.bias',
                'transformer.resblocks.11.ln_1.weight',
                'transformer.resblocks.11.ln_1.bias',
                'transformer.resblocks.11.mlp.c_fc.weight',
                'transformer.resblocks.11.mlp.c_fc.bias',
                'transformer.resblocks.11.mlp.c_proj.weight',
                'transformer.resblocks.11.mlp.c_proj.bias',
                'transformer.resblocks.11.ln_2.weight',
                'transformer.resblocks.11.ln_2.bias',
                'ln_final.weight',
                'ln_final.bias',
                'text_projection']
    else:
        print(f"no {model_name}")
import os
import clip
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR10
from torch.nn import functional as F
import torch


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.model, self.preprocess = clip.load('ViT-B/32', 'cpu')

        optim_params = get_optim_params('ViT-B/32'):

        for name, param in self.model.named_parameters():
            if name not in optim_params:
                param.requires_grad = False

    def forward(self, image, text):
        image_features = self.model.encode_image(image)
        text_features = self.model.encode_text(text)
        return image_features, text_features


net = Net()
optimizer = torch.optim.SGD(net.parameters(), lr=1e-2)

root = os.path.expanduser("~/.cache")
cifar10 = CIFAR10(root, download=True, train=True, transform=net.preprocess)
train = next(iter(DataLoader(cifar10, batch_size=8)))
images = train[0]
texts = torch.cat([clip.tokenize(f"a photo of a {cifar10.classes[c]}") for c in train[1]])

storeParam = {}
for name, param in net.named_parameters():
    storeParam[name] = param.detach().clone()

for i in range(10):
    image_features, text_features = net(images, texts)
    loss = F.mse_loss(image_features, text_features)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(loss.item())

for name, param in net.named_parameters():
    if not torch.equal(param, storeParam[name]):
        print(f"{name}")

<think>好的,我现在需要解决用户在TensorFlow中遇到的AttributeError: module 'tensorflow' has no attribute 'clip_by_average_norm'问题。首先,我得回忆一下TensorFlow的不同版本之间的变化,尤其是函数或方法的变动情况。 用户提到的是clip_by_average_norm属性不存在,这通常意味着在当前的TensorFlow版本中,这个函数可能已经被移除了或者改名了。根据之前的经验,TensorFlow 2.x版本相对于1.x有很多API的变化,很多函数从tf名下移动到了子模块中,比如tf.keras或tf.nn等。因此,我需要先确认用户使用的TensorFlow版本。 接下来,我应该考虑用户可能没有正确导入函数的情况。在TensorFlow 2.x中,clip_by_average_norm可能不再直接存在于tf模块下,而是被整合到其他子模块里,或者可能被废弃了。这时候需要查阅TensorFlow的官方文档,确认该函数在哪些版本中存在,以及替代方案是什么。 另外,用户可能在代码中错误地调用了这个函数,比如拼写错误或者路径错误。比如是否应该是tf.clip_by_average_norm还是tf.xxx.clip_by_average_norm?例如,在旧版本中,梯度裁剪函数可能在tf.clip_by_global_norm,而clip_by_average_norm可能是一个不同的函数,但需要确认。 根据用户提供的引用信息,其他类似的AttributeError问题都是由于版本升级导致的API变动,比如global_variables_initializer在2.x中被移动到tf.compat.v1下,或者set_random_seed同样需要兼容模块。因此,clip_by_average_norm可能也有类似的迁移路径。 可能的解决方案包括: 1. 检查TensorFlow的版本,如果是2.x,可能需要使用兼容模块,即tf.compat.v1.clip_by_average_norm,但这需要确认该函数是否存在于v1兼容层中。 2. 如果该函数在2.x中被移除,可能需要寻找替代方法,比如使用其他梯度裁剪函数,如tf.clip_by_norm或tf.clip_by_global_norm,并调整参数以适应average norm的情况。 3. 用户可能需要更新代码,遵循TensorFlow 2.x的最佳实践,例如使用Keras API中的优化器内置的梯度裁剪功能。 此外,还需要指导用户如何检查版本,如何导入兼容模块,或者如何调整代码逻辑。例如,使用tf.clip_by_norm时,可能需要手动计算平均范数,或者调整裁剪的方式。 最后,建议用户查阅官方文档或升级指南,确保使用的函数与当前版本兼容,并提供示例代码片段,帮助用户正确应用替代方案。</think>### 解决 TensorFlow 中 `clip_by_average_norm` 属性不存在的报错 #### 问题原因 `AttributeError: module 'tensorflow' has no attribute 'clip_by_average_norm'` 通常由以下原因导致: 1. **TensorFlow 版本不兼容**:`clip_by_average_norm` 是 TensorFlow 1.x 的 API,在 TensorFlow 2.x 中被移除或迁移到其他子模块[^2][^1]。 2. **API 名称拼写错误或路径错误**:函数可能被重命名或整合到其他模块中。 --- #### 逐步解决方案 ##### 1. **检查 TensorFlow 版本** 首先确认当前 TensorFlow 版本是否为 2.x: ```python import tensorflow as tf print(tf.__version__) # 输出示例:'2.10.0' ``` - 如果版本为 1.x,升级到 2.x:`pip install --upgrade tensorflow`。 - 如果版本为 2.x,需使用兼容性 API 或替代方法。 ##### 2. **使用兼容性模块(针对从 1.x 迁移到 2.x 的代码)** TensorFlow 2.x 提供了 `tf.compat.v1` 模块以支持旧版 API: ```python # 启用 v1 兼容模式 tf.compat.v1.disable_eager_execution() # 可选,根据代码需求决定 gradients = tf.compat.v1.clip_by_average_norm(gradients, clip_norm) ``` 但需注意:**并非所有 1.x 的 API 都完全支持**,需验证 `clip_by_average_norm` 是否在 `tf.compat.v1` 中存在。 ##### 3. **使用替代函数 `tf.clip_by_norm`** TensorFlow 2.x 推荐使用 `tf.clip_by_norm` 实现梯度裁剪,但需手动计算平均范数: ```python # 示例:裁剪梯度的全局范数 gradients, _ = tf.clip_by_global_norm(gradients, clip_norm) # 或手动计算平均范数 clip_value = clip_norm * tf.reduce_mean([tf.norm(g) for g in gradients]) gradients = [tf.clip_by_value(g, -clip_value, clip_value) for g in gradients] ``` ##### 4. **使用 Keras 优化器的内置梯度裁剪** TensorFlow 2.x 的 Keras 优化器支持直接在优化器中设置梯度裁剪: ```python # 在优化器中设置 clipnorm 或 clipvalue optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, clipnorm=clip_norm) ``` --- #### 验证代码 ```python import tensorflow as tf # 使用 tf.clip_by_norm 替代 gradients = [...] # 假设已计算梯度 clip_norm = 1.0 gradients = [tf.clip_by_norm(g, clip_norm) for g in gradients] # 或使用 Keras 优化器内置裁剪 optimizer = tf.keras.optimizers.SGD(clipnorm=clip_norm) ``` ---
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

MallocLu

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值