DINOv2 复现过程

原创已于 2025-11-26 13:51:32 修改 · 388 阅读

3 ·

CC 4.0 BY-SA版权

文章标签：

#pytorch #深度学习 #人工智能

于 2025-11-25 18:30:52 首次发布

部署运行你感兴趣的模型镜像

DINOv2 是 Meta FAIR 自监督学习的视觉骨干模型仓库，核心功能一句话：无需标签，直接当“万能视觉特征提取器”用。

一、把图片变成向量

克隆 + 装包

git clone https://github.com/facebookresearch/dinov2.git
cd dinov2
conda create -n dinov2 python==3.10 -y
pip install -r requirements.txt       # torch>=1.12 + torchvision + timm This is the most difficult step in this process, and it will take a long time
# see appendix about conda enviroment file

下载预训练权重（以 ViT-S/14 为例）

mkdir -p checkpoints
touch download.py run.py extract.py

download.py 的作用是运行之后从源中下载相应的 model 文件到缓存地址

import torch
# 第一次运行会自动把权重缓存到 ~/.cache/torch/hub/
dinov2_vits14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')

python download.py --verbose

run.py 文件用于测试程序能否正常运行。
请事先准备一张名为 “xxx.png” 的图片，并确保将其放在 checkpoints 目录下。

import torch
import PIL.Image
import torchvision.transforms as T
import warnings

# 忽略不必要的警告
warnings.filterwarnings("ignore", message=".*cutlassF.*")
warnings.filterwarnings("ignore", message=".*flshattF.*")
warnings.filterwarnings("ignore", message=".*tritonflashattF.*")
warnings.filterwarnings("ignore", message=".*smallkF.*")

# 检查CUDA是否可用
if not torch.cuda.is_available():
    print("错误: CUDA不可用，但模型需要GPU运行")
    print("请检查:")
    print("1. 是否安装了CUDA版本的PyTorch")
    print("2. 显卡驱动是否正常")
    print("3. 是否有可用的NVIDIA显卡")
    exit(1)

device = torch.device('cuda')
print(f"使用设备: {device}")
print(f"GPU名称: {torch.cuda.get_device_name()}")

# 设置内存优化
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# 加载模型并移动到GPU
try:
    model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
    print("模型加载成功")
except Exception as e:
    print(f"模型加载失败: {e}")
    exit(1)

# 将模型移动到GPU并使用混合精度
model = model.to(device).eval()

print(f"模型设备: {next(model.parameters()).device}")

# 图像预处理
transform = T.Compose([
    T.Resize(224),
    T.CenterCrop(224),
    T.ToTensor(),
])

try:
    # 加载图像
    img = PIL.Image.open("xxx.png").convert('RGB')
    
    # 应用预处理并移动到GPU
    x = transform(img).unsqueeze(0).to(device)
    
    print(f"输入张量形状: {x.shape}")
    print(f"输入张量设备: {x.device}")
    print(f"输入数据类型: {x.dtype}")
    
    # 使用混合精度推理来避免dtype警告
    with torch.no_grad():
        # 尝试使用bfloat16（如果支持）
        if torch.cuda.is_bf16_supported():
            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                feat = model(x)
            print("使用bfloat16精度")
        else:
            # 回退到fp16
            with torch.cuda.amp.autocast(dtype=torch.float16):
                feat = model(x)
            print("使用float16精度")
    
    print(f"特征向量形状: {feat.shape}")
    print(f"特征向量设备: {feat.device}")
    print(f"特征向量dtype: {feat.dtype}")
    
    # 转换为float32便于处理
    feat = feat.float()
    
    print("特征向量前10个值:", feat.squeeze()[:10].cpu().numpy())
    print("特征提取完成!")
    
except FileNotFoundError:
    print("错误: 找不到 xxx.png 文件")
    print("请确保当前目录下有 xxx.png 文件")
except Exception as e:
    print(f"处理过程中出错: {e}")
    import traceback
    traceback.print_exc()

python run.py

extract.py 文件展示的是 DINOv2 提取特征的效果

import torch
import PIL.Image
import torchvision.transforms as T
import warnings
import numpy as np
import matplotlib.pyplot as plt
from torchvision.utils import make_grid
import os

# 忽略不必要的警告
warnings.filterwarnings("ignore", message=".*cutlassF.*")
warnings.filterwarnings("ignore", message=".*flshattF.*")
warnings.filterwarnings("ignore", message=".*tritonflashattF.*")
warnings.filterwarnings("ignore", message=".*smallkF.*")

# 检查CUDA是否可用
if not torch.cuda.is_available():
    print("错误: CUDA不可用，但模型需要GPU运行")
    exit(1)

device = torch.device('cuda')
print(f"使用设备: {device}")
print(f"GPU名称: {torch.cuda.get_device_name()}")

# 设置内存优化
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# 加载模型并移动到GPU
try:
    model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
    print("模型加载成功")
except Exception as e:
    print(f"模型加载失败: {e}")
    exit(1)

model = model.to(device).eval()
print(f"模型设备: {next(model.parameters()).device}")

# 图像预处理
transform = T.Compose([
    T.Resize(224),
    T.CenterCrop(224),
    T.ToTensor(),
])

def extract_and_visualize_features(image_path, output_dir="feature_visualizations"):
    """
    提取图像特征并生成可视化特征图
    """
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    
    try:
        # 加载图像
        img = PIL.Image.open(image_path).convert('RGB')
        original_img = img.copy()
        
        # 应用预处理并移动到GPU
        x = transform(img).unsqueeze(0).to(device)
        
        print(f"输入张量形状: {x.shape}")
        print(f"输入张量设备: {x.device}")
        
        # 使用混合精度推理
        with torch.no_grad():
            if torch.cuda.is_bf16_supported():
                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                    features = model(x)
                print("使用bfloat16精度")
            else:
                with torch.cuda.amp.autocast(dtype=torch.float16):
                    features = model(x)
                print("使用float16精度")
        
        # 转换为float32
        features = features.float()
        print(f"特征向量形状: {features.shape}")
        
        # 保存原始特征向量
        feature_vector = features.squeeze().cpu().numpy()
        np.save(os.path.join(output_dir, "feature_vector.npy"), feature_vector)
        print(f"特征向量已保存: feature_vector.npy")
        
        # 可视化特征
        visualize_features(original_img, features, output_dir, image_path)
        
        return feature_vector
        
    except Exception as e:
        print(f"处理过程中出错: {e}")
        import traceback
        traceback.print_exc()
        return None

def visualize_features(original_img, features, output_dir, image_path):
    """
    生成特征可视化图像 - 修复了PCA错误
    """
    # 1. 保存原始图像
    original_img.save(os.path.join(output_dir, "original_image.jpg"))
    
    # 2. 创建特征分布图
    plt.figure(figsize=(15, 10))
    
    feature_data = features.squeeze().cpu().numpy()
    
    # 子图1: 原始图像
    plt.subplot(2, 3, 1)
    plt.imshow(original_img)
    plt.title("Original Image")
    plt.axis('off')
    
    # 子图2: 特征值分布直方图
    plt.subplot(2, 3, 2)
    plt.hist(feature_data, bins=50, alpha=0.7, color='blue')
    plt.title("Feature Value Distribution")
    plt.xlabel("Feature Value")
    plt.ylabel("Frequency")
    
    # 子图3: 特征维度重要性（前30维）
    plt.subplot(2, 3, 3)
    feature_magnitude = np.abs(feature_data)
    top_indices = np.argsort(feature_magnitude)[-30:][::-1]
    plt.bar(range(30), feature_magnitude[top_indices])
    plt.title("Top 30 Feature Dimensions")
    plt.xlabel("Feature Dimension Index")
    plt.ylabel("Absolute Value")
    plt.xticks(range(30), top_indices, rotation=45)
    
    # 子图4: 特征热力图（将384维特征重塑为可视化形式）
    plt.subplot(2, 3, 4)
    # 将384维特征重塑为 16x24 的热力图
    rows, cols = 16, 24
    if rows * cols == len(feature_data):
        heatmap = feature_data.reshape(rows, cols)
        im = plt.imshow(heatmap, cmap='viridis', aspect='auto')
        plt.title(f"Feature Heatmap ({rows}x{cols})")
        plt.colorbar(im)
    else:
        # 如果不能完美分割，使用一维可视化
        plt.plot(feature_magnitude)
        plt.title("Feature Magnitude Plot")
        plt.xlabel("Dimension")
        plt.ylabel("Absolute Value")
    
    # 子图5: 特征值排序图（替代PCA）
    plt.subplot(2, 3, 5)
    sorted_features = np.sort(feature_data)
    plt.plot(sorted_features, 'r-', linewidth=2)
    plt.title("Sorted Feature Values")
    plt.xlabel("Rank")
    plt.ylabel("Feature Value")
    plt.grid(True, alpha=0.3)
    
    # 子图6: 特征累积分布
    plt.subplot(2, 3, 6)
    cumulative_energy = np.cumsum(np.square(sorted_features[::-1])) / np.sum(np.square(feature_data))
    plt.plot(cumulative_energy, 'g-', linewidth=2)
    plt.title("Cumulative Energy Distribution")
    plt.xlabel("Number of Components")
    plt.ylabel("Cumulative Energy")
    plt.grid(True, alpha=0.3)
    # 标记95%能量点
    idx_95 = np.where(cumulative_energy >= 0.95)[0][0]
    plt.axvline(x=idx_95, color='red', linestyle='--', alpha=0.7)
    plt.text(idx_95, 0.5, f'95% at {idx_95} dims', rotation=90, va='center')
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "feature_analysis.png"), dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"特征分析图已保存: feature_analysis.png")
    
    # 3. 生成额外的特征可视化
    generate_advanced_visualizations(feature_data, output_dir)
    
    # 4. 生成特征摘要报告
    generate_feature_report(features, output_dir)

def generate_advanced_visualizations(feature_data, output_dir):
    """
    生成高级特征可视化
    """
    # 可视化1: 2D特征投影（使用前两个维度）
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 3, 1)
    # 使用前两个维度作为坐标
    if len(feature_data) >= 2:
        plt.scatter(feature_data[0], feature_data[1], s=100, c='red', alpha=0.7)
        plt.axhline(y=0, color='k', linestyle='-', alpha=0.3)
        plt.axvline(x=0, color='k', linestyle='-', alpha=0.3)
        plt.title("Feature Projection (First 2 Dims)")
        plt.xlabel("Dimension 1")
        plt.ylabel("Dimension 2")
    
    # 可视化2: 特征相关性矩阵（小块）
    plt.subplot(1, 3, 2)
    # 取前20个特征计算相关性
    n_features = min(20, len(feature_data))
    feature_subset = feature_data[:n_features].reshape(1, -1)
    correlation = np.corrcoef(feature_subset.T)
    im = plt.imshow(correlation, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title(f"Feature Correlation (First {n_features} dims)")
    plt.colorbar(im)
    
    # 可视化3: 特征聚类热图
    plt.subplot(1, 3, 3)
    # 将特征重塑为更合理的形状进行可视化
    n_rows = 8
    n_cols = 48  # 8*48=384
    if n_rows * n_cols == len(feature_data):
        feature_matrix = feature_data.reshape(n_rows, n_cols)
        im = plt.imshow(feature_matrix, cmap='RdYlBu', aspect='auto')
        plt.title(f"Feature Matrix ({n_rows}x{n_cols})")
        plt.colorbar(im)
    else:
        # 使用滑动窗口可视化
        window_size = 32
        feature_windows = [feature_data[i:i+window_size] for i in range(0, len(feature_data), window_size)]
        if feature_windows:
            max_len = max(len(w) for w in feature_windows)
            padded_windows = [np.pad(w, (0, max_len - len(w)), mode='constant') for w in feature_windows]
            window_matrix = np.array(padded_windows)
            im = plt.imshow(window_matrix, cmap='RdYlBu', aspect='auto')
            plt.title(f"Feature Windows ({len(feature_windows)}x{window_size})")
            plt.colorbar(im)
    
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, "advanced_visualizations.png"), dpi=300, bbox_inches='tight')
    plt.close()
    
    print(f"高级可视化图已保存: advanced_visualizations.png")

def generate_feature_report(features, output_dir):
    """
    生成特征摘要报告
    """
    feature_data = features.squeeze().cpu().numpy()
    
    # 计算各种统计量
    abs_features = np.abs(feature_data)
    sorted_indices = np.argsort(abs_features)[::-1]
    
    report = f"""
    DINOv2 特征提取报告
    ===================
    
    基本信息:
    - 特征维度: {feature_data.shape[0]}
    - 特征值范围: [{feature_data.min():.6f}, {feature_data.max():.6f}]
    - 特征均值: {feature_data.mean():.6f}
    - 特征标准差: {feature_data.std():.6f}
    - 特征L2范数: {np.linalg.norm(feature_data):.6f}
    
    统计信息:
    - 正特征数量: {np.sum(feature_data > 0)}
    - 负特征数量: {np.sum(feature_data < 0)}
    - 零特征数量: {np.sum(feature_data == 0)}
    - 特征稀疏度: {np.sum(abs_features < 1e-6) / len(feature_data):.2%}
    
    重要维度分析:
    - 最大特征值: {feature_data.max():.6f} (维度 {np.argmax(feature_data)})
    - 最小特征值: {feature_data.min():.6f} (维度 {np.argmin(feature_data)})
    - 最大绝对值: {abs_features.max():.6f} (维度 {np.argmax(abs_features)})
    
    最重要的5个特征维度:
    """
    
    for i in range(5):
        idx = sorted_indices[i]
        report += f"    - 维度 {idx}: 值 = {feature_data[idx]:.6f}, 绝对值 = {abs_features[idx]:.6f}\n"
    
    # 能量集中度分析
    squared_sorted = np.sort(feature_data**2)[::-1]
    cumulative_energy = np.cumsum(squared_sorted) / np.sum(squared_sorted)
    
    report += f"""
    能量集中度分析:
    - 前10维包含 {cumulative_energy[9]:.2%} 的总能量
    - 前50维包含 {cumulative_energy[49]:.2%} 的总能量  
    - 前100维包含 {cumulative_energy[99]:.2%} 的总能量
    - 达到95%能量所需维度: {np.where(cumulative_energy >= 0.95)[0][0] + 1}
    - 达到99%能量所需维度: {np.where(cumulative_energy >= 0.99)[0][0] + 1}
    
    生成时间: {np.datetime64('now')}
    """
    
    with open(os.path.join(output_dir, "feature_report.txt"), "w") as f:
        f.write(report)
    
    print(f"特征报告已保存: feature_report.txt")

# 执行特征提取和可视化
if __name__ == "__main__":
    image_path = "xxx.png"  # 可以修改为您的图片路径
    
    if not os.path.exists(image_path):
        print(f"错误: 找不到图片文件 {image_path}")
        print("请确保图片文件存在，或修改 image_path 变量")
    else:
        print(f"开始处理图片: {image_path}")
        feature_vector = extract_and_visualize_features(image_path)
        
        if feature_vector is not None:
            print("\n" + "="*50)
            print("特征提取任务完成!")
            print("="*50)
            print("生成的文件:")
            print("- original_image.jpg: 原始图像")
            print("- feature_vector.npy: 原始特征向量 (384维)")
            print("- feature_analysis.png: 特征分析可视化")
            print("- advanced_visualizations.png: 高级特征可视化")
            print("- feature_report.txt: 特征统计报告")
            print(f"所有文件保存在: feature_visualizations/ 目录")
            print("="*50)

python extract.py

二、仓库目录速览

dinov2/ 核心模型代码
notebooks/ 官方 Jupyter 示例（特征提取、可视化注意力）
dinov2/eval/ 线性评估、检索、检测脚本
dinov2/train/ 自监督预训练代码（8 卡 A100 起步，一般用户用不到）

三、一句话总结：
把 dinov2_vit*14.pth 当“视觉版 BERT”用，提特征 → 建索引 / 接分类头 / 换分割骨干，基本不用调参就能涨点。

Appendix

environment.DINOv2.yaml 文件中放的是成功运行时候的 conda 环境:

name: dinov2
channels:
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free
  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/conda-forge
  - defaults
dependencies:
  - _libgcc_mutex=0.1=main
  - _openmp_mutex=5.1=1_gnu
  - blas=1.0=openblas
  - bzip2=1.0.8=h5eee18b_6
  - ca-certificates=2025.11.4=h06a4308_0
  - expat=2.7.3=h3385a95_0
  - ld_impl_linux-64=2.44=h153f514_2
  - libffi=3.4.4=h6a678d5_1
  - libgcc=15.2.0=h69a1729_7
  - libgcc-ng=15.2.0=h166f726_7
  - libgfortran=15.2.0=h166f726_7
  - libgfortran-ng=15.2.0=h166f726_7
  - libgfortran5=15.2.0=hc633d37_7
  - libgomp=15.2.0=h4751f2c_7
  - libnsl=2.0.0=h5eee18b_0
  - libopenblas=0.3.30=h46f56fc_2
  - libstdcxx=15.2.0=h39759b7_7
  - libstdcxx-ng=15.2.0=hc03a8fd_7
  - libuuid=1.41.5=h5eee18b_0
  - libxcb=1.17.0=h9b100fa_0
  - libzlib=1.3.1=hb25bd0a_0
  - ncurses=6.5=h7934f7d_0
  - openssl=3.0.18=hd6dcaed_0
  - pip=25.3=pyhc872135_0
  - pthread-stubs=0.3=h0ce48e5_1
  - python=3.10.19=h6fa692b_0
  - readline=8.3=hc2a1206_0
  - setuptools=80.9.0=py310h06a4308_0
  - sqlite=3.51.0=h2a70700_0
  - tk=8.6.15=h54e0aa7_0
  - wheel=0.45.1=py310h06a4308_0
  - xorg-libx11=1.8.12=h9b100fa_1
  - xorg-libxau=1.0.12=h9b100fa_0
  - xorg-libxdmcp=1.1.5=h9b100fa_0
  - xorg-xorgproto=2024.1=h5eee18b_1
  - xz=5.6.4=h5eee18b_1
  - zlib=1.3.1=hb25bd0a_0
  - pip:
      - antlr4-python3-runtime==4.9.3
      - cachetools==6.2.2
      - certifi==2025.11.12
      - charset-normalizer==3.4.4
      - click==8.3.1
      - cloudpickle==3.1.2
      - cmake==4.2.0
      - contourpy==1.3.2
      - cubinlinker-cu11==0.3.0.post3
      - cuda-bindings==11.8.7
      - cuda-python==11.8.7
      - cudf-cu11==25.4.0
      - cuml-cu11==25.4.0
      - cupy-cuda11x==13.6.0
      - cuvs-cu11==25.4.0
      - cycler==0.12.1
      - dask==2025.2.0
      - dask-cuda==25.4.0
      - dask-cudf-cu11==25.4.0
      - distributed==2025.2.0
      - distributed-ucxx-cu11==0.43.0
      - fastrlock==0.8.3
      - filelock==3.20.0
      - fonttools==4.60.1
      - fsspec==2025.10.0
      - fvcore==0.1.5.post20221221
      - idna==3.11
      - importlib-metadata==8.7.0
      - iopath==0.1.10
      - jinja2==3.1.6
      - joblib==1.5.2
      - kiwisolver==1.4.9
      - libcudf-cu11==25.4.0
      - libcuml-cu11==25.4.0
      - libcuvs-cu11==25.4.0
      - libkvikio-cu11==25.4.0
      - libraft-cu11==25.4.0
      - librmm-cu11==25.4.0
      - libucx-cu11==1.18.1
      - libucxx-cu11==0.43.0
      - lit==18.1.8
      - llvmlite==0.43.0
      - locket==1.0.0
      - markdown-it-py==4.0.0
      - markupsafe==3.0.3
      - matplotlib==3.10.7
      - mdurl==0.1.2
      - modelscope==1.32.0
      - mpmath==1.3.0
      - msgpack==1.1.2
      - mypy-extensions==1.1.0
      - networkx==3.4.2
      - numba==0.60.0
      - numba-cuda==0.4.0
      - numpy==1.26.3
      - nvidia-cublas-cu11==11.11.3.6
      - nvidia-cufft-cu11==10.9.0.58
      - nvidia-curand-cu11==10.3.0.86
      - nvidia-cusolver-cu11==11.4.1.48
      - nvidia-cusparse-cu11==11.7.5.86
      - nvidia-ml-py==12.575.51
      - nvidia-nvcomp-cu11==4.2.0.11
      - nvtx==0.2.13
      - omegaconf==2.3.0
      - packaging==25.0
      - pandas==2.2.3
      - partd==1.4.2
      - pillow==12.0.0
      - portalocker==3.2.0
      - psutil==7.1.3
      - ptxcompiler-cu11==0.8.1.post3
      - pyarrow==19.0.1
      - pygments==2.19.2
      - pylibcudf-cu11==25.4.0
      - pylibraft-cu11==25.4.0
      - pynvml==12.0.0
      - pyparsing==3.2.5
      - pyre-extensions==0.0.23
      - python-dateutil==2.9.0.post0
      - pytz==2025.2
      - pyyaml==6.0.3
      - raft-dask-cu11==25.4.0
      - rapids-dask-dependency==25.4.0
      - rapids-logger==0.1.19
      - requests==2.32.5
      - rich==14.2.0
      - rmm-cu11==25.4.0
      - scikit-learn==1.7.2
      - scipy==1.15.3
      - six==1.17.0
      - sortedcontainers==2.4.0
      - submitit==1.5.3
      - sympy==1.14.0
      - tabulate==0.9.0
      - tblib==3.2.2
      - termcolor==3.2.0
      - threadpoolctl==3.6.0
      - toolz==1.1.0
      - torch==2.0.0+cu117
      - torchmetrics==0.10.3
      - torchvision==0.15.0+cu117
      - tornado==6.5.2
      - tqdm==4.67.1
      - treelite==4.4.1
      - triton==2.0.0
      - typing-extensions==4.15.0
      - typing-inspect==0.9.0
      - tzdata==2025.2
      - ucx-py-cu11==0.43.0
      - ucxx-cu11==0.43.0
      - urllib3==2.5.0
      - xformers==0.0.18
      - yacs==0.1.8
      - zict==3.0.0
      - zipp==3.23.0
prefix: /home/ubuntu/miniconda3/envs/dinov2

需要额外注意的是 numpy 的版本号是 1.26.3，这个版本号是多次尝试才得到的适配当前环境的一个版本.

您可能感兴趣的与本文相关的镜像

PyTorch 2.6

PyTorch

Cuda

PyTorch 是一个开源的 Python 机器学习库，基于 Torch 库，底层由 C++ 实现，应用于人工智能领域，如计算机视觉和自然语言处理