covid_seg) (base) liulicheng@ailab-MS-7B79:~/MultiModal_MedSeg_2025$ /home/liulicheng/anaconda3/envs/covid_seg/bin/python /home/liulicheng/MultiModal_MedSeg_2025/train/train_swinunetr_clipfusion.py
使用尺寸: Resized=(128, 128, 64), Crop=(64, 64, 32)
/home/liulicheng/anaconda3/envs/covid_seg/lib/python3.8/site-packages/monai/utils/deprecate_utils.py:221: FutureWarning: monai.networks.nets.swin_unetr SwinUNETR.__init__:img_size: Argument `img_size` has been deprecated since version 1.3. It will be removed in version 1.5. The img_size argument is not required anymore and checks on the input size are run during forward().
warn_deprecated(argname, msg, warning_category)
模型结构:
SwinUNETRWithCLIPFusion(
(swinViT): SwinTransformer(
(patch_embed): PatchEmbed(
(proj): Conv3d(1, 12, kernel_size=(2, 2, 2), stride=(2, 2, 2))
)
(pos_drop): Dropout(p=0.0, inplace=False)
(layers1): ModuleList(
(0): BasicLayer(
(blocks): ModuleList(
(0-1): 2 x SwinTransformerBlock(
(norm1): LayerNorm((12,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=12, out_features=36, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=12, out_features=12, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): Identity()
(norm2): LayerNorm((12,), eps=1e-05, elementwise_affine=True)
(mlp): MLPBlock(
(linear1): Linear(in_features=12, out_features=48, bias=True)
(linear2): Linear(in_features=48, out_features=12, bias=True)
(fn): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(drop2): Dropout(p=0.0, inplace=False)
)
)
)
(downsample): PatchMerging(
(reduction): Linear(in_features=96, out_features=24, bias=False)
(norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
)
)
)
(layers2): ModuleList(
(0): BasicLayer(
(blocks): ModuleList(
(0-1): 2 x SwinTransformerBlock(
(norm1): LayerNorm((24,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=24, out_features=72, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=24, out_features=24, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): Identity()
(norm2): LayerNorm((24,), eps=1e-05, elementwise_affine=True)
(mlp): MLPBlock(
(linear1): Linear(in_features=24, out_features=96, bias=True)
(linear2): Linear(in_features=96, out_features=24, bias=True)
(fn): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(drop2): Dropout(p=0.0, inplace=False)
)
)
)
(downsample): PatchMerging(
(reduction): Linear(in_features=192, out_features=48, bias=False)
(norm): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
)
)
)
(layers3): ModuleList(
(0): BasicLayer(
(blocks): ModuleList(
(0-1): 2 x SwinTransformerBlock(
(norm1): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=48, out_features=144, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=48, out_features=48, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): Identity()
(norm2): LayerNorm((48,), eps=1e-05, elementwise_affine=True)
(mlp): MLPBlock(
(linear1): Linear(in_features=48, out_features=192, bias=True)
(linear2): Linear(in_features=192, out_features=48, bias=True)
(fn): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(drop2): Dropout(p=0.0, inplace=False)
)
)
)
(downsample): PatchMerging(
(reduction): Linear(in_features=384, out_features=96, bias=False)
(norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
)
)
)
(layers4): ModuleList(
(0): BasicLayer(
(blocks): ModuleList(
(0-1): 2 x SwinTransformerBlock(
(norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
(attn): WindowAttention(
(qkv): Linear(in_features=96, out_features=288, bias=True)
(attn_drop): Dropout(p=0.0, inplace=False)
(proj): Linear(in_features=96, out_features=96, bias=True)
(proj_drop): Dropout(p=0.0, inplace=False)
(softmax): Softmax(dim=-1)
)
(drop_path): Identity()
(norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
(mlp): MLPBlock(
(linear1): Linear(in_features=96, out_features=384, bias=True)
(linear2): Linear(in_features=384, out_features=96, bias=True)
(fn): GELU(approximate='none')
(drop1): Dropout(p=0.0, inplace=False)
(drop2): Dropout(p=0.0, inplace=False)
)
)
)
(downsample): PatchMerging(
(reduction): Linear(in_features=768, out_features=192, bias=False)
(norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
)
)
)
(encoder1): UnetrBasicBlock(
(layer): UnetResBlock(
(conv1): Convolution(
(conv): Conv3d(1, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(conv2): Convolution(
(conv): Conv3d(12, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(lrelu): LeakyReLU(negative_slope=0.01, inplace=True)
(norm1): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(norm2): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(conv3): Convolution(
(conv): Conv3d(1, 12, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
)
(norm3): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)
)
(encoder2): UnetrBasicBlock(
(layer): UnetResBlock(
(conv1): Convolution(
(conv): Conv3d(12, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(conv2): Convolution(
(conv): Conv3d(12, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(lrelu): LeakyReLU(negative_slope=0.01, inplace=True)
(norm1): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(norm2): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)
)
(encoder3): UnetrBasicBlock(
(layer): UnetResBlock(
(conv1): Convolution(
(conv): Conv3d(24, 24, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(conv2): Convolution(
(conv): Conv3d(24, 24, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(lrelu): LeakyReLU(negative_slope=0.01, inplace=True)
(norm1): InstanceNorm3d(24, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(norm2): InstanceNorm3d(24, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)
)
(encoder4): UnetrBasicBlock(
(layer): UnetResBlock(
(conv1): Convolution(
(conv): Conv3d(48, 48, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(conv2): Convolution(
(conv): Conv3d(48, 48, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(lrelu): LeakyReLU(negative_slope=0.01, inplace=True)
(norm1): InstanceNorm3d(48, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(norm2): InstanceNorm3d(48, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)
)
(encoder10): UnetrBasicBlock(
(layer): UnetResBlock(
(conv1): Convolution(
(conv): Conv3d(192, 192, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(conv2): Convolution(
(conv): Conv3d(192, 192, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(lrelu): LeakyReLU(negative_slope=0.01, inplace=True)
(norm1): InstanceNorm3d(192, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(norm2): InstanceNorm3d(192, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)
)
(decoder5): ConvTranspose3d(12, 12, kernel_size=(2, 2, 2), stride=(2, 2, 2))
(decoder4): UnetUpBlock(
(transp_conv): Convolution(
(conv): ConvTranspose3d(12, 12, kernel_size=(2, 2, 2), stride=(2, 2, 2), bias=False)
)
(conv_block): UnetBasicBlock(
(conv1): Convolution(
(conv): Conv3d(24, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(conv2): Convolution(
(conv): Conv3d(12, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(lrelu): LeakyReLU(negative_slope=0.01, inplace=True)
(norm1): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(norm2): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)
)
(decoder3): UnetUpBlock(
(transp_conv): Convolution(
(conv): ConvTranspose3d(12, 12, kernel_size=(2, 2, 2), stride=(2, 2, 2), bias=False)
)
(conv_block): UnetBasicBlock(
(conv1): Convolution(
(conv): Conv3d(24, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(conv2): Convolution(
(conv): Conv3d(12, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(lrelu): LeakyReLU(negative_slope=0.01, inplace=True)
(norm1): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(norm2): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)
)
(decoder2): UnetUpBlock(
(transp_conv): Convolution(
(conv): ConvTranspose3d(12, 12, kernel_size=(2, 2, 2), stride=(2, 2, 2), bias=False)
)
(conv_block): UnetBasicBlock(
(conv1): Convolution(
(conv): Conv3d(24, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(conv2): Convolution(
(conv): Conv3d(12, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(lrelu): LeakyReLU(negative_slope=0.01, inplace=True)
(norm1): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(norm2): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)
)
(decoder1): UnetUpBlock(
(transp_conv): Convolution(
(conv): ConvTranspose3d(12, 12, kernel_size=(2, 2, 2), stride=(2, 2, 2), bias=False)
)
(conv_block): UnetBasicBlock(
(conv1): Convolution(
(conv): Conv3d(24, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(conv2): Convolution(
(conv): Conv3d(12, 12, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
)
(lrelu): LeakyReLU(negative_slope=0.01, inplace=True)
(norm1): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
(norm2): InstanceNorm3d(12, eps=1e-05, momentum=0.1, affine=False, track_running_stats=False)
)
)
(out): UnetOutBlock(
(conv): Convolution(
(conv): Conv3d(12, 3, kernel_size=(1, 1, 1), stride=(1, 1, 1))
)
)
(fusion): LightCrossAttentionFusion(
(query_proj): Linear(in_features=192, out_features=192, bias=True)
(key_proj): Linear(in_features=512, out_features=192, bias=True)
(value_proj): Linear(in_features=512, out_features=192, bias=True)
(out_proj): Linear(in_features=192, out_features=192, bias=True)
)
(fusion_reduce): Conv3d(192, 12, kernel_size=(1, 1, 1), stride=(1, 1, 1))
(residual_conv): Conv3d(192, 12, kernel_size=(1, 1, 1), stride=(1, 1, 1))
(residual_upsample): Sequential(
(0): ConvTranspose3d(12, 12, kernel_size=(2, 2, 2), stride=(2, 2, 2))
(1): ConvTranspose3d(12, 12, kernel_size=(2, 2, 2), stride=(2, 2, 2))
(2): ConvTranspose3d(12, 12, kernel_size=(2, 2, 2), stride=(2, 2, 2))
(3): ConvTranspose3d(12, 12, kernel_size=(2, 2, 2), stride=(2, 2, 2))
)
(skip4_reduce): Conv3d(96, 12, kernel_size=(1, 1, 1), stride=(1, 1, 1))
(skip3_reduce): Conv3d(48, 12, kernel_size=(1, 1, 1), stride=(1, 1, 1))
(skip2_reduce): Conv3d(24, 12, kernel_size=(1, 1, 1), stride=(1, 1, 1))
(skip1_reduce): Conv3d(12, 12, kernel_size=(1, 1, 1), stride=(1, 1, 1))
(dropout): Dropout3d(p=0.2, inplace=False)
)
Epoch 1/200
Training Epoch 1: 0%| | 0/104 [00:00<?, ?it/s]输入图像维度: torch.Size([1, 1, 64, 64, 32])
标签维度: torch.Size([1, 1, 64, 64, 32])
enc_out_list 通道数: [12, 24, 48, 96, 192]
enc_out_list 各层特征图尺寸: [torch.Size([1, 12, 32, 32, 16]), torch.Size([1, 24, 16, 16, 8]), torch.Size([1, 48, 8, 8, 4]), torch.Size([1, 96, 4, 4, 2]), torch.Size([1, 192, 2, 2, 1])]
Training Epoch 1: 0%| | 0/104 [00:01<?, ?it/s]
Traceback (most recent call last):
File "/home/liulicheng/MultiModal_MedSeg_2025/train/train_swinunetr_clipfusion.py", line 344, in <module>
outputs = model(inputs, text_feat)
File "/home/liulicheng/anaconda3/envs/covid_seg/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/liulicheng/anaconda3/envs/covid_seg/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/home/liulicheng/MultiModal_MedSeg_2025/train/train_swinunetr_clipfusion.py", line 178, in forward
d5 = d5 + residual
File "/home/liulicheng/anaconda3/envs/covid_seg/lib/python3.8/site-packages/monai/data/meta_tensor.py", line 282, in __torch_function__
ret = super().__torch_function__(func, types, args, kwargs)
File "/home/liulicheng/anaconda3/envs/covid_seg/lib/python3.8/site-packages/torch/_tensor.py", line 1386, in __torch_function__
ret = func(*args, **kwargs)
RuntimeError: The size of tensor a (32) must match the size of tensor b (16) at non-singleton dimension 4报错啦