怎么理解Pytorch中CrossEntropyLoss()

Singcing

已于 2025-05-19 20:56:20 修改

阅读量278

点赞数 3

分类专栏：机器学习文章标签：深度学习机器学习 python

于 2024-06-04 10:41:10 首次发布

本文链接：https://blog.youkuaiyun.com/m0_46306264/article/details/139435069

版权

机器学习专栏收录该内容

21 篇文章

订阅专栏

如何理解softmax?

import torch
import math


def softmax(x, dim):
    # Get the shape of the input tensor
    batch_size, num_channels, height, width = x.shape
    
    # Initialize the output tensor with the same shape as input tensor
    softmax_output = torch.zeros_like(x)
    
    if dim == 2:  # softmax over height
        # Iterate over the batch dimension
        for b in range(batch_size):
            # Iterate over the channels dimension
            for c in range(num_channels):
                # Iterate over the width dimension
                for w in range(width):
                    # Extract the column of height for softmax
                    height_values = x[b, c, :, w]
                    
                    # Step 1: Compute the exponentials for each value
                    exp_values = [math.exp(val.item()) for val in height_values]
                    
                    # Step 2: Compute the sum of exponentials
                    exp_sum = sum(exp_values)
                    
                    # Step 3: Normalize by dividing each exponential by the sum
                    softmax_values = [exp_val / exp_sum for exp_val in exp_values]
                    
                    # Step 4: Store the softmax values back in the output tensor
                    for h in range(height):
                        softmax_output[b, c, h, w] = softmax_values[h]

    elif dim == 3:  # softmax over width
        # Iterate over the batch dimension
        for b in range(batch_size):
            # Iterate over the channels dimension
            for c in range(num_channels):
                # Iterate over the height dimension
                for h in range(height):
                    # Extract the row of width for softmax
                    width_values = x[b, c, h, :]
                    
                    # Step 1: Compute the exponentials for each value
                    exp_values = [math.exp(val.item()) for val in width_values]
                    
                    # Step 2: Compute the sum of exponentials
                    exp_sum = sum(exp_values)
                    
                    # Step 3: Normalize by dividing each exponential by the sum
                    softmax_values = [exp_val / exp_sum for exp_val in exp_values]
                    
                    # Step 4: Store the softmax values back in the output tensor
                    for w in range(width):
                        softmax_output[b, c, h, w] = softmax_values[w]

    elif dim == 1:  # softmax over channels
        # Iterate over the batch dimension
        for b in range(batch_size):
            # Iterate over the height dimension
            for h in range(height):
                # Iterate over the width dimension
                for w in range(width):
                    # Extract the column of channels for softmax
                    channel_values = x[b, :, h, w]
                    
                    # Step 1: Compute the exponentials for each value
                    exp_values = [math.exp(val.item()) for val in channel_values]
                    
                    # Step 2: Compute the sum of exponentials
                    exp_sum = sum(exp_values)
                    
                    # Step 3: Normalize by dividing each exponential by the sum
                    softmax_values = [exp_val / exp_sum for exp_val in exp_values]
                    
                    # Step 4: Store the softmax values back in the output tensor
                    for c in range(num_channels):
                        softmax_output[b, c, h, w] = softmax_values[c]
    
    else:
        raise ValueError("Invalid dim value. Please use dim=1, dim=2, or dim=3.")

    return softmax_output


# Random tensor with shape (batch, channels, height, width)
batch, channels, height, width = 4, 3, 8, 8
x = torch.randint(-10, 10, (batch, channels, height, width), dtype=torch.float)

# Example: Apply softmax over the height (dim=2)
softmax_x = softmax(x, dim=1)

softmax_layer = torch.nn.Softmax(dim=1)
softmax_outputs = softmax_layer(x)

print(torch.allclose(softmax_x, softmax_outputs, atol=1e-4)) # True

1. 实例

当使用torch.nn.CrossEntropyLoss()与2D输出矩阵和1D标签张量时，输出矩阵的每一行对应于单个样本的原始预测(logits)，并且标签张量的每个元素包含相应样本的整数类标签

"""
Model outputs(outputs):
For the first sample: [2.0, -1.0, 0.5]
For the second sample : [-0.5, 1.0, 3.0 ]
target labels [1,2]

Softmax([2.0, -1.0, 0.5]) = [0.832, 0.017, 0.151]
Softmax([-0.5, 1.0, 3.0]) = [0.046, 0.118, 0.836]
Average Loss = (4.08 + 0.18) / 2 ≈ 2.13
"""

2.torch模拟计算交叉熵

outputs矩阵的意义
样本 1: [P(类别1|样本1), P(类别2|样本1), P(类别3|样本1), P(类别4|样本1)]
样本 2: [P(类别1|样本2), P(类别2|样本2), P(类别3|样本2), P(类别4|样本2)]
样本 3: [P(类别1|样本3), P(类别2|样本3), P(类别3|样本3), P(类别4|样本3)]


softmax(dim=1), 沿着第1维度(dim=1:列维度，dim=0:行维度)进行softmax,
沿着同一行不同列之间的值进行softmax
softmax之后，每一行的概率值和为1

import torch
import torch.nn.functional as F


def test2():
    # Example outputs (logits) and labels
    outputs = torch.tensor([
        [ 1.2, -0.5,  0.3,  2.1],  # Raw predictions for sample 1
        [-0.8,  1.5,  2.3, -1.0],  # Raw predictions for sample 2
        [ 0.5, -1.0,  1.8,  0.2]   # Raw predictions for sample 3
    ])

    target = torch.tensor([2, 1, 3])  # Ground truth labels

    # Step 1: Compute softmax probabilities
    softmax_layer=torch.nn.Softmax(dim=1)
    softmax_outputs=softmax_layer(outputs)
    # or softmax_outputs = F.softmax(outputs, dim=1,)
    # softmax_outputs_size([3,4])　

    # Step 2: Extract the predicted probabilities for the target labels
    predicted_probs = softmax_outputs[range(len(target)), target]
    # predicted_probs_size([3])

    # Step 3: Compute the negative log probabilities for the predicted classes
    neg_log_probs = -torch.log(predicted_probs)
    # neg_log_probs_size([3])

    # Step 4: Compute the mean of the negative log probabilities
    mean_loss = torch.mean(neg_log_probs)
    
    print(mean_loss.item())  #　1.851070761680603

test2()

3.直接使用torch的函数

import torch
import torch.nn.functional as F
def test1():
    # Example outputs (logits) and labels
    outputs = torch.tensor([
        [ 1.2, -0.5,  0.3,  2.1],  # Raw predictions for sample 1
        [-0.8,  1.5,  2.3, -1.0],  # Raw predictions for sample 2
        [ 0.5, -1.0,  1.8,  0.2]   # Raw predictions for sample 3
    ])

    target = torch.tensor([2, 1, 3])  # Ground truth labels

    criterion=torch.nn.CrossEntropyLoss(reduction="mean")
    print(criterion(outputs,target)) # tensor(1.8511)