sk_buff详解

本文详细介绍了Linux内核4.4.266中的sk_buff结构体,包括其在skb组织、数据存储、通用成员及标志性变量方面的内容。sk_buff用于构造复杂的双向链表,数据存储相关变量涉及传输控制块、数据长度、首部长度等。通用成员变量如dev指向虚拟网络设备的net_device结构,csum和csum_offset涉及校验和计算。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

我用的Linux内核版本4.4.266

sk_buff结构体定义在文件/linux-4.4.266/include/linux/skbuff.h

/** 
 *	struct sk_buff - socket buffer
 *	@next: Next buffer in list
 *	@prev: Previous buffer in list
 *	@tstamp: Time we arrived/left
 *	@rbnode: RB tree node, alternative to next/prev for netem/tcp
 *	@sk: Socket we are owned by
 *	@dev: Device we arrived on/are leaving by
 *	@cb: Control buffer. Free for use by every layer. Put private vars here
 *	@_skb_refdst: destination entry (with norefcount bit)
 *	@sp: the security path, used for xfrm
 *	@len: Length of actual data
 *	@data_len: Data length
 *	@mac_len: Length of link layer header
 *	@hdr_len: writable header length of cloned skb
 *	@csum: Checksum (must include start/offset pair)
 *	@csum_start: Offset from skb->head where checksumming should start
 *	@csum_offset: Offset from csum_start where checksum should be stored
 *	@priority: Packet queueing priority
 *	@ignore_df: allow local fragmentation
 *	@cloned: Head may be cloned (check refcnt to be sure)
 *	@ip_summed: Driver fed us an IP checksum
 *	@nohdr: Payload reference only, must not modify header
 *	@nfctinfo: Relationship of this skb to the connection
 *	@pkt_type: Packet class
 *	@fclone: skbuff clone status
 *	@ipvs_property: skbuff is owned by ipvs
 *	@peeked: this packet has been seen already, so stats have been
 *		done for it, don't do them again
 *	@nf_trace: netfilter packet trace flag
 *	@protocol: Packet protocol from driver
 *	@destructor: Destruct function
 *	@nfct: Associated connection, if any
 *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
 *	@skb_iif: ifindex of device we arrived on
 *	@tc_index: Traffic control index
 *	@tc_verd: traffic control verdict
 *	@hash: the packet hash
 *	@queue_mapping: Queue mapping for multiqueue devices
 *	@xmit_more: More SKBs are pending for this queue
 *	@pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
 *	@ndisc_nodetype: router type (from link layer)
 *	@ooo_okay: allow the mapping of a socket to a queue to be changed
 *	@l4_hash: indicate hash is a canonical 4-tuple hash over transport
 *		ports.
 *	@sw_hash: indicates hash was computed in software stack
 *	@wifi_acked_valid: wifi_acked was set
 *	@wifi_acked: whether frame was acked on wifi or not
 *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
  *	@napi_id: id of the NAPI struct this skb came from
 *	@secmark: security marking
 *	@offload_fwd_mark: fwding offload mark
 *	@mark: Generic packet mark
 *	@vlan_proto: vlan encapsulation protocol
 *	@vlan_tci: vlan tag control information
 *	@inner_protocol: Protocol (encapsulation)
 *	@inner_transport_header: Inner transport layer header (encapsulation)
 *	@inner_network_header: Network layer header (encapsulation)
 *	@inner_mac_header: Link layer header (encapsulation)
 *	@transport_header: Transport layer header
 *	@network_header: Network layer header
 *	@mac_header: Link layer header
 *	@tail: Tail pointer
 *	@end: End pointer
 *	@head: Head of buffer
 *	@data: Data head pointer
 *	@truesize: Buffer size
 *	@users: User count - see {datagram,tcp}.c
 */

struct sk_buff {
	union {
		struct {
			/* These two members must be first. */
			struct sk_buff		*next;
			struct sk_buff		*prev;

			union {
				ktime_t		tstamp;
				struct skb_mstamp skb_mstamp;
			};
		};
		struct rb_node		rbnode; /* used in netem, ip4 defrag, and tcp stack */
	};

	union {
		struct sock		*sk;
		int			ip_defrag_offset;
	};

	struct net_device	*dev;

	/*
	 * This is the control buffer. It is free to use for every
	 * layer. Please put your private variables there. If you
	 * want to keep them across layers you have to do a skb_clone()
	 * first. This is owned by whoever has the skb queued ATM.
	 */
	char			cb[48] __aligned(8);

	unsigned long		_skb_refdst;
	void			(*destructor)(struct sk_buff *skb);
#ifdef CONFIG_XFRM
	struct	sec_path	*sp;
#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
	struct nf_conntrack	*nfct;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
	struct nf_bridge_info	*nf_bridge;
#endif
	unsigned int		len,
				data_len;
	__u16			mac_len,
				hdr_len;

	/* Following fields are _not_ copied in __copy_skb_header()
	 * Note that queue_mapping is here mostly to fill a hole.
	 */
	kmemcheck_bitfield_begin(flags1);
	__u16			queue_mapping;
	__u8			cloned:1,
				nohdr:1,
				fclone:2,
				peeked:1,
				head_frag:1,
				xmit_more:1,
				pfmemalloc:1;
	kmemcheck_bitfield_end(flags1);

	/* fields enclosed in headers_start/headers_end are copied
	 * using a single memcpy() in __copy_skb_header()
	 */
	/* private: */
	__u32			headers_start[0];
	/* public: */

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX	(7 << 5)
#else
#define PKT_TYPE_MAX	7
#endif
#define PKT_TYPE_OFFSET()	offsetof(struct sk_buff, __pkt_type_offset)

	__u8			__pkt_type_offset[0];
	__u8			pkt_type:3;
	__u8			ignore_df:1;
	__u8			nfctinfo:3;
	__u8			nf_trace:1;

	__u8			ip_summed:2;
	__u8			ooo_okay:1;
	__u8			l4_hash:1;
	__u8			sw_hash:1;
	__u8			wifi_acked_valid:1;
	__u8			wifi_acked:1;
	__u8			no_fcs:1;

	/* Indicates the inner headers are valid in the skbuff. */
	__u8			encapsulation:1;
	__u8			encap_hdr_csum:1;
	__u8			csum_valid:1;
	__u8			csum_complete_sw:1;
	__u8			csum_level:2;
	__u8			csum_bad:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
	__u8			ndisc_nodetype:2;
#endif
	__u8			ipvs_property:1;

	__u8			inner_protocol_type:1;
	__u8			remcsum_offload:1;
	/* 3 or 5 bit hole */

#ifdef CONFIG_NET_SCHED
	__u16			tc_index;	/* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
	__u16			tc_verd;	/* traffic control verdict */
#endif
#endif

	union {
		__wsum		csum;
		struct {
			__u16	csum_start;
			__u16	csum_offset;
		};
	};
	__u32			priority;
	int			skb_iif;
	__u32			hash;
	__be16			vlan_proto;
	__u16			vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
	union {
		unsigned int	napi_id;
		unsigned int	sender_cpu;
	};
#endif
	union {
#ifdef CONFIG_NETWORK_SECMARK
		__u32		secmark;
#endif
#ifdef CONFIG_NET_SWITCHDEV
		__u32		offload_fwd_mark;
#endif
	};

	union {
		__u32		mark;
		__u32		reserved_tailroom;
	};

	union {
		__be16		inner_protocol;
		__u8		inner_ipproto;
	};

	__u16			inner_transport_header;
	__u16			inner_network_header;
	__u16			inner_mac_header;

	__be16			protocol;
	__u16			transport_header;
	__u16			network_header;
	__u16			mac_header;

	/* private: */
	__u32			headers_end[0];
	/* public: */

	/* These elements must be at the end, see alloc_skb() for details.  */
	sk_buff_data_t		tail;
	sk_buff_data_t		end;
	unsigned char		*head,
				*data;
	unsigned int		truesize;
	atomic_t		users;
};

【skb组织相关变量】

struct sk_buff *next;
struct sk_buff *prev;

主要用来构造SKB双向链表。该双向链表比传统的双向链表复杂,要求每个skb必须能被整个链表的头部快速找到,为满足该要求,在第一个skb节点前面会插入另一个辅助的sk_buff_head结构的头结点,可以认为该sk_buff_head结构就是skb链表的头结点。

struct sk_buff_head {

/* These two members must be first. */

struct sk_buff *next;

struct sk_buff *prev;

__u32 qlen;//skb链表中的节点数,队列长度

spinlock_t lock;//用于控制对skb链表并发操作的自旋锁

};

skb和和sk_buff_head关系如图:

【skb数据存储相关变量】

struct sock *sk

skb的宿主传输控制块在网络数据报文由本地发出或由本地接收时才有效,使传输控制块与套接口及用户应用程序相关。当一个skb仅在二层或者三层被转发时,即源IP和目的IP都不是本机地址时该指针值为NULL。

 

unsigned int len

skb中数据部分长度。该字段值随着skb从一个协议层向另一个协议层传递而改变,向上传递时下层首部就不再需要了,而向下层传递时需添加本层首部,因此len也包含了协议首部的长度。(len=线性缓冲区数据长度+SG类型的聚合分散IO数据长度+FRAGLIST类型的聚合分散IO数据长度)

 

unsigned int data_len

SG类型和FRAGLIST类型聚合分散IO存储区中的数据长度

 

__u16 mac_len

二层首部长度

 

void (*destructor)(struct sk_buff *skb);

skb析构函数指针,释放skb时被调用。在转发时如果skb没有宿主传输控制块则该指针为NULL。

 

unsigned char *data //指向数据的头

sk_buff_data_t tail //指向数据的尾(typedef unsigned char *sk_buff_data_t;)

sk_buff_data_t end //指向缓冲区的尾

unsigned char *head //指向缓冲区的头

 

整个数据缓存区的总长度,alloc_skb()会将truesize初始化成len+sizeof(sk_buff)

 

unsigned int truesize

atomic_t users 引用计数,用来标识有多少实体引用了该skb。其主要作用是确定释放所属skb的时机,当计数器为零时,skb才能被释放。因此,每个引用该skb的实体都必须在适当的时候递增和递减引用计数,该计数器只保护skb描述符,而skb数据缓存区也有类似的计数器(skb_shared_info结构中的dataref),通常使用skb_get()和kfree_skb()操作skb描述符引用计数。skb_get()在返回前先执行atomic_inc()操作,而kfree_skb则先执行atomic_dec_and_test(),当引用计数为0时就会释放skb,否则只是简单递减计数。

 

【skb通用成员变量】

union {

ktime_t tstamp;

struct skb_mstamp skb_mstamp;

}; //接收或发送时间戳,在网络设备收到一个数据包后通过netif_receive_skb()和netif_rx调用net_timestamp()来设置。

 

struct net_device *dev; //网络设备指针,接收数据包时该指针指向收到数据包的网络设备,发送数据包时该指针指向输出数据包的网络设备。

Linux支持多种形式的虚拟网络设备并由一个虚拟网络设备驱动管理,当这个虚拟设备被使用时,dev指着指向该虚拟设备的net_device结构,在输出时虚拟设备驱动会在一组设备中选择其中的某个合适的设备,并将dev指针修改为指向这个设备的net_device,而在输入时,当原始网络设备接收到报文后,根据某种算法选择某个合适的虚拟网络设备,并将dev指针修改为指向这个虚拟设备的net_device结构。

char cb[48] __aligned(8); //skb信息控制块,由每层协议自己维护并使用,只在本层有效。



__u8 ip_summed:2; //标记传输层校验和的状态

//ip_summed取下述值:

#define CHECKSUM_NONE       0 //硬件不支持,完全由软件来执行校验和

#define CHECKSUM_UNNECESSARY    1 //没有必要执行校验和

#define CHECKSUM_COMPLETE   2 //已经完成执行校验和

#define CHECKSUM_PARTIAL    3 //由硬件来执行校验和

 

union {
    __wsum csum;
    struct {
        __u16 csum_start;
        __u16 csum_offset;
    };
}; 

csum在校验状态为CHECKSUM_NONE时用于存放负载数据报的数据部分的校验和;csum_offset在校验状态为CHECKSUM_PARTIAL时记录传输层首部中的校验和字段的偏移.

__u8 cloned:1,//标记skb是否已克隆

__u8 pkt_type:3;//帧类型,分类由二层目的地址来决定.

//pkt_type取值如下:

#define PACKET_HOST     0       /* To us        */

#define PACKET_BROADCAST    1       /* To all       */

#define PACKET_MULTICAST    2       /* To group     */

#define PACKET_OTHERHOST    3       /* To someone else  */

#define PACKET_OUTGOING     4       /* Outgoing of any type */

#define PACKET_LOOPBACK     5       /* MC/BRD frame looped back */

#define PACKET_USER     6       /* To user space    */

#define PACKET_KERNEL       7       /* To kernel space  */



__u32 priority;//发送或转发数据包QoS类别。

__be16 protocol;//从二层设备角度看到的上层协议。

 

【标志性变量】

__u8 nohdr:1,//标识payload是否被单独引用,不存在协议首部。

__u8 fclone:2,//当前克隆状态

//Fclone取值如下:

enum {

     SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) 未被克隆*/

     SKB_FCLONE_ORIG,    /* orig skb (from fclone_cache) 分配的父skb,可以被克隆*/

     SKB_FCLONE_CLONE,   /* companion fclone skb (from fclone_cache) 分配的子skb,从父skb克隆得到的*/

};

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

影帝sunny

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值