我用的Linux内核版本4.4.266
sk_buff结构体定义在文件/linux-4.4.266/include/linux/skbuff.h
/**
* struct sk_buff - socket buffer
* @next: Next buffer in list
* @prev: Previous buffer in list
* @tstamp: Time we arrived/left
* @rbnode: RB tree node, alternative to next/prev for netem/tcp
* @sk: Socket we are owned by
* @dev: Device we arrived on/are leaving by
* @cb: Control buffer. Free for use by every layer. Put private vars here
* @_skb_refdst: destination entry (with norefcount bit)
* @sp: the security path, used for xfrm
* @len: Length of actual data
* @data_len: Data length
* @mac_len: Length of link layer header
* @hdr_len: writable header length of cloned skb
* @csum: Checksum (must include start/offset pair)
* @csum_start: Offset from skb->head where checksumming should start
* @csum_offset: Offset from csum_start where checksum should be stored
* @priority: Packet queueing priority
* @ignore_df: allow local fragmentation
* @cloned: Head may be cloned (check refcnt to be sure)
* @ip_summed: Driver fed us an IP checksum
* @nohdr: Payload reference only, must not modify header
* @nfctinfo: Relationship of this skb to the connection
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ipvs_property: skbuff is owned by ipvs
* @peeked: this packet has been seen already, so stats have been
* done for it, don't do them again
* @nf_trace: netfilter packet trace flag
* @protocol: Packet protocol from driver
* @destructor: Destruct function
* @nfct: Associated connection, if any
* @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
* @skb_iif: ifindex of device we arrived on
* @tc_index: Traffic control index
* @tc_verd: traffic control verdict
* @hash: the packet hash
* @queue_mapping: Queue mapping for multiqueue devices
* @xmit_more: More SKBs are pending for this queue
* @pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
* @ndisc_nodetype: router type (from link layer)
* @ooo_okay: allow the mapping of a socket to a queue to be changed
* @l4_hash: indicate hash is a canonical 4-tuple hash over transport
* ports.
* @sw_hash: indicates hash was computed in software stack
* @wifi_acked_valid: wifi_acked was set
* @wifi_acked: whether frame was acked on wifi or not
* @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
* @napi_id: id of the NAPI struct this skb came from
* @secmark: security marking
* @offload_fwd_mark: fwding offload mark
* @mark: Generic packet mark
* @vlan_proto: vlan encapsulation protocol
* @vlan_tci: vlan tag control information
* @inner_protocol: Protocol (encapsulation)
* @inner_transport_header: Inner transport layer header (encapsulation)
* @inner_network_header: Network layer header (encapsulation)
* @inner_mac_header: Link layer header (encapsulation)
* @transport_header: Transport layer header
* @network_header: Network layer header
* @mac_header: Link layer header
* @tail: Tail pointer
* @end: End pointer
* @head: Head of buffer
* @data: Data head pointer
* @truesize: Buffer size
* @users: User count - see {datagram,tcp}.c
*/
struct sk_buff {
union {
struct {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
union {
ktime_t tstamp;
struct skb_mstamp skb_mstamp;
};
};
struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
};
union {
struct sock *sk;
int ip_defrag_offset;
};
struct net_device *dev;
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8);
unsigned long _skb_refdst;
void (*destructor)(struct sk_buff *skb);
#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info *nf_bridge;
#endif
unsigned int len,
data_len;
__u16 mac_len,
hdr_len;
/* Following fields are _not_ copied in __copy_skb_header()
* Note that queue_mapping is here mostly to fill a hole.
*/
kmemcheck_bitfield_begin(flags1);
__u16 queue_mapping;
__u8 cloned:1,
nohdr:1,
fclone:2,
peeked:1,
head_frag:1,
xmit_more:1,
pfmemalloc:1;
kmemcheck_bitfield_end(flags1);
/* fields enclosed in headers_start/headers_end are copied
* using a single memcpy() in __copy_skb_header()
*/
/* private: */
__u32 headers_start[0];
/* public: */
/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX (7 << 5)
#else
#define PKT_TYPE_MAX 7
#endif
#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)
__u8 __pkt_type_offset[0];
__u8 pkt_type:3;
__u8 ignore_df:1;
__u8 nfctinfo:3;
__u8 nf_trace:1;
__u8 ip_summed:2;
__u8 ooo_okay:1;
__u8 l4_hash:1;
__u8 sw_hash:1;
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
/* Indicates the inner headers are valid in the skbuff. */
__u8 encapsulation:1;
__u8 encap_hdr_csum:1;
__u8 csum_valid:1;
__u8 csum_complete_sw:1;
__u8 csum_level:2;
__u8 csum_bad:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ipvs_property:1;
__u8 inner_protocol_type:1;
__u8 remcsum_offload:1;
/* 3 or 5 bit hole */
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
};
__u32 priority;
int skb_iif;
__u32 hash;
__be16 vlan_proto;
__u16 vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
union {
unsigned int napi_id;
unsigned int sender_cpu;
};
#endif
union {
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
#ifdef CONFIG_NET_SWITCHDEV
__u32 offload_fwd_mark;
#endif
};
union {
__u32 mark;
__u32 reserved_tailroom;
};
union {
__be16 inner_protocol;
__u8 inner_ipproto;
};
__u16 inner_transport_header;
__u16 inner_network_header;
__u16 inner_mac_header;
__be16 protocol;
__u16 transport_header;
__u16 network_header;
__u16 mac_header;
/* private: */
__u32 headers_end[0];
/* public: */
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize;
atomic_t users;
};
【skb组织相关变量】
struct sk_buff *next;
struct sk_buff *prev;
主要用来构造SKB双向链表。该双向链表比传统的双向链表复杂,要求每个skb必须能被整个链表的头部快速找到,为满足该要求,在第一个skb节点前面会插入另一个辅助的sk_buff_head结构的头结点,可以认为该sk_buff_head结构就是skb链表的头结点。
struct sk_buff_head {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
__u32 qlen;//skb链表中的节点数,队列长度
spinlock_t lock;//用于控制对skb链表并发操作的自旋锁
};
skb和和sk_buff_head关系如图:
【skb数据存储相关变量】
struct sock *sk
skb的宿主传输控制块在网络数据报文由本地发出或由本地接收时才有效,使传输控制块与套接口及用户应用程序相关。当一个skb仅在二层或者三层被转发时,即源IP和目的IP都不是本机地址时该指针值为NULL。
unsigned int len
skb中数据部分长度。该字段值随着skb从一个协议层向另一个协议层传递而改变,向上传递时下层首部就不再需要了,而向下层传递时需添加本层首部,因此len也包含了协议首部的长度。(len=线性缓冲区数据长度+SG类型的聚合分散IO数据长度+FRAGLIST类型的聚合分散IO数据长度)
unsigned int data_len
SG类型和FRAGLIST类型聚合分散IO存储区中的数据长度
__u16 mac_len
二层首部长度
void (*destructor)(struct sk_buff *skb);
skb析构函数指针,释放skb时被调用。在转发时如果skb没有宿主传输控制块则该指针为NULL。
unsigned char *data //指向数据的头
sk_buff_data_t tail //指向数据的尾(typedef unsigned char *sk_buff_data_t;)
sk_buff_data_t end //指向缓冲区的尾
unsigned char *head //指向缓冲区的头
整个数据缓存区的总长度,alloc_skb()会将truesize初始化成len+sizeof(sk_buff)
unsigned int truesize
atomic_t users 引用计数,用来标识有多少实体引用了该skb。其主要作用是确定释放所属skb的时机,当计数器为零时,skb才能被释放。因此,每个引用该skb的实体都必须在适当的时候递增和递减引用计数,该计数器只保护skb描述符,而skb数据缓存区也有类似的计数器(skb_shared_info结构中的dataref),通常使用skb_get()和kfree_skb()操作skb描述符引用计数。skb_get()在返回前先执行atomic_inc()操作,而kfree_skb则先执行atomic_dec_and_test(),当引用计数为0时就会释放skb,否则只是简单递减计数。
【skb通用成员变量】
union {
ktime_t tstamp;
struct skb_mstamp skb_mstamp;
}; //接收或发送时间戳,在网络设备收到一个数据包后通过netif_receive_skb()和netif_rx调用net_timestamp()来设置。
struct net_device *dev; //网络设备指针,接收数据包时该指针指向收到数据包的网络设备,发送数据包时该指针指向输出数据包的网络设备。
Linux支持多种形式的虚拟网络设备并由一个虚拟网络设备驱动管理,当这个虚拟设备被使用时,dev指着指向该虚拟设备的net_device结构,在输出时虚拟设备驱动会在一组设备中选择其中的某个合适的设备,并将dev指针修改为指向这个设备的net_device,而在输入时,当原始网络设备接收到报文后,根据某种算法选择某个合适的虚拟网络设备,并将dev指针修改为指向这个虚拟设备的net_device结构。
char cb[48] __aligned(8); //skb信息控制块,由每层协议自己维护并使用,只在本层有效。
__u8 ip_summed:2; //标记传输层校验和的状态
//ip_summed取下述值:
#define CHECKSUM_NONE 0 //硬件不支持,完全由软件来执行校验和
#define CHECKSUM_UNNECESSARY 1 //没有必要执行校验和
#define CHECKSUM_COMPLETE 2 //已经完成执行校验和
#define CHECKSUM_PARTIAL 3 //由硬件来执行校验和
union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
};
csum在校验状态为CHECKSUM_NONE时用于存放负载数据报的数据部分的校验和;csum_offset在校验状态为CHECKSUM_PARTIAL时记录传输层首部中的校验和字段的偏移.
__u8 cloned:1,//标记skb是否已克隆
__u8 pkt_type:3;//帧类型,分类由二层目的地址来决定.
//pkt_type取值如下:
#define PACKET_HOST 0 /* To us */
#define PACKET_BROADCAST 1 /* To all */
#define PACKET_MULTICAST 2 /* To group */
#define PACKET_OTHERHOST 3 /* To someone else */
#define PACKET_OUTGOING 4 /* Outgoing of any type */
#define PACKET_LOOPBACK 5 /* MC/BRD frame looped back */
#define PACKET_USER 6 /* To user space */
#define PACKET_KERNEL 7 /* To kernel space */
__u32 priority;//发送或转发数据包QoS类别。
__be16 protocol;//从二层设备角度看到的上层协议。
【标志性变量】
__u8 nohdr:1,//标识payload是否被单独引用,不存在协议首部。
__u8 fclone:2,//当前克隆状态
//Fclone取值如下:
enum {
SKB_FCLONE_UNAVAILABLE, /* skb has no fclone (from head_cache) 未被克隆*/
SKB_FCLONE_ORIG, /* orig skb (from fclone_cache) 分配的父skb,可以被克隆*/
SKB_FCLONE_CLONE, /* companion fclone skb (from fclone_cache) 分配的子skb,从父skb克隆得到的*/
};