1 内核中分层模型的实现
由于内核网络子系统处理大量特定于协议的细节和微妙地方,只有分层[每层功能单一]并通过层层之间的大量的标准化的函数指针这种架构方式实现,才能尽可能的简洁高效,但也使得代码路径变得不不清晰
自顶向下分层:
用户空间 应用程序+c标准库
内核应用层 struct socket[面向用户]
+ struct sock[面向硬件]
内核传输层 struct
proto
内核网络层 struct
packet_type |特定于协议
内核主机到网络层 dev.c|struct net_device|drive.c
<--> 物理传输
2 数据传输方式的封装
一个以太网帧结构:
MAC首部|ip首部|tcp首部|http首部|html数据
<-以太网帧净荷
<-ip净荷
<-tcp净荷
3 网络命名空间
命名空间类似c++语言的namespace机制,利用命名空间,建立系统的多个虚拟视图,彼此隔离,使每个实例看起来像一台运行的linux服务器。大多数计算机目前只需要一个命名空间,由全局变量init_net描述。在linux2.6.32版本中,命名空间结构如下:
struct net
{
atomic_t count; /*
To decided when the network namespace should be
freed.*/
#ifdef NETNS_REFCNT_DEBUG
atomic_t use_count; /*
To track references we
* destroy
on demand
*/
#endif
struct list_head list; /* list of network namespaces */
头部为net_namespace_list
struct list_head cleanup_list; /* namespaces on death
row */
struct list_head exit_list; /*
Use
only net_mutex
*/
struct
proc_dir_entry *proc_net;
//该namespace
/proc/net项结构
struct
proc_dir_entry *proc_net_stat;
#ifdef
CONFIG_SYSCTL
struct
ctl_table_set sysctls;
#endif
struct net_device *loopback_dev; /* The loopback */
struct list_head dev_base_head;
//挂在该命名空间中的所有的网络设备net_device
struct
hlist_head *dev_name_head;
struct
hlist_head *dev_index_head;
/* core fib_rules */
struct list_head rules_ops;
spinlock_t rules_mod_lock;
struct sock
*rtnl; /*
rtnetlink socket
*/
struct sock *genl_sock;
struct netns_core core; //特定于协议的结构
struct netns_mib mib;
struct netns_packet packet;
struct netns_unix unx;
struct netns_ipv4 ipv4;
#if defined(CONFIG_IPV6)
|| defined(CONFIG_IPV6_MODULE)
struct netns_ipv6 ipv6;
#endif
#if defined(CONFIG_IP_DCCP)
|| defined(CONFIG_IP_DCCP_MODULE)
struct netns_dccp dccp;
#endif
#ifdef
CONFIG_NETFILTER
struct netns_xt xt;
#if defined(CONFIG_NF_CONNTRACK)
|| defined(CONFIG_NF_CONNTRACK_MODULE)
struct netns_ct ct;
#endif
struct sock *nfnl;
struct sock *nfnl_stash;
#endif
#ifdef
CONFIG_XFRM
struct netns_xfrm xfrm;
#endif
#ifdef
CONFIG_WEXT_CORE
struct sk_buff_head wext_nlevents;
#endif
struct net_generic *gen;
};
4 套接字缓冲区
每次发送或接受数据时,该数据对应一个套接字缓冲区(struct sk_buff),通过网络子系统各个层对其进行处理,而无需来回复制,从而大幅提高性能,套接字缓冲区的基本思想:通过操作指针来增删协议首部
/**
* struct sk_buff
- socket
buffer
* @next: Next
buffer in list
* @prev: Previous buffer in list
* @sk: Socket
we are owned by
* @tstamp:
Time we arrived
* @dev: Device
we arrived on/are leaving
by
* @transport_header:
Transport layer
header
* @network_header: Network layer
header
* @mac_header: Link layer
header
* @_skb_refdst: destination entry (with norefcount
bit)
* @sp:
the security
path, used for
xfrm
* @cb: Control buffer.
Free for use
by every layer. Put private
vars here
* @len: Length
of actual data
* @data_len: Data length
* @mac_len: Length
of link layer
header
* @hdr_len:
writable
header length
of cloned skb
* @csum:
Checksum
(must include start/offset
pair)
* @csum_start:
Offset
from skb->head where
checksumming should start
* @csum_offset:
Offset
from csum_start
where checksum should be stored
* @local_df:
allow local fragmentation
* @cloned:
Head may
be cloned (check
refcnt to be sure)
* @nohdr:
Payload reference
only,
must not
modify
header
* @pkt_type: Packet class
* @fclone: skbuff
clone status
* @ip_summed: Driver
fed us an IP
checksum
* @priority:
Packet queueing
priority
* @users:
User
count - see
{datagram,tcp}.c
* @protocol: Packet protocol from driver
* @truesize:
Buffer size
* @head:
Head of
buffer
* @data: Data
head pointer
* @tail:
Tail pointer
* @end:
End pointer
* @destructor: Destruct
function
* @mark: Generic
packet mark
* @nfct: Associated connection, if
any
* @ipvs_property: skbuff is owned by ipvs
* @peeked:
this packet
has been seen already, so stats have
been
* done
for it, don't do
them again
* @nf_trace: netfilter
packet trace flag
* @nfctinfo:
Relationship
of this skb
to the connection
* @nfct_reasm: netfilter
conntrack
re-assembly
pointer
* @nf_bridge: Saved data
about a bridged frame
- see
br_netfilter.c
* @skb_iif: ifindex
of device
we arrived on
* @rxhash:
the packet hash
computed on receive
* @queue_mapping:
Queue mapping
for multiqueue devices
* @tc_index:
Traffic control index
* @tc_verd:
traffic control verdict
* @ndisc_nodetype:
router type (from link layer)
* @dma_cookie:
a cookie to one
of several possible DMA
operations
* done
by skb DMA
functions
* @secmark: security
marking
* @vlan_tci:
vlan tag
control information
*/
struct sk_buff
{
/* These
two members must
be first.
*/
struct sk_buff *next;
//将套接字缓冲区连接到sk_buff_head双向链表中
struct sk_buff *prev;
ktime_t tstamp; //报文收到时时间
struct sock *sk;
//发出时有效
struct net_device *dev;
//网口设备
/*
* This is the
control buffer. It is free
to use
for every
* layer. Please
put your private variables there. If
you
* want to keep them across layers you
have to do
a skb_clone()
* first.
This is owned by
whoever has the skb
queued ATM.
*/
char cb[48] __aligned(8);
//用于在协议栈之间传递参数
unsigned long _skb_refdst;
#ifdef
CONFIG_XFRM
struct sec_path *sp;
#endif
unsigned int len,
//存储区数据长度+分片长度之和
data_len;
//分片长度之和
__u16 mac_len,
//mac头长度
hdr_len;
union {
__wsum csum;
struct
{
__u16 csum_start;
__u16 csum_offset;
};
};
__u32 priority;
kmemcheck_bitfield_begin(flags1);
__u8 local_df:1,
cloned:1,
ip_summed:2,
nohdr:1,
nfctinfo:3;
__u8 pkt_type:3,//数据报类型,最高位=1 组播,发送到其他主机,广播
fclone:2,
ipvs_property:1,
peeked:1,
nf_trace:1;
kmemcheck_bitfield_end(flags1);
__be16 protocol;
//帧类型,ETH_P_IP(ip),ETH_P_ARP(arp数据报)
void (*destructor)(struct sk_buff
*skb);
#if defined(CONFIG_NF_CONNTRACK)
|| defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct;
struct sk_buff *nfct_reasm;
#endif
#ifdef
CONFIG_BRIDGE_NETFILTER
struct nf_bridge_info *nf_bridge;
#endif
int skb_iif;
#ifdef
CONFIG_NET_SCHED
__u16 tc_index; /*
traffic control index
*/
#ifdef
CONFIG_NET_CLS_ACT
__u16 tc_verd; /*
traffic control verdict
*/
#endif
#endif
__u32 rxhash;
kmemcheck_bitfield_begin(flags2);
__u16 queue_mapping:16;
#ifdef
CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2,
deliver_no_wcard:1;
#else
__u8 deliver_no_wcard:1;
#endif
kmemcheck_bitfield_end(flags2);
/* 0/14 bit hole */
#ifdef
CONFIG_NET_DMA
dma_cookie_t dma_cookie;
#endif
#ifdef
CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 dropcount;
};
__u16 vlan_tci;
/*typedef
unsigned int sk_buff_data_t*/
sk_buff_data_t transport_header;
//传输层头部[该结构在sk_buff缓冲区的偏移大小]
sk_buff_data_t network_header;
//网络层头部
sk_buff_data_t mac_header;
//链路层头部
/* These
elements must
be at the end, see
alloc_skb()
for details.
*/
//协议数据区域的其实和结束位置[mac+ip+tcp+内容,当数据在不同协议层传输时,这两部分是动态变化的,描述当前协议数据区域的实际位置]
sk_buff_data_t tail;
unsigned char*data;
sk_buff_data_t end;
unsigned char *head;
//指向数据在内存起始位置和结束位置(大于实际需要的长度)
unsigned int truesize;
atomic_t users;
};
4-1 从该套接字缓冲区中获取协议头部结构
static inline struct
tcphdr*
tcp_hdr(const struct sk_buff* skb)
{return (struct
tcphdr*)skb->transport_header}
类似有:udp_hdr,ip_hdr,
4-2
操作sk_buff的标准函数:
分配一个sk_buff结构然后分配一个size大小的数据缓冲区
static inline struct sk_buff
* alloc_skb(unsigned int size,gfp_t
priority)
{
return __alloc_skb(size,
priority,
0, -1);
}
/**
* __alloc_skb - allocate
a network buffer
* @size: size
to allocate
* @gfp_mask:
allocation mask
* @fclone:
allocate
from fclone
cache instead of
head cache
* and allocate
a cloned (child) skb
* @node: numa node
to allocate
memory on
*
* Allocate
a new &sk_buff.
The returned buffer
has no
headroom
and a
* tail room
of size
bytes.
The object has a
reference count
of one.
* The return is the
buffer. On a failure
the return is %NULL.
*
* Buffers may
only
be allocated from interrupts using
a @gfp_mask of
* %GFP_ATOMIC.
*/
struct sk_buff
*__alloc_skb(unsigned int size,
gfp_t gfp_mask, int
fclone
, int node)
{
struct
kmem_cache *cache;
struct skb_shared_info
*shinfo;
struct sk_buff
*skb;
u8 *data;
cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
/*
Get the HEAD */
一般从skbuff_head_cache高速缓冲头部分配skb_struct结构大小数据区
skb
= kmem_cache_alloc_node(cache,
gfp_mask &
~__GFP_DMA, node);
if (!skb)
goto out;
prefetchw(skb);
size
= SKB_DATA_ALIGN(size);
//该size大小空间,skb_share_info紧接该sk_buf->end[size尾]
data
= kmalloc_node_track_caller(size
+ sizeof(struct skb_shared_info),
gfp_mask, node);
if (!data)
goto nodata;
prefetchw(data
+ size);
/*
* Only
clear those
fields we need to
clear, not
those
that we will
* actually initialise
below. Hence, don't
put any more fields after
* the tail pointer in struct sk_buff!
*/
memset(skb,
0, offsetof(struct sk_buff,
tail));
//sk_buff结构tail元素前全清0
skb->truesize
= size
+ sizeof(struct sk_buff);
//sk_buff总大小=size+sizeof(sk_buff)
atomic_set(&skb->users,
1);
skb->head = data;
//指向数据内存区开始
skb->data
= data;
skb_reset_tail_pointer(skb);
//开始时,由于数据内存区没有数据,所以tail和data都指向头
skb->end = skb->tail + size; //指向数据内存区末尾
kmemcheck_annotate_bitfield(skb,
flags1);
kmemcheck_annotate_bitfield(skb,
flags2);
#ifdef NET_SKBUFF_DATA_USES_OFFSET
skb->mac_header
= ~0U;
#endif
/*
make sure
we initialize shinfo sequentially
*/
shinfo
= skb_shinfo(skb); //紧邻数据内存区后面的skb_shared_info结构初始化
memset(shinfo,
0, offsetof(struct skb_shared_info, dataref));
atomic_set(&shinfo->dataref,
1);
if
(fclone)
{
struct sk_buff
*child = skb
+ 1;
atomic_t *fclone_ref
= (atomic_t *) (child +
1);
kmemcheck_annotate_bitfield(child,
flags1);
kmemcheck_annotate_bitfield(child,
flags2);
skb->fclone
= SKB_FCLONE_ORIG;
atomic_set(fclone_ref,
1);
child->fclone
= SKB_FCLONE_UNAVAILABLE;
}
out:
return skb;
nodata:
kmem_cache_free(cache, skb);
skb
= NULL;
goto out;
}
struct sk_buff* skb_copy(const struct sk_buff* skb,gfp_t
mask):新建一个sk_buff结构,复制数据缓冲内容
struct sk_buff* skb_clone(struct sk_buff* skb,gfp_t
mask):新建一个sk_buff结构,共享数据缓冲内容
skb_tailroom()
return int (skb->end-skb->tail)
skb_headroom()
return int (skb->data-skb->head)
skb_realloc_headroom()
在数据起始处创建更多空闲空间,现存数据不变