转载:
https://blog.youkuaiyun.com/eric_liufeng/article/details/10789811
https://blog.youkuaiyun.com/shichaog/article/details/44572561
https://www.cnblogs.com/aiwz/p/6333287.html
https://blog.youkuaiyun.com/shanshanpt/article/details/20699543
https://blog.youkuaiyun.com/ztguang/article/details/74938574
(图片摘自https://blog.youkuaiyun.com/eric_liufeng/article/details/10789811,为了舒服,我把水印去了,sorry。)
内核版本:linux3.9.1
1. ip_rcv()
二层报文进入三层时,首先是由ip_rcv()函数进行处理。
作用:
对ip报文数据的正确性进行判断,包括首部长度,版本,总长度,校验和等,在函数最后调用注册在NF_INET_PRE_ROUTING处的钩子函数,最后进入到ip_rcv_finish()函数。
//linux/net/ipv4/ip_input.c
/*
* Main IP Receive routine.
*/
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
const struct iphdr *iph;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
//当网卡设置为混杂模式时,会接收所有的报文,在这里丢弃掉目的Mac地址不是本机地址的报文。
if (skb->pkt_type == PACKET_OTHERHOST)//网卡收报时,会设置相应的pkt_type字段
goto drop;
IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
/*检查skb的引用计数,如果大于1,说明其他地方也在使用这个skb,复制一个sbk返回;否则返回原来的skb。
当返回skb为空时,说明复制skb出错。*/
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto out;
}
//判断skb的ip头部长度是否>=20,sizeof(struct iphdr)=20。
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
iph = ip_hdr(skb);
/*
* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
//ip头部字段中ip头长度,ip版本判断
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
//再次检查skb的ip头长度是否>=iph->ihl*4,疑问?这里不是应该判断是否等于么?
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
iph = ip_hdr(skb);
//ip头部进行校验,这里iph->ihl没有*4
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto inhdr_error;
len = ntohs(iph->tot_len);
/*确保缓冲区长度大于等于ip报头中记录的总长度,这是因为L2层可能会进行有效载荷填充,
所以skb->len的长度可能会比ip报头中记录的总长度大一点。*/
if (skb->len < len) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
} else if (len < (iph->ihl*4))//总长度一定大于等于头部长度
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
/*当L2层进行了填充时,去掉填充,把封包裁剪为正确尺寸,让skb->len=ntohs(iph->tot_len),
并让L4层校验和失效(如果网卡在硬件钟端时计算了L4校验和),让L4从新计算校验和。*/
if (pskb_trim_rcsum(skb, len)) {
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
goto drop;
}
/* Remove any debris in the socket control block */
/*将skb中ip控制块清0,以便后续对ip选项的处理,不理解。*/
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
/* Must drop socket now because of tproxy. */
/*如果一个缓冲区当前有一个拥有者,我们就调用拥有者的析构函数,使skb没有拥有者。
该缓冲区继续存在,但以前的拥有者不再对其“负责”。*/
/*使包成为不属于任何套接字的孤包*/
skb_orphan(skb);
/*调用注册在NF_INET_PRE_ROUTING处的钩子函数,如果此数据包被钩子函数放行,
返回,继续执行ip_rcv_finish()函数。*/
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
2. ip_rcv_finish()
作用:
1.查找路由,决定对报文进行本地接收还是转发,赋值skb_dst()->input(),发往本地为ip_local_deliver,转发为ip_forward()。
2.对IP的选项进行处理
//linux/net/ipv4/ip_input.c
static int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
if (sysctl_ip_early_demux && !skb_dst(skb)) {
const struct net_protocol *ipprot;
int protocol = iph->protocol;
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->early_demux) {
ipprot->early_demux(skb);
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
}
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
/*skb_dst(skb)取skb的路由信息,如果为空,调用路由子系统查找路由信息*/
if (!skb_dst(skb)) {
int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb->dev);
//找不到路由项,进行出错处理
if (unlikely(err)) {
if (err == -EXDEV)
NET_INC_STATS_BH(dev_net(skb->dev),
LINUX_MIB_IPRPFILTER);
goto drop;
}
}
//更新流量控制(QoS)所用的统计数据
#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
/*ip_rcv_options(skb)对ip选项进行处理,这个函数很重要,后面单独分析。
ip_rcv_options判断后,丢弃返回1,保留返回0,比较怪。*/
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
//获得上面skb指向的路由项入口,根据skb类型,对组播和广播信息进行统计
rt = skb_rtable(skb);
//组播
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
skb->len);
}
//广播
else if (rt->rt_type == RTN_BROADCAST)
IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
skb->len);
/*根据dst_entry的结果,des_input会调用skb_dst(skb)->input(skb)进行IP的路由选择.
input为函数指针,其值在ip_route_input_noref中进行设置
传递给本地计算机的单播或多播,进入ip_local_deliver()
单播转发的报文进入ip_forward()
多播转发的报文进入ip_mr_input()*/
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
相关函数:
skb_dst()
/**
* skb_dst - returns skb dst_entry
* @skb: buffer
*
* Returns skb dst_entry, regardless of reference taken or not.
*/
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
/* If refdst was not refcounted, check we still are in a
* rcu_read_lock section
*/
WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
!rcu_read_lock_held() &&
!rcu_read_lock_bh_held());
return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}
dst_entry结构体
/* Each dst_entry has reference count and sits in some parent list(s).
* When it is removed from parent list, it is "freed" (dst_free).
* After this it enters dead state (dst->obsolete > 0) and if its refcnt
* is zero, it can be destroyed immediately, otherwise it is added
* to gc list and garbage collector periodically checks the refcnt.
*/
struct sk_buff;
struct dst_entry {
struct rcu_head rcu_head;
struct dst_entry *child;
struct net_device *dev;
struct dst_ops *ops;
unsigned long _metrics;
unsigned long expires;
struct dst_entry *path;
struct dst_entry *from;
#ifdef CONFIG_XFRM
struct xfrm_state *xfrm;
#else
void *__pad1;
#endif
int (*input)(struct sk_buff *);
int (*output)(struct sk_buff *);
unsigned short flags;
#define DST_HOST 0x0001
#define DST_NOXFRM 0x0002
#define DST_NOPOLICY 0x0004
#define DST_NOHASH 0x0008
#define DST_NOCACHE 0x0010
#define DST_NOCOUNT 0x0020
#define DST_NOPEER 0x0040
#define DST_FAKE_RTABLE 0x0080
#define DST_XFRM_TUNNEL 0x0100
#define DST_XFRM_QUEUE 0x0200
unsigned short pending_confirm;
short error;
/* A non-zero value of dst->obsolete forces by-hand validation
* of the route entry. Positive values are set by the generic
* dst layer to indicate that the entry has been forcefully
* destroyed.
*
* Negative values are used by the implementation layer code to
* force invocation of the dst_ops->check() method.
*/
short obsolete;
#define DST_OBSOLETE_NONE 0
#define DST_OBSOLETE_DEAD 2
#define DST_OBSOLETE_FORCE_CHK -1
#define DST_OBSOLETE_KILL -2
unsigned short header_len; /* more space at head required */
unsigned short trailer_len; /* space to reserve at tail */
#ifdef CONFIG_IP_ROUTE_CLASSID
__u32 tclassid;
#else
__u32 __pad2;
#endif
/*
* Align __refcnt to a 64 bytes alignment
* (L1_CACHE_SIZE would be too much)
*/
#ifdef CONFIG_64BIT
long __pad_to_align_refcnt[2];
#endif
/*
* __refcnt wants to be on a different cache line from
* input/output/ops or performance tanks badly
*/
atomic_t __refcnt; /* client references */
int __use;
unsigned long lastuse;
union {
struct dst_entry *next;
struct rtable __rcu *rt_next;
struct rt6_info *rt6_next;
struct dn_route __rcu *dn_next;
};
};
skb_rtable函数
static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
return (struct rtable *)skb_dst(skb);
}
3. ip_local_deliver()
这个函数比较简单,
作用:
1.判断ip报文是否分片,是则进行重组
2.调用注册在NF_NET_LOCAL_IN处的钩子函数,放行后,进入ip_local_deliver_finish()函数。
//linux/net/ipv4/ip_input.c
/*
* Deliver IP Packets to the higher protocol layers.
*/
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
/*对分片的packet进行重组,有些网卡硬件有offload特性,这个特性可以将分片和重组在网卡实现,
而不需要在ip层实现,此外,现在一般网络上的路由器都能支持1500以上的数据包,分片的情况现在
已经很少了。*/
/*判断ip报文是否分片,是则进行重组*/
if (ip_is_fragment(ip_hdr(skb))) {
/*处理IP报文分片*/
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
/*调用NF_INET_LOCAL_IN处的钩子函数,放行后,进入ip_local_deliver_finish函数*/
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
4. ip_local_deliver_finish()
作用:
1. 处理Raw ip,如果配置了安全策略,则进行IPSec安全检查
2. 根据ip报头的protocol字段,找到对应的L4层接收函数(先找到对应的net_protocol,调用net_protocol->handler,)对于TCP,为tcp_v4_rcv。报文由此进入L4。
//linux/net/ipv4/ip_input.c
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
/*去掉ip首部,跳过ip头,skb->data指向L4头,更新skb->len*/
__skb_pull(skb, ip_hdrlen(skb));
/* Point into the IP datagram, just past the header. */
/*赋值skb->transport_header=skb->data*/
skb_reset_transport_header(skb);
rcu_read_lock();
{
int protocol = ip_hdr(skb)->protocol;
const struct net_protocol *ipprot;
int raw;
resubmit:
/*处理raw ip,raw socket的deliver的方式,复制一份副本,交给处理该RAW IP的套接字*/
raw = raw_local_deliver(skb, protocol);
/*从inet_proto数组中取出对应的net_protocol元素,tcp的为tcp_protocol*/
ipprot = rcu_dereference(inet_protos[protocol]);
//如果找到相应的协议,调用对应的处理历程
if (ipprot != NULL) {
int ret;
/*启用了安全策略,则交给IPSec*/
if (!ipprot->no_policy) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
/*调用L4协议处理函数,对于tcp为tcp_protocol->handler,即tcp_v4_rcv*/
ret = ipprot->handler(skb);
if (ret < 0) {
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
} else {
/*如果RAW套接口没有接收或者接收异常*/
if (!raw) {
//将包交给IPSec
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
//产生一个目的不可达的ICMP报文给发送方
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
kfree_skb(skb);
} else {
IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
/*释放skb*/
consume_skb(skb);
}
}
}
out:
rcu_read_unlock();
return 0;
}
skb_rest_transport_header()
static inline void skb_reset_transport_header(struct sk_buff *skb)
{
skb->transport_header = skb->data;
}
net_protocol结构体
/* This is used to register protocols. */
struct net_protocol {
void (*early_demux)(struct sk_buff *skb);
int (*handler)(struct sk_buff *skb);
void (*err_handler)(struct sk_buff *skb, u32 info);
unsigned int no_policy:1,
netns_ok:1;
};
tcp_protocol结构体
//linux/net/ipv4/af_inet.c
static const struct net_protocol tcp_protocol = {
.early_demux = tcp_v4_early_demux,
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.no_policy = 1,
.netns_ok = 1,
};
5. ip_forward()
作用:
1. 处理Router Alert、Strict Source Routing(严格源路由选项)选项;
2. 各种检查,ttl-1;
3. 调用NF_INET_FORWARD处钩子函数,放行后,进入ip_forward_finish函数。
//linux/net/ipv4/ip_forward.c
int ip_forward(struct sk_buff *skb)
{
struct iphdr *iph; /* Our header */
struct rtable *rt; /* Route we use */
struct ip_options *opt = &(IPCB(skb)->opt);
if (skb_warn_if_lro(skb))
goto drop;
/*对数据包进行IPSec安全策略检查*/
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
goto drop;
/*如果数据报中存在 Router Alert选项,调用ip_call_ra_chain将数据报输入给对路由警告选项
感兴趣的用户进程,如果成功,则不再转发数据报。*/
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
return NET_RX_SUCCESS;
/*目的MAC地址不是本机的帧,直接丢弃。
在L2层,会设置帧的类型,当目的MAC地址就是本机时,skb->pkt_type=PACKET_HOST*/
if (skb->pkt_type != PACKET_HOST)
goto drop;
/*其实就是skb->ip_summed = CHECKSUM_NONE,表明后面的输出还需要软件来计算校验和*/
skb_forward_csum(skb);
/*
* According to the RFC, we must first decrease the TTL field. If
* that reaches zero, we must reply an ICMP control message telling
* that the packet's lifetime expired.
*/
/*ttl用完,丢弃该报文,发送ICMP超时报文给发送方。*/
if (ip_hdr(skb)->ttl <= 1)
goto too_many_hops;
/*进行IPsec路由选择和转发处理,如果失败,则丢弃该数据报。*/
if (!xfrm4_route_forward(skb))
goto drop;
/*获取路由表*/
rt = skb_rtable(skb);
/*如果使用了严格路由选项,选项中设置的下个节点和路由子系统查找的下一条不一致,则报错,
发送ICMP报文,我看一般是rt->rt_dst != rt->rt_gateway*/
if (opt->is_strictroute && rt->rt_uses_gateway)
goto sr_failed;
/*如果skb->len大于MTU值,且Dont-Fragment被置位,则丢弃此报文,
发送ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED报文*/
if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
(ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(dst_mtu(&rt->dst)));
goto drop;
}
/* We are about to mangle packet. Copy it! */
/*我们即将修改缓冲区内容,所以必须拷贝一份skb。只有当数据报为共享,
或者数据报头部的可用空间不足以存储L2报头时,才拷贝。*/
if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
goto drop;
iph = ip_hdr(skb);
/* Decrease ttl after skb cow done */
//ttl减一
ip_decrease_ttl(iph);
/*
* We now generate an ICMP HOST REDIRECT giving the route
* we calculated.
*/
/*如果该数据报的输出路由存在重定向标志,且该数据报中不存在源路由选项,
则向发送方发送重定向ICMP报文。*/
if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
ip_rt_send_redirect(skb);
skb->priority = rt_tos2priority(iph->tos);
/*调用NF_INET_FORWARD处钩子函数,放行后,进入ip_forward_finish函数*/
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
rt->dst.dev, ip_forward_finish);
sr_failed:
/*
* Strict routing permits no gatewaying
*/
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
goto drop;
too_many_hops:
/* Tell the sender its packet died... */
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}
6. ip_forward_finish()
作用:
1. IP选项的处理
2. 根据skb_dst(skb)->output(skb),调用相应的报文输出函数。如果是单播则是ip_output; 如果是多播,则是ip_mc_output。
//linux/inet/ipv4/ip_forward.c
static int ip_forward_finish(struct sk_buff *skb)
{
struct ip_options *opt = &(IPCB(skb)->opt);
IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);
//ip选项处理
if (unlikely(opt->optlen))
ip_forward_options(skb);
/*dst_output调用skb_dst(skb)->output(skb),如果是单播则是ip_output;
如果是多播,则是ip_mc_output*/
//疑问?这个output指针是在哪里设置的?
return dst_output(skb);
}
7. ip_output()
所有传输(无论是本地产生还是转发其他主机报文)都会通过dst_output上路。
作用:
1. 设置发送设备skb->dev和和skb->protocol;
2. 调用NF_INET_POST_ROUTING处钩子函数,放行后,进入ip_finish_output函数。
//linux/net/ipv4/ip_output.c
int ip_output(struct sk_buff *skb)
{
//获取输出设备
struct net_device *dev = skb_dst(skb)->dev;
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
//设置输出设备
skb->dev = dev;
//设置skb->protocol协议为IP,注意这里不是设置IP头部的protocol字段
skb->protocol = htons(ETH_P_IP);
/*调用NF_INET_POST_ROUTING处钩子函数,放行后,进入ip_finish_output函数*/
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
ip_finish_output,
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
8. ip_finish_output()
这个函数命名有点怪,不是应该是ip_output_finish()形式么?
1. 判断报文是否需要分片,是则调用ip_fragment()函数进行分片,结束后在调用ip_finish_output2()函数;如果不需要分片,直接进入ip_finish_output2()函数。
//linux/net/ipv4/ip_output.c
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
/* Policy lookup after SNAT yielded a new policy */
//Netfilter和IPSec相关处理
if (skb_dst(skb)->xfrm != NULL) {
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
#endif
/*GSO:网卡在支持GSO功能时,对于超大数据包(大于MTU值),
内核会将分段的工作延迟到交给驱动的前一刻。如果网卡不支持此功能,
则内核用软件的方式对数据包进行分片*/
/*如果报文长度大于MTU,并且网卡不支持GSO,则进行报文分片*/
if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
return ip_fragment(skb, ip_finish_output2);
else
return ip_finish_output2(skb);
}
9. ip_finish_output2()
作用:
通过邻居子系统将数据报输出到网络设备。
//linux/net/ipv4/ip_output.c
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
u32 nexthop;
//组播
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST)//广播
IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
/* Be paranoid, rather than too clever. */
/*检测skb的前部空间是否还能够存储链路层首部,如果不够,则重新分配更大
存储区的skb,并释放原skb.*/
if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
struct sk_buff *skb2;
skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
if (skb2 == NULL) {
kfree_skb(skb);
return -ENOMEM;
}
if (skb->sk)
skb_set_owner_w(skb2, skb->sk);
consume_skb(skb);
skb = skb2;
}
rcu_read_lock_bh();
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
if (unlikely(!neigh))
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
if (!IS_ERR(neigh)) {
/*调用dst_neigh_output输出报文,在这个函数中,进行判断
如果缓存了链路层首部,则调用neigh_hh_output输出报文,
若存在对应的邻居项,则通过邻居项的输出方法输出数据报*/
int res = dst_neigh_output(dst, neigh, skb);
rcu_read_unlock_bh();
return res;
}
rcu_read_unlock_bh();
/*如果既没有缓存链路层的首部,又不存在对应的邻居项,
在这种情况下,不能输出,释放skb。*/
net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
__func__);
kfree_skb(skb);
return -EINVAL;
}
相关函数:
1. dst_neigh_output()
static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
struct sk_buff *skb)
{
const struct hh_cache *hh;
if (dst->pending_confirm) {
unsigned long now = jiffies;
dst->pending_confirm = 0;
/* avoid dirtying neighbour */
if (n->confirmed != now)
n->confirmed = now;
}
hh = &n->hh;
if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
return neigh_hh_output(hh, skb);
else
return n->output(n, skb);
}
2. neigh_hh_output()
可以看到在这个函数中调用了dev_queue_xmit()函数。
static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
unsigned int seq;
int hh_len;
do {
seq = read_seqbegin(&hh->hh_lock);
hh_len = hh->hh_len;
if (likely(hh_len <= HH_DATA_MOD)) {
/* this is inlined by gcc */
memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD);
} else {
int hh_alen = HH_DATA_ALIGN(hh_len);
memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
}
} while (read_seqretry(&hh->hh_lock, seq));
skb_push(skb, hh_len);
return dev_queue_xmit(skb);
}
10. dev_queue_xmit()
终于到了神奇的dev_queue_xmit()。
ip_mc_output,if_fragment等一大堆函数;邻居子系统,IP选项处理,IP路由,ip_queue_xmit一条线函数以后再说。
先分析到dev_queue_xmit(),下面的dev_hard_start_xmit()以后再说,明天还要分析桥上的流程。
路还很长,太多的细节有待研究。Come on !