以Linux-5.15.153为例,IPv4报文在Linux内核中的处理流程如下图所示:
首先,对于在Linux网络内核中,网络层的处理起点是 ip_rcv,它由Linux网络内核中链路层处理完毕后调用。ip_rcv的定义为:
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
struct net_device *orig_dev)
{
struct net *net = dev_net(dev);
//做一些报文的合法性检查
skb = ip_rcv_core(skb, net);
if (skb == NULL)
return NET_RX_DROP;
return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
net, NULL, skb, dev, NULL,
ip_rcv_finish);
}
可以看到,在对报文进行合法性检查之后,立马转到 NF_INET_PRE_ROUTING 钩子处已注册的回调函数处理,处理完毕之后再送给 ip_rcv_finish 处理。ip_rcv_finish 的定义为:
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
int ret;
/* if ingress device is enslaved to an L3 master device pass the
* skb to its handler for processing
*/
skb = l3mdev_ip_rcv(skb);
if (!skb)
return NET_RX_SUCCESS;
ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
}
可以看到,如果目标网络设备是一个从设备,它将进入 l3mdev_ip_rcv 并交由主设备的处理函数进行处理,我们将在后续的文章中讨论这一部分,在此按下不表。接下来,报文将被送到 ip_rcv_finish_core ,处理完毕后若报文没有被丢弃,则进入 dst_input 处理。首先考虑 ip_rcv_finish_core ,它的定义为:
static int ip_rcv_finish_core(struct net *net, struct sock *sk,
struct sk_buff *skb, struct net_device *dev,
const struct sk_buff *hint)
{
const struct iphdr *iph = ip_hdr(skb);
int err, drop_reason;
struct rtable *rt;
drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
//ip_rcv_finish中传入的hint为NULL,因此这里肯定返回false,因此不进入此逻辑。
if (ip_can_use_hint(skb, iph, hint)) {
err = ip_route_use_hint(skb, iph->daddr, iph->saddr, iph->tos,
dev, hint);
if (unlikely(err))
goto drop_error;
}
if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
!skb_dst(skb) &&
!skb->sk &&
!ip_is_fragment(iph)) {
switch (iph->protocol) {
case IPPROTO_TCP:
if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
tcp_v4_early_demux(skb);
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
break;
case IPPROTO_UDP:
if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
err = udp_v4_early_demux(skb);
if (unlikely(err))
goto drop_error;
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
}
break;
}
}
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
if (!skb_valid_dst(skb)) {
//skb中未保存路由目的则查询路由信息
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, dev);
if (unlikely(err))
goto drop_error;
} else {
struct in_device *in_dev = __in_dev_get_rcu(dev);
if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY))
IPCB(skb)->flags |= IPSKB_NOPOLICY;
}
#ifdef CONFIG_IP_ROUTE_CLASSID
if (unlikely(skb_dst(skb)->tclassid)) {
struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
u32 idx = skb_dst(skb)->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes += skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes += skb->len;
}
#endif
if (iph->ihl > 5 && ip_rcv_options(skb, dev))
goto drop;
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
} else if (rt->rt_type == RTN_BROADCAST) {
__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
} else if (skb->pkt_type == PACKET_BROADCAST ||
skb->pkt_type == PACKET_MULTICAST) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
/* RFC 1122 3.3.6:
*
* When a host sends a datagram to a link-layer broadcast
* address, the IP destination address MUST be a legal IP
* broadcast or IP multicast address.
*
* A host SHOULD silently discard a datagram that is received
* via a link-layer broadcast (see Section 2.4) but does not
* specify an IP multicast or broadcast destination address.
*
* This doesn't explicitly say L2 *broadcast*, but broadcast is
* in a way a form of multicast and the most common use case for
* this is 802.11 protecting against cross-station spoofing (the
* so-called "hole-196" attack) so do it for both.
*/
if (in_dev &&
IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
goto drop;
}
}
return NET_RX_SUCCESS;
drop:
kfree_skb_reason(skb, drop_reason);
return NET_RX_DROP;
drop_error:
if (err == -EXDEV) {
drop_reason = SKB_DROP_REASON_IP_RPFILTER;
__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
}
goto drop;
}
可以看到,在这里首先检查当前sk_buff中是否保存有效的路由目的信息,如果没有,则调用 ip_route_input_noref 查询路由。ip_route_input_noref 的定义为:
int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev)
{
struct fib_result res;
int err;
tos &= IPTOS_RT_MASK;
rcu_read_lock();
err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
rcu_read_unlock();
return err;
}
这里重要的函数是 ip_route_input_rcu,其定义为:
/* called with rcu_read_lock held */
int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, struct fib_result *res)
{
/* Multicast recognition logic is moved from route cache to here.
* The problem was that too many Ethernet cards have broken/missing
* hardware multicast filters :-( As result the host on multicasting
* network acquires a lot of useless route cache entries, sort of
* SDR messages from all the world. Now we try to get rid of them.
* Really, provided software IP multicast filter is organized
* reasonably (at least, hashed), it does not result in a slowdown
* comparing with route cache reject entries.
* Note, that multicast routers are not affected, because
* route cache entry is created eventually.
*/
if (ipv4_is_multicast(daddr)) {
struct in_device *in_dev = __in_dev_get_rcu(dev);
int our = 0;
int err = -EINVAL;
if (!in_dev)
return err;
our = ip_check_mc_rcu(in_dev, daddr, saddr,
ip_hdr(skb)->protocol);
/* check l3 master if no match yet */
if (!our && netif_is_l3_slave(dev)) {
struct in_device *l3_in_dev;
l3_in_dev = __in_dev_get_rcu(skb->dev);
if (l3_in_dev)
our = ip_check_mc_rcu(l3_in_dev,