Linux内核数据包L3层转发处理流程

转载:

https://blog.youkuaiyun.com/eric_liufeng/article/details/10789811

https://blog.youkuaiyun.com/shichaog/article/details/44572561

https://www.cnblogs.com/aiwz/p/6333287.html

https://blog.youkuaiyun.com/shanshanpt/article/details/20699543

https://blog.youkuaiyun.com/ztguang/article/details/74938574

(图片摘自https://blog.youkuaiyun.com/eric_liufeng/article/details/10789811,为了舒服,我把水印去了,sorry。)

内核版本:linux3.9.1

1. ip_rcv()

二层报文进入三层时,首先是由ip_rcv()函数进行处理。

作用:

对ip报文数据的正确性进行判断,包括首部长度,版本,总长度,校验和等,在函数最后调用注册在NF_INET_PRE_ROUTING处的钩子函数,最后进入到ip_rcv_finish()函数。

//linux/net/ipv4/ip_input.c
/*
 * 	Main IP Receive routine.
 */
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
	const struct iphdr *iph;
	u32 len;

	/* When the interface is in promisc. mode, drop all the crap
	 * that it receives, do not try to analyse it.
	 */
	//当网卡设置为混杂模式时,会接收所有的报文,在这里丢弃掉目的Mac地址不是本机地址的报文。
	if (skb->pkt_type == PACKET_OTHERHOST)//网卡收报时,会设置相应的pkt_type字段
		goto drop;


	IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);
	
	/*检查skb的引用计数,如果大于1,说明其他地方也在使用这个skb,复制一个sbk返回;否则返回原来的skb。
	  当返回skb为空时,说明复制skb出错。*/
	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
		goto out;
	}
	
	//判断skb的ip头部长度是否>=20,sizeof(struct iphdr)=20。
	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
		goto inhdr_error;

	iph = ip_hdr(skb);

	/*
	 *	RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
	 *
	 *	Is the datagram acceptable?
	 *
	 *	1.	Length at least the size of an ip header
	 *	2.	Version of 4
	 *	3.	Checksums correctly. [Speed optimisation for later, skip loopback checksums]
	 *	4.	Doesn't have a bogus length
	 */
	//ip头部字段中ip头长度,ip版本判断
	if (iph->ihl < 5 || iph->version != 4)
		goto inhdr_error;
	//再次检查skb的ip头长度是否>=iph->ihl*4,疑问?这里不是应该判断是否等于么?
	if (!pskb_may_pull(skb, iph->ihl*4))
		goto inhdr_error;

	iph = ip_hdr(skb);
	
	//ip头部进行校验,这里iph->ihl没有*4
	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
		goto inhdr_error;

	len = ntohs(iph->tot_len);
	/*确保缓冲区长度大于等于ip报头中记录的总长度,这是因为L2层可能会进行有效载荷填充,
	所以skb->len的长度可能会比ip报头中记录的总长度大一点。*/
	if (skb->len < len) {
		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
		goto drop;
	} else if (len < (iph->ihl*4))//总长度一定大于等于头部长度
		goto inhdr_error;

	/* Our transport medium may have padded the buffer out. Now we know it
	 * is IP we can trim to the true length of the frame.
	 * Note this now means skb->len holds ntohs(iph->tot_len).
	 */
	/*当L2层进行了填充时,去掉填充,把封包裁剪为正确尺寸,让skb->len=ntohs(iph->tot_len),
	并让L4层校验和失效(如果网卡在硬件钟端时计算了L4校验和),让L4从新计算校验和。*/
	if (pskb_trim_rcsum(skb, len)) {
		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
		goto drop;
	}

	/* Remove any debris in the socket control block */
	/*将skb中ip控制块清0,以便后续对ip选项的处理,不理解。*/
	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));

	/* Must drop socket now because of tproxy. */
	/*如果一个缓冲区当前有一个拥有者,我们就调用拥有者的析构函数,使skb没有拥有者。
	该缓冲区继续存在,但以前的拥有者不再对其“负责”。*/
	/*使包成为不属于任何套接字的孤包*/
	skb_orphan(skb);
	
	/*调用注册在NF_INET_PRE_ROUTING处的钩子函数,如果此数据包被钩子函数放行,
	返回,继续执行ip_rcv_finish()函数。*/
	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,
		       ip_rcv_finish);

inhdr_error:
	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
drop:
	kfree_skb(skb);
out:
	return NET_RX_DROP;
}

2. ip_rcv_finish()

作用:

1.查找路由,决定对报文进行本地接收还是转发,赋值skb_dst()->input(),发往本地为ip_local_deliver,转发为ip_forward()。

2.对IP的选项进行处理

//linux/net/ipv4/ip_input.c
static int ip_rcv_finish(struct sk_buff *skb)
{
	const struct iphdr *iph = ip_hdr(skb);
	struct rtable *rt;
	
	
	if (sysctl_ip_early_demux && !skb_dst(skb)) {
		const struct net_protocol *ipprot;
		int protocol = iph->protocol;

		ipprot = rcu_dereference(inet_protos[protocol]);
		if (ipprot && ipprot->early_demux) {
			ipprot->early_demux(skb);
			/* must reload iph, skb->head might have changed */
			iph = ip_hdr(skb);
		}
	}

	/*
	 *	Initialise the virtual path cache for the packet. It describes
	 *	how the packet travels inside Linux networking.
	 */
	/*skb_dst(skb)取skb的路由信息,如果为空,调用路由子系统查找路由信息*/
	if (!skb_dst(skb)) {
		int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
					       iph->tos, skb->dev);
		//找不到路由项,进行出错处理
		if (unlikely(err)) {
			if (err == -EXDEV)
				NET_INC_STATS_BH(dev_net(skb->dev),
						 LINUX_MIB_IPRPFILTER);
			goto drop;
		}
	}

//更新流量控制(QoS)所用的统计数据
#ifdef CONFIG_IP_ROUTE_CLASSID
	if (unlikely(skb_dst(skb)->tclassid)) {
		struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
		u32 idx = skb_dst(skb)->tclassid;
		st[idx&0xFF].o_packets++;
		st[idx&0xFF].o_bytes += skb->len;
		st[(idx>>16)&0xFF].i_packets++;
		st[(idx>>16)&0xFF].i_bytes += skb->len;
	}
#endif
	/*ip_rcv_options(skb)对ip选项进行处理,这个函数很重要,后面单独分析。
	ip_rcv_options判断后,丢弃返回1,保留返回0,比较怪。*/
	if (iph->ihl > 5 && ip_rcv_options(skb))
		goto drop;
	//获得上面skb指向的路由项入口,根据skb类型,对组播和广播信息进行统计
	rt = skb_rtable(skb);
	//组播
	if (rt->rt_type == RTN_MULTICAST) {
		IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INMCAST,
				skb->len);
	} 
	//广播
	else if (rt->rt_type == RTN_BROADCAST)
		IP_UPD_PO_STATS_BH(dev_net(rt->dst.dev), IPSTATS_MIB_INBCAST,
				skb->len);
	/*根据dst_entry的结果,des_input会调用skb_dst(skb)->input(skb)进行IP的路由选择.
	input为函数指针,其值在ip_route_input_noref中进行设置
	传递给本地计算机的单播或多播,进入ip_local_deliver()
	单播转发的报文进入ip_forward()
	多播转发的报文进入ip_mr_input()*/
	return dst_input(skb);

drop:
	kfree_skb(skb);
	return NET_RX_DROP;
}

相关函数:

skb_dst()


/**
 * skb_dst - returns skb dst_entry
 * @skb: buffer
 *
 * Returns skb dst_entry, regardless of reference taken or not.
 */
static inline struct dst_entry *skb_dst(const struct sk_buff *skb)
{
	/* If refdst was not refcounted, check we still are in a 
	 * rcu_read_lock section
	 */
	WARN_ON((skb->_skb_refdst & SKB_DST_NOREF) &&
		!rcu_read_lock_held() &&
		!rcu_read_lock_bh_held());
	return (struct dst_entry *)(skb->_skb_refdst & SKB_DST_PTRMASK);
}

dst_entry结构体


/* Each dst_entry has reference count and sits in some parent list(s).
 * When it is removed from parent list, it is "freed" (dst_free).
 * After this it enters dead state (dst->obsolete > 0) and if its refcnt
 * is zero, it can be destroyed immediately, otherwise it is added
 * to gc list and garbage collector periodically checks the refcnt.
 */

struct sk_buff;

struct dst_entry {
	struct rcu_head		rcu_head;
	struct dst_entry	*child;
	struct net_device       *dev;
	struct  dst_ops	        *ops;
	unsigned long		_metrics;
	unsigned long           expires;
	struct dst_entry	*path;
	struct dst_entry	*from;
#ifdef CONFIG_XFRM
	struct xfrm_state	*xfrm;
#else
	void			*__pad1;
#endif
	int			(*input)(struct sk_buff *);
	int			(*output)(struct sk_buff *);

	unsigned short		flags;
#define DST_HOST		0x0001
#define DST_NOXFRM		0x0002
#define DST_NOPOLICY		0x0004
#define DST_NOHASH		0x0008
#define DST_NOCACHE		0x0010
#define DST_NOCOUNT		0x0020
#define DST_NOPEER		0x0040
#define DST_FAKE_RTABLE		0x0080
#define DST_XFRM_TUNNEL		0x0100
#define DST_XFRM_QUEUE		0x0200

	unsigned short		pending_confirm;

	short			error;

	/* A non-zero value of dst->obsolete forces by-hand validation
	 * of the route entry.  Positive values are set by the generic
	 * dst layer to indicate that the entry has been forcefully
	 * destroyed.
	 *
	 * Negative values are used by the implementation layer code to
	 * force invocation of the dst_ops->check() method.
	 */
	short			obsolete;
#define DST_OBSOLETE_NONE	0
#define DST_OBSOLETE_DEAD	2
#define DST_OBSOLETE_FORCE_CHK	-1
#define DST_OBSOLETE_KILL	-2
	unsigned short		header_len;	/* more space at head required */
	unsigned short		trailer_len;	/* space to reserve at tail */
#ifdef CONFIG_IP_ROUTE_CLASSID
	__u32			tclassid;
#else
	__u32			__pad2;
#endif

	/*
	 * Align __refcnt to a 64 bytes alignment
	 * (L1_CACHE_SIZE would be too much)
	 */
#ifdef CONFIG_64BIT
	long			__pad_to_align_refcnt[2];
#endif
	/*
	 * __refcnt wants to be on a different cache line from
	 * input/output/ops or performance tanks badly
	 */
	atomic_t		__refcnt;	/* client references	*/
	int			__use;
	unsigned long		lastuse;
	union {
		struct dst_entry	*next;
		struct rtable __rcu	*rt_next;
		struct rt6_info		*rt6_next;
		struct dn_route __rcu	*dn_next;
	};
};

skb_rtable函数


static inline struct rtable *skb_rtable(const struct sk_buff *skb)
{
	return (struct rtable *)skb_dst(skb);
}

3. ip_local_deliver()

这个函数比较简单,

作用:

1.判断ip报文是否分片,是则进行重组

2.调用注册在NF_NET_LOCAL_IN处的钩子函数,放行后,进入ip_local_deliver_finish()函数。

//linux/net/ipv4/ip_input.c
/*
 * 	Deliver IP Packets to the higher protocol layers.
 */
int ip_local_deliver(struct sk_buff *skb)
{
	/*
	 *	Reassemble IP fragments.
	 */
	/*对分片的packet进行重组,有些网卡硬件有offload特性,这个特性可以将分片和重组在网卡实现,
	而不需要在ip层实现,此外,现在一般网络上的路由器都能支持1500以上的数据包,分片的情况现在
	已经很少了。*/
	/*判断ip报文是否分片,是则进行重组*/
	if (ip_is_fragment(ip_hdr(skb))) {
		/*处理IP报文分片*/
		if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
			return 0;
	}
	/*调用NF_INET_LOCAL_IN处的钩子函数,放行后,进入ip_local_deliver_finish函数*/
	return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
		       ip_local_deliver_finish);
}

4. ip_local_deliver_finish()

作用:

1. 处理Raw ip,如果配置了安全策略,则进行IPSec安全检查

2. 根据ip报头的protocol字段,找到对应的L4层接收函数(先找到对应的net_protocol,调用net_protocol->handler,)对于TCP,为tcp_v4_rcv。报文由此进入L4。

//linux/net/ipv4/ip_input.c
static int ip_local_deliver_finish(struct sk_buff *skb)
{
	struct net *net = dev_net(skb->dev);

	/*去掉ip首部,跳过ip头,skb->data指向L4头,更新skb->len*/
	__skb_pull(skb, ip_hdrlen(skb));

	/* Point into the IP datagram, just past the header. */
	/*赋值skb->transport_header=skb->data*/
	skb_reset_transport_header(skb);

	rcu_read_lock();
	{
		int protocol = ip_hdr(skb)->protocol;
		const struct net_protocol *ipprot;
		int raw;

	resubmit:
		/*处理raw ip,raw socket的deliver的方式,复制一份副本,交给处理该RAW IP的套接字*/
		raw = raw_local_deliver(skb, protocol);
		
		/*从inet_proto数组中取出对应的net_protocol元素,tcp的为tcp_protocol*/
		ipprot = rcu_dereference(inet_protos[protocol]);
		//如果找到相应的协议,调用对应的处理历程
		if (ipprot != NULL) {
			int ret;
			
			/*启用了安全策略,则交给IPSec*/
			if (!ipprot->no_policy) {
				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
					kfree_skb(skb);
					goto out;
				}
				nf_reset(skb);
			}
			/*调用L4协议处理函数,对于tcp为tcp_protocol->handler,即tcp_v4_rcv*/
			ret = ipprot->handler(skb);
			if (ret < 0) {
				protocol = -ret;
				goto resubmit;
			}
			IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
		} else {
			/*如果RAW套接口没有接收或者接收异常*/
			if (!raw) {
				//将包交给IPSec
				if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
					IP_INC_STATS_BH(net, IPSTATS_MIB_INUNKNOWNPROTOS);
					//产生一个目的不可达的ICMP报文给发送方
					icmp_send(skb, ICMP_DEST_UNREACH,
						  ICMP_PROT_UNREACH, 0);
				}
				kfree_skb(skb);
			} else {
				IP_INC_STATS_BH(net, IPSTATS_MIB_INDELIVERS);
				/*释放skb*/
				consume_skb(skb);
			}
		}
	}
 out:
	rcu_read_unlock();

	return 0;
}

skb_rest_transport_header()


static inline void skb_reset_transport_header(struct sk_buff *skb)
{
	skb->transport_header = skb->data;
}

net_protocol结构体


/* This is used to register protocols. */
struct net_protocol {
	void			(*early_demux)(struct sk_buff *skb);
	int			(*handler)(struct sk_buff *skb);
	void			(*err_handler)(struct sk_buff *skb, u32 info);
	unsigned int		no_policy:1,
				netns_ok:1;
};

tcp_protocol结构体

//linux/net/ipv4/af_inet.c
static const struct net_protocol tcp_protocol = {
	.early_demux	=	tcp_v4_early_demux,
	.handler	=	tcp_v4_rcv,
	.err_handler	=	tcp_v4_err,
	.no_policy	=	1,
	.netns_ok	=	1,
};

5. ip_forward()

作用:

1. 处理Router Alert、Strict Source Routing(严格源路由选项)选项;

2. 各种检查,ttl-1;

3. 调用NF_INET_FORWARD处钩子函数,放行后,进入ip_forward_finish函数。

//linux/net/ipv4/ip_forward.c
int ip_forward(struct sk_buff *skb)
{
	struct iphdr *iph;	/* Our header */
	struct rtable *rt;	/* Route we use */
	struct ip_options *opt	= &(IPCB(skb)->opt);

	if (skb_warn_if_lro(skb))
		goto drop;
	
	/*对数据包进行IPSec安全策略检查*/
	if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
		goto drop;
	
	/*如果数据报中存在 Router Alert选项,调用ip_call_ra_chain将数据报输入给对路由警告选项
	感兴趣的用户进程,如果成功,则不再转发数据报。*/
	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
		return NET_RX_SUCCESS;
	
	/*目的MAC地址不是本机的帧,直接丢弃。
	在L2层,会设置帧的类型,当目的MAC地址就是本机时,skb->pkt_type=PACKET_HOST*/
	if (skb->pkt_type != PACKET_HOST)
		goto drop;
	/*其实就是skb->ip_summed = CHECKSUM_NONE,表明后面的输出还需要软件来计算校验和*/
	skb_forward_csum(skb);

	/*
	 *	According to the RFC, we must first decrease the TTL field. If
	 *	that reaches zero, we must reply an ICMP control message telling
	 *	that the packet's lifetime expired.
	 */
	/*ttl用完,丢弃该报文,发送ICMP超时报文给发送方。*/
	if (ip_hdr(skb)->ttl <= 1)
		goto too_many_hops;

	/*进行IPsec路由选择和转发处理,如果失败,则丢弃该数据报。*/
	if (!xfrm4_route_forward(skb))
		goto drop;
	
	/*获取路由表*/
	rt = skb_rtable(skb);
	
	/*如果使用了严格路由选项,选项中设置的下个节点和路由子系统查找的下一条不一致,则报错,
	发送ICMP报文,我看一般是rt->rt_dst != rt->rt_gateway*/
	if (opt->is_strictroute && rt->rt_uses_gateway)
		goto sr_failed;
	
	/*如果skb->len大于MTU值,且Dont-Fragment被置位,则丢弃此报文,
	发送ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED报文*/
	if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) &&
		     (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) {
		IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS);
		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
			  htonl(dst_mtu(&rt->dst)));
		goto drop;
	}

	/* We are about to mangle packet. Copy it! */
	/*我们即将修改缓冲区内容,所以必须拷贝一份skb。只有当数据报为共享,
	或者数据报头部的可用空间不足以存储L2报头时,才拷贝。*/
	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
		goto drop;
	iph = ip_hdr(skb);

	/* Decrease ttl after skb cow done */
	//ttl减一
	ip_decrease_ttl(iph);

	/*
	 *	We now generate an ICMP HOST REDIRECT giving the route
	 *	we calculated.
	 */
	/*如果该数据报的输出路由存在重定向标志,且该数据报中不存在源路由选项,
	则向发送方发送重定向ICMP报文。*/
	if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr && !skb_sec_path(skb))
		ip_rt_send_redirect(skb);

	skb->priority = rt_tos2priority(iph->tos);
	
	/*调用NF_INET_FORWARD处钩子函数,放行后,进入ip_forward_finish函数*/
	return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev,
		       rt->dst.dev, ip_forward_finish);

sr_failed:
	/*
	 *	Strict routing permits no gatewaying
	 */
	 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
	 goto drop;

too_many_hops:
	/* Tell the sender its packet died... */
	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_INHDRERRORS);
	icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
drop:
	kfree_skb(skb);
	return NET_RX_DROP;
}

6. ip_forward_finish()

作用:

1. IP选项的处理

2. 根据skb_dst(skb)->output(skb),调用相应的报文输出函数。如果是单播则是ip_output; 如果是多播,则是ip_mc_output。

//linux/inet/ipv4/ip_forward.c
static int ip_forward_finish(struct sk_buff *skb)
{
	struct ip_options *opt	= &(IPCB(skb)->opt);

	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
	IP_ADD_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTOCTETS, skb->len);

	//ip选项处理
	if (unlikely(opt->optlen))
		ip_forward_options(skb);
	/*dst_output调用skb_dst(skb)->output(skb),如果是单播则是ip_output;
	如果是多播,则是ip_mc_output*/
        //疑问?这个output指针是在哪里设置的?
	return dst_output(skb);
}

7. ip_output()

所有传输(无论是本地产生还是转发其他主机报文)都会通过dst_output上路。

作用:

1. 设置发送设备skb->dev和和skb->protocol;

2. 调用NF_INET_POST_ROUTING处钩子函数,放行后,进入ip_finish_output函数。

//linux/net/ipv4/ip_output.c
int ip_output(struct sk_buff *skb)
{
	//获取输出设备
	struct net_device *dev = skb_dst(skb)->dev;

	IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
	
	//设置输出设备
	skb->dev = dev;
	//设置skb->protocol协议为IP,注意这里不是设置IP头部的protocol字段
	skb->protocol = htons(ETH_P_IP);

	/*调用NF_INET_POST_ROUTING处钩子函数,放行后,进入ip_finish_output函数*/
	return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
			    ip_finish_output,
			    !(IPCB(skb)->flags & IPSKB_REROUTED));
}

8. ip_finish_output()

这个函数命名有点怪,不是应该是ip_output_finish()形式么?

1. 判断报文是否需要分片,是则调用ip_fragment()函数进行分片,结束后在调用ip_finish_output2()函数;如果不需要分片,直接进入ip_finish_output2()函数。

//linux/net/ipv4/ip_output.c
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
	/* Policy lookup after SNAT yielded a new policy */
	//Netfilter和IPSec相关处理
	if (skb_dst(skb)->xfrm != NULL) {
		IPCB(skb)->flags |= IPSKB_REROUTED;
		return dst_output(skb);
	}
#endif
	/*GSO:网卡在支持GSO功能时,对于超大数据包(大于MTU值),
	内核会将分段的工作延迟到交给驱动的前一刻。如果网卡不支持此功能,
	则内核用软件的方式对数据包进行分片*/
	/*如果报文长度大于MTU,并且网卡不支持GSO,则进行报文分片*/
	if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
		return ip_fragment(skb, ip_finish_output2);
	else
		return ip_finish_output2(skb);
}

9. ip_finish_output2()

作用:

通过邻居子系统将数据报输出到网络设备。

//linux/net/ipv4/ip_output.c
static inline int ip_finish_output2(struct sk_buff *skb)
{
	struct dst_entry *dst = skb_dst(skb);
	struct rtable *rt = (struct rtable *)dst;
	struct net_device *dev = dst->dev;
	unsigned int hh_len = LL_RESERVED_SPACE(dev);
	struct neighbour *neigh;
	u32 nexthop;
	
	//组播
	if (rt->rt_type == RTN_MULTICAST) {
		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
	} else if (rt->rt_type == RTN_BROADCAST)//广播
		IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);

	/* Be paranoid, rather than too clever. */
	/*检测skb的前部空间是否还能够存储链路层首部,如果不够,则重新分配更大
	存储区的skb,并释放原skb.*/
	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
		struct sk_buff *skb2;

		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
		if (skb2 == NULL) {
			kfree_skb(skb);
			return -ENOMEM;
		}
		if (skb->sk)
			skb_set_owner_w(skb2, skb->sk);
		consume_skb(skb);
		skb = skb2;
	}

	rcu_read_lock_bh();
	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
	if (unlikely(!neigh))
		neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
	if (!IS_ERR(neigh)) {
		/*调用dst_neigh_output输出报文,在这个函数中,进行判断
		如果缓存了链路层首部,则调用neigh_hh_output输出报文,
		若存在对应的邻居项,则通过邻居项的输出方法输出数据报*/
		int res = dst_neigh_output(dst, neigh, skb);

		rcu_read_unlock_bh();
		return res;
	}
	rcu_read_unlock_bh();

	/*如果既没有缓存链路层的首部,又不存在对应的邻居项,
	在这种情况下,不能输出,释放skb。*/
	net_dbg_ratelimited("%s: No header cache and no neighbour!\n",
			    __func__);
	kfree_skb(skb);
	return -EINVAL;
}

相关函数:

1.  dst_neigh_output()


static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
				   struct sk_buff *skb)
{
	const struct hh_cache *hh;

	if (dst->pending_confirm) {
		unsigned long now = jiffies;

		dst->pending_confirm = 0;
		/* avoid dirtying neighbour */
		if (n->confirmed != now)
			n->confirmed = now;
	}

	hh = &n->hh;
	if ((n->nud_state & NUD_CONNECTED) && hh->hh_len)
		return neigh_hh_output(hh, skb);
	else
		return n->output(n, skb);
}

2. neigh_hh_output()

可以看到在这个函数中调用了dev_queue_xmit()函数。


static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb)
{
	unsigned int seq;
	int hh_len;

	do {
		seq = read_seqbegin(&hh->hh_lock);
		hh_len = hh->hh_len;
		if (likely(hh_len <= HH_DATA_MOD)) {
			/* this is inlined by gcc */
			memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD);
		} else {
			int hh_alen = HH_DATA_ALIGN(hh_len);

			memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
		}
	} while (read_seqretry(&hh->hh_lock, seq));

	skb_push(skb, hh_len);
	return dev_queue_xmit(skb);
}

10. dev_queue_xmit()

终于到了神奇的dev_queue_xmit()。

 

ip_mc_output,if_fragment等一大堆函数;邻居子系统,IP选项处理,IP路由,ip_queue_xmit一条线函数以后再说。

先分析到dev_queue_xmit(),下面的dev_hard_start_xmit()以后再说,明天还要分析桥上的流程。

路还很长,太多的细节有待研究。Come on !

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值