虚拟网络设备veth是如何工作的

最新推荐文章于 2025-03-12 13:38:21 发布

Megahertz66

最新推荐文章于 2025-03-12 13:38:21 发布

阅读量1.2k

点赞数

分类专栏： linux networking 文章标签：虚拟机 linux kernel 网络

本文链接：https://blog.youkuaiyun.com/Megahertz66/article/details/119213598

版权

linux networking 专栏收录该内容

12 篇文章

订阅专栏

本文深入剖析了Linux内核4.9.37中的veth模块，探讨了如何通过veth在不同网络命名空间间建立连接，包括veth设备的创建过程、数据收发机制以及rtnetlink的运用。重点讲解了veth_newlink函数和数据包转发的原理。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

veth源码分析

环境

linux kernel 4.9.37

两个网络命名空间的数据是需要通过 veth 才可以进行交互的。前面分析了网络命名空间，下面看看 veth 究竟是通过什么实现的传输。

代码分析

veth 实际的代码在 linux-4.9.37\drivers\net\veth.c 中。veth 是注册到 rtnetlink 中，并借助其进行交互。
其各种操作都包含在 veth_link_ops 结构中。

static __init int veth_init(void)
{
	return rtnl_link_register(&veth_link_ops);
}

static struct rtnl_link_ops veth_link_ops = {
	.kind		= DRV_NAME,
	.priv_size	= sizeof(struct veth_priv),
	.setup		= veth_setup,
	.validate	= veth_validate,
	.newlink	= veth_newlink,
	.dellink	= veth_dellink,
	.policy		= veth_policy,
	.maxtype	= VETH_INFO_MAX,
	.get_link_net	= veth_get_link_net,
};

创建 veth

在用户侧可以使用 ip link add veth0 type veth peer name veth1 创建 veth1 与 veth2 两个连通的虚拟网络设备。
下面的代码中我省略了很多的代码，其中代码的主要作用是从 rtnetlink 中获取到关于两个虚拟设备的信息。如下代码是比较关键的
这里有个疑问是注释标出了注册两个设备的顺序，不知道这里是必须要这么做还是我想多了，作者只是说明一下。
从下面的代码可以看出创建新的 veth 会注册两个网络设备 dev 和 peer，并且关联了两个设备。
dev 的私有数据中的 peer 是指向 peer 的，而 peer 的私有数据中的 peer 是指向 dev 的。

static int veth_newlink(struct net *src_net, struct net_device *dev,
			 struct nlattr *tb[], struct nlattr *data[])
{
	int err;
	struct net_device *peer;
	struct veth_priv *priv;

	/*
	 * create and register peer first
	 */
	err = register_netdevice(peer);


	/*
	 * register dev last
	 *
	 * note, that since we've registered new device the dev's name
	 * should be re-allocated
	 */

	err = register_netdevice(dev);

	
	netif_carrier_off(dev);

	/*
	 * tie the deviced together
	 */

	priv = netdev_priv(dev);
	rcu_assign_pointer(priv->peer, peer);

	priv = netdev_priv(peer);
	rcu_assign_pointer(priv->peer, dev);
	return 0;
}

数据收发

下面的代码也是省略了关于 dev 的参数设备。veth_netdev_ops 是关于设备的初始化、open、发包、设备 mtu 等各种操作。

static void veth_setup(struct net_device *dev)
{
	ether_setup(dev);

	dev->netdev_ops = &veth_netdev_ops;
	dev->ethtool_ops = &veth_ethtool_ops;

	dev->destructor = veth_dev_free;

}

static const struct net_device_ops veth_netdev_ops = {
	.ndo_init            = veth_dev_init,
	.ndo_open            = veth_open,
	.ndo_stop            = veth_close,
	.ndo_start_xmit      = veth_xmit,
	.ndo_change_mtu      = veth_change_mtu,
	.ndo_get_stats64     = veth_get_stats64,
	.ndo_set_rx_mode     = veth_set_multicast_list,
	.ndo_set_mac_address = eth_mac_addr,
#ifdef CONFIG_NET_POLL_CONTROLLER
	.ndo_poll_controller	= veth_poll_controller,
#endif
	.ndo_get_iflink		= veth_get_iflink,
	.ndo_features_check	= passthru_features_check,
	.ndo_set_rx_headroom	= veth_set_rx_headroom,
};

看一下 veth_xmit 函数，根据名字可以看出，veth 设备就是用这个函数进行发送数据的。函数不长就没有省略。
根据私有数据中的 peer 指针找到和这个设备对应的虚拟设备。在执行了 dev_forward_skb 函数之后更新了数据长度，包数量信息后就返回 ok 了。

static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct veth_priv *priv = netdev_priv(dev);
	struct net_device *rcv;
	int length = skb->len;

	rcu_read_lock();
	rcv = rcu_dereference(priv->peer);
	if (unlikely(!rcv)) {
		kfree_skb(skb);
		goto drop;
	}

	if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);

		u64_stats_update_begin(&stats->syncp);
		stats->bytes += length;
		stats->packets++;
		u64_stats_update_end(&stats->syncp);
	} else {
drop:
		atomic64_inc(&priv->dropped);
	}
	rcu_read_unlock();
	return NETDEV_TX_OK;
}

下面的代码可以看出 __dev_forward_skb 函数不是返回 NET_RX_DROP 就是返回 0。所以说 veth 的数据还是要通过 netif_rx_internal 函数进行处理，
关于 CONFIG_RPS 宏可以看参考中给出的网址，大概就是配置多队列网卡时使用的。
除了宏中的代码，剩下的就是 enqueue_to_backlog 函数。

/**
 * dev_forward_skb - loopback an skb to another netif
 *
 * @dev: destination network device
 * @skb: buffer to forward
 *
 * return values:
 *	NET_RX_SUCCESS	(no congestion)
 *	NET_RX_DROP     (packet was dropped, but freed)
 *
 * dev_forward_skb can be used for injecting an skb from the
 * start_xmit function of one device into the receive queue
 * of another device.
 *
 * The receiving device may be in another namespace, so
 * we have to clear all information in the skb that could
 * impact namespace isolation.
 */
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
	return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
}

int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
{
	int ret = ____dev_forward_skb(dev, skb);

	if (likely(!ret)) {
		skb->protocol = eth_type_trans(skb, dev);
		skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
	}

	return ret;
}

static __always_inline int ____dev_forward_skb(struct net_device *dev,
					       struct sk_buff *skb)
{
	if (skb_orphan_frags(skb, GFP_ATOMIC) ||
	    unlikely(!is_skb_forwardable(dev, skb))) {
		atomic_long_inc(&dev->rx_dropped);
		kfree_skb(skb);
		return NET_RX_DROP;
	}

	skb_scrub_packet(skb, true);
	skb->priority = 0;
	return 0;
}

static int netif_rx_internal(struct sk_buff *skb)
{
	int ret;

	net_timestamp_check(netdev_tstamp_prequeue, skb);

	trace_netif_rx(skb);
#ifdef CONFIG_RPS
	if (static_key_false(&rps_needed)) {
		struct rps_dev_flow voidflow, *rflow = &voidflow;
		int cpu;

		preempt_disable();
		rcu_read_lock();

		cpu = get_rps_cpu(skb->dev, skb, &rflow);
		if (cpu < 0)
			cpu = smp_processor_id();

		ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

		rcu_read_unlock();
		preempt_enable();
	} else
#endif
	{
		unsigned int qtail;
		ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
		put_cpu();
	}
	return ret;
}

注释也说的比较清晰了，就是将数据送至 cpu 的 input 队列中。收包时就是按照正常的协议栈收包流程进行包的回收。

/*
 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
 * queue (may be a remote CPU queue).
 */
static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
			      unsigned int *qtail)
{
	struct softnet_data *sd;
	unsigned long flags;
	unsigned int qlen;

	sd = &per_cpu(softnet_data, cpu);

	local_irq_save(flags);

	rps_lock(sd);
	if (!netif_running(skb->dev))
		goto drop;
	qlen = skb_queue_len(&sd->input_pkt_queue);
	if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
		if (qlen) {
enqueue:
			__skb_queue_tail(&sd->input_pkt_queue, skb);
			input_queue_tail_incr_save(sd, qtail);
			rps_unlock(sd);
			local_irq_restore(flags);
			return NET_RX_SUCCESS;
		}

		/* Schedule NAPI for backlog device
		 * We can use non atomic operation since we own the queue lock
		 */
		if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
			if (!rps_ipi_queued(sd))
				____napi_schedule(sd, &sd->backlog);
		}
		goto enqueue;
	}

drop:
	sd->dropped++;
	rps_unlock(sd);

	local_irq_restore(flags);

	atomic_long_inc(&skb->dev->rx_dropped);
	kfree_skb(skb);
	return NET_RX_DROP;
}

参考

Linux系统中RPS/RFS介绍