DPDK之l2fwd源码解析
引言
l2fwd是dpdk第二层(链路层)的数据包转发示例。该例子将序号相邻两个网卡相互转发数据,例如有6个端口,那么0与1、2与3、4与5成对相互转发数据。
1 源码概述
1.1 数据结构
- l2fwd_enabled_port_mask
表示开启的eth网卡。例如port为N的网卡若开启,l2fwd_enabled_port_mask从低到高第N位置1,否则置0; - l2fwd_dst_ports
表示转发目的网卡的port。例如port为N的网卡的转发目的网卡port为l2fwd_dst_ports[N] - port_pair_params
表示相互转发的两个网卡的信息。例如M与N是相互转发的两个网卡的port号,那么它们的第0位不同,其他位相同。满足:
M >> 1 等于 N >> 1
port_pair_params[M >> 1].port[M&0x1] 等于 port_pair_params[N >> 1].port[M&0x1] 等于 M
port_pair_params[M >> 1].port[N&0x1] 等于 port_pair_params[N >> 1].port[N&0x1] 等于 N - l2fwd_rx_queue_per_lcore
表示每个核上绑定处理的接收网卡端口最大数量,本程序为1 - lcore_queue_conf
表示每个核处理哪个网卡的接收数据。例如lcore_queue_conf[K].rx_port_list[0]=N表示核K处理网卡N的接收数据。 - l2fwd_ports_eth_addr
网卡号与mac地址的映射关系
1.2 主要API
- rte_eal_init / rte_eal_cleanup
在程序启动/推出时执行资源初始化和清理; - rte_eth_dev_stop / rte_eth_dev_close
停止/关闭网卡设备; - rte_eal_mp_remote_launch / rte_eal_wait_lcore
前者拉起线程;后者等待线程结束 - rte_pktmbuf_pool_create / rte_zmalloc_socket
前者创建mbuf池;后者创建单独的一个内存块; - rte_eth_dev_count_avail
获取可用网卡数量; - rte_socket_id / rte_eth_dev_socket_id
前者获取当前CPU的结点ID;后者获取网卡设备的结点ID; - rte_eth_dev_info_get / rte_eth_dev_adjust_nb_rx_tx_desc / rte_eth_macaddr_get
rte_eth_dev_info_get用于获取网卡设备信息;rte_eth_dev_adjust_nb_rx_tx_desc用于将网卡待配置的接收buffer和发送buffer长度调整到合法值(使其在可配置队列长度的有效范围内和字节对齐);rte_eth_macaddr_get用于获取网卡mac地址; - rte_eth_dev_configure / rte_eth_rx_queue_setup / rte_eth_tx_queue_setup
rte_eth_dev_configure用于配置网卡;rte_eth_rx_queue_setup用于配置接收队列;rte_eth_tx_queue_setup用于配置发送队列; - rte_eth_dev_set_ptypes / rte_eth_dev_start / rte_eth_promiscuous_enable
rte_eth_dev_set_ptypes用于设置可处理的数据包类型;rte_eth_dev_start用于启动网卡;rte_eth_promiscuous_enable用于开启网卡混杂模式; - rte_eth_tx_buffer_init / rte_eth_tx_buffer_set_err_callback / rte_eth_tx_buffer / rte_eth_tx_buffer_flush
rte_eth_tx_buffer_init用于初始化发送队列;rte_eth_tx_buffer_set_err_callback用于设置发送失败的回调;rte_eth_tx_buffer用于追加mbuf到发送队列(超过队列大小将直接发送到网卡);rte_eth_tx_buffer_flush将发送队列缓存的mbuf发送到指定的网卡; - rte_eth_rx_burst
从网卡取收到的数据包; - rte_prefetch0
从内存预取数据到cpu缓存;
2 源码流程
2.1 主函数流程
整个main函数总结为如下过程:
- 首先,调用rte_eal_init做初始化
- 然后,解析命令行参数;
- 检查和配置第1节中各类数据结构;
- 配置和启动各个网卡;
- 拉起工作线程;
- 清理和推出。
int
main(int argc, char **argv)
{
# 略略略... ...
///>>>>>>>>>1 eal初始化
/* Init EAL. 8< */
ret = rte_eal_init(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Invalid EAL arguments\n");
# 略略略... ...
///>>>>>>>>>2 解析参数
/* parse application arguments (after the EAL ones) */
ret = l2fwd_parse_args(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Invalid L2FWD arguments\n");
/* >8 End of init EAL. */
# 略略略... ...
///>>>>>>>>>3.1 检查port_pair_params
if (port_pair_params != NULL) {
if (check_port_pair_config() < 0)
rte_exit(EXIT_FAILURE, "Invalid port pair config\n");
}
///>>>>>>>>>3.2 检查l2fwd_enabled_port_mask
/* check port mask to possible port mask */
if (l2fwd_enabled_port_mask & ~((1 << nb_ports) - 1))
rte_exit(EXIT_FAILURE, "Invalid portmask; possible (0x%x)\n",
(1 << nb_ports) - 1);
# 略略略... ...
///>>>>>>>>>3.3 配置l2fwd_dst_ports
/* reset l2fwd_dst_ports */
for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++)
l2fwd_dst_ports[portid] = 0;
last_port = 0;
/* populate destination port details */
if (port_pair_params != NULL) {
uint16_t idx, p;
for (idx = 0; idx < (nb_port_pair_params << 1); idx++) {
p = idx & 1;
portid = port_pair_params[idx >> 1].port[p];
l2fwd_dst_ports[portid] =
port_pair_params[idx >> 1].port[p ^ 1];
}
} else {
RTE_ETH_FOREACH_DEV(portid) {
/* skip ports that are not enabled */
if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
continue;
if (nb_ports_in_mask % 2) {
l2fwd_dst_ports[portid] = last_port;
l2fwd_dst_ports[last_port] = portid;
} else {
last_port = portid;
}
nb_ports_in_mask++;
}
if (nb_ports_in_mask % 2) {
printf("Notice: odd number of ports in portmask.\n");
l2fwd_dst_ports[last_port] = last_port;
}
}
/* >8 End of initialization of the driver. */
# 略略略... ...
///>>>>>>>>>3.4 配置lcore_queue_conf
/* Initialize the port/queue configuration of each logical core */
RTE_ETH_FOREACH_DEV(portid) {
/* skip ports that are not enabled */
if ((l2fwd_enabled_port_mask & (1 << portid)) == 0)
continue;
/* get the lcore_id for this port */
while (rte_lcore_is_enabled(rx_lcore_id) == 0 ||
lcore_queue_conf[rx_lcore_id].n_rx_port ==
l2fwd_rx_queue_per_lcore) {
rx_lcore_id++;
if (rx_lcore_id >= RTE_MAX_LCORE)
rte_exit(EXIT_FAILURE, "Not enough cores\n");
}
if (qconf != &lcore_queue_conf[rx_lcore_id]) {
/* Assigned a new logical core in the loop above. */
qconf = &lcore_queue_conf[rx_lcore_id];
nb_lcores++;
}
qconf->rx_port_list[qconf->n_rx_port] = portid;
qconf->n_rx_port++;
printf("Lcore %u: RX port %u TX port %u\n", rx_lcore_id,
portid, l2fwd_dst_ports[portid]);
}
# 略略略... ...
/* Create the mbuf pool. 8< */
l2fwd_pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", nb_mbufs,
MEMPOOL_CACHE_SIZE, 0, RTE_MBUF_DEFAULT_BUF_SIZE,
rte_socket_id());
# 略略略... ...
/* Initialise each port */
RTE_ETH_FOREACH_DEV(portid) {
# 略略略... ...
///>>>>>>>>>4.1 配置网卡
ret = rte_eth_dev_info_get(portid, &dev_info);
# 略略略... ...
/* Configure the number of queues for a port. */
ret = rte_eth_dev_configure(portid, 1, 1, &local_port_conf);
# 略略略... ...
ret = rte_eth_dev_adjust_nb_rx_tx_desc(portid, &nb_rxd,
&nb_txd);
# 略略略... ...
ret = rte_eth_macaddr_get(portid,
&l2fwd_ports_eth_addr[portid]);
# 略略略... ...
/* RX queue setup. 8< */
ret = rte_eth_rx_queue_setup(portid, 0, nb_rxd,
rte_eth_dev_socket_id(portid),
&rxq_conf,
l2fwd_pktmbuf_pool);
# 略略略... ...
ret = rte_eth_tx_queue_setup(portid, 0, nb_txd,
rte_eth_dev_socket_id(portid),
&txq_conf);
# 略略略... ...
///>>>>>>>>>4.2 分配发送buffer
/* Initialize TX buffers */
tx_buffer[portid] = rte_zmalloc_socket("tx_buffer",
RTE_ETH_TX_BUFFER_SIZE(MAX_PKT_BURST), 0,
rte_eth_dev_socket_id(portid));
# 略略略... ...
rte_eth_tx_buffer_init(tx_buffer[portid], MAX_PKT_BURST);
ret = rte_eth_tx_buffer_set_err_callback(tx_buffer[portid],
rte_eth_tx_buffer_count_callback,
&port_statistics[portid].dropped);
# 略略略... ...
///>>>>>>>>>4.3 启动网卡
ret = rte_eth_dev_set_ptypes(portid, RTE_PTYPE_UNKNOWN, NULL,
0);
# 略略略... ...
/* Start device */
ret = rte_eth_dev_start(portid);
# 略略略... ...
if (promiscuous_on) {
ret = rte_eth_promiscuous_enable(portid);
# 略略略... ...
}
# 略略略... ...
}
# 略略略... ...
///>>>>>>>>>5 拉起线程
/* launch per-lcore init on every lcore */
rte_eal_mp_remote_launch(l2fwd_launch_one_lcore, NULL, CALL_MAIN);
RTE_LCORE_FOREACH_WORKER(lcore_id) {
if (rte_eal_wait_lcore(lcore_id) < 0) {
ret = -1;
break;
}
}
///>>>>>>>>>6 清理资源
RTE_ETH_FOREACH_DEV(portid) {
# 略略略... ...
ret = rte_eth_dev_stop(portid);
# 略略略... ...
rte_eth_dev_close(portid);
# 略略略... ...
}
/* clean up the EAL */
rte_eal_cleanup();
# 略略略... ...
}
2.1.1 检查和配置数据结构
- 检查port_pair_params:
通过check_port_pair_config检查port_pair_params是否有效; - 检查l2fwd_enabled_port_mask:
系统网卡数量为nb_ports,如果l2fwd_enabled_port_mask高于nb_ports的bit位为1,则非法; - 配置l2fwd_dst_ports
首先,对l2fwd_dst_ports全部默认置0;
然后,若port_pair_params不为NULL,则通过port_pair_params计算l2fwd_dst_ports;否则,将相邻两个网卡号相互设置为目的网卡; - 配置lcore_queue_conf
将网卡按port号依次分配给可用的cpu核。如果cpu核都可用,那么port号为N的网卡由序号为N的核处理接收数据,特别地,如果有核不可用,序号依次退后。
2.1.2 配置和启动各个网卡
- 配置网卡
配置网卡主要配置网卡的收发队列,同时获取网卡mac地址,并缓存到l2fwd_ports_eth_addr中; - 分配发送缓存
每个网卡分配的内存结点是网卡的亲和结点; - 启动网卡
启动网卡前,设置其接收任何类型数据;启动后,设置其开放混杂模式;
2.2 工作流程
在main函数拉起线程时,传递了l2fwd_launch_one_lcore函数作为线程回调,同时主线程也会执行该函数。该函数直接调用了l2fwd_main_loop:
static void
l2fwd_main_loop(void)
{
# 略略略... ...
///>>>>>>>>>1 获取lcore_queue_conf
lcore_id = rte_lcore_id();
qconf = &lcore_queue_conf[lcore_id];
if (qconf->n_rx_port == 0) {
RTE_LOG(INFO, L2FWD, "lcore %u has nothing to do\n", lcore_id);
return;
}
///>>>>>>>>>2 获取待处理的接收网卡号
for (i = 0; i < qconf->n_rx_port; i++) {
portid = qconf->rx_port_list[i];
RTE_LOG(INFO, L2FWD, " -- lcoreid=%u portid=%u\n", lcore_id,
portid);
}
while (!force_quit) {
/* Drains TX queue in its main loop. 8< */
cur_tsc = rte_rdtsc();
/*
* TX burst queue drain
*/
///>>>>>>>>>3 刷新目标网卡发送缓存
diff_tsc = cur_tsc - prev_tsc;
if (unlikely(diff_tsc > drain_tsc)) {
for (i = 0; i < qconf->n_rx_port; i++) {
portid = l2fwd_dst_ports[qconf->rx_port_list[i]];
buffer = tx_buffer[portid];
sent = rte_eth_tx_buffer_flush(portid, 0, buffer);
if (sent)
port_statistics[portid].tx += sent;
}
/* if timer is enabled */
if (timer_period > 0) {
/* advance the timer */
timer_tsc += diff_tsc;
/* if timer has reached its timeout */
if (unlikely(timer_tsc >= timer_period)) {
/* do this only on main core */
if (lcore_id == rte_get_main_lcore()) {
print_stats();
/* reset the timer */
timer_tsc = 0;
}
}
}
prev_tsc = cur_tsc;
}
/* >8 End of draining TX queue. */
///>>>>>>>>>4 将接收网卡数据投递到目标网卡的发送缓存
/* Read packet from RX queues. 8< */
for (i = 0; i < qconf->n_rx_port; i++) {
portid = qconf->rx_port_list[i];
nb_rx = rte_eth_rx_burst(portid, 0,
pkts_burst, MAX_PKT_BURST);
if (unlikely(nb_rx == 0))
continue;
port_statistics[portid].rx += nb_rx;
for (j = 0; j < nb_rx; j++) {
m = pkts_burst[j];
rte_prefetch0(rte_pktmbuf_mtod(m, void *));
l2fwd_simple_forward(m, portid);
}
}
/* >8 End of read packet from RX queues. */
}
}
主要流程如下:
- 获取lcore_queue_conf;
- 获取待处理的接收网卡号;
- 刷新目标网卡发送缓存;
- 将接收网卡数据投递到目标网卡的发送缓存;
注:qconf->n_rx_port等于1
2.2.1 将接收网卡数据投递到目标网卡的发送缓存
- 通过rte_eth_rx_burst读取接收网卡的数据包
- rte_prefetch0预取接收数据到cpu缓存
- 通过l2fwd_simple_forward将接收包追加到目标网卡的发送队列
2.2.2 刷新网卡发送缓存
每隔一定时间,执行:
- 将目标网卡的待发送包发送到网卡;
- 对于主线程会打印各个线程的统计数据;