STP相关先不管,以后有兴趣了在专门研究。
一、net_bridge、net_bridge_prot、net_bridge_fdb_entry数据结构
1. 网桥设备net_bridge数据结构
struct net_bridge {
/*自旋锁*/
spinlock_t lock;
/*用于管理下面hash表的锁*/
spinlock_t hash_lock;
/*网桥端口列表*/
struct list_head port_list;
/*网桥设备*/
struct net_device *dev;
struct pcpu_sw_netstats __percpu *stats;
/* These fields are accessed on each packet */
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
u8 vlan_enabled;
u8 vlan_stats_enabled;
__be16 vlan_proto;
u16 default_pvid;
struct net_bridge_vlan_group __rcu *vlgrp;
#endif
/*hash链表,每一个元素指向一个net_bridge_fdb_entry表
CAM表,转发表*/
struct hlist_head hash[BR_HASH_SIZE];
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
union {
struct rtable fake_rtable;
struct rt6_info fake_rt6_info;
};
bool nf_call_iptables;
bool nf_call_ip6tables;
bool nf_call_arptables;
#endif
u16 group_fwd_mask;
u16 group_fwd_mask_required;
/* STP */
bridge_id designated_root;
bridge_id bridge_id;
u32 root_path_cost;
unsigned char topology_change;
unsigned char topology_change_detected;
u16 root_port;
unsigned long max_age;
unsigned long hello_time;
unsigned long forward_delay;
unsigned long ageing_time;
unsigned long bridge_max_age;
unsigned long bridge_hello_time;
unsigned long bridge_forward_delay;
unsigned long bridge_ageing_time;
u8 group_addr[ETH_ALEN];
bool group_addr_set;
enum {
BR_NO_STP, /* no spanning tree */
BR_KERNEL_STP, /* old STP in kernel */
BR_USER_STP, /* new RSTP in userspace */
} stp_enabled;
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
unsigned char multicast_router;
u8 multicast_disabled:1;
u8 multicast_querier:1;
u8 multicast_query_use_ifaddr:1;
u8 has_ipv6_addr:1;
u8 multicast_stats_enabled:1;
u32 hash_elasticity;
u32 hash_max;
u32 multicast_last_member_count;
u32 multicast_startup_query_count;
u8 multicast_igmp_version;
unsigned long multicast_last_member_interval;
unsigned long multicast_membership_interval;
unsigned long multicast_querier_interval;
unsigned long multicast_query_interval;
unsigned long multicast_query_response_interval;
unsigned long multicast_startup_query_interval;
spinlock_t multicast_lock;
/*mdb指向网桥的组播数据库转发表*/
struct net_bridge_mdb_htable __rcu *mdb;
struct hlist_head router_list;
struct timer_list multicast_router_timer;
struct bridge_mcast_other_query ip4_other_query;
struct bridge_mcast_own_query ip4_own_query;
struct bridge_mcast_querier ip4_querier;
struct bridge_mcast_stats __percpu *mcast_stats;
#if IS_ENABLED(CONFIG_IPV6)
struct bridge_mcast_other_query ip6_other_query;
struct bridge_mcast_own_query ip6_own_query;
struct bridge_mcast_querier ip6_querier;
u8 multicast_mld_version;
#endif /* IS_ENABLED(CONFIG_IPV6) */
#endif
struct timer_list hello_timer;
struct timer_list tcn_timer;
struct timer_list topology_change_timer;
struct delayed_work gc_work;
struct kobject *ifobj;
u32 auto_cnt;
#ifdef CONFIG_NET_SWITCHDEV
int offload_fwd_mark;
#endif
bool neigh_suppress_enabled;
};
2. 网桥端口net_bridge_port数据结构
//net/bridge/br_private.h
struct net_bridge_port
{
/*指向网桥端口所属的网桥设备*/
struct net_bridge *br;
/*添加到网桥的设备*/
struct net_device *dev;
/*网桥端口列表,连接到net_bridge->head_list*/
struct list_head list;
/* STP */
u8 priority;/*端口优先级*/
u8 state;/*端口状态*/
u16 port_no;/*端口号*/
unsigned char topology_change_ack;
unsigned char config_pending;
port_id port_id;/*端口ID,由端口优先级和端口号组成*/
port_id designated_port;
bridge_id designated_root;
bridge_id designated_bridge;
u32 path_cost;
u32 designated_cost;
unsigned long designated_age;
struct timer_list forward_delay_timer;
struct timer_list hold_timer;
struct timer_list message_age_timer;
struct kobject kobj;
struct rcu_head rcu;
/*flags参数很重要*/
unsigned long flags;
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
struct bridge_mcast_own_query ip4_own_query;
#if IS_ENABLED(CONFIG_IPV6)
struct bridge_mcast_own_query ip6_own_query;
#endif /* IS_ENABLED(CONFIG_IPV6) */
unsigned char multicast_router;
struct timer_list multicast_router_timer;
struct hlist_head mglist;
struct hlist_node rlist;
#endif
#ifdef CONFIG_SYSFS
char sysfs_name[IFNAMSIZ];
#endif
#ifdef CONFIG_NET_POLL_CONTROLLER
struct netpoll *np;
#endif
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
struct net_port_vlans __rcu *vlan_info;
#endif
};
感觉就前面的三个br,dev,list有用。还有一个flags(老版本没有这个参数)。
(1). 关于flags参数:
参考:https://blog.youkuaiyun.com/sinat_20184565/article/details/80852155
网桥添加网络设备时,新接口的flags赋值为BR_LEARNING | BR_FLOOD,即新接口为自动状态接口
自动状态接口(AUTO_PORT):
内核中对自动状态接口定义如下:即设置了学习|单播洪泛的接口。
#define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING)
#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
#define BR_HAIRPIN_MODE BIT(0)
#define BR_BPDU_GUARD BIT(1)
#define BR_ROOT_BLOCK BIT(2)
#define BR_MULTICAST_FAST_LEAVE BIT(3)
#define BR_ADMIN_COST BIT(4)
#define BR_LEARNING BIT(5)
#define BR_FLOOD BIT(6)
#define BR_AUTO_MASK (BR_FLOOD | BR_LEARNING)
#define BR_PROMISC BIT(7)
#define BR_PROXYARP BIT(8)
#define BR_LEARNING_SYNC BIT(9)
#define BR_PROXYARP_WIFI BIT(10)
/* called with RTNL but without bridge lock */
static struct net_bridge_port *new_nbp(struct net_bridge *br,
struct net_device *dev)
{
int index;
struct net_bridge_port *p;
index = find_portno(br);
if (index < 0)
return ERR_PTR(index);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return ERR_PTR(-ENOMEM);
p->br = br;
dev_hold(dev);
p->dev = dev;
p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
p->port_no = index;
p->flags = BR_LEARNING | BR_FLOOD;
br_init_port(p);
br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
br_multicast_add_port(p);
return p;
}
BR_LEARNING标志,在函数br_handle_frame_finish中判断,用来决定是否使用数据包的源MAC地址更新网桥的FDB转发表。
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
struct net_bridge *br;
br = p->br;
if (p->flags & BR_LEARNING)
br_fdb_update(br, p, eth_hdr(skb)->h_source, vid, false);
}
BR_FLOOD标志用来控制是否在接口上泛洪单播数据包,如果没有设置此标志,不能在此接口泛洪单播包。
void br_flood(struct net_bridge *br, struct sk_buff *skb, ...)
{
struct net_bridge_port *p;
list_for_each_entry_rcu(p, &br->port_list, list) {
switch (pkt_type) {
case BR_PKT_UNICAST:
if (!(p->flags & BR_FLOOD))
continue;
break;
}
(2). 端口状态:
//linux/uapi/linux/if_bridge.h
#define BR_STATE_DISABLED 0
#define BR_STATE_LISTENING 1
#define BR_STATE_LEARNING 2
#define BR_STATE_FORWARDING 3
#define BR_STATE_BLOCKING 4
关闭:不收发任何报文
阻塞:只能接收BPDU,不能发送BPDU,不能收发数据帧,不进行地址学习,
监听:可以收发BPDU,不能收发数据帧,不进行地址学习
学习:可以收发BPDU,不收发数据帧,进行地址学习
转发:可以收发BPDU,可以收发数据帧,进行地址学习
记得在br_handle_frame_finish中用到了端口状态:
if (!p || p->state == BR_STATE_DISABLED)
...
if (p->state == BR_STATE_LEARNING)
...
3. net_bridge_fdb_entry数据结构
fdb:forward database
https://blog.youkuaiyun.com/qq_25077833/article/details/52834418
端口-MAC地址表
struct net_bridge_fdb_entry
{
/*链接到net_bridge的hash[BR_HASH_SIZE]*/
struct hlist_node hlist;
/*指向网桥端口*/
struct net_bridge_port *dst;
struct rcu_head rcu;
unsigned long updated;
unsigned long used;//引用计数
/*mac地址*/
mac_addr addr;
/*mac地址是否为本地地址,1为是*/
unsigned char is_local:1,
/*mac地址为静态的,表示mac地址不会过期,本地地址都是静态的。*/
is_static:1,
added_by_user:1, //用户配置
added_by_external_learn:1;//外部学习
__u16 vlan_id;//MAC属于哪个VLAN?
};
这个is_local选项在br_handle_frame_finish()函数中查找fdb转发表后,会用到进行判断,如果目的端口的is_local=1,表示数据包是本地接收,需要送往三层进行进一步处理。
二、网桥相关函数
1.br_init()
网桥初始化函数
桥接程序既可以集成在内核中,也可以编译成单独模块。初始化函数为br_init(),清理函数为br_deinit()函数。定义在/net/bridge/br.c中。就是个模块,br_init()为初始化函数,br_deinit()为exit函数。
static int __init br_init(void)
{
int err;
BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb));
/*注册STP协议*/
err = stp_proto_register(&br_stp_proto);
if (err < 0) {
pr_err("bridge: can't register sap for STP\n");
return err;
}
/*fdb初始化,分配一个fdb内存*/
err = br_fdb_init();
if (err)
goto err_out;
/*register_pernet_subsys - register a network namespace subsystem
注册一个网络空间子系统*/
err = register_pernet_subsys(&br_net_ops);
if (err)
goto err_out1;
/*netfilter初始化*/
err = br_nf_core_init();
if (err)
goto err_out2;
/*注册netdevice通知链*/
err = register_netdevice_notifier(&br_device_notifier);
if (err)
goto err_out3;
/*注册netdevice_switch通知链*/
err = register_netdev_switch_notifier(&br_netdev_switch_notifier);
if (err)
goto err_out4;
/*netlink初始化*/
err = br_netlink_init();
if (err)
goto err_out5;
//安装网络设备的do_ioctl函数,也就是提供给用户空间ioctl接口。
brioctl_set(br_ioctl_deviceless_stub);
#if IS_ENABLED(CONFIG_ATM_LANE)
br_fdb_test_addr_hook = br_fdb_test_addr;
#endif
pr_info("bridge: automatic filtering via arp/ip/ip6tables has been "
"deprecated. Update your scripts to load br_netfilter if you "
"need this.\n");
return 0;
err_out5:
unregister_netdev_switch_notifier(&br_netdev_switch_notifier);
err_out4:
unregister_netdevice_notifier(&br_device_notifier);
err_out3:
br_nf_core_fini();
err_out2:
unregister_pernet_subsys(&br_net_ops);
err_out1:
br_fdb_fini();
err_out:
stp_proto_unregister(&br_stp_proto);
return err;
}
(1). register_netdevice_notifier(&br_device_notifier);
https://www.cnblogs.com/3me-linux/p/6566750.html
注册netdevice通知链,网桥设备是建立在其他设备之上的,那些设备的状态(UP/DOWN),地址改变等消息,会影响网桥设备的内部数据结构,如端口表,FDB等,因此需要关注netdev_chain,这些事件的处理由br_device_event()完成。
(2)brioctl_set(br_ioctl_deviceless_stub);
http://blog.sina.com.cn/s/blog_67cc0c8f0101oh33.html
用户空间程序使用网桥相关的命令来调用ioctl函数时,它经kernel依据命令所属的分类分派到compat_sock_ioctl_trans()函数,在compat_sock_ioctl_trans()函数中,根据cmd类型,调用相应函数,例如sock_ioctl()函数,dev_ifsioc()函数。sock_ioctl()函数,在sock_ioctl()函数里,当ioctl命令为SIOCGIFBR、SIOCSIFBR、SIOCBRADDBR、SIOCBRDELBR时,会调用br_ioctl_deviceless_stub()函数进行处理;在dev_ifsioc()函数中,调用设备的ndo_do_ioctl()函数,网桥对应的就是br_dev_ioctl()函数。
(3). compat_sock_ioctl_trans()函数
static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
unsigned int cmd, unsigned long arg)
{
...
switch (cmd) {
case SIOCSIFBR:
case SIOCGIFBR:
return old_bridge_ioctl(argp);
...
case SIOCBRADDBR:
case SIOCBRDELBR:
return sock_ioctl(file, cmd, arg);
case SIOCBRADDIF:
case SIOCBRDELIF:
...
return dev_ifsioc(net, sock, cmd, argp);
}
}
(4). sock_ioctl()函数
static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
switch (cmd) {
...
case SIOCGIFBR:
case SIOCSIFBR:
case SIOCBRADDBR:
case SIOCBRDELBR:
err = -ENOPKG;
if (!br_ioctl_hook)
request_module("bridge");
mutex_lock(&br_ioctl_mutex);
if (br_ioctl_hook)
err = br_ioctl_hook(net, cmd, argp);
mutex_unlock(&br_ioctl_mutex);
break;
...
}
return err;
}
(5). dev_ifsioc()函数
/*
* Perform the SIOCxIFxxx calls, inside rtnl_lock()
*/
static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
{
...
switch (cmd) {
...
/*
* Unknown or private ioctl
*/
default:
if ((cmd >= SIOCDEVPRIVATE &&
cmd <= SIOCDEVPRIVATE + 15) ||
...
cmd == SIOCBRADDIF ||
cmd == SIOCBRDELIF ||
...
cmd == SIOCWANDEV) {
err = -EOPNOTSUPP;
if (ops->ndo_do_ioctl) {
if (netif_device_present(dev))
err = ops->ndo_do_ioctl(dev, ifr, cmd);
else
err = -ENODEV;
}
} else
err = -EINVAL;
}
return err;
}
(6). br_ioctl_deviceless_stub()函数
int br_ioctl_deviceless_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
switch (cmd) {
case SIOCGIFBR:
case SIOCSIFBR:
return old_deviceless(net, uarg);
case SIOCBRADDBR:
case SIOCBRDELBR:
{
char buf[IFNAMSIZ];
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
if (copy_from_user(buf, uarg, IFNAMSIZ))
return -EFAULT;
buf[IFNAMSIZ-1] = 0;
if (cmd == SIOCBRADDBR)
return br_add_bridge(net, buf);
return br_del_bridge(net, buf);
}
}
return -EOPNOTSUPP;
}
(7) . br_dev_ioctl()函数
int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
{
struct net_bridge *br = netdev_priv(dev);
switch (cmd) {
case SIOCDEVPRIVATE:
return old_dev_ioctl(dev, rq, cmd);
case SIOCBRADDIF:
case SIOCBRDELIF:
return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF);
}
br_debug(br, "Bridge does not support ioctl 0x%x\n", cmd);
return -EOPNOTSUPP;
}
2. br_add_bridge()
添加网桥函数。这个函数中调用alloc_netdev()分配网桥设备及私有数据内存,调用register_netdev()注册网桥设备。在alloc_netdev()函数中,调用了br_dev_setup()函数对新建立网桥进行初始化工作。
在alloc_netdev()中创建设备的时候,把参数name传递给dev->name,这个时候没有对name进行检验。
在register_netdev()注册设备的时候,对参数name进行校验,包括name合法性,name是否重复,name的形式是否为"前缀%d"形式,如果是,系统会根据前缀,顺序分配一个名称。
int br_add_bridge(struct net *net, const char *name)
{
struct net_device *dev;
int res;
/*建立网桥设备*/
dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN,
br_dev_setup);
if (!dev)
return -ENOMEM;
/*设置网络设备所在的网络命名空间*/
dev_net_set(dev, net);
dev->rtnl_link_ops = &br_link_ops;
/*注册网络设备*/
res = register_netdev(dev);
if (res)
free_netdev(dev);
return res;
}
(1). alloc_netdev宏定义
alloc_netdev():给网络设备分配空间,alloc_netdev()调用的是alloc_netdev_mqs()函数。
#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)
sizeof_priv:网络设备私有数据大小
name:设备名称,或者“设备名前缀%d”形式,相同前缀的设备会进行统一编号,以确保设备名唯一
name_assign_type:设备名来源
setup:网络设备初始化函数,回调函数。
name_assign_typ类型:
这几个类似什么意思,还不清楚。
/* interface name assignment types (sysfs name_assign_type attribute) */
#define NET_NAME_UNKNOWN 0 /* unknown origin (not exposed to userspace) */
/*由内核枚举*/
#define NET_NAME_ENUM 1 /* enumerated by kernel */
#define NET_NAME_PREDICTABLE 2 /* predictably named by the kernel */
#define NET_NAME_USER 3 /* provided by user-space */
#define NET_NAME_RENAMED 4 /* renamed by user-space */
(2). alloc_netdev_mqs()函数:
alloc_netdev_mqs()函数,分配了net_device和私有数据的内存空间,对net_device进行了基本的初始化,并且分配了设备接收、发送队列空间。
rxqs:分配的传输队列的数量
rxqs:分配的接收队列的数量
/**
* alloc_netdev_mqs - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @name_assign_type: origin of device name
* @setup: callback to initialize device
* @txqs: the number of TX subqueues to allocate
* @rxqs: the number of RX subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
* and performs basic initialization. Also allocates subqueue structs
* for each queue on the device.
*/
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned char name_assign_type,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
size_t alloc_size;
struct net_device *p;
/*name长度不能超过16,dev中name数组长16,为name[16]*/
BUG_ON(strlen(name) >= sizeof(dev->name));
/*不能没有传输队列*/
if (txqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
return NULL;
}
#ifdef CONFIG_SYSFS
if (rxqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
return NULL;
}
#endif
/*将net_device数据结构长度按照32位对齐后,加上私有数据长度,产生总的内存分配长度。*/
alloc_size = sizeof(struct net_device);
if (sizeof_priv) {
/* ensure 32-byte alignment of private area */
/*net_device结构体32位对齐,确保私有数据也32位对齐*/
alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
alloc_size += sizeof_priv;
}
/* ensure 32-byte alignment of whole construct */
/*这里增加31个字节空间,是为了下面将分配后的net_device结构体的地址调整
到32位边界对齐时,预留空间。*/
alloc_size += NETDEV_ALIGN - 1;
p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
if (!p)
p = vzalloc(alloc_size);
if (!p)
return NULL;
/*将net_device数据结构的地址对齐到32位边界,记录下调整后的地址dev,
和实际分配的地址p,便于释放空间时使用实际起始地址p。*/
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
if (dev_addr_init(dev))
goto free_pcpu;
dev_mc_init(dev);
dev_uc_init(dev);
dev_net_set(dev, &init_net);
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
dev->gso_min_segs = 0;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
INIT_LIST_HEAD(&dev->close_list);
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->adj_list.upper);
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->all_adj_list.upper);
INIT_LIST_HEAD(&dev->all_adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
if (netif_alloc_netdev_queues(dev))
goto free_all;
#ifdef CONFIG_SYSFS
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
goto free_all;
#endif
strcpy(dev->name, name);//直接把name赋值给dev->name
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
return dev;
free_all:
free_netdev(dev);
return NULL;
free_pcpu:
free_percpu(dev->pcpu_refcnt);
free_dev:
netdev_freemem(dev);
return NULL;
}
插入一句,最近在看代码的时候,创建设备的时候用的是alloc_etherdev()函数,功能和alloc_netdev()的一样,只是封装了一下:
alloc_etherdev()
#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
#define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)
/**
* alloc_etherdev_mqs - Allocates and sets up an Ethernet device
* @sizeof_priv: Size of additional driver-private structure to be allocated
* for this Ethernet device
* @txqs: The number of TX queues this device has.
* @rxqs: The number of RX queues this device has.
*
* Fill in the fields of the device structure with Ethernet-generic
* values. Basically does everything except registering the device.
*
* Constructs a new net device, complete with a private data area of
* size (sizeof_priv). A 32-byte (not bit) alignment is enforced for
* this private data area.
*/
struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
unsigned int rxqs)
{
return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_UNKNOWN,
ether_setup, txqs, rxqs);
}
(3)register_netdevice
这里只简单看一下name的赋值过程。
/**
* register_netdevice - register a network device
* @dev: device to register
*
* Take a completed network device structure and add it to the kernel
* interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
* chain. 0 is returned on success. A negative errno code is returned
* on a failure to set up the device, or if the name is a duplicate.
*
* Callers must hold the rtnl semaphore. You may want
* register_netdev() instead of this.
*
* BUGS:
* The locking appears insufficient to guarantee two parallel registers
* will not get the same name.
*/
int register_netdevice(struct net_device *dev)
{
int ret;
struct net *net = dev_net(dev);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
might_sleep();
/* When net_device's are persistent, this will be fatal. */
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
ret = dev_get_valid_name(net, dev, dev->name);
if (ret < 0)
goto out;
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
goto out;
}
}
if (((dev->hw_features | dev->features) &
NETIF_F_HW_VLAN_CTAG_FILTER) &&
(!dev->netdev_ops->ndo_vlan_rx_add_vid ||
!dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
ret = -EINVAL;
goto err_uninit;
}
ret = -EBUSY;
if (!dev->ifindex)
dev->ifindex = dev_new_index(net);
else if (__dev_get_by_index(net, dev->ifindex))
goto err_uninit;
/* Transfer changeable features to wanted_features and enable
* software offloads (GSO and GRO).
*/
dev->hw_features |= NETIF_F_SOFT_FEATURES;
dev->features |= NETIF_F_SOFT_FEATURES;
dev->wanted_features = dev->features & dev->hw_features;
if (!(dev->flags & IFF_LOOPBACK)) {
dev->hw_features |= NETIF_F_NOCACHE_COPY;
}
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
dev->vlan_features |= NETIF_F_HIGHDMA;
/* Make NETIF_F_SG inheritable to tunnel devices.
*/
dev->hw_enc_features |= NETIF_F_SG;
/* Make NETIF_F_SG inheritable to MPLS.
*/
dev->mpls_features |= NETIF_F_SG;
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
if (ret)
goto err_uninit;
ret = netdev_register_kobject(dev);
if (ret)
goto err_uninit;
dev->reg_state = NETREG_REGISTERED;
__netdev_update_features(dev);
/*
* Default initial state at registry is that the
* device is present.
*/
set_bit(__LINK_STATE_PRESENT, &dev->state);
linkwatch_init_dev(dev);
dev_init_scheduler(dev);
dev_hold(dev);
list_netdevice(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
/* If the device has permanent device address, driver should
* set dev_addr and also addr_assign_type should be set to
* NET_ADDR_PERM (default value).
*/
if (dev->addr_assign_type == NET_ADDR_PERM)
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
ret = notifier_to_errno(ret);
if (ret) {
rollback_registered(dev);
dev->reg_state = NETREG_UNREGISTERED;
}
/*
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
out:
return ret;
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
goto out;
}
(4). dev_get_valid_name
static int dev_get_valid_name(struct net *net,
struct net_device *dev,
const char *name)
{
BUG_ON(!net);
if (!dev_valid_name(name))
return -EINVAL;
if (strchr(name, '%'))
return dev_alloc_name_ns(net, dev, name);
else if (__dev_get_by_name(net, name))
return -EEXIST;
else if (dev->name != name)
strlcpy(dev->name, name, IFNAMSIZ);
return 0;
}
(5). dev_alloc_name_ns
由系统根据前缀赋值名称。
static int dev_alloc_name_ns(struct net *net,
struct net_device *dev,
const char *name)
{
char buf[IFNAMSIZ];
int ret;
ret = __dev_alloc_name(net, name, buf);
if (ret >= 0)
strlcpy(dev->name, buf, IFNAMSIZ);
return ret;
}
例如我的设备上目前有一个br0,我想再创建一个br1:
3. br_dev_setup()函数
网桥初始化函数,网桥设备的net_device相应字段,网桥私有数据net_bridge字段设置。
void br_dev_setup(struct net_device *dev)
{
/*netdev_prit()取网桥设备私有数据起始地址*/
struct net_bridge *br = netdev_priv(dev);
/*随机生成mac地址*/
eth_hw_addr_random(dev);
/*以太网设备初始化,用以太网通用参数初始化dev相应参数*/
ether_setup(dev);
/*网桥设备的操作函数集br_netdev_ops*/
dev->netdev_ops = &br_netdev_ops;
/*析构函数为br_dev_free*/
dev->destructor = br_dev_free;
/*网桥设备的以太网工具函数集*/
dev->ethtool_ops = &br_ethtool_ops;
/*设置网络设备的设备类型*/
SET_NETDEV_DEVTYPE(dev, &br_type);
/*传输队列长度为0*/
dev->tx_queue_len = 0;
/*设备标识为桥*/
dev->priv_flags = IFF_EBRIDGE;
dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL |
NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_STAG_TX;
dev->vlan_features = COMMON_FEATURES;
/*后面是网桥私有化数据初始化*/
/*dev指向自己对于的net_device*/
br->dev = dev;
/*锁初始化*/
spin_lock_init(&br->lock);
/*网桥端口列表初始化*/
INIT_LIST_HEAD(&br->port_list);
/*CAM表自旋锁初始化*/
spin_lock_init(&br->hash_lock);
/*网桥默认优先级0x8000,即32768*/
br->bridge_id.prio[0] = 0x80;
br->bridge_id.prio[1] = 0x00;
/*STP初始化相关*/
/*802.1D(STP)组播01:80:C2:00:00:00*/
ether_addr_copy(br->group_addr, eth_reserved_addr_base);
/*默认没有开启STP,不阻塞任何组播包*/
br->stp_enabled = BR_NO_STP;
br->group_fwd_mask = BR_GROUPFWD_DEFAULT;
br->group_fwd_mask_required = BR_GROUPFWD_DEFAULT;
br->designated_root = br->bridge_id;
/*20s,BPDU老化时间*/
br->bridge_max_age = br->max_age = 20 * HZ;
/*2s,hello定时器时间*/
br->bridge_hello_time = br->hello_time = 2 * HZ;
/*15s转发时延,用于Block->Learning->Forwarding*/
br->bridge_forward_delay = br->forward_delay = 15 * HZ;
/*FDB中保存的MAC地址的老化时间,5分钟*/
br->ageing_time = 300 * HZ;
/*桥上netfilter初始化*/
br_netfilter_rtable_init(br);
br_stp_timer_init(br);
/*组播初始化*/
br_multicast_init(br);
}
(1). netdev_priv()函数
这个函数很简短,但是很重要,功能就一个。取出网桥设备私有数据起始地址,就是net_bridge结构地址。它也是32字节对齐的。
/**
* netdev_priv - access network device private data
* @dev: network device
*
* Get network device private data
*/
static inline void *netdev_priv(const struct net_device *dev)
{
return (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);
}
(2). 宏SET_NETDEV_DEVTYPE(dev, &br_type);
/* Set the sysfs device type for the network logical device to allow
* fine-grained identification of different network device types. For
* example Ethernet, Wirelss LAN, Bluetooth, WiMAX etc.
*/
#define SET_NETDEV_DEVTYPE(net, devtype) ((net)->dev.type = (devtype))
相当于dev->dev.type = &by_type
br_type结构体定义为:
static struct device_type br_type = {
.name = "bridge",
};
net_device数据结构中有个元素struct device dev; dev结构体定义为:当中有个type元素。
struct device
{
struct device *parent;
struct device_private *p;
struct kobject kobj;
const char *init_name; /* initial name of the device */
const struct device_type *type;
...
}
(3). ether_setup()
以太网设备初始化,使用以太网通用参数初始化net_device中以太网相关参数。包括L2头长度为14,MAC地址长度为6,MTU为1518等。
/**
* ether_setup - setup Ethernet network device
* @dev: network device
*
* Fill in the fields of the device structure with Ethernet-generic values.
*/
void ether_setup(struct net_device *dev)
{
/*L2层头操作函数,包括create,parse,cache等
这个参数初始化的地方没找到,以后再研究。*/
dev->header_ops = ð_header_ops;
/*设置ARP协议硬件标识符,ARPHRD_ETHER表示标准以太网*/
dev->type = ARPHRD_ETHER;
/*L2头长度为14*/
dev->hard_header_len = ETH_HLEN;
/*数据长度最大为1500*/
dev->mtu = ETH_DATA_LEN;
/*MAC地址长度为6*/
dev->addr_len = ETH_ALEN;
/*发送队列长度为1000*/
dev->tx_queue_len = 1000; /* Ethernet wants good queues */
/*?*/
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
/*Like 'flags' but invisible to userspace,see if.h for the definitions*/
dev->priv_flags |= IFF_TX_SKB_SHARING;
/*分配广播地址,全1*/
eth_broadcast_addr(dev->broadcast);
}
关于dev->type = ARPHRD_ETHER;
设置arp协议硬件标识符, ARPHRD_ETHER表示标准以太网。
/* ARP protocol HARDWARE identifiers. */
#define ARPHRD_NETROM 0 /* from KA9Q: NET/ROM pseudo */
#define ARPHRD_ETHER 1 /* Ethernet 10Mbps */
#define ARPHRD_EETHER 2 /* Experimental Ethernet */
#define ARPHRD_AX25 3 /* AX.25 Level 2 */
#define ARPHRD_PRONET 4 /* PROnet token ring */
#define ARPHRD_CHAOS 5 /* Chaosnet */
#define ARPHRD_IEEE802 6 /* IEEE 802.2 Ethernet/TR/TB */
#define ARPHRD_ARCNET 7 /* ARCnet */
#define ARPHRD_APPLETLK 8 /* APPLEtalk */
#define ARPHRD_DLCI 15 /* Frame Relay DLCI */
#define ARPHRD_ATM 19 /* ATM */
#define ARPHRD_METRICOM 23 /* Metricom STRIP (new IANA id) */
#define ARPHRD_IEEE1394 24 /* IEEE 1394 IPv4 - RFC 2734 */
#define ARPHRD_EUI64 27 /* EUI-64 */
#define ARPHRD_INFINIBAND 32 /* InfiniBand */
(4). br_netdev_ops操作函数
static const struct net_device_ops br_netdev_ops = {
/*打开网桥调用*/
.ndo_open = br_dev_open,
.ndo_stop = br_dev_stop,
.ndo_init = br_dev_init,
/*网桥发包函数*/
.ndo_start_xmit = br_dev_xmit,
.ndo_get_stats64 = br_get_stats64,
/*设置网桥MAC地址*/
.ndo_set_mac_address = br_set_mac_address,
.ndo_set_rx_mode = br_dev_set_multicast_list,
.ndo_change_rx_flags = br_dev_change_rx_flags,
/*改变网桥MTU时,新的MTU值必须小于等于被绑定的设备的最小的MTU*/
.ndo_change_mtu = br_change_mtu,
/*网桥ioctl命令调用函数*/
.ndo_do_ioctl = br_dev_ioctl,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_netpoll_setup = br_netpoll_setup,
.ndo_netpoll_cleanup = br_netpoll_cleanup,
.ndo_poll_controller = br_poll_controller,
#endif
/*网桥添加端口*/
.ndo_add_slave = br_add_slave,
.ndo_del_slave = br_del_slave,
.ndo_fix_features = br_fix_features,
/*网桥添加fdb转发享*/
.ndo_fdb_add = br_fdb_add,
.ndo_fdb_del = br_fdb_delete,
.ndo_fdb_dump = br_fdb_dump,
.ndo_bridge_getlink = br_getlink,
.ndo_bridge_setlink = br_setlink,
.ndo_bridge_dellink = br_dellink,
};
看看定义是哪样的:
/*
* This structure defines the management hooks for network devices.
* The following hooks can be defined; unless noted otherwise, they are
* optional and can be filled with a null pointer.
*
* int (*ndo_init)(struct net_device *dev);
* This function is called once when network device is registered.
* The network device can use this to any late stage initializaton
* or semantic validattion. It can fail with an error code which will
* be propogated back to register_netdev
*
* void (*ndo_uninit)(struct net_device *dev);
* This function is called when device is unregistered or when registration
* fails. It is not called if init fails.
*
* int (*ndo_open)(struct net_device *dev);
* This function is called when network device transistions to the up
* state.
*
* int (*ndo_stop)(struct net_device *dev);
* This function is called when network device transistions to the down
* state.
*
* netdev_tx_t (*ndo_start_xmit)(struct sk_buff *skb,
* struct net_device *dev);
* Called when a packet needs to be transmitted.
* Returns NETDEV_TX_OK. Can return NETDEV_TX_BUSY, but you should stop
* the queue before that can happen; it's for obsolete devices and weird
* corner cases, but the stack really does a non-trivial amount
* of useless work if you return NETDEV_TX_BUSY.
* (can also return NETDEV_TX_LOCKED iff NETIF_F_LLTX)
* Required can not be NULL.
*
* u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb,
* void *accel_priv, select_queue_fallback_t fallback);
* Called to decide which queue to when device supports multiple
* transmit queues.
*
* void (*ndo_change_rx_flags)(struct net_device *dev, int flags);
* This function is called to allow device receiver to make
* changes to configuration when multicast or promiscious is enabled.
*
* void (*ndo_set_rx_mode)(struct net_device *dev);
* This function is called device changes address list filtering.
* If driver handles unicast address filtering, it should set
* IFF_UNICAST_FLT to its priv_flags.
*
* int (*ndo_set_mac_address)(struct net_device *dev, void *addr);
* This function is called when the Media Access Control address
* needs to be changed. If this interface is not defined, the
* mac address can not be changed.
*
* int (*ndo_validate_addr)(struct net_device *dev);
* Test if Media Access Control address is valid for the device.
*
* int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
* Called when a user request an ioctl which can't be handled by
* the generic interface code. If not defined ioctl's return
* not supported error code.
*
* int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
* Used to set network devices bus interface parameters. This interface
* is retained for legacy reason, new devices should use the bus
* interface (PCI) for low level management.
*
* int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);
* Called when a user wants to change the Maximum Transfer Unit
* of a device. If not defined, any request to change MTU will
* will return an error.
*
* void (*ndo_tx_timeout)(struct net_device *dev);
* Callback uses when the transmitter has not made any progress
* for dev->watchdog ticks.
*
* struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
* struct rtnl_link_stats64 *storage);
* struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
* Called when a user wants to get the network device usage
* statistics. Drivers must do one of the following:
* 1. Define @ndo_get_stats64 to fill in a zero-initialised
* rtnl_link_stats64 structure passed by the caller.
* 2. Define @ndo_get_stats to update a net_device_stats structure
* (which should normally be dev->stats) and return a pointer to
* it. The structure may be changed asynchronously only if each
* field is written atomically.
* 3. Update dev->stats asynchronously and atomically, and define
* neither operation.
*
* int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16 vid);
* If device support VLAN filtering this function is called when a
* VLAN id is registered.
*
* int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, __be16 proto, u16 vid);
* If device support VLAN filtering this function is called when a
* VLAN id is unregistered.
*
* void (*ndo_poll_controller)(struct net_device *dev);
*
* SR-IOV management functions.
* int (*ndo_set_vf_mac)(struct net_device *dev, int vf, u8* mac);
* int (*ndo_set_vf_vlan)(struct net_device *dev, int vf, u16 vlan, u8 qos);
* int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
* int max_tx_rate);
* int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
* int (*ndo_get_vf_config)(struct net_device *dev,
* int vf, struct ifla_vf_info *ivf);
* int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int link_state);
* int (*ndo_set_vf_port)(struct net_device *dev, int vf,
* struct nlattr *port[]);
*
* Enable or disable the VF ability to query its RSS Redirection Table and
* Hash Key. This is needed since on some devices VF share this information
* with PF and querying it may adduce a theoretical security risk.
* int (*ndo_set_vf_rss_query_en)(struct net_device *dev, int vf, bool setting);
* int (*ndo_get_vf_port)(struct net_device *dev, int vf, struct sk_buff *skb);
* int (*ndo_setup_tc)(struct net_device *dev, u8 tc)
* Called to setup 'tc' number of traffic classes in the net device. This
* is always called from the stack with the rtnl lock held and netif tx
* queues stopped. This allows the netdevice to perform queue management
* safely.
*
* Fiber Channel over Ethernet (FCoE) offload functions.
* int (*ndo_fcoe_enable)(struct net_device *dev);
* Called when the FCoE protocol stack wants to start using LLD for FCoE
* so the underlying device can perform whatever needed configuration or
* initialization to support acceleration of FCoE traffic.
*
* int (*ndo_fcoe_disable)(struct net_device *dev);
* Called when the FCoE protocol stack wants to stop using LLD for FCoE
* so the underlying device can perform whatever needed clean-ups to
* stop supporting acceleration of FCoE traffic.
*
* int (*ndo_fcoe_ddp_setup)(struct net_device *dev, u16 xid,
* struct scatterlist *sgl, unsigned int sgc);
* Called when the FCoE Initiator wants to initialize an I/O that
* is a possible candidate for Direct Data Placement (DDP). The LLD can
* perform necessary setup and returns 1 to indicate the device is set up
* successfully to perform DDP on this I/O, otherwise this returns 0.
*
* int (*ndo_fcoe_ddp_done)(struct net_device *dev, u16 xid);
* Called when the FCoE Initiator/Target is done with the DDPed I/O as
* indicated by the FC exchange id 'xid', so the underlying device can
* clean up and reuse resources for later DDP requests.
*
* int (*ndo_fcoe_ddp_target)(struct net_device *dev, u16 xid,
* struct scatterlist *sgl, unsigned int sgc);
* Called when the FCoE Target wants to initialize an I/O that
* is a possible candidate for Direct Data Placement (DDP). The LLD can
* perform necessary setup and returns 1 to indicate the device is set up
* successfully to perform DDP on this I/O, otherwise this returns 0.
*
* int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
* struct netdev_fcoe_hbainfo *hbainfo);
* Called when the FCoE Protocol stack wants information on the underlying
* device. This information is utilized by the FCoE protocol stack to
* register attributes with Fiber Channel management service as per the
* FC-GS Fabric Device Management Information(FDMI) specification.
*
* int (*ndo_fcoe_get_wwn)(struct net_device *dev, u64 *wwn, int type);
* Called when the underlying device wants to override default World Wide
* Name (WWN) generation mechanism in FCoE protocol stack to pass its own
* World Wide Port Name (WWPN) or World Wide Node Name (WWNN) to the FCoE
* protocol stack to use.
*
* RFS acceleration.
* int (*ndo_rx_flow_steer)(struct net_device *dev, const struct sk_buff *skb,
* u16 rxq_index, u32 flow_id);
* Set hardware filter for RFS. rxq_index is the target queue index;
* flow_id is a flow ID to be passed to rps_may_expire_flow() later.
* Return the filter ID on success, or a negative error code.
*
* Slave management functions (for bridge, bonding, etc).
* int (*ndo_add_slave)(struct net_device *dev, struct net_device *slave_dev);
* Called to make another netdev an underling.
*
* int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev);
* Called to release previously enslaved netdev.
*
* Feature/offload setting functions.
* netdev_features_t (*ndo_fix_features)(struct net_device *dev,
* netdev_features_t features);
* Adjusts the requested feature flags according to device-specific
* constraints, and returns the resulting flags. Must not modify
* the device state.
*
* int (*ndo_set_features)(struct net_device *dev, netdev_features_t features);
* Called to update device configuration to new features. Passed
* feature set might be less than what was returned by ndo_fix_features()).
* Must return >0 or -errno if it changed dev->features itself.
*
* int (*ndo_fdb_add)(struct ndmsg *ndm, struct nlattr *tb[],
* struct net_device *dev,
* const unsigned char *addr, u16 vid, u16 flags)
* Adds an FDB entry to dev for addr.
* int (*ndo_fdb_del)(struct ndmsg *ndm, struct nlattr *tb[],
* struct net_device *dev,
* const unsigned char *addr, u16 vid)
* Deletes the FDB entry from dev coresponding to addr.
* int (*ndo_fdb_dump)(struct sk_buff *skb, struct netlink_callback *cb,
* struct net_device *dev, struct net_device *filter_dev,
* int idx)
* Used to add FDB entries to dump requests. Implementers should add
* entries to skb and update idx with the number of entries.
*
* int (*ndo_bridge_setlink)(struct net_device *dev, struct nlmsghdr *nlh,
* u16 flags)
* int (*ndo_bridge_getlink)(struct sk_buff *skb, u32 pid, u32 seq,
* struct net_device *dev, u32 filter_mask,
* int nlflags)
* int (*ndo_bridge_dellink)(struct net_device *dev, struct nlmsghdr *nlh,
* u16 flags);
*
* int (*ndo_change_carrier)(struct net_device *dev, bool new_carrier);
* Called to change device carrier. Soft-devices (like dummy, team, etc)
* which do not represent real hardware may define this to allow their
* userspace components to manage their virtual carrier state. Devices
* that determine carrier state from physical hardware properties (eg
* network cables) or protocol-dependent mechanisms (eg
* USB_CDC_NOTIFY_NETWORK_CONNECTION) should NOT implement this function.
*
* int (*ndo_get_phys_port_id)(struct net_device *dev,
* struct netdev_phys_item_id *ppid);
* Called to get ID of physical port of this device. If driver does
* not implement this, it is assumed that the hw is not able to have
* multiple net devices on single physical port.
*
* void (*ndo_add_vxlan_port)(struct net_device *dev,
* sa_family_t sa_family, __be16 port);
* Called by vxlan to notiy a driver about the UDP port and socket
* address family that vxlan is listnening to. It is called only when
* a new port starts listening. The operation is protected by the
* vxlan_net->sock_lock.
*
* void (*ndo_del_vxlan_port)(struct net_device *dev,
* sa_family_t sa_family, __be16 port);
* Called by vxlan to notify the driver about a UDP port and socket
* address family that vxlan is not listening to anymore. The operation
* is protected by the vxlan_net->sock_lock.
*
* void* (*ndo_dfwd_add_station)(struct net_device *pdev,
* struct net_device *dev)
* Called by upper layer devices to accelerate switching or other
* station functionality into hardware. 'pdev is the lowerdev
* to use for the offload and 'dev' is the net device that will
* back the offload. Returns a pointer to the private structure
* the upper layer will maintain.
* void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
* Called by upper layer device to delete the station created
* by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
* the station and priv is the structure returned by the add
* operation.
* netdev_tx_t (*ndo_dfwd_start_xmit)(struct sk_buff *skb,
* struct net_device *dev,
* void *priv);
* Callback to use for xmit over the accelerated station. This
* is used in place of ndo_start_xmit on accelerated net
* devices.
* netdev_features_t (*ndo_features_check) (struct sk_buff *skb,
* struct net_device *dev
* netdev_features_t features);
* Called by core transmit path to determine if device is capable of
* performing offload operations on a given packet. This is to give
* the device an opportunity to implement any restrictions that cannot
* be otherwise expressed by feature flags. The check is called with
* the set of features that the stack has calculated and it returns
* those the driver believes to be appropriate.
* int (*ndo_set_tx_maxrate)(struct net_device *dev,
* int queue_index, u32 maxrate);
* Called when a user wants to set a max-rate limitation of specific
* TX queue.
* int (*ndo_get_iflink)(const struct net_device *dev);
* Called to get the iflink value of this device.
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
void (*ndo_uninit)(struct net_device *dev);
int (*ndo_open)(struct net_device *dev);
int (*ndo_stop)(struct net_device *dev);
netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb,
struct net_device *dev);
u16 (*ndo_select_queue)(struct net_device *dev,
struct sk_buff *skb,
void *accel_priv,
select_queue_fallback_t fallback);
void (*ndo_change_rx_flags)(struct net_device *dev,
int flags);
void (*ndo_set_rx_mode)(struct net_device *dev);
int (*ndo_set_mac_address)(struct net_device *dev,
void *addr);
int (*ndo_validate_addr)(struct net_device *dev);
int (*ndo_do_ioctl)(struct net_device *dev,
struct ifreq *ifr, int cmd);
int (*ndo_set_config)(struct net_device *dev,
struct ifmap *map);
int (*ndo_change_mtu)(struct net_device *dev,
int new_mtu);
int (*ndo_neigh_setup)(struct net_device *dev,
struct neigh_parms *);
void (*ndo_tx_timeout) (struct net_device *dev);
struct rtnl_link_stats64* (*ndo_get_stats64)(struct net_device *dev,
struct rtnl_link_stats64 *storage);
struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
int (*ndo_vlan_rx_add_vid)(struct net_device *dev,
__be16 proto, u16 vid);
int (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
__be16 proto, u16 vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
void (*ndo_poll_controller)(struct net_device *dev);
int (*ndo_netpoll_setup)(struct net_device *dev,
struct netpoll_info *info);
void (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
#ifdef CONFIG_NET_RX_BUSY_POLL
int (*ndo_busy_poll)(struct napi_struct *dev);
#endif
int (*ndo_set_vf_mac)(struct net_device *dev,
int queue, u8 *mac);
int (*ndo_set_vf_vlan)(struct net_device *dev,
int queue, u16 vlan, u8 qos);
int (*ndo_set_vf_rate)(struct net_device *dev,
int vf, int min_tx_rate,
int max_tx_rate);
int (*ndo_set_vf_spoofchk)(struct net_device *dev,
int vf, bool setting);
int (*ndo_get_vf_config)(struct net_device *dev,
int vf,
struct ifla_vf_info *ivf);
int (*ndo_set_vf_link_state)(struct net_device *dev,
int vf, int link_state);
int (*ndo_set_vf_port)(struct net_device *dev,
int vf,
struct nlattr *port[]);
int (*ndo_get_vf_port)(struct net_device *dev,
int vf, struct sk_buff *skb);
int (*ndo_set_vf_rss_query_en)(
struct net_device *dev,
int vf, bool setting);
int (*ndo_setup_tc)(struct net_device *dev, u8 tc);
#if IS_ENABLED(CONFIG_FCOE)
int (*ndo_fcoe_enable)(struct net_device *dev);
int (*ndo_fcoe_disable)(struct net_device *dev);
int (*ndo_fcoe_ddp_setup)(struct net_device *dev,
u16 xid,
struct scatterlist *sgl,
unsigned int sgc);
int (*ndo_fcoe_ddp_done)(struct net_device *dev,
u16 xid);
int (*ndo_fcoe_ddp_target)(struct net_device *dev,
u16 xid,
struct scatterlist *sgl,
unsigned int sgc);
int (*ndo_fcoe_get_hbainfo)(struct net_device *dev,
struct netdev_fcoe_hbainfo *hbainfo);
#endif
#if IS_ENABLED(CONFIG_LIBFCOE)
#define NETDEV_FCOE_WWNN 0
#define NETDEV_FCOE_WWPN 1
int (*ndo_fcoe_get_wwn)(struct net_device *dev,
u64 *wwn, int type);
#endif
#ifdef CONFIG_RFS_ACCEL
int (*ndo_rx_flow_steer)(struct net_device *dev,
const struct sk_buff *skb,
u16 rxq_index,
u32 flow_id);
#endif
int (*ndo_add_slave)(struct net_device *dev,
struct net_device *slave_dev);
int (*ndo_del_slave)(struct net_device *dev,
struct net_device *slave_dev);
netdev_features_t (*ndo_fix_features)(struct net_device *dev,
netdev_features_t features);
int (*ndo_set_features)(struct net_device *dev,
netdev_features_t features);
int (*ndo_neigh_construct)(struct neighbour *n);
void (*ndo_neigh_destroy)(struct neighbour *n);
int (*ndo_fdb_add)(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr,
u16 vid,
u16 flags);
int (*ndo_fdb_del)(struct ndmsg *ndm,
struct nlattr *tb[],
struct net_device *dev,
const unsigned char *addr,
u16 vid);
int (*ndo_fdb_dump)(struct sk_buff *skb,
struct netlink_callback *cb,
struct net_device *dev,
struct net_device *filter_dev,
int idx);
int (*ndo_bridge_setlink)(struct net_device *dev,
struct nlmsghdr *nlh,
u16 flags);
int (*ndo_bridge_getlink)(struct sk_buff *skb,
u32 pid, u32 seq,
struct net_device *dev,
u32 filter_mask,
int nlflags);
int (*ndo_bridge_dellink)(struct net_device *dev,
struct nlmsghdr *nlh,
u16 flags);
int (*ndo_change_carrier)(struct net_device *dev,
bool new_carrier);
int (*ndo_get_phys_port_id)(struct net_device *dev,
struct netdev_phys_item_id *ppid);
int (*ndo_get_phys_port_name)(struct net_device *dev,
char *name, size_t len);
void (*ndo_add_vxlan_port)(struct net_device *dev,
sa_family_t sa_family,
__be16 port);
void (*ndo_del_vxlan_port)(struct net_device *dev,
sa_family_t sa_family,
__be16 port);
void* (*ndo_dfwd_add_station)(struct net_device *pdev,
struct net_device *dev);
void (*ndo_dfwd_del_station)(struct net_device *pdev,
void *priv);
netdev_tx_t (*ndo_dfwd_start_xmit) (struct sk_buff *skb,
struct net_device *dev,
void *priv);
int (*ndo_get_lock_subclass)(struct net_device *dev);
netdev_features_t (*ndo_features_check) (struct sk_buff *skb,
struct net_device *dev,
netdev_features_t features);
int (*ndo_set_tx_maxrate)(struct net_device *dev,
int queue_index,
u32 maxrate);
int (*ndo_get_iflink)(const struct net_device *dev);
};
(5). br_ethtool_ops操作函数
static const struct ethtool_ops br_ethtool_ops = {
.get_drvinfo = br_getinfo,
.get_link = ethtool_op_get_link,
};
ethtool_ops结构体定义,...以后需要再单独分析。
4. 删除网桥br_del_bridge()函数
先根据name找到网桥设备,调用br_dev_delete()函数删除网桥。
//删除网桥
int br_del_bridge(struct net *net, const char *name)
{
struct net_device *dev;
int ret = 0;
rtnl_lock();
/*网络空间net中查找名称为name的设备*/
dev = __dev_get_by_name(net, name);
/*未找到*/
if (dev == NULL)
ret = -ENXIO; /* Could not find device */
/*dev->priv_flags参数在br_dev_setup()函数中初始化为IEF_EBRIDGE,
判断找到的设备是否为桥。*/
else if (!(dev->priv_flags & IFF_EBRIDGE)) {
/* Attempt to delete non bridge device! */
ret = -EPERM;
}
/*删除网桥的时候要关闭网桥*/
else if (dev->flags & IFF_UP) {
/* Not shutdown yet. */
ret = -EBUSY;
}
/*调用br_dev_delete()删除网桥*/
else
br_dev_delete(dev, NULL);
rtnl_unlock();
return ret;
}
(1). 关于priv_flags:表示设备类型
/**
* enum net_device_priv_flags - &struct net_device priv_flags
*
* These are the &struct net_device, they are only set internally
* by drivers and used in the kernel. These flags are invisible to
* userspace, this means that the order of these flags can change
* during any kernel release.
*
* You should have a pretty good reason to be extending these flags.
*
* @IFF_802_1Q_VLAN: 802.1Q VLAN device
* @IFF_EBRIDGE: Ethernet bridging device
* @IFF_SLAVE_INACTIVE: bonding slave not the curr. active
* @IFF_MASTER_8023AD: bonding master, 802.3ad
* @IFF_MASTER_ALB: bonding master, balance-alb
* @IFF_BONDING: bonding master or slave
* @IFF_SLAVE_NEEDARP: need ARPs for validation
* @IFF_ISATAP: ISATAP interface (RFC4214)
* @IFF_MASTER_ARPMON: bonding master, ARP mon in use
* @IFF_WAN_HDLC: WAN HDLC device
* @IFF_XMIT_DST_RELEASE: dev_hard_start_xmit() is allowed to
* release skb->dst
* @IFF_DONT_BRIDGE: disallow bridging this ether dev
* @IFF_DISABLE_NETPOLL: disable netpoll at run-time
* @IFF_MACVLAN_PORT: device used as macvlan port
* @IFF_BRIDGE_PORT: device used as bridge port
* @IFF_OVS_DATAPATH: device used as Open vSwitch datapath port
* @IFF_TX_SKB_SHARING: The interface supports sharing skbs on transmit
* @IFF_UNICAST_FLT: Supports unicast filtering
* @IFF_TEAM_PORT: device used as team port
* @IFF_SUPP_NOFCS: device supports sending custom FCS
* @IFF_LIVE_ADDR_CHANGE: device supports hardware address
* change when it's running
* @IFF_MACVLAN: Macvlan device
*/
enum netdev_priv_flags {
IFF_802_1Q_VLAN = 1<<0,
IFF_EBRIDGE = 1<<1,
IFF_SLAVE_INACTIVE = 1<<2,
IFF_MASTER_8023AD = 1<<3,
IFF_MASTER_ALB = 1<<4,
IFF_BONDING = 1<<5,
IFF_SLAVE_NEEDARP = 1<<6,
IFF_ISATAP = 1<<7,
IFF_MASTER_ARPMON = 1<<8,
IFF_WAN_HDLC = 1<<9,
IFF_XMIT_DST_RELEASE = 1<<10,
IFF_DONT_BRIDGE = 1<<11,
IFF_DISABLE_NETPOLL = 1<<12,
IFF_MACVLAN_PORT = 1<<13,
IFF_BRIDGE_PORT = 1<<14,
IFF_OVS_DATAPATH = 1<<15,
IFF_TX_SKB_SHARING = 1<<16,
IFF_UNICAST_FLT = 1<<17,
IFF_TEAM_PORT = 1<<18,
IFF_SUPP_NOFCS = 1<<19,
IFF_LIVE_ADDR_CHANGE = 1<<20,
IFF_MACVLAN = 1<<21,
IFF_XMIT_DST_RELEASE_PERM = 1<<22,
IFF_IPVLAN_MASTER = 1<<23,
IFF_IPVLAN_SLAVE = 1<<24,
};
(2). br_fdb_delete_by_port()函数
删除网桥br上目的端口为p的所有fdb项
/* Flush all entries referring to a specific port.
* if do_all is set also flush static entries
*/
void br_fdb_delete_by_port(struct net_bridge *br,
const struct net_bridge_port *p,
int do_all)
{
int i;
spin_lock_bh(&br->hash_lock);
/*遍历hash数组*/
for (i = 0; i < BR_HASH_SIZE; i++) {
struct hlist_node *h, *g;
/*遍历链表*/
hlist_for_each_safe(h, g, &br->hash[i]) {
struct net_bridge_fdb_entry *f
= hlist_entry(h, struct net_bridge_fdb_entry, hlist);
/*目的端口不为p,进行下一次循环*/
if (f->dst != p)
continue;
/*设置do_all为1时,目的地址为静态MAC地址的项也删除。*/
if (f->is_static && !do_all)
continue;
/*目的MAC地址为本地MAC*/
if (f->is_local)
fdb_delete_local(br, p, f);
else
fdb_delete(br, f);
}
}
spin_unlock_bh(&br->hash_lock);
}
(3). br_dev_delete()函数
删除网桥所有端口,删除网桥所有fdb项等。
/* Delete bridge device */
void br_dev_delete(struct net_device *dev, struct list_head *head)
{
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_port *p, *n;
/*遍历删除网桥所有端口*/
list_for_each_entry_safe(p, n, &br->port_list, list) {
del_nbp(p);
}
/*删除目的地址为指定端口的fdb项,这里端口为NULL,表示删除网桥br的所有fdb项*/
br_fdb_delete_by_port(br, NULL, 1);
br_vlan_flush(br);
del_timer_sync(&br->gc_timer);
br_sysfs_delbr(br->dev);
/*从内核中删除设备*/
unregister_netdevice_queue(br->dev, head);
}
这个函数中有很多函数还没有分析,以后再分析,实在是太多了,分析不过来了。
5. br_add_if()
给网桥添加端口
对待添加的设备进行正确性判断,然后创建网桥端口,进行相关参数初始化,rx_handle设置为br_handle_frame()、插入fdb等。
/* called with RTNL */
int br_add_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p;
int err = 0;
bool changed_addr;
/* Don't allow bridging non-ethernet like devices, or DSA-enabled
* master network devices since the bridge layer rx_handler prevents
* the DSA fake ethertype handler to be invoked, so we do not strip off
* the DSA switch tag protocol header and the bridge layer just return
* RX_HANDLER_CONSUMED, stopping RX processing for these frames.
*/
/*非以太网设备不能绑定到桥。
环回地址、地址长度不为6、非法以太网地址、使用了dsa的设备都不能添加到桥*/
if ((dev->flags & IFF_LOOPBACK) ||
dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN ||
!is_valid_ether_addr(dev->dev_addr) ||
netdev_uses_dsa(dev))
return -EINVAL;
/* No bridging of bridges */
/*网桥不能添加到网桥*/
if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit)
return -ELOOP;
/* Device is already being bridged */
/*设备已经被添加到一个网桥中,一个设备只能添加到一个网桥*/
if (br_port_exists(dev))
return -EBUSY;
/* No bridging devices that dislike that (e.g. wireless) */
/*设置了IEF_DONT_BRIDGE不添加到网桥的设备,例如无线设备。*/
if (dev->priv_flags & IFF_DONT_BRIDGE)
return -EOPNOTSUPP;
/*net_bridge_port创建网桥端口,进行相关初始化*/
p = new_nbp(br, dev);
if (IS_ERR(p))
return PTR_ERR(p);
call_netdevice_notifiers(NETDEV_JOIN, dev);
err = dev_set_allmulti(dev, 1);
if (err)
goto put_back;
err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj),
SYSFS_BRIDGE_PORT_ATTR);
if (err)
goto err1;
err = br_sysfs_addif(p);
if (err)
goto err2;
err = br_netpoll_enable(p);
if (err)
goto err3;
/*这一步很重要,注册桥上设备的rx_handler为br_handle_frame函数,
在netif_receive_skb()函数中会使用*/
err = netdev_rx_handler_register(dev, br_handle_frame, p);
if (err)
goto err4;
/*设备的priv_flag参数添加网桥端口属性IFF_BRIDGE_PORT*/
dev->priv_flags |= IFF_BRIDGE_PORT;
err = netdev_master_upper_dev_link(dev, br->dev);
if (err)
goto err5;
/*关闭LRO,large receive offload*/
dev_disable_lro(dev);
/*网桥端口添加到网桥的port_list列表*/
list_add_rcu(&p->list, &br->port_list);
nbp_update_port_count(br);
//从新设置br->dev->feature字段
netdev_update_features(br->dev);
if (br->dev->needed_headroom < dev->needed_headroom)
br->dev->needed_headroom = dev->needed_headroom;
/*插入本地设备MAC-端口转发fdb项,通过函数br_fdb_insert插入的fdb表项的is_local和is_static都是1*/
if (br_fdb_insert(br, p, dev->dev_addr, 0))
netdev_err(dev, "failed insert local address bridge forwarding table\n");
/*初始化网桥端口的VLAN配置,如果Bridge设备有“Default PVID",就将默认PVID设置为端口的PVID并且Untag。*/
if (nbp_vlan_init(p))
netdev_err(dev, "failed to initialize vlan filtering on this port\n");
spin_lock_bh(&br->lock);
/*重新给网桥分配MAC地址以及网桥ID,因为刚开始建立网桥时MAC地址是随机分配的,当给网桥添加端口后,
可以把添加进来的设备的MAC地址作为网桥的MAC地址。*/
changed_addr = br_stp_recalculate_bridge_id(br);
//如果网桥端口设备是UP的,就使能它,设置状态等(如果STP没打开就没有这些步骤了)。
if (netif_running(dev) && netif_oper_up(dev) &&
(br->dev->flags & IFF_UP))
br_stp_enable_port(p);
spin_unlock_bh(&br->lock);
br_ifinfo_notify(RTM_NEWLINK, p);
/*改变了MAC地址,通知别人*/
if (changed_addr)
call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
/*设置网桥的MTU为所有端口的MTU的最小值*/
dev_set_mtu(br->dev, br_min_mtu(br));
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
err5:
dev->priv_flags &= ~IFF_BRIDGE_PORT;
netdev_rx_handler_unregister(dev);
err4:
br_netpoll_disable(p);
err3:
sysfs_remove_link(br->ifobj, p->dev->name);
err2:
kobject_put(&p->kobj);
p = NULL; /* kobject_put frees */
err1:
dev_set_allmulti(dev, -1);
put_back:
dev_put(dev);
kfree(p);
return err;
}
(1). net_nbp()函数
创建网桥端口函数,net_bridge_port结构初始化。
/* called with RTNL but without bridge lock */
static struct net_bridge_port *new_nbp(struct net_bridge *br,
struct net_device *dev)
{
int index;
struct net_bridge_port *p;
index = find_portno(br);
if (index < 0)
return ERR_PTR(index);
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (p == NULL)
return ERR_PTR(-ENOMEM);
p->br = br;
dev_hold(dev);
p->dev = dev;
p->path_cost = port_cost(dev);
p->priority = 0x8000 >> BR_PORT_BITS;
p->port_no = index;
p->flags = BR_LEARNING | BR_FLOOD;
br_init_port(p);
br_set_state(p, BR_STATE_DISABLED);
br_stp_port_timer_init(p);
br_multicast_add_port(p);
return p;
}
(2). netdev_rx_handler_register()
注册设备的rx_handler。
/**
* netdev_rx_handler_register - register receive handler
* @dev: device to register a handler for
* @rx_handler: receive handler to register
* @rx_handler_data: data pointer that is used by rx handler
*
* Register a receive handler for a device. This handler will then be
* called from __netif_receive_skb. A negative errno code is returned
* on a failure.
*
* The caller must hold the rtnl_mutex.
*
* For a general description of rx_handler, see enum rx_handler_result.
*/
int netdev_rx_handler_register(struct net_device *dev,
rx_handler_func_t *rx_handler,
void *rx_handler_data)
{
ASSERT_RTNL();
if (dev->rx_handler)
return -EBUSY;
/* Note: rx_handler_data must be set before rx_handler */
/*rx_handler_data指向网桥端口,rx_handler指向br_handle_frame*/
rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
rcu_assign_pointer(dev->rx_handler, rx_handler);
return 0;
}
(3). 关于br_stp_recalculate_bridge_id(br)
https://www.cnblogs.com/3me-linux/p/6566750.html
重新计算网桥MAC,Bridge ID
当一个网桥设备(不是端口设备)刚刚创建的时候,其MAC地址是随机的(见 br_dev_setup,旧实现是空MAC),这也会影响网桥ID(Prio+MAC),没有端口时网桥ID的MAC部分为0。当有个设备作为其端口后,是个合适的机会重新为网桥选一个MAC,并重新计算网桥ID。前提是如果这个端口的
MAC合适的话,例如不是0,长度是48Bits,并且值比原来的小(STP中ID小好事,因为其他因素一样的情况下MAC愈小ID愈小,优先级就越高),就用这个端口的MAC。
/* called under bridge lock */
bool br_stp_recalculate_bridge_id(struct net_bridge *br)
{
const unsigned char *br_mac_zero =
(const unsigned char *)br_mac_zero_aligned;
const unsigned char *addr = br_mac_zero;
struct net_bridge_port *p;
/* user has chosen a value so keep it */
/*这个NET_ADDR_SET什么情况设置?*/
if (br->dev->addr_assign_type == NET_ADDR_SET)
return false;
/*遍历网桥端口,把最小的MAC地址赋值给addr。*/
list_for_each_entry(p, &br->port_list, list) {
if (addr == br_mac_zero ||
memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0)
addr = p->dev->dev_addr;
}
/*地址没有发生改变*/
if (ether_addr_equal(br->bridge_id.addr, addr))
return false; /* no change */
/*改变网桥MAC, 网桥ID*/
br_stp_change_bridge_id(br, addr);
return true;
}
(6). br_del_if()
删除网桥端口,从新设置网桥MTU,更新网桥MAC、网桥ID(可能)。
/* called with RTNL */
int br_del_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p;
bool changed_addr;
/*根据dev获取对应网桥端口*/
p = br_port_get_rtnl(dev);
/*端口不存在或者端口不属于这个网桥*/
if (!p || p->br != br)
return -EINVAL;
/* Since more than one interface can be attached to a bridge,
* there still maybe an alternate path for netconsole to use;
* therefore there is no reason for a NETDEV_RELEASE event.
*/
/*删除网桥端口*/
del_nbp(p);
/*重新设置网桥的MTU*/
dev_set_mtu(br->dev, br_min_mtu(br));
spin_lock_bh(&br->lock);
/*重新计算网桥MAC和网桥ID*/
changed_addr = br_stp_recalculate_bridge_id(br);
spin_unlock_bh(&br->lock);
/*如果网桥地址发生改变,调用netdevice_notifiers,这个通知链还不懂。*/
if (changed_addr)
call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
//从新设置br->dev->feature字段。
netdev_update_features(br->dev);
return 0;
}