网络设备驱动的注册
net_device的生成和成员的赋值可由alloc_netdev来生成一个net_device结构体,对其成员赋值并返回该结构体的指针。第一个参数为设备私有成员的大小,第二个参数为设备名,第三个参数为net_device的setup()函数指针,第四、五个参数为要分配的发送和接收子队列的数量。setup()函数接收的参数也为struct net_device指针,用于预置net_device成员的值
#define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)
#define alloc_netdev_mq(sizeof_priv, name, name_assign_type, setup, count) \
alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, count, \
count)
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned char name_assign_type,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
size_t alloc_size;
struct net_device *p;
BUG_ON(strlen(name) >= sizeof(dev->name));
if (txqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
return NULL;
}
#ifdef CONFIG_SYSFS
if (rxqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
return NULL;
}
#endif
alloc_size = sizeof(struct net_device);
if (sizeof_priv) {
/* ensure 32-byte alignment of private area */
alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
alloc_size += sizeof_priv;
}
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1;
p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!p)
return NULL;
dev = PTR_ALIGN(p, NETDEV_ALIGN);
dev->padded = (char *)dev - (char *)p;
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
if (dev_addr_init(dev))
goto free_pcpu;
dev_mc_init(dev);
dev_uc_init(dev);
dev_net_set(dev, &init_net);
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
INIT_LIST_HEAD(&dev->napi_list);
INIT_LIST_HEAD(&dev->unreg_list);
INIT_LIST_HEAD(&dev->close_list);
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->adj_list.upper);
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all);
INIT_LIST_HEAD(&dev->ptype_specific);
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
if (!dev->tx_queue_len) {
dev->priv_flags |= IFF_NO_QUEUE;
dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
}
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
if (netif_alloc_netdev_queues(dev))
goto free_all;
#ifdef CONFIG_SYSFS
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev))
goto free_all;
#endif
strcpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops;
nf_hook_ingress_init(dev);
return dev;
free_all:
free_netdev(dev);
return NULL;
free_pcpu:
free_percpu(dev->pcpu_refcnt);
free_dev:
netdev_freemem(dev);
return NULL;
}
setup函数一般就是初始化net_device的一些成员
#define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1)
#define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count)
struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
unsigned int rxqs)
{
return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_UNKNOWN,
ether_setup, txqs, rxqs);
}
void ether_setup(struct net_device *dev)
{
dev->header_ops = ð_header_ops;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
dev->min_header_len = ETH_HLEN;
dev->mtu = ETH_DATA_LEN;
dev->min_mtu = ETH_MIN_MTU;
dev->max_mtu = ETH_DATA_LEN;
dev->addr_len = ETH_ALEN;
dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
dev->priv_flags |= IFF_TX_SKB_SHARING;
eth_broadcast_addr(dev->broadcast);
}
网络设备的注册由register_netdev完成
int register_netdev(struct net_device *dev)
{
int err;
rtnl_lock();
err = register_netdevice(dev);
rtnl_unlock();
return err;
}
int register_netdevice(struct net_device *dev)
{
int ret;
struct net *net = dev_net(dev);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
might_sleep();
/* When net_device's are persistent, this will be fatal. */
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
ret = dev_get_valid_name(net, dev, dev->name);
if (ret < 0)
goto out;
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
goto out;
}
}
if (((dev->hw_features | dev->features) &
NETIF_F_HW_VLAN_CTAG_FILTER) &&
(!dev->netdev_ops->ndo_vlan_rx_add_vid ||
!dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
ret = -EINVAL;
goto err_uninit;
}
ret = -EBUSY;
if (!dev->ifindex)
dev->ifindex = dev_new_index(net);
else if (__dev_get_by_index(net, dev->ifindex))
goto err_uninit;
/* Transfer changeable features to wanted_features and enable
* software offloads (GSO and GRO).
*/
dev->hw_features |= NETIF_F_SOFT_FEATURES;
dev->features |= NETIF_F_SOFT_FEATURES;
if (dev->netdev_ops->ndo_udp_tunnel_add) {
dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
}
dev->wanted_features = dev->features & dev->hw_features;
if (!(dev->flags & IFF_LOOPBACK))
dev->hw_features |= NETIF_F_NOCACHE_COPY;
/* If IPv4 TCP segmentation offload is supported we should also
* allow the device to enable segmenting the frame with the option
* of ignoring a static IP ID value. This doesn't enable the
* feature itself but allows the user to enable it later.
*/
if (dev->hw_features & NETIF_F_TSO)
dev->hw_features |= NETIF_F_TSO_MANGLEID;
if (dev->vlan_features & NETIF_F_TSO)
dev->vlan_features |= NETIF_F_TSO_MANGLEID;
if (dev->mpls_features & NETIF_F_TSO)
dev->mpls_features |= NETIF_F_TSO_MANGLEID;
if (dev->hw_enc_features & NETIF_F_TSO)
dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
dev->vlan_features |= NETIF_F_HIGHDMA;
/* Make NETIF_F_SG inheritable to tunnel devices.
*/
dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
/* Make NETIF_F_SG inheritable to MPLS.
*/
dev->mpls_features |= NETIF_F_SG;
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
if (ret)
goto err_uninit;
ret = netdev_register_kobject(dev);
if (ret)
goto err_uninit;
dev->reg_state = NETREG_REGISTERED;
__netdev_update_features(dev);
/*
* Default initial state at registry is that the
* device is present.
*/
set_bit(__LINK_STATE_PRESENT, &dev->state);
linkwatch_init_dev(dev);
dev_init_scheduler(dev);
dev_hold(dev);
list_netdevice(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
/* If the device has permanent device address, driver should
* set dev_addr and also addr_assign_type should be set to
* NET_ADDR_PERM (default value).
*/
if (dev->addr_assign_type == NET_ADDR_PERM)
memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
/* Notify protocols, that a new device appeared. */
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
ret = notifier_to_errno(ret);
if (ret) {
rollback_registered(dev);
dev->reg_state = NETREG_UNREGISTERED;
}
/*
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
out:
return ret;
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
if (dev->priv_destructor)
dev->priv_destructor(dev);
goto out;
}
网络设备的初始化
一般就是对硬件资源初始化,初始化net_device的公有成员,设置成员函数,获取私有信息并初始化
void xxx_init(struct net_device *dev)
{
/* 设备的私有信息结构体 */
struct xxx_priv *priv;
/* 检查设备是否存在和设备所使用的硬件资源 */
xxx_hw_init();
/* 初始化以太网设备的公用成员 */
ether_setup(dev);
/* 设置设备的成员函数指针 */
ndev->netdev_ops = &xxx_netdev_ops;
ndev->ethtool_ops = &xxx_ethtool_ops;
dev->watchdog_timeo = timeout;
/* 取得私有信息,并初始化它 */
priv = netdev_priv(dev);
... /* 初始化设备私有数据区 */
}
网络设备的打开与释放
使能设备使用的硬件资源,申请I/O区域、中断和DMA通道等;
调用Linux内核提供的netif_start_queue()函数,激活设备发送队
static int hisi_femac_net_open(struct net_device *dev)
{
struct hisi_femac_priv *priv = netdev_priv(dev);
hisi_femac_port_reset(priv);
hisi_femac_set_hw_mac_addr(priv, dev->dev_addr);
hisi_femac_rx_refill(priv);
netif_carrier_off(dev);
netdev_reset_queue(dev);
netif_start_queue(dev);
napi_enable(&priv->napi);
priv->link_status = 0;
if (dev->phydev)
phy_start(dev->phydev);
writel(IRQ_ENA_PORT0_MASK, priv->glb_base + GLB_IRQ_RAW);
hisi_femac_irq_enable(priv, IRQ_ENA_ALL | IRQ_ENA_PORT0 | DEF_INT_MASK);
return 0;
}
调用Linux内核提供的netif_stop_queue()函数,停止设备传输包。
释放设备所使用的I/O区域、中断和DMA资源
static int hisi_femac_net_close(struct net_device *dev)
{
struct hisi_femac_priv *priv = netdev_priv(dev);
hisi_femac_irq_disable(priv, IRQ_ENA_PORT0);
if (dev->phydev)
phy_stop(dev->phydev);
netif_stop_queue(dev);
napi_disable(&priv->napi);
hisi_femac_free_skb_rings(priv);
return 0;
}
数据发送流程
一般就是net_device_ops的ndo_start_xmit成员的实例,从上层协议传递过来的sk_buff参数获得数据包的有效数据和长度,将有效数据放入临时缓冲区,设置硬件的寄存器,驱使网络设备进行数据发送操作;当发送队列为满或因其他原因来不及发送当前上层传下来的数据包时,对netif_stop_queue阻止上层继续向网络设备驱动传递数据包,当忙于发送的数据包被发送完成后,在以TX结束的中断处理中,应该调用netif_wake_queue()唤醒被阻塞的上层,以启动它继续向网络设备驱动传送数据包
int xxx_tx(struct sk_buff *skb, struct net_device *dev)
{
int len;
char *data, shortpkt[ETH_ZLEN];
if (xxx_send_available(...)) { /* 发送队列未满,可以发送 */
/* 获得有效数据指针和长度 */
data = skb->data;
len = skb->len;
if (len < ETH_ZLEN) {
/* 如果帧长小于以太网帧最小长度,补0 */
memset(shortpkt, 0, ETH_ZLEN);
memcpy(shortpkt, skb->data, skb->len);
len = ETH_ZLEN;
data = shortpkt;
}
dev->trans_start = jiffies; /* 记录发送时间戳 */
if (avail) {/* 设置硬件寄存器,让硬件把数据包发送出去 */
xxx_hw_tx(data, len, dev);
} else {
netif_stop_queue(dev);
...
}
}
数据接收流程
网络设备接收数据的主要方法是由中断引发设备的中断处理函数里,判断中断事件为接收中断,则读取接收到的数据,分配sk_buffer数据结构和数据缓冲区,将接收到的数据复制到数据缓冲区,并调用netif_rx()函数将sk_buffer传递给上层协议
static void xxx_interrupt(int irq, void *dev_id)
{
...
switch (status &ISQ_EVENT_MASK) {
case ISQ_RECEIVER_EVENT:
/* 获取数据包 */
xxx_rx(dev);
break;
/* 其他类型的中断 */
}
}
static void xxx_rx(struct xxx_device *dev)
{
...
length = get_rev_len (...);
/* 分配新的套接字缓冲区 */
skb = dev_alloc_skb(length + 2);
skb_reserve(skb, 2); /* 对齐 */
skb->dev = dev;
/* 读取硬件上接收到的数据 */
insw(ioaddr + RX_FRAME_PORT, skb_put(skb, length), length >> 1);
if (length &1)
skb->data[length - 1] = inw(ioaddr + RX_FRAME_PORT);
/* 获取上层协议类型 */
skb->protocol = eth_type_trans(skb, dev);
/* 把数据包交给上层 */
netif_rx(skb);
/* 记录接收时间戳 */
dev->last_rx = jiffies;
...
}
如果是NAPI兼容的设备驱动,则可以通过poll方式接收数据包。在这种情况下,需要为该设备驱
动提供作为netif_napi_add()参数的xxx_poll()函数
static void xxx_interrupt(int irq, void *dev_id)
{
switch (status &ISQ_EVENT_MASK) {
case ISQ_RECEIVER_EVENT:
… /* 获取数据包 */
xxx_disable_rx_int(...); /* 禁止接收中断 */
napi_schedule(&priv->napi);
break;
… /* 其他类型的中断 */
}
}
static int xxx_poll(struct napi_struct *napi, int budget)
{
int npackets = 0;
struct sk_buff *skb;
struct xxx_priv *priv = container_of(napi, struct xxx_priv, napi);
struct xxx_packet *pkt;
while (npackets < budget && priv->rx_queue) {
/* 从队列中取出数据包 */
pkt = xxx_dequeue_buf(dev);
/* 接下来的处理和中断触发的数据包接收一致 */
skb = dev_alloc_skb(pkt->datalen + 2);
...
skb_reserve(skb, 2);
memcpy(skb_put(skb, pkt->datalen), pkt->data, pkt->datalen);
skb->dev = dev;
skb->protocol = eth_type_trans(skb, dev);
/* 调用netif_receive_skb,而不是net_rx, 将数据包交给上层协议 */
netif_receive_skb(skb);
/* 更改统计数据 */
priv->stats.rx_packets++;
priv->stats.rx_bytes += pkt->datalen;
xxx_release_buffer(pkt);
npackets++;
}
if (npackets < budget) {
napi_complete(napi);
xxx_enable_rx_int (…); /* 再次启动网络设备的接收中断 */
}
return npackets;
}
当第一次中断发生后,中断处理程序要禁止设备的数据包接收中断并调度NAPI
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
网络连接状态
可以通过netif_carrier_on()和netif_carrier_off()函数改变设备的连接状态,如果驱动检测到连接状态发生变化,也应该以netif_carrier_on()和netif_carrier_off()函数显式地通知内核;netif_carrier_ok可判断载波是否存
void netif_carrier_on(struct net_device *dev)
{
if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
if (dev->reg_state == NETREG_UNINITIALIZED)
return;
atomic_inc(&dev->carrier_changes);
linkwatch_fire_event(dev);
if (netif_running(dev))
__netdev_watchdog_up(dev);
}
}
void netif_carrier_off(struct net_device *dev)
{
if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
if (dev->reg_state == NETREG_UNINITIALIZED)
return;
atomic_inc(&dev->carrier_changes);
linkwatch_fire_event(dev);
}
}
static inline bool netif_carrier_ok(const struct net_device *dev)
{
return !test_bit(__LINK_STATE_NOCARRIER, &dev->state);
}
驱动还可能会提供一些供系统对设备的参数进行设置或读取设备相关信息,比如设置mac地址,流量统计等,详细可参考/drivers/net/ethernet/的各半导体厂商的eth驱动