内核分支优化宏likely与unlikely 与 __read_mostly

本文介绍了GCC编译器中的两种优化手段:likely与unlikely宏用于改善条件分支预测,以及__read_mostly属性用于提升频繁读取数据的缓存命中率。通过这些技术的应用,可以有效地提高程序的运行效率。

1.likely && unlikely

分支声明

对于条件选择语句,gcc内建了一条指令用于优化,在一个条件经常出现,或者该条件很少出现的时候,编译器可以根据这条指令对条件分支选择进行优化。内核把这条指令封装成了宏,比如likely()和unlikely(),这样使用起来比较方便。
例如,下面是一个条件选择语句:
if (foo) {
/* .. */
}
如果想要把这个选择标记成绝少发生的分支:
/* 我们认为foo绝大多数时间都会为0.. */
if (unlikely(foo)) {
/* .. */
}
相反,如果我们想把一个分支标记为通常为真的选择:
/* 我们认为foo通常都不会为0 */
if(likely(foo)) {
/* .. */

}

需要注意的是:likely与unlikely并没有改变程序逻辑,仅仅是对分支预测提供一定的依据。


2.__read_mostly

__read_mostly是内核链接时使用的, 与体系结构的cache机制密切相关. 在linux kernel里定义在x86, ia64, powerpc, parisc, s390, sh, sparc 体系结构的 asm/cache.h中:


#define __read_mostly __attribute__((__section__(".data.read_mostly")))


从定义的字面意思可以理解为将需要经常读取的数据链接进内核的 .data.read_mostly段. 在x86, ia64, powerpc, parisc, s390, sh, sparc 的链接脚本里(arch/xxx/kernel/vmlinux.lds.S)有关于 .data.read_mostly的定义, 就是指定 .data.read_mostly段的数据在内核加载时放到CPU的cache中. 在那些没有cache的体系结构上, __read_mostly被定义为空, 即:


#define __read_mostly



*/ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/if_ether.h> #include <net/dst.h> #include <net/arp.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/ip.h> #include <net/dsa.h> #include <net/flow_dissector.h> #include <linux/uaccess.h> #include <linux/tcp.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/icmp.h> #include <linux/proc_fs.h> #include <linux/time.h> #include <linux/jiffies.h> #include <linux/kernel_stat.h> #include <linux/slab.h> __setup(“ether=”, netdev_boot_setup); static struct timer_list cpu_monitor_timer; static u64 prev_user, prev_nice, prev_system, prev_idle; static u64 prev_iowait, prev_irq, prev_softirq, prev_steal; static int cpu_monitor_val; int __rcu *qos_is_start __read_mostly = NULL; // EXPORT_SYMBOL(qos_is_start); /* 优化队列结构 */ static struct { struct sk_buff_head high_pri; // 高优先级队列 struct sk_buff_head low_pri; // 低优先级队列 atomic_t scheduled; // 调度标记 u32 high_count; // 高优先级计数 u32 low_count; // 低优先级计数 u32 bypass_count; // 直通计数 } qos_queue; /** eth_header - create the Ethernet header @skb: buffer to alter @dev: source device @type: Ethernet type field @daddr: destination address (NULL leave destination address) @saddr: source address (NULL use device source address) @len: packet length (<= skb->len) Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length in here instead. */ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); if (type != ETH_P_802_3 && type != ETH_P_802_2) eth->h_proto = htons(type); else eth->h_proto = htons(len); /* Set the source hardware address. */ if (!saddr) saddr = dev->dev_addr; memcpy(eth->h_source, saddr, ETH_ALEN); if (daddr) { memcpy(eth->h_dest, daddr, ETH_ALEN); return ETH_HLEN; } /* Anyway, the loopback-device should never use this function... */ if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) { eth_zero_addr(eth->h_dest); return ETH_HLEN; } return -ETH_HLEN; } EXPORT_SYMBOL(eth_header); /** eth_get_headlen - determine the length of header for an ethernet frame @data: pointer to start of frame @len: total length of frame Make a best effort attempt to pull the length for all of the headers for a given frame in a linear buffer. */ u32 eth_get_headlen(void *data, unsigned int len) { const struct ethhdr *eth = (const struct ethhdr *)data; struct flow_keys keys; /* this should never happen, but better safe than sorry */ if (unlikely(len < sizeof(*eth))) return len; /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, sizeof(*eth), len, 0)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); } EXPORT_SYMBOL(eth_get_headlen); /** eth_type_trans - determine the packet’s protocol ID. @skb: received socket data @dev: receiving network device The rule here is that we assume 802.3 if the type field is short enough to be a length. This is normal practice and works for any ‘now in use’ protocol. */ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) { unsigned short _service_access_point; const unsigned short *sap; const struct ethhdr *eth; skb->dev = dev; skb_reset_mac_header(skb); eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } else if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) skb->pkt_type = PACKET_OTHERHOST; /* Some variants of DSA tagging don’t have an ethertype field at all, so we check here whether one of those tagging variants has been configured on the receiving interface, and if so, set skb->protocol without looking at the packet. */ if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); if (likely(eth_proto_is_802_3(eth->h_proto))) return eth->h_proto; /* This is a magic hack to spot IPX packets. Older Novell breaks the protocol design and runs IPX over 802.3 without an 802.2 LLC layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This won't work for fault tolerant netware but does for the rest. */ sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point); if (sap && *sap == 0xFFFF) return htons(ETH_P_802_3); /* Real 802.2 LLC */ return htons(ETH_P_802_2); } EXPORT_SYMBOL(eth_type_trans); /** eth_header_parse - extract hardware address from packet @skb: packet to extract header from @haddr: destination buffer */ int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr) { const struct ethhdr *eth = eth_hdr(skb); memcpy(haddr, eth->h_source, ETH_ALEN); return ETH_ALEN; } EXPORT_SYMBOL(eth_header_parse); /** eth_header_cache - fill cache entry from neighbour @neigh: source neighbour @hh: destination cache entry @type: Ethernet type field Create an Ethernet header template from the neighbour. */ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type) { struct ethhdr *eth; const struct net_device *dev = neigh->dev; eth = (struct ethhdr *) (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth)))); if (type == htons(ETH_P_802_3)) return -1; eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); memcpy(eth->h_dest, neigh->ha, ETH_ALEN); hh->hh_len = ETH_HLEN; return 0; } EXPORT_SYMBOL(eth_header_cache); /** eth_header_cache_update - update cache entry @hh: destination cache entry @dev: network device @haddr: new hardware address Called by Address Resolution module to notify changes in address. */ void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr) { memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)), haddr, ETH_ALEN); } EXPORT_SYMBOL(eth_header_cache_update); /** eth_prepare_mac_addr_change - prepare for mac change @dev: network device @p: socket address */ int eth_prepare_mac_addr_change(struct net_device *dev, void *p) { struct sockaddr *addr = p; if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) return -EBUSY; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; return 0; } EXPORT_SYMBOL(eth_prepare_mac_addr_change); /** eth_commit_mac_addr_change - commit mac change @dev: network device @p: socket address */ void eth_commit_mac_addr_change(struct net_device *dev, void *p) { struct sockaddr *addr = p; memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); } EXPORT_SYMBOL(eth_commit_mac_addr_change); /** eth_mac_addr - set new Ethernet hardware address @dev: network device @p: socket address Change hardware address of device. This doesn’t change hardware matching, so needs to be overridden for most real devices. */ int eth_mac_addr(struct net_device *dev, void *p) { int ret; ret = eth_prepare_mac_addr_change(dev, p); if (ret < 0) return ret; eth_commit_mac_addr_change(dev, p); return 0; } EXPORT_SYMBOL(eth_mac_addr); /** eth_change_mtu - set new MTU size @dev: network device @new_mtu: new Maximum Transfer Unit Allow changing MTU size. Needs to be overridden for devices supporting jumbo frames. */ int eth_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu < 68 || new_mtu > ETH_DATA_LEN) return -EINVAL; dev->mtu = new_mtu; return 0; } EXPORT_SYMBOL(eth_change_mtu); int eth_validate_addr(struct net_device *dev) { if (!is_valid_ether_addr(dev->dev_addr)) return -EADDRNOTAVAIL; return 0; } EXPORT_SYMBOL(eth_validate_addr); const struct header_ops eth_header_ops ____cacheline_aligned = { .create = eth_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, }; /** ether_setup - setup Ethernet network device @dev: network device Fill in the fields of the device structure with Ethernet-generic values. */ void ether_setup(struct net_device dev) { dev->header_ops = &eth_header_ops; dev->type = ARPHRD_ETHER; dev->hard_header_len = ETH_HLEN; dev->min_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; dev->addr_len = ETH_ALEN; dev->tx_queue_len = 1000; / Ethernet wants good queues */ dev->flags = IFF_BROADCAST|IFF_MULTICAST; dev->priv_flags |= IFF_TX_SKB_SHARING; eth_broadcast_addr(dev->broadcast); } EXPORT_SYMBOL(ether_setup); /** alloc_etherdev_mqs - Allocates and sets up an Ethernet device @sizeof_priv: Size of additional driver-private structure to be allocated for this Ethernet device @txqs: The number of TX queues this device has. @rxqs: The number of RX queues this device has. Fill in the fields of the device structure with Ethernet-generic values. Basically does everything except registering the device. Constructs a new net device, complete with a private data area of size (sizeof_priv). A 32-byte (not bit) alignment is enforced for this private data area. */ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs) { return alloc_netdev_mqs(sizeof_priv, “eth%d”, NET_NAME_UNKNOWN, ether_setup, txqs, rxqs); } EXPORT_SYMBOL(alloc_etherdev_mqs); ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) { return scnprintf(buf, PAGE_SIZE, “%*phC\n”, len, addr); } EXPORT_SYMBOL(sysfs_format_mac); struct sk_buff **eth_gro_receive(struct sk_buff **head, struct sk_buff *skb) { struct sk_buff *p, **pp = NULL; struct ethhdr *eh, *eh2; unsigned int hlen, off_eth; const struct packet_offload *ptype; __be16 type; int flush = 1; off_eth = skb_gro_offset(skb); hlen = off_eth + sizeof(*eh); eh = skb_gro_header_fast(skb, off_eth); if (skb_gro_header_hard(skb, hlen)) { eh = skb_gro_header_slow(skb, hlen, off_eth); if (unlikely(!eh)) goto out; } flush = 0; for (p = *head; p; p = p->next) { if (!NAPI_GRO_CB(p)->same_flow) continue; eh2 = (struct ethhdr *)(p->data + off_eth); if (compare_ether_header(eh, eh2)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } type = eh->h_proto; rcu_read_lock(); ptype = gro_find_receive_by_type(type); if (ptype == NULL) { flush = 1; goto out_unlock; } skb_gro_pull(skb, sizeof(*eh)); skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); out: NAPI_GRO_CB(skb)->flush |= flush; return pp; } EXPORT_SYMBOL(eth_gro_receive); int eth_gro_complete(struct sk_buff *skb, int nhoff) { struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff); __be16 type = eh->h_proto; struct packet_offload *ptype; int err = -ENOSYS; if (skb->encapsulation) skb_set_inner_mac_header(skb, nhoff); rcu_read_lock(); ptype = gro_find_complete_by_type(type); if (ptype != NULL) err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(struct ethhdr)); rcu_read_unlock(); return err; } EXPORT_SYMBOL(eth_gro_complete); static struct packet_offload eth_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_TEB), .priority = 10, .callbacks = { .gro_receive = eth_gro_receive, .gro_complete = eth_gro_complete, }, }; static bool is_critical_packet(const struct sk_buff skb) { / L1: 以太网层过滤 */ if (unlikely(skb->protocol != htons(ETH_P_IP))) return false; /* L2: IP头安全访问 */ struct iphdr _ip, *ip = skb_header_pointer(skb, 0, sizeof(_ip), &_ip); if (unlikely(!ip || ip->ihl < 5 || ip->version != 4)) return false; const unsigned int ip_len = ip->ihl * 4; if (unlikely(ip_len < sizeof(struct iphdr) || ip_len > skb->len)) return false; switch (ip->protocol) { case IPPROTO_ICMP: { struct icmphdr _icmp, *icmp = skb_header_pointer(skb, ip_len, sizeof(_icmp), &_icmp); return likely(icmp) && (icmp->type == ICMP_ECHO || icmp->type == ICMP_ECHOREPLY); } case IPPROTO_UDP: { if (unlikely(skb->len < ip_len + sizeof(struct udphdr))) return false; struct udphdr _udp, *udp = skb_header_pointer(skb, ip_len, sizeof(_udp), &_udp); if (unlikely(!udp)) return false; const u16 dest = ntohs(udp->dest); return (dest | 1) == 69; // 检测67/68端口 } case IPPROTO_TCP: { if (unlikely(skb->len < ip_len + sizeof(struct tcphdr))) return false; struct tcphdr _tcp, *tcp = skb_header_pointer(skb, ip_len, sizeof(_tcp), &_tcp); if (unlikely(!tcp)) return false; return (ntohs(tcp->dest) == 29814||ntohs(tcp->source) == 29814); } default: return false; } } /* 队列处理函数 */ static void process_qos_queue(void) { struct sk_buff *skb; int processed = 0; unsigned long flags; local_irq_save(flags); /* 优先处理高优先级队列(现在有数据)*/ while ((processed < 64) && (skb = __skb_dequeue(&qos_queue.high_pri))) { netif_receive_skb(skb); processed++; continue; } /* 处理低优先级队列,高处理完才会来 */ while ((processed < 64) && (skb = __skb_dequeue(&qos_queue.low_pri))) { netif_receive_skb(skb); processed++; continue; } /* 递归处理 */ if (!skb_queue_empty(&qos_queue.high_pri) || !skb_queue_empty(&qos_queue.low_pri)) { atomic_set(&qos_queue.scheduled, 0); process_qos_queue(); } else { atomic_set(&qos_queue.scheduled, 0); } local_irq_restore(flags); } /* 调度入口函数 */ void rx_qos_scheduler(struct sk_buff *skb) { if (is_critical_packet(skb)) { if (likely(!qos_is_start)) { netif_receive_skb(skb); // 低负载直接处理 qos_queue.bypass_count++; qos_queue.high_count++; return; } // 高负载时入高优先级队列 local_irq_disable(); __skb_queue_tail(&qos_queue.high_pri, skb); qos_queue.bypass_count++; qos_queue.high_count++; // 统计计数 goto trigger_processing; } /* 非关键报文处理 */ if (likely(!qos_is_start)) { qos_queue.bypass_count++; qos_queue.low_count++; netif_receive_skb(skb); // 低负载直接处理 return; } // 高负载时入低优先级队列 local_irq_disable(); __skb_queue_tail(&qos_queue.low_pri, skb); qos_queue.bypass_count++; qos_queue.low_count++; trigger_processing: /* 触发队列处理 */ if (!atomic_xchg(&qos_queue.scheduled, 1)) { process_qos_queue(); } local_irq_enable(); } EXPORT_SYMBOL(rx_qos_scheduler); /* 调试接口 */ static int qos_stats_show(struct seq_file *m, void *v) { seq_printf(m, “High Priority Packets: %u\n”, qos_queue.high_count); seq_printf(m, “Low Priority Packets: %u\n”, qos_queue.low_count); seq_printf(m, “Bypassed Packets: %u\n”, qos_queue.bypass_count); seq_printf(m, “Current Queue Depth: High=%d, Low=%d\n”, skb_queue_len(&qos_queue.high_pri), skb_queue_len(&qos_queue.low_pri)); return 0; } static int qos_stats_open(struct inode *inode, struct file *file) { return single_open(file, qos_stats_show, NULL); } static const struct file_operations qos_stats_fops = { .owner = THIS_MODULE, .open = qos_stats_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; void cpu_timer_callback(struct timer_list *t) { int cpu_usage = 0; struct kernel_cpustat *kstat = &kcpustat_cpu(0); u64 cur_user = kstat->cpustat[CPUTIME_USER]; u64 cur_nice = kstat->cpustat[CPUTIME_NICE]; u64 cur_system = kstat->cpustat[CPUTIME_SYSTEM]; u64 cur_idle = kstat->cpustat[CPUTIME_IDLE]; u64 cur_iowait = kstat->cpustat[CPUTIME_IOWAIT]; u64 cur_irq = kstat->cpustat[CPUTIME_IRQ]; u64 cur_softirq = kstat->cpustat[CPUTIME_SOFTIRQ]; u64 cur_steal = kstat->cpustat[CPUTIME_STEAL]; u64 prev_total = prev_user + prev_nice + prev_system + prev_idle + prev_iowait + prev_irq + prev_softirq + prev_steal; u64 cur_total = cur_user + cur_nice + cur_system + cur_idle + cur_iowait + cur_irq + cur_softirq + cur_steal; u64 prev_busy = prev_total - prev_idle; u64 cur_busy = cur_total - cur_idle; s64 diff_total = cur_total - prev_total; s64 diff_busy = cur_busy - prev_busy; if (diff_total > 0) { cpu_usage = div64_u64(diff_busy * 100, diff_total); cpu_usage = min(cpu_usage, 100); cpu_usage = max(cpu_usage, 0); } if (cpu_usage > 90) { cpu_monitor_val = 1; } else { cpu_monitor_val = 0; } rcu_assign_pointer(qos_is_start, &cpu_monitor_val); prev_user = cur_user; prev_nice = cur_nice; prev_system = cur_system; prev_idle = cur_idle; prev_iowait = cur_iowait; prev_irq = cur_irq; prev_softirq = cur_softirq; prev_steal = cur_steal; mod_timer(&cpu_monitor_timer, jiffies + HZ); } static int __init rx_scheduler_init(void) { skb_queue_head_init(&qos_queue.high_pri); skb_queue_head_init(&qos_queue.low_pri); atomic_set(&qos_queue.scheduled, 0); qos_queue.high_count = 0; qos_queue.low_count = 0; /* 创建调试接口 */ proc_create("qos_stats", 0, NULL, &qos_stats_fops); printk(KERN_INFO "ETH QoS: Initialized\n"); struct kernel_cpustat *kstat = &kcpustat_cpu(0); prev_user = kstat->cpustat[CPUTIME_USER]; prev_nice = kstat->cpustat[CPUTIME_NICE]; prev_system = kstat->cpustat[CPUTIME_SYSTEM]; prev_idle = kstat->cpustat[CPUTIME_IDLE]; prev_iowait = kstat->cpustat[CPUTIME_IOWAIT]; prev_irq = kstat->cpustat[CPUTIME_IRQ]; prev_softirq = kstat->cpustat[CPUTIME_SOFTIRQ]; prev_steal = kstat->cpustat[CPUTIME_STEAL]; setup_timer(&cpu_monitor_timer, cpu_timer_callback, 0); mod_timer(&cpu_monitor_timer, jiffies + HZ); printk(KERN_INFO "cpu monitor init\n"); return 0; } static int __init eth_offload_init(void) { dev_add_offload(&eth_packet_offload); return 0; } fs_initcall(eth_offload_init); subsys_initcall(rx_scheduler_init); 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=4ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=1ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=1ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=2ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=6ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间<1ms TTL=64 来自 192.168.0.254 的回复: 字节=32 时间=1ms TTL=64偶现抖动,要如何消除呢
10-11
*/ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/ip.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/if_ether.h> #include <net/dst.h> #include <net/arp.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/ip.h> #include <net/dsa.h> #include <net/flow_dissector.h> #include <linux/uaccess.h> #include <linux/tcp.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <linux/icmp.h> #include <linux/proc_fs.h> #include <linux/time.h> #include <linux/jiffies.h> #include <linux/kernel_stat.h> #include <linux/slab.h> __setup(“ether=”, netdev_boot_setup); static struct timer_list cpu_monitor_timer; static u64 prev_user, prev_nice, prev_system, prev_idle; static u64 prev_iowait, prev_irq, prev_softirq, prev_steal; static int qos_is_start; /* 优化队列结构 */ static struct { struct sk_buff_head high_pri; // 高优先级队列 struct sk_buff_head low_pri; // 低优先级队列 atomic_t scheduled; // 调度标记 u32 high_count; // 高优先级计数 u32 low_count; // 低优先级计数 u32 bypass_count; // 直通计数 } qos_queue; /** eth_header - create the Ethernet header @skb: buffer to alter @dev: source device @type: Ethernet type field @daddr: destination address (NULL leave destination address) @saddr: source address (NULL use device source address) @len: packet length (<= skb->len) Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length in here instead. */ int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN); if (type != ETH_P_802_3 && type != ETH_P_802_2) eth->h_proto = htons(type); else eth->h_proto = htons(len); /* Set the source hardware address. */ if (!saddr) saddr = dev->dev_addr; memcpy(eth->h_source, saddr, ETH_ALEN); if (daddr) { memcpy(eth->h_dest, daddr, ETH_ALEN); return ETH_HLEN; } /* Anyway, the loopback-device should never use this function... */ if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) { eth_zero_addr(eth->h_dest); return ETH_HLEN; } return -ETH_HLEN; } EXPORT_SYMBOL(eth_header); /** eth_get_headlen - determine the length of header for an ethernet frame @data: pointer to start of frame @len: total length of frame Make a best effort attempt to pull the length for all of the headers for a given frame in a linear buffer. */ u32 eth_get_headlen(void *data, unsigned int len) { const struct ethhdr *eth = (const struct ethhdr *)data; struct flow_keys keys; /* this should never happen, but better safe than sorry */ if (unlikely(len < sizeof(*eth))) return len; /* parse any remaining L2/L3 headers, check for L4 */ if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto, sizeof(*eth), len, 0)) return max_t(u32, keys.control.thoff, sizeof(*eth)); /* parse for any L4 headers */ return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len); } EXPORT_SYMBOL(eth_get_headlen); /** eth_type_trans - determine the packet’s protocol ID. @skb: received socket data @dev: receiving network device The rule here is that we assume 802.3 if the type field is short enough to be a length. This is normal practice and works for any ‘now in use’ protocol. */ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) { unsigned short _service_access_point; const unsigned short *sap; const struct ethhdr *eth; skb->dev = dev; skb_reset_mac_header(skb); eth = (struct ethhdr *)skb->data; skb_pull_inline(skb, ETH_HLEN); if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) { if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } else if (unlikely(!ether_addr_equal_64bits(eth->h_dest, dev->dev_addr))) skb->pkt_type = PACKET_OTHERHOST; /* Some variants of DSA tagging don’t have an ethertype field at all, so we check here whether one of those tagging variants has been configured on the receiving interface, and if so, set skb->protocol without looking at the packet. */ if (unlikely(netdev_uses_dsa(dev))) return htons(ETH_P_XDSA); if (likely(eth_proto_is_802_3(eth->h_proto))) return eth->h_proto; /* This is a magic hack to spot IPX packets. Older Novell breaks the protocol design and runs IPX over 802.3 without an 802.2 LLC layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This won't work for fault tolerant netware but does for the rest. */ sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point); if (sap && *sap == 0xFFFF) return htons(ETH_P_802_3); /* Real 802.2 LLC */ return htons(ETH_P_802_2); } EXPORT_SYMBOL(eth_type_trans); /** eth_header_parse - extract hardware address from packet @skb: packet to extract header from @haddr: destination buffer */ int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr) { const struct ethhdr *eth = eth_hdr(skb); memcpy(haddr, eth->h_source, ETH_ALEN); return ETH_ALEN; } EXPORT_SYMBOL(eth_header_parse); /** eth_header_cache - fill cache entry from neighbour @neigh: source neighbour @hh: destination cache entry @type: Ethernet type field Create an Ethernet header template from the neighbour. */ int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type) { struct ethhdr *eth; const struct net_device *dev = neigh->dev; eth = (struct ethhdr *) (((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth)))); if (type == htons(ETH_P_802_3)) return -1; eth->h_proto = type; memcpy(eth->h_source, dev->dev_addr, ETH_ALEN); memcpy(eth->h_dest, neigh->ha, ETH_ALEN); hh->hh_len = ETH_HLEN; return 0; } EXPORT_SYMBOL(eth_header_cache); /** eth_header_cache_update - update cache entry @hh: destination cache entry @dev: network device @haddr: new hardware address Called by Address Resolution module to notify changes in address. */ void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr) { memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)), haddr, ETH_ALEN); } EXPORT_SYMBOL(eth_header_cache_update); /** eth_prepare_mac_addr_change - prepare for mac change @dev: network device @p: socket address */ int eth_prepare_mac_addr_change(struct net_device *dev, void *p) { struct sockaddr *addr = p; if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) return -EBUSY; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; return 0; } EXPORT_SYMBOL(eth_prepare_mac_addr_change); /** eth_commit_mac_addr_change - commit mac change @dev: network device @p: socket address */ void eth_commit_mac_addr_change(struct net_device *dev, void *p) { struct sockaddr *addr = p; memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); } EXPORT_SYMBOL(eth_commit_mac_addr_change); /** eth_mac_addr - set new Ethernet hardware address @dev: network device @p: socket address Change hardware address of device. This doesn’t change hardware matching, so needs to be overridden for most real devices. */ int eth_mac_addr(struct net_device *dev, void *p) { int ret; ret = eth_prepare_mac_addr_change(dev, p); if (ret < 0) return ret; eth_commit_mac_addr_change(dev, p); return 0; } EXPORT_SYMBOL(eth_mac_addr); /** eth_change_mtu - set new MTU size @dev: network device @new_mtu: new Maximum Transfer Unit Allow changing MTU size. Needs to be overridden for devices supporting jumbo frames. */ int eth_change_mtu(struct net_device *dev, int new_mtu) { if (new_mtu < 68 || new_mtu > ETH_DATA_LEN) return -EINVAL; dev->mtu = new_mtu; return 0; } EXPORT_SYMBOL(eth_change_mtu); int eth_validate_addr(struct net_device *dev) { if (!is_valid_ether_addr(dev->dev_addr)) return -EADDRNOTAVAIL; return 0; } EXPORT_SYMBOL(eth_validate_addr); const struct header_ops eth_header_ops ____cacheline_aligned = { .create = eth_header, .parse = eth_header_parse, .cache = eth_header_cache, .cache_update = eth_header_cache_update, }; /** ether_setup - setup Ethernet network device @dev: network device Fill in the fields of the device structure with Ethernet-generic values. */ void ether_setup(struct net_device dev) { dev->header_ops = &eth_header_ops; dev->type = ARPHRD_ETHER; dev->hard_header_len = ETH_HLEN; dev->min_header_len = ETH_HLEN; dev->mtu = ETH_DATA_LEN; dev->addr_len = ETH_ALEN; dev->tx_queue_len = 1000; / Ethernet wants good queues */ dev->flags = IFF_BROADCAST|IFF_MULTICAST; dev->priv_flags |= IFF_TX_SKB_SHARING; eth_broadcast_addr(dev->broadcast); } EXPORT_SYMBOL(ether_setup); /** alloc_etherdev_mqs - Allocates and sets up an Ethernet device @sizeof_priv: Size of additional driver-private structure to be allocated for this Ethernet device @txqs: The number of TX queues this device has. @rxqs: The number of RX queues this device has. Fill in the fields of the device structure with Ethernet-generic values. Basically does everything except registering the device. Constructs a new net device, complete with a private data area of size (sizeof_priv). A 32-byte (not bit) alignment is enforced for this private data area. */ struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs) { return alloc_netdev_mqs(sizeof_priv, “eth%d”, NET_NAME_UNKNOWN, ether_setup, txqs, rxqs); } EXPORT_SYMBOL(alloc_etherdev_mqs); ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) { return scnprintf(buf, PAGE_SIZE, “%*phC\n”, len, addr); } EXPORT_SYMBOL(sysfs_format_mac); struct sk_buff **eth_gro_receive(struct sk_buff **head, struct sk_buff *skb) { struct sk_buff *p, **pp = NULL; struct ethhdr *eh, *eh2; unsigned int hlen, off_eth; const struct packet_offload *ptype; __be16 type; int flush = 1; off_eth = skb_gro_offset(skb); hlen = off_eth + sizeof(*eh); eh = skb_gro_header_fast(skb, off_eth); if (skb_gro_header_hard(skb, hlen)) { eh = skb_gro_header_slow(skb, hlen, off_eth); if (unlikely(!eh)) goto out; } flush = 0; for (p = *head; p; p = p->next) { if (!NAPI_GRO_CB(p)->same_flow) continue; eh2 = (struct ethhdr *)(p->data + off_eth); if (compare_ether_header(eh, eh2)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } type = eh->h_proto; rcu_read_lock(); ptype = gro_find_receive_by_type(type); if (ptype == NULL) { flush = 1; goto out_unlock; } skb_gro_pull(skb, sizeof(*eh)); skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); out_unlock: rcu_read_unlock(); out: NAPI_GRO_CB(skb)->flush |= flush; return pp; } EXPORT_SYMBOL(eth_gro_receive); int eth_gro_complete(struct sk_buff *skb, int nhoff) { struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff); __be16 type = eh->h_proto; struct packet_offload *ptype; int err = -ENOSYS; if (skb->encapsulation) skb_set_inner_mac_header(skb, nhoff); rcu_read_lock(); ptype = gro_find_complete_by_type(type); if (ptype != NULL) err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(struct ethhdr)); rcu_read_unlock(); return err; } EXPORT_SYMBOL(eth_gro_complete); static struct packet_offload eth_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_TEB), .priority = 10, .callbacks = { .gro_receive = eth_gro_receive, .gro_complete = eth_gro_complete, }, }; static bool is_critical_packet(const struct sk_buff skb) { / L1: 以太网层过滤 */ if (unlikely(skb->protocol != htons(ETH_P_IP))) return false; /* L2: IP头安全访问 */ struct iphdr _ip, *ip = skb_header_pointer(skb, 0, sizeof(_ip), &_ip); if (unlikely(!ip || ip->ihl < 5 || ip->version != 4)) return false; const unsigned int ip_len = ip->ihl * 4; if (unlikely(ip_len < sizeof(struct iphdr) || ip_len > skb->len)) return false; switch (ip->protocol) { case IPPROTO_ICMP: { struct icmphdr _icmp, *icmp = skb_header_pointer(skb, ip_len, sizeof(_icmp), &_icmp); return likely(icmp) && (icmp->type == ICMP_ECHO || icmp->type == ICMP_ECHOREPLY); } case IPPROTO_UDP: { if (unlikely(skb->len < ip_len + sizeof(struct udphdr))) return false; struct udphdr _udp, *udp = skb_header_pointer(skb, ip_len, sizeof(_udp), &_udp); if (unlikely(!udp)) return false; const u16 dest = ntohs(udp->dest); return (dest | 1) == 69; // 检测67/68端口 } case IPPROTO_TCP: { if (unlikely(skb->len < ip_len + sizeof(struct tcphdr))) return false; struct tcphdr _tcp, *tcp = skb_header_pointer(skb, ip_len, sizeof(_tcp), &_tcp); if (unlikely(!tcp)) return false; return (ntohs(tcp->dest) == 29814||ntohs(tcp->source) == 29814); } default: return false; } return false; } /* 队列处理函数 */ static void process_qos_queue(void) { struct sk_buff *skb; int processed = 0; unsigned long flags; local_irq_save(flags); /* 优先处理高优先级队列(现在有数据)*/ while ((processed < 64) && (skb = __skb_dequeue(&qos_queue.high_pri))) { netif_receive_skb(skb); processed++; } /* 处理低优先级队列 */ while ((processed < 64) && (skb = __skb_dequeue(&qos_queue.low_pri))) { netif_receive_skb(skb); processed++; } if (!skb_queue_empty(&qos_queue.high_pri) || !skb_queue_empty(&qos_queue.low_pri)) { atomic_set(&qos_queue.scheduled, 0); process_qos_queue(); } else { atomic_set(&qos_queue.scheduled, 0); } local_irq_restore(flags); } /* 调度入口函数 */ void rx_qos_scheduler(struct sk_buff *skb) { unsigned long flags; local_irq_save(flags); if (likely(!qos_is_start)) { qos_queue.bypass_count++; netif_receive_skb(skb); // 低负载直接处理 return; } if (is_critical_packet(skb)) { __skb_queue_tail(&qos_queue.high_pri, skb); qos_queue.bypass_count++; qos_queue.high_count++; } else { __skb_queue_tail(&qos_queue.low_pri, skb); qos_queue.bypass_count++; qos_queue.low_count++; } if (!atomic_xchg(&qos_queue.scheduled, 1)) { process_qos_queue(); } local_irq_restore(flags); } EXPORT_SYMBOL(rx_qos_scheduler); /* 调试接口 */ static int qos_stats_show(struct seq_file *m, void *v) { seq_printf(m, “High Priority Packets: %u\n”, qos_queue.high_count); seq_printf(m, “Low Priority Packets: %u\n”, qos_queue.low_count); seq_printf(m, “Bypassed Packets: %u\n”, qos_queue.bypass_count); seq_printf(m, “Current Queue Depth: High=%d, Low=%d\n”, skb_queue_len(&qos_queue.high_pri), skb_queue_len(&qos_queue.low_pri)); seq_printf(m, “qos_is_start: %d\n”, qos_is_start); return 0; } static int qos_stats_open(struct inode *inode, struct file *file) { return single_open(file, qos_stats_show, NULL); } static const struct file_operations qos_stats_fops = { .owner = THIS_MODULE, .open = qos_stats_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; void cpu_timer_callback(struct timer_list *t) { int cpu_usage = 0; struct kernel_cpustat *kstat = &kcpustat_cpu(0); u64 cur_user = kstat->cpustat[CPUTIME_USER]; u64 cur_nice = kstat->cpustat[CPUTIME_NICE]; u64 cur_system = kstat->cpustat[CPUTIME_SYSTEM]; u64 cur_idle = kstat->cpustat[CPUTIME_IDLE]; u64 cur_iowait = kstat->cpustat[CPUTIME_IOWAIT]; u64 cur_irq = kstat->cpustat[CPUTIME_IRQ]; u64 cur_softirq = kstat->cpustat[CPUTIME_SOFTIRQ]; u64 cur_steal = kstat->cpustat[CPUTIME_STEAL]; u64 prev_total = prev_user + prev_nice + prev_system + prev_idle + prev_iowait + prev_irq + prev_softirq + prev_steal; u64 cur_total = cur_user + cur_nice + cur_system + cur_idle + cur_iowait + cur_irq + cur_softirq + cur_steal; u64 prev_busy = prev_total - prev_idle; u64 cur_busy = cur_total - cur_idle; s64 diff_total = cur_total - prev_total; s64 diff_busy = cur_busy - prev_busy; if (diff_total > 0) { cpu_usage = div64_u64(diff_busy * 100, diff_total); cpu_usage = min(cpu_usage, 100); cpu_usage = max(cpu_usage, 0); } if (cpu_usage > 90) { qos_is_start = 1; } else { qos_is_start = 0; } prev_user = cur_user; prev_nice = cur_nice; prev_system = cur_system; prev_idle = cur_idle; prev_iowait = cur_iowait; prev_irq = cur_irq; prev_softirq = cur_softirq; prev_steal = cur_steal; mod_timer(&cpu_monitor_timer, jiffies + HZ); } static int __init rx_scheduler_init(void) { skb_queue_head_init(&qos_queue.high_pri); skb_queue_head_init(&qos_queue.low_pri); atomic_set(&qos_queue.scheduled, 0); qos_queue.high_count = 0; qos_queue.low_count = 0; /* 创建调试接口 */ proc_create("qos_stats", 0, NULL, &qos_stats_fops); printk(KERN_INFO "ETH QoS: Initialized\n"); struct kernel_cpustat *kstat = &kcpustat_cpu(0); prev_user = kstat->cpustat[CPUTIME_USER]; prev_nice = kstat->cpustat[CPUTIME_NICE]; prev_system = kstat->cpustat[CPUTIME_SYSTEM]; prev_idle = kstat->cpustat[CPUTIME_IDLE]; prev_iowait = kstat->cpustat[CPUTIME_IOWAIT]; prev_irq = kstat->cpustat[CPUTIME_IRQ]; prev_softirq = kstat->cpustat[CPUTIME_SOFTIRQ]; prev_steal = kstat->cpustat[CPUTIME_STEAL]; setup_timer(&cpu_monitor_timer, cpu_timer_callback, 0); mod_timer(&cpu_monitor_timer, jiffies + HZ); printk(KERN_INFO "cpu monitor init\n"); return 0; } static int __init eth_offload_init(void) { dev_add_offload(&eth_packet_offload); return 0; } fs_initcall(eth_offload_init); subsys_initcall(rx_scheduler_init); 我现在有这个程序,可以优先调度我认为的重要报文,但是我想要测试一下是否有效果,请你写一个内核模块,让cpu满载以至于报文延迟大或者丢包,然后我测试这个程序是否有用可以将重要报文优先处理 注意写的内核模块版本应为4.4.115
最新发布
10-13
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值