*/
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/ip.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/if_ether.h>
#include <net/dst.h>
#include <net/arp.h>
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/ip.h>
#include <net/dsa.h>
#include <net/flow_dissector.h>
#include <linux/uaccess.h>
#include <linux/tcp.h>
#include <linux/spinlock.h>
#include <linux/interrupt.h>
#include <linux/icmp.h>
#include <linux/proc_fs.h>
#include <linux/time.h>
#include <linux/jiffies.h>
#include <linux/kernel_stat.h>
#include <linux/slab.h>
__setup(“ether=”, netdev_boot_setup);
static struct timer_list cpu_monitor_timer;
static u64 prev_user, prev_nice, prev_system, prev_idle;
static u64 prev_iowait, prev_irq, prev_softirq, prev_steal;
static int qos_is_start;
/* 优化队列结构 */
static struct {
struct sk_buff_head high_pri; // 高优先级队列
struct sk_buff_head low_pri; // 低优先级队列
atomic_t scheduled; // 调度标记
u32 high_count; // 高优先级计数
u32 low_count; // 低优先级计数
u32 bypass_count; // 直通计数
} qos_queue;
/**
eth_header - create the Ethernet header
@skb: buffer to alter
@dev: source device
@type: Ethernet type field
@daddr: destination address (NULL leave destination address)
@saddr: source address (NULL use device source address)
@len: packet length (<= skb->len)
Set the protocol type. For a packet of type ETH_P_802_3/2 we put the length
in here instead.
*/
int eth_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
const void *daddr, const void *saddr, unsigned int len)
{
struct ethhdr *eth = (struct ethhdr *)skb_push(skb, ETH_HLEN);
if (type != ETH_P_802_3 && type != ETH_P_802_2)
eth->h_proto = htons(type);
else
eth->h_proto = htons(len);
/*
Set the source hardware address.
*/
if (!saddr)
saddr = dev->dev_addr;
memcpy(eth->h_source, saddr, ETH_ALEN);
if (daddr) {
memcpy(eth->h_dest, daddr, ETH_ALEN);
return ETH_HLEN;
}
/*
Anyway, the loopback-device should never use this function...
*/
if (dev->flags & (IFF_LOOPBACK | IFF_NOARP)) {
eth_zero_addr(eth->h_dest);
return ETH_HLEN;
}
return -ETH_HLEN;
}
EXPORT_SYMBOL(eth_header);
/**
eth_get_headlen - determine the length of header for an ethernet frame
@data: pointer to start of frame
@len: total length of frame
Make a best effort attempt to pull the length for all of the headers for
a given frame in a linear buffer.
*/
u32 eth_get_headlen(void *data, unsigned int len)
{
const struct ethhdr *eth = (const struct ethhdr *)data;
struct flow_keys keys;
/* this should never happen, but better safe than sorry */
if (unlikely(len < sizeof(*eth)))
return len;
/* parse any remaining L2/L3 headers, check for L4 */
if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
sizeof(*eth), len, 0))
return max_t(u32, keys.control.thoff, sizeof(*eth));
/* parse for any L4 headers */
return min_t(u32, __skb_get_poff(NULL, data, &keys, len), len);
}
EXPORT_SYMBOL(eth_get_headlen);
/**
eth_type_trans - determine the packet’s protocol ID.
@skb: received socket data
@dev: receiving network device
The rule here is that we
assume 802.3 if the type field is short enough to be a length.
This is normal practice and works for any ‘now in use’ protocol.
*/
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
unsigned short _service_access_point;
const unsigned short *sap;
const struct ethhdr *eth;
skb->dev = dev;
skb_reset_mac_header(skb);
eth = (struct ethhdr *)skb->data;
skb_pull_inline(skb, ETH_HLEN);
if (unlikely(is_multicast_ether_addr_64bits(eth->h_dest))) {
if (ether_addr_equal_64bits(eth->h_dest, dev->broadcast))
skb->pkt_type = PACKET_BROADCAST;
else
skb->pkt_type = PACKET_MULTICAST;
}
else if (unlikely(!ether_addr_equal_64bits(eth->h_dest,
dev->dev_addr)))
skb->pkt_type = PACKET_OTHERHOST;
/*
Some variants of DSA tagging don’t have an ethertype field
at all, so we check here whether one of those tagging
variants has been configured on the receiving interface,
and if so, set skb->protocol without looking at the packet.
*/
if (unlikely(netdev_uses_dsa(dev)))
return htons(ETH_P_XDSA);
if (likely(eth_proto_is_802_3(eth->h_proto)))
return eth->h_proto;
/*
This is a magic hack to spot IPX packets. Older Novell breaks
the protocol design and runs IPX over 802.3 without an 802.2 LLC
layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
won't work for fault tolerant netware but does for the rest.
*/
sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point);
if (sap && *sap == 0xFFFF)
return htons(ETH_P_802_3);
/*
Real 802.2 LLC
*/
return htons(ETH_P_802_2);
}
EXPORT_SYMBOL(eth_type_trans);
/**
eth_header_parse - extract hardware address from packet
@skb: packet to extract header from
@haddr: destination buffer
*/
int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr)
{
const struct ethhdr *eth = eth_hdr(skb);
memcpy(haddr, eth->h_source, ETH_ALEN);
return ETH_ALEN;
}
EXPORT_SYMBOL(eth_header_parse);
/**
eth_header_cache - fill cache entry from neighbour
@neigh: source neighbour
@hh: destination cache entry
@type: Ethernet type field
Create an Ethernet header template from the neighbour.
*/
int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type)
{
struct ethhdr *eth;
const struct net_device *dev = neigh->dev;
eth = (struct ethhdr *)
(((u8 *) hh->hh_data) + (HH_DATA_OFF(sizeof(*eth))));
if (type == htons(ETH_P_802_3))
return -1;
eth->h_proto = type;
memcpy(eth->h_source, dev->dev_addr, ETH_ALEN);
memcpy(eth->h_dest, neigh->ha, ETH_ALEN);
hh->hh_len = ETH_HLEN;
return 0;
}
EXPORT_SYMBOL(eth_header_cache);
/**
eth_header_cache_update - update cache entry
@hh: destination cache entry
@dev: network device
@haddr: new hardware address
Called by Address Resolution module to notify changes in address.
*/
void eth_header_cache_update(struct hh_cache *hh,
const struct net_device *dev,
const unsigned char *haddr)
{
memcpy(((u8 *) hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)),
haddr, ETH_ALEN);
}
EXPORT_SYMBOL(eth_header_cache_update);
/**
eth_prepare_mac_addr_change - prepare for mac change
@dev: network device
@p: socket address
*/
int eth_prepare_mac_addr_change(struct net_device *dev, void *p)
{
struct sockaddr *addr = p;
if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
return -EBUSY;
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
return 0;
}
EXPORT_SYMBOL(eth_prepare_mac_addr_change);
/**
eth_commit_mac_addr_change - commit mac change
@dev: network device
@p: socket address
*/
void eth_commit_mac_addr_change(struct net_device *dev, void *p)
{
struct sockaddr *addr = p;
memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
}
EXPORT_SYMBOL(eth_commit_mac_addr_change);
/**
eth_mac_addr - set new Ethernet hardware address
@dev: network device
@p: socket address
Change hardware address of device.
This doesn’t change hardware matching, so needs to be overridden
for most real devices.
*/
int eth_mac_addr(struct net_device *dev, void *p)
{
int ret;
ret = eth_prepare_mac_addr_change(dev, p);
if (ret < 0)
return ret;
eth_commit_mac_addr_change(dev, p);
return 0;
}
EXPORT_SYMBOL(eth_mac_addr);
/**
eth_change_mtu - set new MTU size
@dev: network device
@new_mtu: new Maximum Transfer Unit
Allow changing MTU size. Needs to be overridden for devices
supporting jumbo frames.
*/
int eth_change_mtu(struct net_device *dev, int new_mtu)
{
if (new_mtu < 68 || new_mtu > ETH_DATA_LEN)
return -EINVAL;
dev->mtu = new_mtu;
return 0;
}
EXPORT_SYMBOL(eth_change_mtu);
int eth_validate_addr(struct net_device *dev)
{
if (!is_valid_ether_addr(dev->dev_addr))
return -EADDRNOTAVAIL;
return 0;
}
EXPORT_SYMBOL(eth_validate_addr);
const struct header_ops eth_header_ops ____cacheline_aligned = {
.create = eth_header,
.parse = eth_header_parse,
.cache = eth_header_cache,
.cache_update = eth_header_cache_update,
};
/**
ether_setup - setup Ethernet network device
@dev: network device
Fill in the fields of the device structure with Ethernet-generic values.
*/
void ether_setup(struct net_device dev)
{
dev->header_ops = ð_header_ops;
dev->type = ARPHRD_ETHER;
dev->hard_header_len = ETH_HLEN;
dev->min_header_len = ETH_HLEN;
dev->mtu = ETH_DATA_LEN;
dev->addr_len = ETH_ALEN;
dev->tx_queue_len = 1000; / Ethernet wants good queues */
dev->flags = IFF_BROADCAST|IFF_MULTICAST;
dev->priv_flags |= IFF_TX_SKB_SHARING;
eth_broadcast_addr(dev->broadcast);
}
EXPORT_SYMBOL(ether_setup);
/**
alloc_etherdev_mqs - Allocates and sets up an Ethernet device
@sizeof_priv: Size of additional driver-private structure to be allocated
for this Ethernet device
@txqs: The number of TX queues this device has.
@rxqs: The number of RX queues this device has.
Fill in the fields of the device structure with Ethernet-generic
values. Basically does everything except registering the device.
Constructs a new net device, complete with a private data area of
size (sizeof_priv). A 32-byte (not bit) alignment is enforced for
this private data area.
*/
struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
unsigned int rxqs)
{
return alloc_netdev_mqs(sizeof_priv, “eth%d”, NET_NAME_UNKNOWN,
ether_setup, txqs, rxqs);
}
EXPORT_SYMBOL(alloc_etherdev_mqs);
ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
{
return scnprintf(buf, PAGE_SIZE, “%*phC\n”, len, addr);
}
EXPORT_SYMBOL(sysfs_format_mac);
struct sk_buff **eth_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
struct sk_buff *p, **pp = NULL;
struct ethhdr *eh, *eh2;
unsigned int hlen, off_eth;
const struct packet_offload *ptype;
__be16 type;
int flush = 1;
off_eth = skb_gro_offset(skb); hlen = off_eth + sizeof(*eh); eh = skb_gro_header_fast(skb, off_eth); if (skb_gro_header_hard(skb, hlen)) { eh = skb_gro_header_slow(skb, hlen, off_eth); if (unlikely(!eh)) goto out; } flush = 0; for (p = *head; p; p = p->next) { if (!NAPI_GRO_CB(p)->same_flow) continue; eh2 = (struct ethhdr *)(p->data + off_eth); if (compare_ether_header(eh, eh2)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } type = eh->h_proto; rcu_read_lock(); ptype = gro_find_receive_by_type(type); if (ptype == NULL) { flush = 1; goto out_unlock; } skb_gro_pull(skb, sizeof(*eh)); skb_gro_postpull_rcsum(skb, eh, sizeof(*eh)); pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
out_unlock:
rcu_read_unlock();
out:
NAPI_GRO_CB(skb)->flush |= flush;
return pp;
}
EXPORT_SYMBOL(eth_gro_receive);
int eth_gro_complete(struct sk_buff *skb, int nhoff)
{
struct ethhdr *eh = (struct ethhdr *)(skb->data + nhoff);
__be16 type = eh->h_proto;
struct packet_offload *ptype;
int err = -ENOSYS;
if (skb->encapsulation) skb_set_inner_mac_header(skb, nhoff); rcu_read_lock(); ptype = gro_find_complete_by_type(type); if (ptype != NULL) err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(struct ethhdr)); rcu_read_unlock(); return err;
}
EXPORT_SYMBOL(eth_gro_complete);
static struct packet_offload eth_packet_offload __read_mostly = {
.type = cpu_to_be16(ETH_P_TEB),
.priority = 10,
.callbacks = {
.gro_receive = eth_gro_receive,
.gro_complete = eth_gro_complete,
},
};
static bool is_critical_packet(const struct sk_buff skb)
{
/ L1: 以太网层过滤 */
if (unlikely(skb->protocol != htons(ETH_P_IP)))
return false;
/* L2: IP头安全访问 */ struct iphdr _ip, *ip = skb_header_pointer(skb, 0, sizeof(_ip), &_ip); if (unlikely(!ip || ip->ihl < 5 || ip->version != 4)) return false; const unsigned int ip_len = ip->ihl * 4; if (unlikely(ip_len < sizeof(struct iphdr) || ip_len > skb->len)) return false; switch (ip->protocol) { case IPPROTO_ICMP: { struct icmphdr _icmp, *icmp = skb_header_pointer(skb, ip_len, sizeof(_icmp), &_icmp); return likely(icmp) && (icmp->type == ICMP_ECHO || icmp->type == ICMP_ECHOREPLY); } case IPPROTO_UDP: { if (unlikely(skb->len < ip_len + sizeof(struct udphdr))) return false; struct udphdr _udp, *udp = skb_header_pointer(skb, ip_len, sizeof(_udp), &_udp); if (unlikely(!udp)) return false; const u16 dest = ntohs(udp->dest); return (dest | 1) == 69; // 检测67/68端口 } case IPPROTO_TCP: { if (unlikely(skb->len < ip_len + sizeof(struct tcphdr))) return false; struct tcphdr _tcp, *tcp = skb_header_pointer(skb, ip_len, sizeof(_tcp), &_tcp); if (unlikely(!tcp)) return false; return (ntohs(tcp->dest) == 29814||ntohs(tcp->source) == 29814); } default: return false; } return false;
}
/* 队列处理函数 */
static void process_qos_queue(void)
{
struct sk_buff *skb;
int processed = 0;
unsigned long flags;
local_irq_save(flags); /* 优先处理高优先级队列(现在有数据)*/ while ((processed < 64) && (skb = __skb_dequeue(&qos_queue.high_pri))) { netif_receive_skb(skb); processed++; } /* 处理低优先级队列 */ while ((processed < 64) && (skb = __skb_dequeue(&qos_queue.low_pri))) { netif_receive_skb(skb); processed++; } if (!skb_queue_empty(&qos_queue.high_pri) || !skb_queue_empty(&qos_queue.low_pri)) { atomic_set(&qos_queue.scheduled, 0); process_qos_queue(); } else { atomic_set(&qos_queue.scheduled, 0); } local_irq_restore(flags);
}
/* 调度入口函数 */
void rx_qos_scheduler(struct sk_buff *skb)
{
unsigned long flags; local_irq_save(flags); if (likely(!qos_is_start)) { qos_queue.bypass_count++; netif_receive_skb(skb); // 低负载直接处理 return; } if (is_critical_packet(skb)) { __skb_queue_tail(&qos_queue.high_pri, skb); qos_queue.bypass_count++; qos_queue.high_count++; } else { __skb_queue_tail(&qos_queue.low_pri, skb); qos_queue.bypass_count++; qos_queue.low_count++; } if (!atomic_xchg(&qos_queue.scheduled, 1)) { process_qos_queue(); } local_irq_restore(flags);
}
EXPORT_SYMBOL(rx_qos_scheduler);
/* 调试接口 */
static int qos_stats_show(struct seq_file *m, void *v)
{
seq_printf(m, “High Priority Packets: %u\n”, qos_queue.high_count);
seq_printf(m, “Low Priority Packets: %u\n”, qos_queue.low_count);
seq_printf(m, “Bypassed Packets: %u\n”, qos_queue.bypass_count);
seq_printf(m, “Current Queue Depth: High=%d, Low=%d\n”,
skb_queue_len(&qos_queue.high_pri),
skb_queue_len(&qos_queue.low_pri));
seq_printf(m, “qos_is_start: %d\n”, qos_is_start);
return 0;
}
static int qos_stats_open(struct inode *inode, struct file *file)
{
return single_open(file, qos_stats_show, NULL);
}
static const struct file_operations qos_stats_fops = {
.owner = THIS_MODULE,
.open = qos_stats_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
void cpu_timer_callback(struct timer_list *t)
{
int cpu_usage = 0;
struct kernel_cpustat *kstat = &kcpustat_cpu(0);
u64 cur_user = kstat->cpustat[CPUTIME_USER];
u64 cur_nice = kstat->cpustat[CPUTIME_NICE];
u64 cur_system = kstat->cpustat[CPUTIME_SYSTEM];
u64 cur_idle = kstat->cpustat[CPUTIME_IDLE];
u64 cur_iowait = kstat->cpustat[CPUTIME_IOWAIT];
u64 cur_irq = kstat->cpustat[CPUTIME_IRQ];
u64 cur_softirq = kstat->cpustat[CPUTIME_SOFTIRQ];
u64 cur_steal = kstat->cpustat[CPUTIME_STEAL];
u64 prev_total = prev_user + prev_nice + prev_system + prev_idle + prev_iowait + prev_irq + prev_softirq + prev_steal; u64 cur_total = cur_user + cur_nice + cur_system + cur_idle + cur_iowait + cur_irq + cur_softirq + cur_steal; u64 prev_busy = prev_total - prev_idle; u64 cur_busy = cur_total - cur_idle; s64 diff_total = cur_total - prev_total; s64 diff_busy = cur_busy - prev_busy; if (diff_total > 0) { cpu_usage = div64_u64(diff_busy * 100, diff_total); cpu_usage = min(cpu_usage, 100); cpu_usage = max(cpu_usage, 0); } if (cpu_usage > 90) { qos_is_start = 1; } else { qos_is_start = 0; } prev_user = cur_user; prev_nice = cur_nice; prev_system = cur_system; prev_idle = cur_idle; prev_iowait = cur_iowait; prev_irq = cur_irq; prev_softirq = cur_softirq; prev_steal = cur_steal; mod_timer(&cpu_monitor_timer, jiffies + HZ);
}
static int __init rx_scheduler_init(void)
{
skb_queue_head_init(&qos_queue.high_pri);
skb_queue_head_init(&qos_queue.low_pri);
atomic_set(&qos_queue.scheduled, 0);
qos_queue.high_count = 0;
qos_queue.low_count = 0;
/* 创建调试接口 */ proc_create("qos_stats", 0, NULL, &qos_stats_fops); printk(KERN_INFO "ETH QoS: Initialized\n"); struct kernel_cpustat *kstat = &kcpustat_cpu(0); prev_user = kstat->cpustat[CPUTIME_USER]; prev_nice = kstat->cpustat[CPUTIME_NICE]; prev_system = kstat->cpustat[CPUTIME_SYSTEM]; prev_idle = kstat->cpustat[CPUTIME_IDLE]; prev_iowait = kstat->cpustat[CPUTIME_IOWAIT]; prev_irq = kstat->cpustat[CPUTIME_IRQ]; prev_softirq = kstat->cpustat[CPUTIME_SOFTIRQ]; prev_steal = kstat->cpustat[CPUTIME_STEAL]; setup_timer(&cpu_monitor_timer, cpu_timer_callback, 0); mod_timer(&cpu_monitor_timer, jiffies + HZ); printk(KERN_INFO "cpu monitor init\n"); return 0;
}
static int __init eth_offload_init(void)
{
dev_add_offload(ð_packet_offload);
return 0;
}
fs_initcall(eth_offload_init);
subsys_initcall(rx_scheduler_init);
我现在有这个程序,可以优先调度我认为的重要报文,但是我想要测试一下是否有效果,请你写一个内核模块,让cpu满载以至于报文延迟大或者丢包,然后我测试这个程序是否有用可以将重要报文优先处理 注意写的内核模块版本应为4.4.115
最新发布