linux内核中断 X86

最新推荐文章于 2025-03-03 12:30:57 发布

转载最新推荐文章于 2025-03-03 12:30:57 发布 · 761 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：https://blog.youkuaiyun.com/yin262/article/details/54289502?spm=1001.2014.3001.5502

文章标签：

#java #开发语言

Linux内核驱动专栏收录该内容

65 篇文章

订阅专栏

本篇主要分析IDT的初始化流程。nei
IDT简介

IDT——interrupt description table，用来描述中断异常向量，表中的每一个entry对应一个向量。
IDT entry：

这里写图片描述

每个entry为8bytes，有以下关键bit：
16~31：code segment selector
0~15 & 46-64：segment offset （根据以上两项可确定中断处理函数的地址）
Type：区分中断门、陷阱门、任务门等
DPL：Descriptor Privilege Level，访问特权级
P：该描述符是否在内存中
desc_struct ：

kernel中描述IDT entry的数据结构为

87 typedef struct desc_struct gate_desc;

14 /*
15 * FIXME: Accessing the desc_struct through its fields is more elegant,
16 * and should be the one valid thing to do. However, a lot of open code
17 * still touches the a and b accessors, and doing this allow us to do it
18 * incrementally. We keep the signature as a struct, rather than an union,
19 * so we can get rid of it transparently in the future -- glommer
20 */
21 /* 8 byte segment descriptor */
22 struct desc_struct {
23     union {
24         struct {
25             unsigned int a;
26             unsigned int b;
27         };
28         struct {
29             u16 limit0;
30             u16 base0;
31             unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
32             unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
33         };
34     };
35 } __attribute__((packed));

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24

idt_table：

kernel中描述IDT：

77 /* Must be page-aligned because the real IDT is used in a fixmap. */
78 gate_desc idt_table[NR_VECTORS] __page_aligned_bss;

1
2

初始化IDT
setup_once：

第一次初始化，通过汇编形式，这里应该是设置特权级别和code segment？对汇编就不做太多研究了。

470 /*
471 * setup_once
472 *
473 * The setup work we only want to run on the BSP.
474 *
475 * Warning: %esi is live across this function.
476 */
477 __INIT
478 setup_once:
479     /*
480      * Set up a idt with 256 interrupt gates that push zero if there
481      * is no error code and then jump to early_idt_handler_common.
482      * It doesn't actually load the idt - that needs to be done on
483      * each CPU. Interrupts are enabled elsewhere, when we can be
484      * relatively sure everything is ok.
485      */
486
487     movl $idt_table,%edi
488     movl $early_idt_handler_array,%eax
489     movl $NUM_EXCEPTION_VECTORS,%ecx
490 1:
491     movl %eax,(%edi)
492     movl %eax,4(%edi)
493     /* interrupt gate, dpl=0, present */
494     movl $(0x8E000000 + __KERNEL_CS),2(%edi)
495     addl $EARLY_IDT_HANDLER_SIZE,%eax
496     addl $8,%edi
497     loop 1b
498
499     movl $256 - NUM_EXCEPTION_VECTORS,%ecx
500     movl $ignore_int,%edx
501     movl $(__KERNEL_CS << 16),%eax
502     movw %dx,%ax        /* selector = 0x0010 = cs */
503     movw $0x8E00,%dx    /* interrupt gate - dpl=0, present */
504 2:
505     movl %eax,(%edi)
506     movl %edx,4(%edi)
507     addl $8,%edi
508     loop 2b

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39

early_trap_init：

第二次初始化，通过early_trap_init和trap_init初始化IDT中kernel保留的vector，比如前32个vector和system call（0x80）等。系统保留的向量和set_intr_gate等分析见附录。

742 /* Set of traps needed for early debugging. */
743 void __init early_trap_init(void)
744 {
745     set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
746     /* int3 can be called from all */
747     set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
748 #ifdef CONFIG_X86_32
749     set_intr_gate(X86_TRAP_PF, page_fault);
750 #endif
751     load_idt(&idt_descr);
752 }

761 void __init trap_init(void)
762 {
763     int i;
764
765 #ifdef CONFIG_EISA
766     void __iomem *p = early_ioremap(0x0FFFD9, 4);
767
768     if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
769         EISA_bus = 1;
770     early_iounmap(p, 4);
771 #endif
772
773     set_intr_gate(X86_TRAP_DE, divide_error);
774     set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);        //中断门
775     /* int4 can be called from all */
776     set_system_intr_gate(X86_TRAP_OF, &overflow);
777     set_intr_gate(X86_TRAP_BR, bounds);
778     set_intr_gate(X86_TRAP_UD, invalid_op);
779     set_intr_gate(X86_TRAP_NM, device_not_available);
780 #ifdef CONFIG_X86_32
781     set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);              //任务门
782 #else
783     set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
784 #endif
785     set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
786     set_intr_gate(X86_TRAP_TS, invalid_TSS);
787     set_intr_gate(X86_TRAP_NP, segment_not_present);
788     set_intr_gate(X86_TRAP_SS, stack_segment);
789     set_intr_gate(X86_TRAP_GP, general_protection);
790     set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
791     set_intr_gate(X86_TRAP_MF, coprocessor_error);
792     set_intr_gate(X86_TRAP_AC, alignment_check);
793 #ifdef CONFIG_X86_MCE
794     set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
795 #endif
796     set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
797
798     /* Reserve all the builtin and the syscall vector: */
799     for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)       //FIRST_EXTERN_VECTOR的值为32，即前32个中断/异常为系统保留，并且在used_sectors中设置对应的bit为1
800         set_bit(i, used_vectors);
801
802 #ifdef CONFIG_IA32_EMULATION
803     set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
804     set_bit(IA32_SYSCALL_VECTOR, used_vectors);
805 #endif
806
807 #ifdef CONFIG_X86_32
808     set_system_trap_gate(SYSCALL_VECTOR, &system_call);          //syscall vector 中断号为0x80,，初始化为陷阱门
809     set_bit(SYSCALL_VECTOR, used_vectors);
810 #endif
811
812     /*
813      * Set the IDT descriptor to a fixed read-only location, so that the
814      * "sidt" instruction will not leak the location of the kernel, and
815      * to defend the IDT against arbitrary memory write vulnerabilities.
816      * It will be reloaded in cpu_init() */
817     __set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO);         //将中断描述符表（IDT）采用固定映射（fixed map）
818     idt_descr.address = fix_to_virt(FIX_RO_IDT);   //转换成虚拟地址
819
820     /*
821      * Should be a barrier for any external CPU state:
822      */
823     cpu_init();              //会调用load_current_idt
824
825     x86_init.irqs.trap_init();            //x86_init.pci.init_irq = x86_init_noop;   do nothing
826
827 #ifdef CONFIG_X86_64
828     memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
829     set_nmi_gate(X86_TRAP_DB, &debug);
830     set_nmi_gate(X86_TRAP_BP, &int3);
831 #endif
832 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79
    80
    81
    82
    83
    84

在调用init_IRQ之前还有early_irq_init，但是early_irq_init没有IDT相关的初始化，所以暂不分析，放到irq_desc中再做分析。
init_IRQ

第三次初始化：

85 void __init init_IRQ(void)
86 {
87     int i;
88
89     /*
90      * We probably need a better place for this, but it works for
91      * now ...
92      */
93     x86_add_irq_domains();   //在cht平台上此函数为空
94
95     /*
96      * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
97      * If these IRQ's are handled by legacy interrupt-controllers like PIC,
98      * then this configuration will likely be static after the boot. If
99      * these IRQ's are handled by more mordern controllers like IO-APIC,
100      * then this vector space can be freed and re-used dynamically as the
101      * irq's migrate etc.
102      */
103     for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)        //设置CPU 0的vector_irq数组的48-63为legacy irq
104         per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;       //每个CPU有一个per_cpu变量叫vector_irq,一个数组，来描述irq号和vector号的关联，大小为256，index为vector号，对应值为irq num。
105
106     x86_init.irqs.intr_init();            //intr_init      = native_init_IRQ
107 }
native_init_IRQ对IDT表中剩下的所有表项进行初始化
197 void __init native_init_IRQ(void)
198 {
199     int i;
200
201     /* Execute any quirks before the call gates are initialised: */
202     x86_init.irqs.pre_vector_init();               //调用init_ISA_irqs，设置了radix tree中对应legacy irq的irq_desc的内容，如desc->irq_data.chip、 desc->handle_irq、 desc->name 等
203
204     apic_intr_init();                        //apic和一些系统vector的初始化，设置used_vectors，填充对应的idt table选项和used_vectors
205
206     /*
207      * Cover the whole vector space, no vector can escape
208      * us. (some of these will be overridden and become
209      * 'special' SMP interrupts)
210      */
211     i = FIRST_EXTERNAL_VECTOR;
212     for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {                  //共有256个vector，FIRST_EXTERN_VECTOR为0x20，初始化剩余的IDT表项（剩余项中也有一些是保留的，比如系统调用等）
213         /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
214         set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);         //调用set_intr_gate将剩余未初始化的向量全部设置中断门，并且用interrupt数组中的元素作为中断处理函数
215     }                                                                    //interrupt数组初始化见
216
217     if (!acpi_ioapic && !of_ioapic)
218         setup_irq(2, &irq2);
219
220 #ifdef CONFIG_X86_32
221     irq_ctx_init(smp_processor_id());
222 #endif
223 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51

到此为止，IDT的初始化已经完成了，当系统发生中断时，会通过IDT找到interrupt数组中对应向量的处理指针，这个指针并不是直接指向该irq的ISR。有点绕人，没关系，分析完irq_desc就知道了。
附录：

x86中前32个中断和异常为系统

109 /* Interrupts/Exceptions */
110 enum {
111     X86_TRAP_DE = 0,    /* 0, Divide-by-zero */
112     X86_TRAP_DB,        /* 1, Debug */
113     X86_TRAP_NMI,       /* 2, Non-maskable Interrupt */
114     X86_TRAP_BP,        /* 3, Breakpoint */
115     X86_TRAP_OF,        /* 4, Overflow */
116     X86_TRAP_BR,        /* 5, Bound Range Exceeded */
117     X86_TRAP_UD,        /* 6, Invalid Opcode */
118     X86_TRAP_NM,        /* 7, Device Not Available */
119     X86_TRAP_DF,        /* 8, Double Fault */
120     X86_TRAP_OLD_MF,    /* 9, Coprocessor Segment Overrun */
121     X86_TRAP_TS,        /* 10, Invalid TSS */
122     X86_TRAP_NP,        /* 11, Segment Not Present */
123     X86_TRAP_SS,        /* 12, Stack Segment Fault */
124     X86_TRAP_GP,        /* 13, General Protection Fault */
125     X86_TRAP_PF,        /* 14, Page Fault */
126     X86_TRAP_SPURIOUS, /* 15, Spurious Interrupt */
127     X86_TRAP_MF,        /* 16, x87 Floating-Point Exception */
128     X86_TRAP_AC,        /* 17, Alignment Check */
129     X86_TRAP_MC,        /* 18, Machine Check */
130     X86_TRAP_XF,        /* 19, SIMD Floating-Point Exception */
131     X86_TRAP_IRET = 32, /* 32, IRET Exception */
132 }; set_intr_gate_ist

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24

不管是中断还是异常最后调用的接口和流程都是相似的

436 static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)   //X86_TRAP_DB, &debug, 0
437 {
438     BUG_ON((unsigned)n > 0xFF); //中断号必须小于0xff
439     _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
440 }
set_gate调用pack_gate组装成一个门描述符格式，并调用write_idt_entry写入IDT表中相应的描述符中
359 static inline void _set_gate(int gate, unsigned type, void *addr,             //X86_TRAP_DB, GATE_INTERRUPT, &debug
360                  unsigned dpl, unsigned ist, unsigned seg)        //0, 0, __KERNEL_CS   中断号为X86_TRAP_DB，中断门描述符类型为GATE_INTERRUPT， DPL为0表示只有内核态才能访问，__KERNEL_CS为段选择子
361 {
362     gate_desc s;
363
364     pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
365     /*
366      * does not need to be atomic because it is only done once at
367      * setup time
368      */
369     write_idt_entry(idt_table, gate, &s);
370     write_trace_idt_entry(gate, &s);
371 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19

interrupt数组

740 /*
741 * Build the entry stubs and pointer table with some assembler magic.
742 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
743 * single cache line on all modern x86 implementations.
744 */
745 .section .init.rodata,"a"                     //定义一个段，.init.rodata表示该段可以被读写操作，“a” section is allocateable
746 ENTRY(interrupt)                                //定义数据段的入口为interrupt
747 .section .entry.text, "ax"                    //x表示可执行代码段
748     .p2align 5                                //32字节对其
749     .p2align CONFIG_X86_L1_CACHE_SHIFT
750 ENTRY(irq_entries_start)                        //代码段的入口定义为irq_entries_start
751     RING0_INT_FRAME            //
752 vector=FIRST_EXTERNAL_VECTOR                    //0-31号为内部向量，外部向量从0x20开始
753 .rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7    //(256-32+6) / 7 = 32，循环32次
754     .balign 32                                    //32字节对齐
755   .rept 7                                          // 循环7次,共有 32*7=224 次
756     .if vector < NR_VECTORS
757       .if vector <> FIRST_EXTERNAL_VECTOR
758     CFI_ADJUST_CFA_OFFSET -4
759       .endif
760 1: pushl_cfi $(~vector+0x80)   /* Note: always in signed byte range */            //将~vector+0x80压栈
761       .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6     //猜测：(vector-FIRST_EXTERNAL_VECTOR)%7 == 6的那些vector（38 45 52 59 66 73 80...）之前已经设置过或者保留（比如系统中断），其余外部中断跳转到common_interrupt
762     jmp 2f                                      //jmp to common_interrupt
763       .endif
764       .previous
765     .long 1b                                    //保存1的地址，包含了压栈和跳转到common_interrupt
766       .section .entry.text, "ax"
767 vector=vector+1                                //vector自加1，初始化下一个vector
768     .endif
769   .endr
770 2: jmp common_interrupt
771 .endr
772 END(irq_entries_start)
773
774 .previous
775 END(interrupt)
776 .previous

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37

每个中断的处理函数主要做了三件事：
1. 将~vector+0x80压栈
2. 跳转到common_interrupt
3. 依次保存1的地址（指向了1和2的代码内容）到interrupt数组（即interrupt标志的内存位置）对应位置

初始化interrupt数组之后，对应内存中保存了各个vector的中断处理函数的地址，大概如下：

数据段中的interrupt[0]指向这里
pushl (~0x21+0x80)
jmp common_interrupt
nop
数据段中的interrupt[2]指向这里
pushl $(~0x22+0x80)
jmp common_interrupt
nop
……

编译以后产生的代码段数据段布局如下：

简介

irq_desc数据结构用于描述一个irq对应的各种信息，主要有以下方面：
irq_data，描述该irq的irq number，irq chip，irq domain，处理器亲和力等等
handle_irq，highlevel irq-events handler，流处理函数
irq_action，一个链表，每个成员包含该irq中断处理函数等信息
depth，中断嵌套深度
name，cat /proc/interrupts时显示的名称
等等

每个irq对应一个irq_desc，kernel中管理irq_descs有两种方式：
1、如果定义 CONFIG_SPARSE_IRQ，则所有irq_descs以radix tree的形式管理
2、否则所有irq_descs放在一个全局数组中，并对某些成员进行初始化，如下

    1
    2
    3
    4
    5
    6
    7

系统启动时初始化irq_descs
1、early_irq_init
1.1 Radix tree形式

只对系统的16个legacy中断进行irq_desc的初始化

229 int __init early_irq_init(void)
230 {
231     int i, initcnt, node = first_online_node;
232     struct irq_desc *desc;
233
234     init_irq_default_affinity(); //默认的中断亲和力是所有CPU，如果想绑定cpu到某个cpu上该怎么做？http://blog.youkuaiyun.com/kingmax26/article/details/5788732
235
236     /* Let arch update nr_irqs and return the nr of preallocated irqs */
237     initcnt = arch_probe_nr_irqs();       //不同架构的preallocated irq数目不同，x86是16
~ 238     printk(KERN_ERR "yin_test NR_IRQS:%d nr_irqs:%d initcnt %d\n",
+ 239             NR_IRQS, nr_irqs, initcnt);
240
241     if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS))
242         nr_irqs = IRQ_BITMAP_BITS;
243
244     if (WARN_ON(initcnt > IRQ_BITMAP_BITS))
245         initcnt = IRQ_BITMAP_BITS;
246
247     if (initcnt > nr_irqs)
248         nr_irqs = initcnt;
249
250     for (i = 0; i < initcnt; i++) {             //对以上的16个irq进行irq_desc的初始化
251         desc = alloc_desc(i, node, NULL);   //分配irq_desc并对其中某些成员进行初始化
252         set_bit(i, allocated_irqs);            //set bit in allocated_irqs
253         irq_insert_desc(i, desc);                          //插入到radix tree中
254     }
255     return arch_early_irq_init();     //设置以上16个legacy irq的chip_data，void类型
256 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28

1.2 全局数组

260 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
261     [0 ... NR_IRQS-1] = {
262         .handle_irq = handle_bad_irq,
263         .depth      = 1,
264         .lock       = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
265     }
266 };
267
268 int __init early_irq_init(void)
269 {
270     int count, i, node = first_online_node;
271     struct irq_desc *desc;
272
273     init_irq_default_affinity();
274
~ 275     printk(KERN_INFO "NR_IRQS:%d, adasda\n", NR_IRQS);
+ 276     13131
277
278     desc = irq_desc;
279     count = ARRAY_SIZE(irq_desc);
280
281     for (i = 0; i < count; i++) { //遍历数组，对成员进行初始化
282         desc[i].kstat_irqs = alloc_percpu(unsigned int);
283         alloc_masks(&desc[i], GFP_KERNEL, node);
284         raw_spin_lock_init(&desc[i].lock);
285         lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
286         desc_set_defaults(i, &desc[i], node, NULL);
287     }
288     return arch_early_irq_init();
289 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30

可以看到，这种方法就比较简单粗暴了，直接静态定义，然后依次对数组元素初始化。
2.init_IRQ

init_IRQ不仅对IDT进行初始化，也对irq_desc进行了初始化

85 void __init init_IRQ(void)
86 {
87     int i;
88
89     /*
90      * We probably need a better place for this, but it works for
91      * now ...
92      */
93     x86_add_irq_domains();   //在x86平台上此函数为空
94
95     /* //默认使用legacy irq PIC（8259），如果使用APIC，那么后续将被释放和动态reused
96      * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
97      * If these IRQ's are handled by legacy interrupt-controllers like PIC,
98      * then this configuration will likely be static after the boot. If
99      * these IRQ's are handled by more mordern controllers like IO-APIC,
100      * then this vector space can be freed and re-used dynamically as the
101      * irq's migrate etc.
102      */
103     for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) //设置CPU 0的vector_irq数组的48-63为legacy irq 0~15
104         per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;       //每个CPU有一个per_cpu变量叫vector_irq,一个数组，来描述irq号和vector号的关联，大小为256，index为vector号，对应值为irq num。
105
106     x86_init.irqs.intr_init();            //intr_init      = native_init_IRQ
107 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23

2.1 native_init_IRQ

197 void __init native_init_IRQ(void)
198 {
199     int i;
200
201     /* Execute any quirks before the call gates are initialised: */
202     x86_init.irqs.pre_vector_init();      //调用init_ISA_irqs
203
204     apic_intr_init();
205
206     /*
207      * Cover the whole vector space, no vector can escape
208      * us. (some of these will be overridden and become
209      * 'special' SMP interrupts)
210      */
211     i = FIRST_EXTERNAL_VECTOR;
212     for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
213         /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
214         set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
215     }
216
217     if (!acpi_ioapic && !of_ioapic)
218         setup_irq(2, &irq2);
219
220 #ifdef CONFIG_X86_32
221     irq_ctx_init(smp_processor_id());
222 #endif
223 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27

   70 void __init init_ISA_irqs(void)
   71 {
   72     struct irq_chip *chip = legacy_pic->chip;
   73     const char *name = chip->name;
   74     int i;
   75
+ 76     pr_err("yin_test, %s, %d, chip->name %s\n",
+ 77             __func__, __LINE__, chip->name);
   78 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
   79     init_bsp_APIC();
   80 #endif
   81     legacy_pic->init(0);
   82
+ 83     pr_err("yin_test, %s, %d\n",
+ 84             __func__, __LINE__);
   85     for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
   86         irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);    //设置irq_desc的chip、handle、name，x86默认使用legacy irq chip 8259，当然我们用的是APIC。这个函数后面会重点分析。
   87 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18

2.2 irq_set_chip_and_handler_name

696 void
697 irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
698                   irq_flow_handler_t handle, const char *name)
699 {
700     irq_set_chip(irq, chip);
701     __irq_set_handler(irq, handle, 0, name);
702 }
703 EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);

    1
    2
    3
    4
    5
    6
    7
    8

23 /**
24 * irq_set_chip - set the irq chip for an irq
25 * @irq:   irq number
26 * @chip: pointer to irq chip description structure
27 */
28 int irq_set_chip(unsigned int irq, struct irq_chip *chip)
29 {
30     unsigned long flags;
31     struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
32
33     if (!desc)
34         return -EINVAL;
35
36     if (!chip)
37         chip = &no_irq_chip;
38
39     desc->irq_data.chip = chip; //设置该irq对应的pic chip
40     irq_put_desc_unlock(desc, flags);
41     /*
42      * For !CONFIG_SPARSE_IRQ make the irq show up in
43      * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
44      * already marked, and this call is harmless.
45      */
46     irq_reserve_irq(irq);
47     return 0;
48 }
49 EXPORT_SYMBOL(irq_set_chip);
50

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28

657 void
658 __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
659           const char *name)
660 {
661     unsigned long flags;
662     struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); //在table中查找irq得到irq_desc
663
664     if (!desc)
665         return;
666
667     if (!handle) {
668         handle = handle_bad_irq;
669     } else {
670         if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
671             goto out;
672     }
673
674     /* Uninstall? */
675     if (handle == handle_bad_irq) {
676         if (desc->irq_data.chip != &no_irq_chip)
677             mask_ack_irq(desc);
678         irq_state_set_disabled(desc);
679         desc->depth = 1;
680     }
681     desc->handle_irq = handle; //设置handle_irq
682     desc->name = name;
683
684     if (handle != handle_bad_irq && is_chained) {
685         irq_settings_set_noprobe(desc);
686         irq_settings_set_norequest(desc);
687         irq_settings_set_nothread(desc);
688         irq_settings_set_chained(desc);
689         irq_startup(desc, true);
690     }
691 out:
692     irq_put_desc_busunlock(desc, flags);
693 }
694 EXPORT_SYMBOL_GPL(__irq_set_handler);

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38

代码逻辑比较简单，我比较感兴趣的是这个desc->handle_irq的作用是什么？和irqaction中的handler、thread_fn有何区别？下一章说明
附录：

irq_desc

16 /**
17 * struct irq_desc - interrupt descriptor
18 * @irq_data:       per irq and chip data passed down to chip functions
19 * @kstat_irqs:     irq stats per cpu
20 * @handle_irq:     highlevel irq-events handler
21 * @preflow_handler:    handler called before the flow handler (currently used by sparc)
22 * @action:     the irq action chain
23 * @status:     status information
24 * @core_internal_state__do_not_mess_with_it: core internal status information
25 * @depth:      disable-depth, for nested irq_disable() calls
26 * @wake_depth:     enable depth, for multiple irq_set_irq_wake() callers
27 * @irq_count:      stats field to detect stalled irqs
28 * @last_unhandled: aging timer for unhandled count
29 * @irqs_unhandled: stats field for spurious unhandled interrupts
30 * @threads_handled:    stats field for deferred spurious detection of threaded handlers
31 * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers
32 * @lock:       locking for SMP
33 * @affinity_hint: hint to user space for preferred irq affinity
34 * @affinity_notify:    context for notification of affinity changes
35 * @pending_mask:   pending rebalanced interrupts
36 * @threads_oneshot:    bitfield to handle shared oneshot threads
37 * @threads_active: number of irqaction threads currently running
38 * @wait_for_threads:   wait queue for sync_irq to wait for threaded handlers
39 * @dir:        /proc/irq/ procfs entry
40 * @name:       flow handler name for /proc/interrupts output
41 */
42 struct irq_desc {
43     struct irq_data     irq_data;
44     unsigned int __percpu   *kstat_irqs;
45     irq_flow_handler_t handle_irq;
46 #ifdef CONFIG_IRQ_PREFLOW_FASTEOI
47     irq_preflow_handler_t   preflow_handler;
48 #endif
49     struct irqaction    *action;    /* IRQ action list */
50     unsigned int        status_use_accessors;
51     unsigned int        core_internal_state__do_not_mess_with_it;
52     unsigned int        depth;      /* nested irq disables */
53     unsigned int        wake_depth; /* nested wake enables */
54     unsigned int        irq_count; /* For detecting broken IRQs */
55     unsigned long       last_unhandled; /* Aging timer for unhandled count */
56     unsigned int        irqs_unhandled;
57     atomic_t        threads_handled;
58     int         threads_handled_last;
59     raw_spinlock_t      lock;
60     struct cpumask      *percpu_enabled;
61 #ifdef CONFIG_SMP
62     const struct cpumask    *affinity_hint;
63     struct irq_affinity_notify *affinity_notify;
64 #ifdef CONFIG_GENERIC_PENDING_IRQ
65     cpumask_var_t       pending_mask;
66 #endif
67 #endif
68     unsigned long       threads_oneshot;
69     atomic_t        threads_active;
70     wait_queue_head_t       wait_for_threads;
71 #ifdef CONFIG_PROC_FS
72     struct proc_dir_entry   *dir;
73 #endif
74     int         parent_irq;
75     struct module       *owner;
76     const char      *name;
77 } ____cacheline_internodealigned_in_smp;

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62

irq_data

126 /**
127 * struct irq_data - per irq and irq chip data passed down to chip functions
128 * @mask:       precomputed bitmask for accessing the chip registers
129 * @irq:        interrupt number
130 * @hwirq:      hardware interrupt number, local to the interrupt domain
131 * @node:       node index useful for balancing
132 * @state_use_accessors: status information for irq chip functions.
133 *          Use accessor functions to deal with it
134 * @chip:       low level interrupt hardware access
135 * @domain:     Interrupt translation domain; responsible for mapping
136 *          between hwirq number and linux irq number.
137 * @handler_data:   per-IRQ data for the irq_chip methods
138 * @chip_data:      platform-specific per-chip private data for the chip
139 *          methods, to allow shared chip implementations
140 * @msi_desc:       MSI descriptor
141 * @affinity:       IRQ affinity on SMP
142 *
143 * The fields here need to overlay the ones in irq_desc until we
144 * cleaned up the direct references and switched everything over to
145 * irq_data.
146 */
147 struct irq_data {
148     u32         mask;
149     unsigned int        irq;
150     unsigned long       hwirq;
151     unsigned int        node;
152     unsigned int        state_use_accessors;
153     struct irq_chip     *chip;
154     struct irq_domain   *domain;
155     void            *handler_data;
156     void            *chip_data;
157     struct msi_desc     *msi_desc;
158     cpumask_var_t       affinity;
159 };

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34

irq_chip

281 /**
282 * struct irq_chip - hardware interrupt chip descriptor
283 *
284 * @name:       name for /proc/interrupts
285 * @irq_startup:    start up the interrupt (defaults to ->enable if NULL)
286 * @irq_shutdown:   shut down the interrupt (defaults to ->disable if NULL)
287 * @irq_enable:     enable the interrupt (defaults to chip->unmask if NULL)
288 * @irq_disable:    disable the interrupt
289 * @irq_ack:        start of a new interrupt
290 * @irq_mask:       mask an interrupt source
291 * @irq_mask_ack:   ack and mask an interrupt source
292 * @irq_unmask:     unmask an interrupt source
293 * @irq_eoi:        end of interrupt
294 * @irq_set_affinity:   set the CPU affinity on SMP machines
295 * @irq_retrigger: resend an IRQ to the CPU
296 * @irq_set_type:   set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ
297 * @irq_set_wake:   enable/disable power-management wake-on of an IRQ
298 * @irq_bus_lock:   function to lock access to slow bus (i2c) chips
299 * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips
300 * @irq_cpu_online: configure an interrupt source for a secondary CPU
301 * @irq_cpu_offline:    un-configure an interrupt source for a secondary CPU
302 * @irq_suspend:    function called from core code on suspend once per chip
303 * @irq_resume:     function called from core code on resume once per chip
304 * @irq_pm_shutdown:    function called from core code on shutdown once per chip
305 * @irq_calc_mask: Optional function to set irq_data.mask for special cases
306 * @irq_print_chip: optional to print special chip info in show_interrupts
307 * @flags:      chip specific flags
308 */
309 struct irq_chip {
310     const char *name;
311     unsigned int    (*irq_startup)(struct irq_data *data);
312     void        (*irq_shutdown)(struct irq_data *data);
313     void        (*irq_enable)(struct irq_data *data);
314     void        (*irq_disable)(struct irq_data *data);
315
316     void        (*irq_ack)(struct irq_data *data);
317     void        (*irq_mask)(struct irq_data *data);
318     void        (*irq_mask_ack)(struct irq_data *data);
319     void        (*irq_unmask)(struct irq_data *data);
320     void        (*irq_eoi)(struct irq_data *data);
321
322     int     (*irq_set_affinity)(struct irq_data *data, const struct cpumask *dest, bool force);
323     int     (*irq_retrigger)(struct irq_data *data);
324     int     (*irq_set_type)(struct irq_data *data, unsigned int flow_type);
325     int     (*irq_set_wake)(struct irq_data *data, unsigned int on);
326
327     void        (*irq_bus_lock)(struct irq_data *data);
328     void        (*irq_bus_sync_unlock)(struct irq_data *data);
329
330     void        (*irq_cpu_online)(struct irq_data *data);
331     void        (*irq_cpu_offline)(struct irq_data *data);
332
333     void        (*irq_suspend)(struct irq_data *data);
334     void        (*irq_resume)(struct irq_data *data);
335     void        (*irq_pm_shutdown)(struct irq_data *data);
336
337     void        (*irq_calc_mask)(struct irq_data *data);
338
339     void        (*irq_print_chip)(struct irq_data *data, struct seq_file *p);
340
341     unsigned long   flags;
342 };

x86 kernel 中断分析三——中断处理流程

CPU检测中断

CPU在执行每条程序之前会检测是否有中断到达，即中断控制器是否有发送中断信号过来

查找IDT

CPU根据中断向量到IDT中读取对应的中断描述符表项，根据段选择符合偏移确定中断服务程序的地址见附录2

interrupt数组

在分析一中，我们看到，填充IDT中断服务程序的是interrupt数组的内容，所以第2步跳转到interrupt数组对应的表项，表项的内容之前也已分析过

push vector num and jmp to common_interrupt

 778 /*
 779  * the CPU automatically disables interrupts when executing an IRQ vector,
 780  * so IRQ-flags tracing has to follow that:
 781  */
 782     .p2align CONFIG_X86_L1_CACHE_SHIFT
 783 common_interrupt:
 784     ASM_CLAC
 785     addl $-0x80,(%esp)  /* Adjust vector into the [-256,-1] range */ 
 786     SAVE_ALL
 787     TRACE_IRQS_OFF
 788     movl %esp,%eax
 789     call do_IRQ
 790     jmp ret_from_intr
 791 ENDPROC(common_interrupt)
 792     CFI_ENDPROC

addl $-0x80,(%esp)

根据第一篇分析，此时栈顶是(~vector + 0x80)，这里减去0x80，所以值为vector num取反，范围在[-256, -1]。这么做是为了和系统调用区分，正值为系统调用号，负值为中断向量。

SAVE_ALL

保存现场，将所有寄存器的值压栈(cs eip ss esp由系统自动保存)

186 .macro SAVE_ALL
 187     cld
 188     PUSH_GS
 189     pushl_cfi %fs
 190     /*CFI_REL_OFFSET fs, 0;*/
 191     pushl_cfi %es
 192     /*CFI_REL_OFFSET es, 0;*/
 193     pushl_cfi %ds
 194     /*CFI_REL_OFFSET ds, 0;*/
 195     pushl_cfi %eax
 196     CFI_REL_OFFSET eax, 0
 197     pushl_cfi %ebp
 198     CFI_REL_OFFSET ebp, 0
 199     pushl_cfi %edi
 200     CFI_REL_OFFSET edi, 0
 201     pushl_cfi %esi
 202     CFI_REL_OFFSET esi, 0
 203     pushl_cfi %edx
 204     CFI_REL_OFFSET edx, 0
 205     pushl_cfi %ecx
 206     CFI_REL_OFFSET ecx, 0
 207     pushl_cfi %ebx
 208     CFI_REL_OFFSET ebx, 0
 209     movl $(__USER_DS), %edx
 210     movl %edx, %ds
 211     movl %edx, %es
 212     movl $(__KERNEL_PERCPU), %edx
 213     movl %edx, %fs
 214     SET_KERNEL_GS %edx
 215 .endm

movl %esp,%eax

将esp的值赋值给eax，eax作为do_IRQ的第一个参数，esp的值是以上压栈的寄存器的内容，以pt_reg形式传过去。

call do_IRQ

175 /*
176  * do_IRQ handles all normal device IRQ's (the special
177  * SMP cross-CPU interrupts have their own specific
178  * handlers).
179  */
180 __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
181 {
182     struct pt_regs *old_regs = set_irq_regs(regs);
183
184     /* high bit used in ret_from_ code  */
185     unsigned vector = ~regs->orig_ax;       //获取向量号，这里有一个取反的操作，与之前的取反相对应得到正的向量号
186     unsigned irq;
187
188     irq_enter();
189     exit_idle();
190
191     irq = __this_cpu_read(vector_irq[vector]);       //通过向量号得到中断号
192
193     if (!handle_irq(irq, regs)) {
194         ack_APIC_irq();
195
196         if (irq != VECTOR_RETRIGGERED) {
197             pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
198                          __func__, smp_processor_id(),
199                          vector, irq);
200         } else {
201             __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
202         }
203     }
204
205     irq_exit();
206
207     set_irq_regs(old_regs);
208     return 1;
209 }

irq_enter

319 /*
  320  * Enter an interrupt context.  //进入中断上下文，因为首先处理的是硬中断，所以我们可以把irq_enter认为是硬中断的开始
  321  */
  322 void irq_enter(void)
  323 {
  324     rcu_irq_enter();                                    //inform RCU that current CPU is entering irq away from idle
  325     if (is_idle_task(current) && !in_interrupt()) {   //如果当前是pid==0的idle task并且不处于中断上下文中
  326         /*
  327          * Prevent raise_softirq from needlessly waking up ksoftirqd
  328          * here, as softirq will be serviced on return from interrupt.
  329          */
  330         local_bh_disable();
  331         tick_irq_enter();     //idle进程会被中断或者其他进程抢占，在系统中断过程中用irq_enter->tick_irq_enter()恢复周期性tick以得到正确的jiffies值（这段注释摘录自http://blog.chinaunix.net/uid-29675110-id-4365095.html）
  332         _local_bh_enable();
  333     }
  334
  335     __irq_enter();
  336 }

__irq_enter

28 /*
 29  * It is safe to do non-atomic ops on ->hardirq_context,
 30  * because NMI handlers may not preempt and the ops are
 31  * always balanced, so the interrupted value of ->hardirq_context
 32  * will always be restored.
 33  */
 34 #define __irq_enter()                   \
 35     do {                        \
 36         account_irq_enter_time(current);    \
 37         preempt_count_add(HARDIRQ_OFFSET);  \             //HARDIRQ_OFFSET等于1左移16位，即将preempt_count第16 bit加1，preempt_count的格式见附录
 38         trace_hardirq_enter();          \
 39     } while (0)

exit_idle

如果系统正处在idle状态，那么退出IDLE

258 /* Called from interrupts to signify idle end */
259 void exit_idle(void)
260 {
261     /* idle loop has pid 0 */          //如果当前进程不为0，直接退出，不需要退出 idle
262     if (current->pid)
263         return;
264     __exit_idle();            //如果是idle进程，那么通过__exit_idle调用一系列notification
265 }

handle_irq

165 bool handle_irq(unsigned irq, struct pt_regs *regs)
166 {
167     struct irq_desc *desc;
168     int overflow;
169
170     overflow = check_stack_overflow();  //x86架构下如果sp指针距离栈底的位置小于1KB，则认为有stack overflow的风险
171
172     desc = irq_to_desc(irq);                        //获取desc，从刚开始的vector num-->irq num--> desc
173     if (unlikely(!desc))
174         return false;
175  //如果发生中断时，CPU正在执行用户空间的代码，处理中断需切换到内核栈，但此时内核栈是空的，所以无需再切换到中断栈
176     if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {     // 在CPU的irq stack执行，否则在当前进程的栈执行，调用下面的desc->handle_irq  
177         if (unlikely(overflow))
178             print_stack_overflow();
179         desc->handle_irq(irq, desc);
180     }
181
182     return true;
183 }

中断栈的定义及初始化

按照目前的内核设计，中断有自己的栈，用来执行中断服务程序，这样是为了防止中断嵌套破坏与之共享的
中断栈的定义，可以看到与进程上下文的布局相同，thread info + stack

 58 /*
 59  * per-CPU IRQ handling contexts (thread information and stack)
 60  */
 61 union irq_ctx {
 62     struct thread_info      tinfo;
 63     u32                     stack[THREAD_SIZE/sizeof(u32)];
 64 } __attribute__((aligned(THREAD_SIZE)));

中断栈的初始化：

创建percpu变量hardirq_ctx和softirq_ctx，类型为irq_ctx，所以每个cpu的软硬中断有各自的stack

 66 static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
 67 static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);

native_init_IRQ->irq_ctx_init
hardirq_ctx和softirq_ctx的初始化方式相同，如下

116 /*
117  * allocate per-cpu stacks for hardirq and for softirq processing
118  */
119 void irq_ctx_init(int cpu)
120 {
121     union irq_ctx *irqctx;
122
123     if (per_cpu(hardirq_ctx, cpu))
124         return;
125
126     irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),      //分配2个page      
127                            THREADINFO_GFP,
128                            THREAD_SIZE_ORDER));
129     memset(&irqctx->tinfo, 0, sizeof(struct thread_info));       //初始化其中的部分成员
130     irqctx->tinfo.cpu       = cpu;
131     irqctx->tinfo.addr_limit    = MAKE_MM_SEG(0);
132
133     per_cpu(hardirq_ctx, cpu) = irqctx;                             //赋值给hardirq_ctx
134
135     irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
136                            THREADINFO_GFP,
137                            THREAD_SIZE_ORDER));
138     memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
139     irqctx->tinfo.cpu       = cpu;
140     irqctx->tinfo.addr_limit    = MAKE_MM_SEG(0);
141
142     per_cpu(softirq_ctx, cpu) = irqctx;
143
144     printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
145            cpu, per_cpu(hardirq_ctx, cpu),  per_cpu(softirq_ctx, cpu));
146 }

网上找的一张图，如下
这里写图片描述

中断栈的切换

发生中断时需要从当前进程栈切换到中断栈

 80 static inline int
 81 execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
 82 {
 83     union irq_ctx *curctx, *irqctx;
 84     u32 *isp, arg1, arg2;
 85
 86     curctx = (union irq_ctx *) current_thread_info();       //获取当前进程的process context，即栈的起始地址
 87     irqctx = __this_cpu_read(hardirq_ctx);                  //获取硬中断的hardirq context，即栈的起始地址
 88
 89     /*
 90      * this is where we switch to the IRQ stack. However, if we are
 91      * already using the IRQ stack (because we interrupted a hardirq
 92      * handler) we can't do that and just have to keep using the
 93      * current stack (which is the irq stack already after all)
 94      */
 95     if (unlikely(curctx == irqctx))                //如果当前进程的栈和中断栈相同，说明发生了中断嵌套，此时当前进程就是一个中断的服务例程
 96         return 0;                                   //这种情况下不能进行栈的切换，还是在当前栈中运行，只要返回0即可
 97
 98     /* build the stack frame on the IRQ stack */
 99     isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));           //获取中断栈的isp
100     irqctx->tinfo.task = curctx->tinfo.task;                    //获取当前进程的task和stack point
101     irqctx->tinfo.previous_esp = current_stack_pointer;
102
103     if (unlikely(overflow))
104         call_on_stack(print_stack_overflow, isp);
105
106     asm volatile("xchgl %%ebx,%%esp \n"                    //具体的栈切换发生在以下汇编中，基本上就是保存现场，进行切换，不深入研究汇编了...
107              "call  *%%edi      \n"
108              "movl  %%ebx,%%esp \n"
109              : "=a" (arg1), "=d" (arg2), "=b" (isp)
110              :  "0" (irq),   "1" (desc),  "2" (isp),
111             "D" (desc->handle_irq)                                    //不管是共享栈还是独立栈，最后都会调用到irq desc对应的handle_irq
112              : "memory", "cc", "ecx");
113     return 1;
114 }

handle_level_irq

kernel中对于中断有一系列的中断流处理函数

handle_simple_irq  用于简易流控处理；
handle_level_irq   用于电平触发中断的流控处理；
handle_edge_irq    用于边沿触发中断的流控处理；
handle_fasteoi_irq  用于需要响应eoi的中断控制器；
handle_percpu_irq   用于只在单一cpu响应的中断；
handle_nested_irq   用于处理使用线程的嵌套中断；

我们在第二篇分析中，init_ISA_irqs把legacy irq的中断流处理函数都设置为handle_level_irq，以此为例做分析：

//level type中断，当硬件中断line的电平处于active level时就一直保持有中断请求，这就要求处理中断过程中屏蔽中断，响应硬件后打开中断
387 /**
388  *  handle_level_irq - Level type irq handler          //电平触发的中断处理函数
389  *  @irq:   the interrupt number
390  *  @desc:  the interrupt description structure for this irq
391  *
392  *  Level type interrupts are active as long as the hardware line has          
393  *  the active level. This may require to mask the interrupt and unmask        
394  *  it after the associated handler has acknowledged the device, so the
395  *  interrupt line is back to inactive.
396  */
397 void
398 handle_level_irq(unsigned int irq, struct irq_desc *desc)
399 {
400     raw_spin_lock(&desc->lock);                         //上锁
401     mask_ack_irq(desc);       //mask对应的中断，否则一直接收来自interrupt line的中断信号
402
403     if (unlikely(irqd_irq_inprogress(&desc->irq_data)))  //如果该中断正在其他cpu上被处理
404         if (!irq_check_poll(desc))  //这边不是很理解，irq的IRQS_POLL_INPROGRESS(polling in a progress)是什么意思？只能等后续代码遇到这个宏的时候再说。如果是在该状态，cpu relax，等待完成
405             goto out_unlock;                        //直接解锁退出
406     //清除IRQS_REPLAY和IRQS_WAITING标志位
407     desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
408     kstat_incr_irqs_this_cpu(irq, desc);   //该CPU上该irq触发次数加1，总的中断触发次数加1
409
410     /*
411      * If its disabled or no action available
412      * keep it masked and get out of here
413      */
414     if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
415         desc->istate |= IRQS_PENDING;                   //设置为pending
416         goto out_unlock;
417     }
418
419     handle_irq_event(desc);                        //核心函数
420
421     cond_unmask_irq(desc);                           //使能中断线
422
423 out_unlock:
424     raw_spin_unlock(&desc->lock);
425 }
426 EXPORT_SYMBOL_GPL(handle_level_irq);

handle irq event

182 irqreturn_t handle_irq_event(struct irq_desc *desc)
183 {
184     struct irqaction *action = desc->action;         //获取irqaction链表
185     irqreturn_t ret;
186
187     desc->istate &= ~IRQS_PENDING;        //正式进入处理流程，清除irq desc的pending标志位
188     irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);        //处理中断前设置IRQD_IRQ_INPROGRESS标志
189     raw_spin_unlock(&desc->lock);
190
191     ret = handle_irq_event_percpu(desc, action);            
192
193     raw_spin_lock(&desc->lock);
194     irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);   //处理中断后清除IRQD_IRQ_INPROGRESS标志
195     return ret;
196 }

handle_irq_event_percpu

132 irqreturn_t
133 handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
134 {
135     irqreturn_t retval = IRQ_NONE;
136     unsigned int flags = 0, irq = desc->irq_data.irq;
137
138     do {
139         irqreturn_t res;
140
141         trace_irq_handler_entry(irq, action);
142         res = action->handler(irq, action->dev_id); //调用硬中断处理函数
143         trace_irq_handler_exit(irq, action, res);
144
145         if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
146                   irq, action->handler))
147             local_irq_disable();
148
149         switch (res) {
150         case IRQ_WAKE_THREAD:      //线程化中断的硬中断，通常只是响应一下硬件ack，就返会IRQ_WAKE_THREAD，唤醒软中断线程
151             /*
152              * Catch drivers which return WAKE_THREAD but
153              * did not set up a thread function
154              */
155             if (unlikely(!action->thread_fn)) {
156                 warn_no_thread(irq, action);
157                 break;
158             }
159
160             irq_wake_thread(desc, action);            //唤醒软中断线程
161
162             /* Fall through to add to randomness */
163         case IRQ_HANDLED:                        //表示已经在硬中断中处理完毕
164             flags |= action->flags;
165             break;
166
167         default:
168             break;
169         }
170
171         retval |= res;
172         action = action->next;            //对于共享中断，所有irqaction挂在同一desc下
173     } while (action);
174
175     add_interrupt_randomness(irq, flags);    //这块代码其实和中断流程的关系不大，利用用户和外设作为噪声源，为内核随机熵池做贡献....（http://jingpin.jikexueyuan.com/article/23923.html）
176
177     if (!noirqdebug)
178         note_interrupt(irq, desc, retval);
179     return retval;
180 }

以上就是中断处理流程的简要分析，有个问题，中action的handler及线程化的软中断从何而来？下篇分析见。

附录1：

CPU使用IDT查到的中断服务程序的段选择符从GDT中取得相应的段描述符，段描述符里保存了中断服务程序的段基址和属性信息，此时CPU就得到了中断服务程序的起始地址。这里，CPU会根据当前cs寄存器里的CPL和GDT的段描述符的DPL，以确保中断服务程序是高于当前程序的，如果这次中断是编程异常（如：int 80h系统调用），那么还要检查CPL和IDT表中中断描述符的DPL，以保证当前程序有权限使用中断服务程序，这可以避免用户应用程序访问特殊的陷阱门和中断门[3]。
如下图显示了从中断向量到GDT中相应中断服务程序起始位置的定位方式:

这里写图片描述

附录2. preempt_count：

 44 #define HARDIRQ_OFFSET  (1UL << HARDIRQ_SHIFT)         // 1左移16位
 32 #define HARDIRQ_SHIFT   (SOFTIRQ_SHIFT + SOFTIRQ_BITS)  // 8 + 8 = 16
 31 #define SOFTIRQ_SHIFT   (PREEMPT_SHIFT + PREEMPT_BITS)  // 0 + 8 = 8
 30 #define PREEMPT_SHIFT   0
 25 #define PREEMPT_BITS    8
 26 #define SOFTIRQ_BITS    8

2500 void __kprobes preempt_count_add(int val)
2501 {
2502 #ifdef CONFIG_DEBUG_PREEMPT
2503     /*
2504      * Underflow?
2505      */
2506     if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2507         return;
2508 #endif
2509     __preempt_count_add(val);    //除去debug相关的内容，只有这一行关键代码，将preempt_count中第16 bit加1
2510 #ifdef CONFIG_DEBUG_PREEMPT
2511     /*
2512      * Spinlock count overflowing soon?
2513      */
2514     DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
2515                 PREEMPT_MASK - 10);
2516 #endif
2517     if (preempt_count() == val)
2518         trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2519 }
2520 EXPORT_SYMBOL(preempt_count_add);

preempt_count的布局如下：
这里写图片描述

kernel 中断分析之四——中断申请[上]
showstopper_x
于 2017-01-04 22:43:13 发布 2729
收藏 1
分类专栏： interrupt 文章标签： kernel interrupt
版权
interrupt 专栏收录该内容
10 篇文章 1 订阅
订阅专栏
前言

从分析三可知，中断处理最终调用了irqaction的handler（interrupt context），在必要的情况下唤醒中断处理线程调用thread_fn（process context）。
对应的中断服务例程是在驱动初始化阶段，普通中断通过setup_irq、request_irq或者request_threaded_irq进行申请，percpu中断通过request_percpu_irq、setup_percpu_irq，本篇主要对这上接口进行分析，__setup_irq放在kernel 中断分析之四——中断申请[下]中分析。
request_threaded_irq

关于中断线程化的目的和对系统performance的提升，可以看这篇文章Moving interrupts to threads(翻译)。

1349 /**
1350 * request_threaded_irq - allocate an interrupt line
1351 * @irq: Interrupt line to allocate
1352 * @handler: Function to be called when the IRQ occurs.
1353 *        Primary handler for threaded interrupts
1354 *        If NULL and thread_fn != NULL the default
1355 *        primary handler is installed
1356 * @thread_fn: Function called from the irq handler thread
1357 *          If NULL, no irq thread is created
1358 * @irqflags: Interrupt type flags
1359 * @devname: An ascii name for the claiming device
1360 * @dev_id: A cookie passed back to the handler function
1361 *
1362 * This call allocates interrupt resources and enables the
1363 * interrupt line and IRQ handling. From the point this
1364 * call is made your handler function may be invoked. Since
1365 * your handler function must clear any interrupt the board
1366 * raises, you must take care both to initialise your hardware
1367 * and to set up the interrupt handler in the right order.
1368 *
1369 * If you want to set up a threaded irq handler for your device
1370 * then you need to supply @handler and @thread_fn. @handler is
1371 * still called in hard interrupt context and has to check
1372 * whether the interrupt originates from the device. If yes it
1373 * needs to disable the interrupt on the device and return
1374 * IRQ_WAKE_THREAD which will wake up the handler thread and run
1375 * @thread_fn. This split handler design is necessary to support
1376 * shared interrupts.
1377 *
1378 * Dev_id must be globally unique. Normally the address of the
1379 * device data structure is used as the cookie. Since the handler
1380 * receives this value it makes sense to use it.
1381 *
1382 * If your interrupt is shared you must pass a non NULL dev_id
1383 * as this is required when freeing the interrupt.
1384 *
1385 * Flags:
1386 *
1387 * IRQF_SHARED     Interrupt is shared
1388 * IRQF_TRIGGER_*      Specify active edge(s) or level
1389 *
1390 */

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43

内核中很少有如此大篇幅的注释。该API的目的是分配一条中断线，参数如下：
irq：中断号，代表一条interrupt line；
handler：注释中叫做primary handler，工作在interrupt context下。如果handler为NULL，thread_fn不为NULL，那么内核会为handler赋值一个default handler指针，具体作用后面会讲；
thread_fn：内核将会创建一个线程——irq handler thread（对应irqaction的thread成员），该线程会调用thread_fn，以实现bottom half的线程化；
irqflags：描述中断类型；
devname：该irqaction的名称；
dev_id：handler中会用到最为参数，如果中断共享的话，该参数不能为NULL。

注释1370-1376行:
@handler 仍然运行在硬中断上下文，必须检测中断是否来自对应的device，如果是，那么，硬中断中disable该设备上的该中断，然后返回IRQ_WAKE_THREAD。系统将会根据该返回值（见分析三）唤醒中断线程并调用@thread_fn。

request_threaded_irq的源码如下：

1391 int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1392              irq_handler_t thread_fn, unsigned long irqflags,
1393              const char *devname, void *dev_id)
1394 {
1395     struct irqaction *action;
1396     struct irq_desc *desc;
1397     int retval;
1398
1399     /*
1400      * Sanity-check: shared interrupts must pass in a real dev-ID,
1401      * otherwise we'll have trouble later trying to figure out
1402      * which interrupt is which (messes up the interrupt freeing
1403      * logic etc).
1404      */
1405     if ((irqflags & IRQF_SHARED) && !dev_id)        //共享中断的情况下必须声明dev_id
1406         return -EINVAL;
1407
1408     desc = irq_to_desc(irq);        //获取irqdesc
1409     if (!desc)
1410         return -EINVAL;
1411
1412     if (!irq_settings_can_request(desc) ||      //某些中断，比如系统保留的中断，不允许申请，会设置IRQ_NOREQUEST标志
1413         WARN_ON(irq_settings_is_per_cpu_devid(desc))) //如果申请的中断时percpu中断，比如timer，也会返回失败
1414         return -EINVAL;
1415
1416     if (!handler) {          //如果handler为NULL但thread_fn不为NULL，赋值handler为irq_default_primary_handler
1417         if (!thread_fn)      // handler和thread_fn不能同时为NULL
1418             return -EINVAL;
1419         handler = irq_default_primary_handler; //直接返回IRQ_WAKE_THREAD
1420     }
1421     // 分配并初始化新的irqaction
1422     action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
1423     if (!action)
1424         return -ENOMEM;
1425
1426     action->handler = handler;
1427     action->thread_fn = thread_fn;
1428     action->flags = irqflags;
1429     action->name = devname;
1430     action->dev_id = dev_id;
1431
1432     chip_bus_lock(desc);
1433     retval = __setup_irq(irq, desc, action); //大部分工作都放到了__setup_irq中
1434     chip_bus_sync_unlock(desc);
1435
1436     if (retval)
1437         kfree(action);
1438
1439 #ifdef CONFIG_DEBUG_SHIRQ_FIXME
1440     if (!retval && (irqflags & IRQF_SHARED)) {
1441         /*
1442          * It's a shared IRQ -- the driver ought to be prepared for it
1443          * to happen immediately, so let's make sure....
1444          * We disable the irq to make sure that a 'real' IRQ doesn't
1445          * run in parallel with our fake.
1446          */
1447         unsigned long flags;
1448
1449         disable_irq(irq);
1450         local_irq_save(flags);
1451
1452         handler(irq, dev_id);
1453
1454         local_irq_restore(flags);
1455         enable_irq(irq);
1456     }
1457 #endif
1458     return retval;
1459 }
EXPORT_SYMBOL(request_threaded_irq);

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70

request_irq

127 static inline int __must_check
128 request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
129 const char *name, void *dev)
130 {
131 return request_threaded_irq(irq, handler, NULL, flags, name, dev);
132 }

    1
    2
    3
    4
    5
    6

request_irq是对request_threaded_irq的封装，thread_fn为NULL，不涉及中断线程化。
setup_irq

1196 /**
1197 *      setup_irq - setup an interrupt
1198 *      @irq: Interrupt line to setup
1199 *      @act: irqaction for the interrupt
1200 *
1201 * Used to statically setup interrupts in the early boot process.
1202 */
1203 int setup_irq(unsigned int irq, struct irqaction *act)
1204 {
1205         int retval;
1206         struct irq_desc *desc = irq_to_desc(irq);
1207
1208         if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
1209                 return -EINVAL;
1210         chip_bus_lock(desc);
1211         retval = __setup_irq(irq, desc, act);
1212         chip_bus_sync_unlock(desc);
1213
1214         return retval;
1215 }
1216 EXPORT_SYMBOL_GPL(setup_irq);

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21

setup_irq一般在系统早期启动的时候被调用（比如系统时钟），用静态定义的irqaction作为参数，这么做的原因是系统启动初期可能slab还没有初始化好，不能像request_threaded_irq那样动态分配irqaction。并且setup_irq的参数irqaction都是没有thread_fn的（我看到的都没有），说明不涉及线程化。流程比较简单，上锁后直接调用__setup_irq。
request_percpu_irq

1642 /**
1643 *      request_percpu_irq - allocate a percpu interrupt line
1644 *      @irq: Interrupt line to allocate
1645 *      @handler: Function to be called when the IRQ occurs.
1646 *      @devname: An ascii name for the claiming device
1647 *      @dev_id: A percpu cookie passed back to the handler function
1648 *
1649 *      This call allocates interrupt resources, but doesn't
1650 *      automatically enable the interrupt. It has to be done on each
1651 *      CPU using enable_percpu_irq().
1652 *
1653 *      Dev_id must be globally unique. It is a per-cpu variable, and
1654 *      the handler gets called with the interrupted CPU's instance of
1655 *      that variable.
1656 */
1657 int request_percpu_irq(unsigned int irq, irq_handler_t handler,
1658                        const char *devname, void __percpu *dev_id)
1659 {
1660         struct irqaction *action;
1661         struct irq_desc *desc;
1662         int retval;
1663
1664         if (!dev_id)
1665                 return -EINVAL;
1666
1667         desc = irq_to_desc(irq);
1668         if (!desc || !irq_settings_can_request(desc) ||
1669             !irq_settings_is_per_cpu_devid(desc))
1670                 return -EINVAL;
1671
1672         action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
1673         if (!action)
1674                 return -ENOMEM;
1675
1676         action->handler = handler;
1677         action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
1678         action->name = devname;
1679         action->percpu_dev_id = dev_id;
1680
1681         chip_bus_lock(desc);
1682         retval = __setup_irq(irq, desc, action);
1683         chip_bus_sync_unlock(desc);
1684
1685         if (retval)
1686                 kfree(action);
1687
1688         return retval;
1689 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48

申请percpu irq，动态分配irqaction，并没有初始化thread_fn，所以不涉及线程化。
setup_percpu_irq

1621 /**
1622 *      setup_percpu_irq - setup a per-cpu interrupt
1623 *      @irq: Interrupt line to setup
1624 *      @act: irqaction for the interrupt
1625 *
1626 * Used to statically setup per-cpu interrupts in the early boot process.
1627 */
1628 int setup_percpu_irq(unsigned int irq, struct irqaction *act)
1629 {
1630         struct irq_desc *desc = irq_to_desc(irq);
1631         int retval;
1632
1633         if (!desc || !irq_settings_is_per_cpu_devid(desc))
1634                 return -EINVAL;
1635         chip_bus_lock(desc);
1636         retval = __setup_irq(irq, desc, act);
1637         chip_bus_sync_unlock(desc);
1638
1639         return retval;
1640 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21

申请percpu irq，系统启动早期调用，不涉及中断线程化。

前言

在kernel 中断分析之四——中断申请 [上]中，request_irq、request_threaded_irq、setup_irq、setup_percpu_irq、request_percpu_irq最终都调用了__setup_irq，本篇对该API进行分析，由于代码比较长，分段分析。

请注意，在分析过程中，遇到一些拿捏不定的地方，以用粗体表示，如果有理解错误，欢迎指正。
__setup_irq——线程化处理

   895 /*
   896 * Internal function to register an irqaction - typically used to
   897 * allocate special interrupts that are part of the architecture.
   898 */
   899 static int
   900 __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
   901 {
   902     struct irqaction *old, **old_ptr;
   903     unsigned long flags, thread_mask = 0;
   904     int ret, nested, shared = 0;
   905     cpumask_var_t mask;
   906
   907     if (!desc)
   908         return -EINVAL;
   909
   910     if (desc->irq_data.chip == &no_irq_chip)
   911         return -ENOSYS;
   912     if (!try_module_get(desc->owner))
   913         return -ENODEV;
   914
   915     /*
   916      * Check whether the interrupt nests into another interrupt
   917      * thread.
   918      */
   919     nested = irq_settings_is_nested_thread(desc);    //判断是否是嵌套中断线程，关于中断嵌套的处理，在后续有分析
   920     if (nested) {
   921         if (!new->thread_fn) {    //嵌套中断不需要有handler，但是thread_fn要有的
   922             ret = -EINVAL;
   923             goto out_mput;
   924         }
   925         /*
   926          * Replace the primary handler which was provided from
   927          * the driver for non nested interrupt handling by the
   928          * dummy function which warns when called.
   929          */
   930         new->handler = irq_nested_primary_handler; //抛出一个警告，nested irq的调用时父中断的handler中处理的，而不是在这里
   931     } else {
   932         if (irq_settings_can_thread(desc)) //有的中断不允许线程化，设置了IRQ_NOTHREAD标志
   933             irq_setup_forced_threading(new); //强制线程化
   934     }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40

irq_setup_forced_threading

   879 static void irq_setup_forced_threading(struct irqaction *new)
   880 {
   881     if (!force_irqthreads) //该全局变量用于表示系统是否允许中断线程化
   882         return;
   883     if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) //IRQF_NO_THREAD 或者percpu中断或者已经线程化的中断，直接返回
   884         return;
   885
   886     new->flags |= IRQF_ONESHOT; //线程化中断需要设置IRQF_ONESHOT标志
   887//如果thread为NULL（那么handler必不为NULL），此时为了线程化，强制将handler赋给thread_fn,handler设置为irq_default_primary_handler
   888     if (!new->thread_fn) {
   889         set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); //thread_flags设置IRQTF_FORCED_THREAD，表示经过强制线程化
   890         new->thread_fn = new->handler;
   891         new->handler = irq_default_primary_handler;
   892     }
   893 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15

我对上述代码的理解，不一定正确：目前的内核实现机制，非nested irq，且非IRQ_NOTHREAD，内核都会将其强制线程化。
以下是IRQF_ONESHOT 的注释：

52 * IRQF_ONESHOT - Interrupt is not reenabled after the hardirq handler finished.
53 * Used by threaded interrupts which need to keep the
54 * irq line disabled until the threaded handler has been run.

    1
    2
    3

另外摘录upstream上IRQF_ONESHOT的commit message如下：

For threaded interrupt handlers we expect the hard interrupt handler
part to mask the interrupt on the originating device. The interrupt
line itself is reenabled after the hard interrupt handler has
executed.

This requires access to the originating device from hard interrupt
context which is not always possible. There are devices which can only
be accessed via a bus (i2c, spi, ...). The bus access requires thread
context. For such devices we need to keep the interrupt line masked
until the threaded handler has executed.

Add a new flag IRQF_ONESHOT which allows drivers to request that the
interrupt is not unmasked after the hard interrupt context handler has
been executed and the thread has been woken. The interrupt line is
unmasked after the thread handler function has been executed.

Note that for now IRQF_ONESHOT cannot be used with IRQF_SHARED to
avoid complex accounting mechanisms.

For oneshot interrupts the primary handler simply returns
IRQ_WAKE_THREAD and does nothing else. A generic implementation
irq_oneshot_primary_handler() is provided to avoid useless copies all
over the place.

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23

所以有：
1. 在线程化中断处理函数中，hardirq在interrupt context中执行时是mask产生中断的外设的对应irq line的，完成硬中断操作后中断线被reenable；
2. 然而中断上下文中并不是能够access所有的外设，比如某些设备必须要通过i2c、spi等bus才能access，而bus access需要在进程上下文中才能实现（这里不太明白），也就是说，需要将处理中断的代码放到进程上下文中（我的理解），所以对于这些设备，在进程上下文中也要关闭中断；
3. IRQF_ONESHOT 应运而生，该flag允许中断线程运行期间对应的interrupt line一直是mask的。运行结束后unmask。
设置IRQF_ONESHOT 的情况下，在硬中断处理完毕后，仍然不能打开对应的中断（the irq line disabled），直到线程化handler处理完毕。
__setup_irq——创建irq handler thread

   936     /*
   937      * Create a handler thread when a thread function is supplied
   938      * and the interrupt does not nest into another interrupt
   939      * thread.
   940      */
   941     if (new->thread_fn && !nested) {
   942         struct task_struct *t;
   943         static const struct sched_param param = {
   944             .sched_priority = MAX_USER_RT_PRIO/2, //线程的优先级
   945         };
   946 //创建一个名为irq/irq-name的线程，该线程调用irq_thread，参数为新的irqaction，只是创建，并没有唤醒
   947         t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
   948                    new->name);
   949         if (IS_ERR(t)) {
   950             ret = PTR_ERR(t);
   951             goto out_mput;
   952         }
   953
   954         sched_setscheduler_nocheck(t, SCHED_FIFO, &param); //设置调度策略和优先级
   955
   956         /*
   957          * We keep the reference to the task struct even if
   958          * the thread dies to avoid that the interrupt code
   959          * references an already freed task_struct.
   960          */
   961         get_task_struct(t); //将该线程的task_struct的reference加1，防止线程die以后task_struct被释放？？
   962         new->thread = t;
   963         /* //这边的注释我没有看懂，为什么设置affinity对共享中断很重要，在irq_thread中会check affinity,到那个时候再看做了什么。
   964          * Tell the thread to set its affinity. This is
   965          * important for shared interrupt handlers as we do
   966          * not invoke setup_affinity() for the secondary
   967          * handlers as everything is already set up. Even for
   968          * interrupts marked with IRQF_NO_BALANCE this is
   969          * correct as we want the thread to move to the cpu(s)
   970          * on which the requesting code placed the interrupt.
   971          */
   972         set_bit(IRQTF_AFFINITY, &new->thread_flags);
   973     }

__setup_irq——添加new irqaction

   974
   975     if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
   976         ret = -ENOMEM;
   977         goto out_thread;
   978     }
   979//一些interrupt controller注册的时候就已经设置IRQCHIP_ONESHOT_SAFE，这种情况下驱动工程师不需要使用IRQF_ONESHOT
   980     /*
   981      * Drivers are often written to work w/o knowledge about the
   982      * underlying irq chip implementation, so a request for a
   983      * threaded irq without a primary hard irq context handler
   984      * requires the ONESHOT flag to be set. Some irq chips like
   985      * MSI based interrupts are per se one shot safe. Check the
   986      * chip flags, so we can avoid the unmask dance at the end of
   987      * the threaded handler for those.
   988      */
   989     if (desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)
   990         new->flags &= ~IRQF_ONESHOT;
   991
   992     /*
   993      * The following block of code has to be executed atomically
   994      */
   995     raw_spin_lock_irqsave(&desc->lock, flags); //操作irqaction链表的时候要上锁
   996     old_ptr = &desc->action;
   997     old = *old_ptr;
   998     if (old) {   //irqaction链表不为空，说明存在共享中断的情况
   999         /*
1000          * Can't share interrupts unless both agree to and are
1001          * the same type (level, edge, polarity). So both flag
1002          * fields must have IRQF_SHARED set and the bits which
1003          * set the trigger type must match. Also all must
1004          * agree on ONESHOT.
1005          */
1006         if (!((old->flags & new->flags) & IRQF_SHARED) ||
1007             ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
1008             ((old->flags ^ new->flags) & IRQF_ONESHOT))
1009             goto mismatch;
1010//共享中断必须有相同的属性（触发方式、oneshot、percpu等）
1011         /* All handlers must agree on per-cpuness */
1012         if ((old->flags & IRQF_PERCPU) !=
1013             (new->flags & IRQF_PERCPU))
1014             goto mismatch;
1015//只是遍历irqaction链表得到thread_mask，每个bit代表一个irqaction，添加的操作在后面
1016         /* add new interrupt at end of irq queue */
1017         do {
1018             /*
1019              * Or all existing action->thread_mask bits,
1020              * so we can find the next zero bit for this
1021              * new action.
1022              */
1023             thread_mask |= old->thread_mask; //循环结束之后系统thread_mask中置1的bit位表示所有之前挂载过得共享中断的irqaction
1024             old_ptr = &old->next;
1025             old = *old_ptr;
1026         } while (old);
1027         shared = 1; //表示中断共享
1028     }
1029
1030     /*
1031      * Setup the thread mask for this irqaction for ONESHOT. For
1032      * !ONESHOT irqs the thread mask is 0 so we can avoid a
1033      * conditional in irq_wake_thread().
1034      */
1035     if (new->flags & IRQF_ONESHOT) { //线程化处理函数
1036         /*
1037          * Unlikely to have 32 resp 64 irqs sharing one line,
1038          * but who knows.
1039          */
1040         if (thread_mask == ~0UL) { //已经超过了所能挂载的最大共享中断数目，退出。这里我觉得应该加一个警告，否则开发人员可能不知道发生了什么。
1041             ret = -EBUSY;
1042             goto out_mask;
1043         }
1044         /*
1045          * The thread_mask for the action is or'ed to
1046          * desc->thread_active to indicate that the
1047          * IRQF_ONESHOT thread handler has been woken, but not
1048          * yet finished. The bit is cleared when a thread
1049          * completes. When all threads of a shared interrupt
1050          * line have completed desc->threads_active becomes
1051          * zero and the interrupt line is unmasked. See
1052          * handle.c:irq_wake_thread() for further information.
1053          *
1054          * If no thread is woken by primary (hard irq context)
1055          * interrupt handlers, then desc->threads_active is
1056          * also checked for zero to unmask the irq line in the
1057          * affected hard irq flow handlers
1058          * (handle_[fasteoi|level]_irq).
1059          *
1060          * The new action gets the first zero bit of
1061          * thread_mask assigned. See the loop above which or's
1062          * all existing action->thread_mask bits.
1063          */
1064         new->thread_mask = 1 << ffz(thread_mask);   //找到第一个非0的bit作为新irqaction的bit标志位
1065
1066     } else if (new->handler == irq_default_primary_handler &&
1067            !(desc->irq_data.chip->flags & IRQCHIP_ONESHOT_SAFE)) {
1068         /*
1069          * The interrupt was requested with handler = NULL, so
1070          * we use the default primary handler for it. But it
1071          * does not have the oneshot flag set. In combination
1072          * with level interrupts this is deadly, because the
1073          * default primary handler just wakes the thread, then
1074          * the irq lines is reenabled, but the device still
1075          * has the level irq asserted. Rinse and repeat....
1076 //存在这样一种情况，handler为NULL，但是并没有使用ONESHOT flag注册，那么在唤醒线程执行后就不会disable对应的中断线，可能造成不停的重入.
这也说明，在handler为NULL的情况下，驱动工程师应该指明ONESHOT，否则将不work
1077          * While this works for edge type interrupts, we play
1078          * it safe and reject unconditionally because we can't
1079          * say for sure which type this interrupt really
1080          * has. The type flags are unreliable as the
1081          * underlying chip implementation can override them.
1082          */
1083         pr_err("Threaded irq requested with handler=NULL and !ONESHOT for irq %d\n",
1084                irq);
1085         ret = -EINVAL;
1086         goto out_mask;
1087     }

_setup_irq——收尾工作

1088         if (!shared) { //非共享中断的情况下需要设置触发方式，共享中断的情况下，所有共享中断的irq的触发方式等相同
1089                 init_waitqueue_head(&desc->wait_for_threads); //初始化irqdesc的等待队列，作用后面会讲到
1090                //设置中断的触发方式，主要有以下几种IRQ_TYPE_EDGE_RISING、IRQ_TYPE_EDGE_FALLING、IRQ_TYPE_LEVEL_HIGH、IRQ_TYPE_LEVEL_LOW
1091                 /* Setup the type (level, edge polarity) if configured: */
1092                 if (new->flags & IRQF_TRIGGER_MASK) { //调用chip->irq_set_type进行设置
1093                         ret = __irq_set_trigger(desc, irq,
1094                                         new->flags & IRQF_TRIGGER_MASK);
1095
1096                         if (ret)
1097                                 goto out_mask;
1098                 }
1099
1100                 desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
1101                                   IRQS_ONESHOT | IRQS_WAITING);
1102                 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); //清楚IRQD_IRQ_INPROGRESS标志位
1103
1104                 if (new->flags & IRQF_PERCPU) {    //percpu中断对应标志位的设置
1105                         irqd_set(&desc->irq_data, IRQD_PER_CPU);
1106                         irq_settings_set_per_cpu(desc);
1107                 }
1108
1109                 if (new->flags & IRQF_ONESHOT) //oneshot类型的中断标志位设置
1110                         desc->istate |= IRQS_ONESHOT;
1111
1112                 if (irq_settings_can_autoenable(desc))
1113                         irq_startup(desc, true); //desc->irq_data.chip->irq_startup
1114                 else
1115                         /* Undo nested disables: */
1116                         desc->depth = 1;
1117
1118                 /* Exclude IRQ from balancing if requested */
1119                 if (new->flags & IRQF_NOBALANCING) {
1120                         irq_settings_set_no_balancing(desc);
1121                         irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
1122                 }
1123                    //设置默认亲和力
1124                 /* Set default affinity mask once everything is setup */
1125                 setup_affinity(irq, desc, mask);
1126
1127         } else if (new->flags & IRQF_TRIGGER_MASK) {
1128                 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
1129                 unsigned int omsk = irq_settings_get_trigger_mask(desc);
1130
1131                 if (nmsk != omsk)
1132                         /* hope the handler works with current trigger mode */
1133                         pr_warning("irq %d uses trigger mode %u; requested %u\n",
1134                                    irq, nmsk, omsk);
1135         }
1137         new->irq = irq;     //设置new irqaction的中断号
1138         *old_ptr = new;    //将new irqaction挂在irqaction链表最后
1139
1140         /* Reset broken irq detection when installing new handler */
1141         desc->irq_count = 0;   //新的irqaction插入后，这两个变量都要清0？
1142         desc->irqs_unhandled = 0;
1143
1144         /*
1145          * Check whether we disabled the irq via the spurious handler
1146          * before. Reenable it and give it another chance.
1147          */
1148         if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
1149                 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
1150                 __enable_irq(desc, irq, false);
1151         }
1152
1153         raw_spin_unlock_irqrestore(&desc->lock, flags);
1154//唤醒new->thread，严格意义上来说，并不需要唤醒，因为此时没有发生中断，但是hung task会检测sleep超过120s的task，然后报错。实际上，很多平台上都把hung task功能disable了......
1155         /*
1156          * Strictly no need to wake it up, but hung_task complains
1157          * when no hard interrupt wakes the thread up.
1158          */
1159         if (new->thread)
1160                 wake_up_process(new->thread);
1161
1162         register_irq_proc(irq, desc); //创建proc下的目录节点
1163         new->dir = NULL;
1164         register_handler_proc(irq, new);
1165         free_cpumask_var(mask);
1166
1167         return 0;

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    78
    79

nested irq

关于前面提到的nested irq，目前知道的信息如下：
1. 多个中断共享一个中断线，该中断线被认为是父中断，共享的中断被认为是子中断；
2. 中断在执行过程中被打断，发生了嵌套，打断的环境是进程上下文，也就是在threaded irq中（handle_nested_irq - Handle a nested irq from a irq thread ）；
3. handle_nested_irq 处理的irq的类型是IRQ_NESTED_THREAD，父中断在初始化中断时调用irq_set_nested_thread设置；
4. 对于IRQ_NESTED_THREAD类型的threaded handler，__setup_irq中不会为其创建单独的线程，子中断在父中断的线程上下文中运行。
5. 多用在GPIO driver中

在内核中找了一个例子，代码如下：

276 static int adp5588_irq_setup(struct adp5588_gpio *dev)
277 {
...
291     for (gpio = 0; gpio < dev->gpio_chip.ngpio; gpio++) {
292         int irq = gpio + dev->irq_base;
293         irq_set_chip_data(irq, dev);
294         irq_set_chip_and_handler(irq, &adp5588_irq_chip,
295                      handle_level_irq);
296         irq_set_nested_thread(irq, 1); //初始化的时候父中断设置子中断的类型
....
306     }
307     //申请中断
308     ret = request_threaded_irq(client->irq,
309                    NULL,
310                    adp5588_irq_handler,
311                    IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
312                    dev_name(&client->dev), dev);
.....
328 }
//在中断线程中调用handle_nested_irq，然后调用子中断的action->thread_fn
243 static irqreturn_t adp5588_irq_handler(int irq, void *devid)
244 {
...
259             while (pending) {
260                 if (pending & (1 << bit)) {
261                     handle_nested_irq(dev->irq_base +
262                               (bank << 3) + bit);
263                     pending &= ~(1 << bit);
264
265                 }
266                 bit++;
267             }
.....
274 }

前言

在x86 kernel 中断分析三——中断处理流程中，对于线程化中断处理函数，handle_irq_event_percpu调用了irq_wake_thread唤醒action->thread，此处唤醒的thread创建于__setup_irq，代码如下：

   947         t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
   948                    new->name);
...
   962         new->thread = t;

    1
    2
    3
    4

本篇分析irq_thread。
irq_thread

参数data是新注册的irqaction

   832 /*
   833 * Interrupt handler thread
   834 */
   835 static int irq_thread(void *data)
   836 {
   837     struct callback_head on_exit_work;
   838     struct irqaction *action = data;
   839     struct irq_desc *desc = irq_to_desc(action->irq);
   840     irqreturn_t (*handler_fn)(struct irq_desc *desc,
   841             struct irqaction *action);
   842
   843     if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,---------1
   844                     &action->thread_flags))
   845         handler_fn = irq_forced_thread_fn;-----------------------1.1
   846     else
   847         handler_fn = irq_thread_fn;------------------------------1.2
   848
   849     init_task_work(&on_exit_work, irq_thread_dtor); ---------------2
   850     task_work_add(current, &on_exit_work, false);
   851
   852     irq_thread_check_affinity(desc, action);-----------------------3
   853
   854     while (!irq_wait_for_interrupt(action)) {----------------------4
   855         irqreturn_t action_ret;
   856
   857         irq_thread_check_affinity(desc, action);
   858
   859         action_ret = handler_fn(desc, action);
   860         if (action_ret == IRQ_HANDLED)
   861             atomic_inc(&desc->threads_handled);
   862
   863         wake_threads_waitq(desc);--------------------------------5
   864     }
   865
   866     /*
   867      * This is the regular exit path. __free_irq() is stopping the
   868      * thread via kthread_stop() after calling
   869      * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the
   870      * oneshot mask bit can be set. We cannot verify that as we
   871      * cannot touch the oneshot mask at this point anymore as
   872      * __setup_irq() might have given out currents thread_mask
   873      * again.
   874      */
   875     task_work_cancel(current, irq_thread_dtor);------------------6
   876     return 0;
   877 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46

1 . force_irqthreads表示系统支持强制线程化。IRQTF_FORCED_THREAD在kernel 中断分析之四——中断申请 [下]中已有分析，表示申请中断时，thread为NULL（那么handler必不为NULL），此时为了线程化，强制将handler赋给thread_fn,handler设置为irq_default_primary_handler，并设置IRQTF_FORCED_THREAD标志位。若满足以上条件，handler_fn 的值为irq_forced_thread_fn，否则赋值为irq_thread_fn。

1.1 irq_forced_thread_fn

765 /*
766 * Interrupts which are not explicitely requested as threaded
767 * interrupts rely on the implicit bh/preempt disable of the hard irq
768 * context. So we need to disable bh here to avoid deadlocks and other
769 * side effects.
770 */--------------------------1.1.1
771 static irqreturn_t
772 irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
773 {
774         irqreturn_t ret;
775
776         local_bh_disable();
777         ret = action->thread_fn(action->irq, action->dev_id);
778         irq_finalize_oneshot(desc, action);
779         local_bh_enable();
780         return ret;
781 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18

这段代码比价直观，唯一让人疑惑的是，在调用action->thread_fn前后进行了开关bh操作，为什么要这么做？
1.1.1注释中描述: 没有明确申请thread interrupt的中断，需要将硬中断上下文的bh/抢占关闭，所以要关闭bh。
如前面说分析，在__setup_irq—>irq_setup_forced_threading中，如果请求中断时没有提供thread_fn（但是提供了handler），
那么进行如下处理

   888     if (!new->thread_fn) {
   889         set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); //设置IRQTF_FORCED_THREAD，表示经过强制线程化
   890         new->thread_fn = new->handler;          //之前的handler变成了thread_fn
   891         new->handler = irq_default_primary_handler;
   892     }
   893 }

    1
    2
    3
    4
    5
    6

以下是我的个人理解，根据上面的代码逻辑不难发现，内核系统开发者和驱动开发者之间存在着一个”冲突”，内核开发者希望将满足条件的handler线程化使其运行在进程上下文，而驱动开发者并不一定知情，并且驱动开发者的本意是希望注册的handler运行在中断上下文，以便快速完成自己需要的功能。我们知道，bh是有可能运行在中断上下文（softirq、tasklet），也有可能运行在进程上下文（work queue）。这种情况下，内核开发者一方面要线程化，一方面又要满足驱动开发者快速运行(即不被抢占和轻易打断)的意愿，所以加上了local_bh_disable，即所有中断下半部都不会对其产生影响。

1.2 irq_thread_fn

783 /*
784 * Interrupts explicitly requested as threaded interrupts want to be
785 * preemtible - many of them need to sleep and wait for slow busses to
786 * complete.
787 */
788 static irqreturn_t irq_thread_fn(struct irq_desc *desc,
789                 struct irqaction *action)
790 {
791         irqreturn_t ret;
792
793         ret = action->thread_fn(action->irq, action->dev_id);
794         irq_finalize_oneshot(desc, action);
795         return ret;
796 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14

多数线程化中断的都会被抢占然后睡眠（这是该设计的本意），所以不需要disable bh。

2 . on_exit_work
从名称可有看出，该callback调用时已经是处理的是中断线程退出方面的工作。
关于task work，专门写了一篇分析介绍：链接（TODO）
初始化并添加一个callback_head，指定其func为irq_thread_dtor，该func的调用时机为进程由内核态返回用户态或者进程退出（do_exit）之前。
然后看一下irq_thread_dtor做了什么？

804 static void irq_thread_dtor(struct callback_head *unused)
805 {
806         struct task_struct *tsk = current;
807         struct irq_desc *desc;
808         struct irqaction *action;
809
810         if (WARN_ON_ONCE(!(current->flags & PF_EXITING)))-----------2.1
811                 return;
812
813         action = kthread_data(tsk);---------------------------------2.2
814
815         pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
816                tsk->comm, tsk->pid, action->irq);
817
818
819         desc = irq_to_desc(action->irq);-----------------------------2.3
820         /*
821          * If IRQTF_RUNTHREAD is set, we need to decrement
822          * desc->threads_active and wake possible waiters.
823          */
824         if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))-----2.4
825                 wake_threads_waitq(desc);
826
827         /* Prevent a stale desc->threads_oneshot */
828         irq_finalize_oneshot(desc, action);---------------------------2.5
829 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26

2.1 如果进程不是处于退出状态，直接返回
2.2 获取irqaction
2.3 获取irqdesc
2.4 thread irq的处理过程中，唤醒中断处理线程action->thread，并设置IRQTF_RUNTHREAD，表示该线程应该运行，handle_irq_event_percpu—>irq_wake_thread—>test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)。
这里如果检测到IRQTF_RUNTHREAD，对应bit清0，调用wake_threads_waitq唤醒wait_for_threads下挂载的处于等待状态的thread（我没有找到像wait_for_thread等待队列上添加等待队列项wait_queue_t 的相关代码）。

798 static void wake_threads_waitq(struct irq_desc *desc)
799 {
800 if (atomic_dec_and_test(&desc->threads_active))
801 wake_up(&desc->wait_for_threads);
802 }

    1
    2
    3
    4
    5
    6

2.5 调用irq_finalize_oneshot

669 /*
670 * Oneshot interrupts keep the irq line masked until the threaded
671 * handler finished. unmask if the interrupt has not been disabled and
672 * is marked MASKED.--------------------------2.5.1
673 */
674 static void irq_finalize_oneshot(struct irq_desc *desc,
675                                  struct irqaction *action)
676 {
677         if (!(desc->istate & IRQS_ONESHOT))-----------2.5.2
678                 return;
679 again:
680         chip_bus_lock(desc);
681         raw_spin_lock_irq(&desc->lock);
682
683         /*
684          * Implausible though it may be we need to protect us against
685          * the following scenario:
686          *
687          * The thread is faster done than the hard interrupt handler
688          * on the other CPU. If we unmask the irq line then the
689          * interrupt can come in again and masks the line, leaves due
690          * to IRQS_INPROGRESS and the irq line is masked forever.
691          *
692          * This also serializes the state of shared oneshot handlers
693          * versus "desc->threads_onehsot |= action->thread_mask;" in
694          * irq_wake_thread(). See the comment there which explains the
695          * serialization.
696          */-----------------------2.5.3
697         if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
698                 raw_spin_unlock_irq(&desc->lock);
699                 chip_bus_sync_unlock(desc);
700                 cpu_relax();
701                 goto again;
702         }
703
704         /*
705          * Now check again, whether the thread should run. Otherwise
706          * we would clear the threads_oneshot bit of this thread which
707          * was just set.
708          */----------------2.5.4
709         if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
710                 goto out_unlock;
711
712         desc->threads_oneshot &= ~action->thread_mask;------2.5.5
713
714         if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
715             irqd_irq_masked(&desc->irq_data))
716                 unmask_irq(desc);---------2.5.6
717
718 out_unlock:
719         raw_spin_unlock_irq(&desc->lock);
720         chip_bus_sync_unlock(desc);
721 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53

2.5.1 在kernel 中断分析之四——中断申请 [下]
中曾有过对ONESHOT类型中断线程的分析，该标志的中断线程在线程运行完毕后才会unmask对应的中断线。
irq_finalize_oneshot就是干这个活儿的。
2.5.2 非ONESHOT类型的直接退出。
2.5.3 正常流程下，handle_irq_event—>irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);—>handle_irq_event_percpu(desc, action);
—>硬中断—>线程化中断（需要的话）—>irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
有一种场景，硬中断唤醒中断线程后，它们分别在不同CPU上运行，线程运行的比硬中断还要快（这种情况比较少见）。这样的后果是，中断线程先
unmask了对应的中断线，而此时desc->irq_data仍然保持IRQD_IRQ_INPROGRESS置1，硬中断还在执行，而中断线已经reenable了。
所以这里做了一个额外的检查，如果此时还在IRQD_IRQ_INPROGRESS状态，那么cpu_relax等待。
2.5.4 经过2.4的check和clear，如果再次发现IRQTF_RUNTHREAD置1，那么说明在这期间irq_wake_thread再次唤醒了中断处理线程（？），直接返回。
2.5.5 清除threads_oneshot中的bit位
2.5.6 unmask irq

3 . irq_thread_check_affinity
线程化(或被强制线程化)的非nested中断在__setup_irq中都会被设置IRQTF_AFFINITY标志位，在中断线程调用irq_thread时通过irq_thread_check_affinity处理，判断是否要改变中断线程的affinity。
最后调用set_cpus_allowed_ptr(current, mask);设置当前进程（即irq thread）的cpu affinity，并迁移到合适的CPU上。

4 . irq_wait_for_interrupt

651 static int irq_wait_for_interrupt(struct irqaction *action)
652 {
653     set_current_state(TASK_INTERRUPTIBLE);------------------4.1
654
655     while (!kthread_should_stop()) {
656
657         if (test_and_clear_bit(IRQTF_RUNTHREAD, ------------4.2
658                        &action->thread_flags)) {
659             __set_current_state(TASK_RUNNING);
660             return 0;
661         }
662         schedule();----------------------------------------4.3
663         set_current_state(TASK_INTERRUPTIBLE);
664     }
665     __set_current_state(TASK_RUNNING);
666     return -1;
667 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17

4.1 设置当前进程的状态为TASK_INTERRUPTIBLE
4.2 IRQTF_RUNTHREAD被置位说明中断线程被唤醒，对应的中断已经触发并且经过top half的处理。检测IRQTF_RUNTHREAD，如果有，然后将该bit位清除设置当前进程TASK_RUNNING，退出irq_wait_for_interrupt，到irq_thread的循环中，再次检测cpu affinity，然后调用handler_fn。
4.4 IRQTF_RUNTHREAD没有置位则调用schedule让出CPU控制权，。

5 . wake_threads_waitq
唤醒该irqdesc->wait_for_threads上挂载的等待事件。

6 . 注释表示是正常的退出流程，但是这个时候call back有被调用到吗？有待验证。

Abstract

目前kernel中的中断机制主要有top half、bottom half（softirq、tasklet、waitqueue）、threaded irq handler。top half不用赘述，这里把threed irq handler与bottom half区分开，是因为他们有以下区别：
1、调度方式
threaded irq handler被系统调度器调度，bottom half被top half调度（也可通过ksoftirqd被schedule）。

2、运行上下文
threaded irq handler运行在process context中
bottom half：softirq和tasklet运行在interrupt context，waitqueue运行在process contex。

当然，他们有共同之处，将中断的部分或者大部分工作延迟执行。
啰嗦一句，系统调用属于异常处理程序，通过软件中断（software interrupt，不是softirq）实现，运行在process context。

本篇主要分析bottom half中的softirq。
softirq
软中断类型

379 /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
380    frequency threaded job scheduling. For almost all the purposes
381    tasklets are more than enough. F.e. all serial device BHs et
382    al. should be converted to tasklets, not to softirqs.
383 */
384
385 enum
386 {
387     HI_SOFTIRQ=0,
388     TIMER_SOFTIRQ,
389     NET_TX_SOFTIRQ,
390     NET_RX_SOFTIRQ,
391     BLOCK_SOFTIRQ,
392     BLOCK_IOPOLL_SOFTIRQ,
393     TASKLET_SOFTIRQ,
394     SCHED_SOFTIRQ,
395     HRTIMER_SOFTIRQ,
396     RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
397
398     NR_SOFTIRQS
399 };

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21

系统已经定义好10种软中断类型，优先级依次降低，并且提示开发人员avoid to allocate new softirqs，开发最常使用的是HI_SOFTIRQ和TASKLET_SOFTIRQ。
软中断向量表

和硬中断的静态线性向量表类似，软中断也有个向量表，数组大小当然是10.
每个数据元素即一个软中断处理函数。

static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;

408 /* softirq mask and active fields moved to irq_cpustat_t in
409 * asm/hardirq.h to get better cache usage. KAO
410 */
411
412 struct softirq_action
413 {
414 void (*action)(struct softirq_action *);
415 };

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11

软中断初始化

每种软中断在初始化时都会调用open_softirq，来初始化软中断处理函数，以TASKLET_SOFTIRQ为例：

631 void __init softirq_init(void)
632 {
633     int cpu;
634
635     for_each_possible_cpu(cpu) {
636         per_cpu(tasklet_vec, cpu).tail =
637             &per_cpu(tasklet_vec, cpu).head;
638         per_cpu(tasklet_hi_vec, cpu).tail =
639             &per_cpu(tasklet_hi_vec, cpu).head;
640     }
641
642     open_softirq(TASKLET_SOFTIRQ, tasklet_action);
643     open_softirq(HI_SOFTIRQ, tasklet_hi_action);
644 }

430 void open_softirq(int nr, void (*action)(struct softirq_action *))
431 {
432 softirq_vec[nr].action = action;
433 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20

软中断触发
API

触发方软中断的API为raise_softirq_irqoff

446 void __tasklet_schedule(struct tasklet_struct *t)
447 {
448     unsigned long flags;
449
450     local_irq_save(flags);
451     t->next = NULL;
452     *__this_cpu_read(tasklet_vec.tail) = t;
453     __this_cpu_write(tasklet_vec.tail, &(t->next));
454     raise_softirq_irqoff(TASKLET_SOFTIRQ);
455     local_irq_restore(flags);
456 }
457 EXPORT_SYMBOL(__tasklet_schedule);

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12

有一个细节，调用raise_softirq_irqoff之前需要禁用本地cpu中断，那么是不是说执行softirq的过程中中断一直是禁止的呢？当然不是，我们看后面的分析。
raise_softirq_irqoff

395 /*
396 * This function must run with irqs disabled!
397 */
398 inline void raise_softirq_irqoff(unsigned int nr)
399 {
400     __raise_softirq_irqoff(nr);------------------------1
401
402     /*
403      * If we're in an interrupt or softirq, we're done
404      * (this also catches softirq-disabled code). We will
405      * actually run the softirq once we return from
406      * the irq or softirq.
407      *
408      * Otherwise we wake up ksoftirqd to make sure we
409      * schedule the softirq soon.
410      */
411     if (!in_interrupt())-------------------------------2
412         wakeup_softirqd();
413 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20

1 . __raise_softirq_irqoff
调用or_softirq_pending，将irq_stat.__softirq_pending的对应软中断bit位置1，关于irq_stat.__softirq_pending，见appendix–__softirq_pending。

#define or_softirq_pending(x) this_cpu_or(irq_stat.__softirq_pending, (x))

2 . in_interrupt判断此时是否处于中断上下文，具体判断的逻辑见appendix–in_interrupt。
如果处于中断上下文，那么不做任何处理，因为当前中断结束之后会处理pending的softirq。
否则，唤醒softorqd线程，处理软中断。

可以看到，这里并没有真正触发软中断，而是pending软中断，等待后续处理（或者中断退出时处理，或者主动唤醒softirqd处理）。所以之前的怀疑（软中断执行过程中禁用本地CPU中断）不成立。
触发时机

这里写图片描述

如图，kernel中触发软中断主要有以下几种情况

    irq_exit，退出中断时会检测有没有pending的软中断
    local_bh_enable
    netif_rx_ni

处理软中断的流程最终都会调用到__do_softirq，不同情况下有直接调用__do_softirq，也有通过唤醒softirqd来调用。

除了以上三种情况，在别的资料和blog上还有：
4. SMP中，处理完处理器间中断时
5. 在使用APIC的系统处理完本地中断时

本篇主要分析前两种情况。
irq_exit

最常见的调用流程是do_IRQ在退出中断时调用irq_exit。

374 /*
375 * Exit an interrupt context. Process softirqs if needed and possible:
376 */
377 void irq_exit(void)
378 {
379 #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
380         local_irq_disable();
381 #else
382         WARN_ON_ONCE(!irqs_disabled());
383 #endif
384
385         account_irq_exit_time(current);
386         preempt_count_sub(HARDIRQ_OFFSET);-----------------------1
387         if (!in_interrupt() && local_softirq_pending())----------2
388                 invoke_softirq();
389
390         tick_irq_exit();
391         rcu_irq_exit();
392         trace_hardirq_exit(); /* must be last! */
393 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21

将硬件中断在preempt count中的计数减少，前面已经处理完硬件中断。
若当前不在中断上下文且有软中断pending，那么调用invoke_softirq

338 static inline void invoke_softirq(void)
339 {
340         if (!force_irqthreads) {
341 #ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
342                 /*
343                  * We can safely execute softirq on the current stack if
344                  * it is the irq stack, because it should be near empty
345                  * at this stage.
346                  */
347                 __do_softirq();
348 #else
349                 /*
350                  * Otherwise, irq_exit() is called on the task stack that can
351                  * be potentially deep already. So call softirq in its own stack
352                  * to prevent from any overrun.
353                  */
354                 do_softirq_own_stack();
355 #endif
356         } else {
357                 wakeup_softirqd();
358         }
359 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22

从目前大部分内核配置来看，大多都使用强制线程化，所以会唤醒ksoftirqd进程。
ksoftirqd

kernel中每个cpu都定义了一个ksoftirqd进程，用来处理软中断。

57 DEFINE_PER_CPU(struct task_struct *, ksoftirqd);

749 static struct smp_hotplug_thread softirq_threads = {
750         .store                  = &ksoftirqd,
751         .thread_should_run      = ksoftirqd_should_run,
752         .thread_fn              = run_ksoftirqd,
753         .thread_comm            = "ksoftirqd/%u",
754 };

    1
    2
    3
    4
    5
    6
    7

该线程被唤醒后会调用run_ksoftirqd
run_ksoftirqd

651 static void run_ksoftirqd(unsigned int cpu)
652 {
653         local_irq_disable();-------------------------1
654         if (local_softirq_pending()) {---------------2
655                 /*
656                  * We can safely run softirq on inline stack, as we are not deep
657                  * in the task stack here.
658                  */
659                 __do_softirq();
660                 local_irq_enable();-----------------3
661                 cond_resched();
662
663                 preempt_disable();
664                 rcu_note_context_switch(cpu);
665                 preempt_enable();
666
667                 return;
668         }
669         local_irq_enable();
670 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20

进入ksoftirqd时先关闭中断，对应下文的3，很容易让人猜测，软中断处理过程中是不是对应CPU的中断时全部mask的？显然不是，在__do_softirq中还有对中断屏蔽的操作；
如果当前有软中断pending，那么调用__do_softirq

__do_softirq

225 asmlinkage void __do_softirq(void)
226 {
227         unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
228         unsigned long old_flags = current->flags;
229         int max_restart = MAX_SOFTIRQ_RESTART;
230         struct softirq_action *h;
231         bool in_hardirq;
232         __u32 pending;
233         int softirq_bit;
234         int cpu;
235
236         /*
237          * Mask out PF_MEMALLOC s current task context is borrowed for the
238          * softirq. A softirq handled such as network RX might set PF_MEMALLOC
239          * again if the socket is related to swap
240          */
241         current->flags &= ~PF_MEMALLOC;----------------------------1
242
243         pending = local_softirq_pending();-------------------------2
244         account_irq_enter_time(current);---------------------------3
245
246         __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);------------4
247         in_hardirq = lockdep_softirq_start();
248
249         cpu = smp_processor_id();----------------------------------5
250 restart:
251         /* Reset the pending bitmask before enabling irqs */
252         set_softirq_pending(0);-----------------------------------6
253
254         local_irq_enable();----------------------------------------7
255
256         h = softirq_vec;-------------------------------------------8
257
258         while ((softirq_bit = ffs(pending))) {----------------------9
259                 unsigned int vec_nr;
260                 int prev_count;
261
262                 h += softirq_bit - 1;------------------------------9.1
263
264                 vec_nr = h - softirq_vec;
265                 prev_count = preempt_count();---------------------9.2
266
267                 kstat_incr_softirqs_this_cpu(vec_nr);
268
269                 trace_softirq_entry(vec_nr);
270                 h->action(h);--------------------------------------9.3
271                 trace_softirq_exit(vec_nr);
272                 if (unlikely(prev_count != preempt_count())) {-----9.4
273                         pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
274                                vec_nr, softirq_to_name[vec_nr], h->action,
275                                prev_count, preempt_count());
276                         preempt_count_set(prev_count);
277                 }
278                 rcu_bh_qs(cpu);
279                 h++;-----------------------------------------------9.5
280                 pending >>= softirq_bit;----------------------------9.6
281         }
282
283         local_irq_disable();----------------------------------------10
284
285         pending = local_softirq_pending();-------------------------11
286         if (pending) {
287                 if (time_before(jiffies, end) && !need_resched() &&----11.1
288                     --max_restart)
289                         goto restart;
290
291                 wakeup_softirqd();--------------------------------11.2
292         }
293
294         lockdep_softirq_end(in_hardirq);
295         account_irq_exit_time(current);
296         __local_bh_enable(SOFTIRQ_OFFSET);-----------------------12
297         WARN_ON_ONCE(in_interrupt());
298         tsk_restore_flags(current, old_flags, PF_MEMALLOC);
299 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75

    这里为什么要清除PF_MEMALLOC？这样分配内存时就会受到buddy system的限制。
    获取当前pending的所有软中断
    计算进入软中断的时间，便于出错时统计。
    disable当前CPU的软中断，所以__do_softirq时不响应新来的软中断（个人理解）
    再来看所谓的disable软中断__local_bh_disable_ip，就是将preempt count中SOFTIRQ_OFFSET对应的bit加1，不用说，此时是禁止schedule的。还记得前面判断的in_interrupt吗？所以如果in_interrupt为真，那么此时可能其他软中断正在被处理。
    获取当前CPU编号
    清除irq_stat.__softirq_pending中所有pending的软中断
    enable当前cpu中断，所以软中断只是在前期关闭硬件中断，后期主要流程中可以被硬中断打断
    获取softirq_vec，软中断向量表

    从pending软中断中找第一个软中断，所以软中断的执行是有优先级的。但是从第四步可以看出，不能被新的软中断打断，即软中断不能打断软中断和硬中断
    9.1 获取对应的软中断向量和向量编号vec_nr
    9.2 获取preempt count
    9.3 调用软中断向量的中断处理函数
    9.4 执行软中断处理函数前后preempt count发生了改变，
    我们知道有preempt count是percpu变量，
    DECLARE_PER_CPU(int, __preempt_count);

    说明该cpu上发生了软中断或者硬中断或者NMI或者在软中断处理函数中发生了schedule，而我们在之前已经禁用了软中断，并且如果发生了硬中断，该CPU应该会被硬中断打断并进入硬中断的处理流程，所以很大可能是在软中断处理函数中进行了调度，比如睡眠引发调度（以上个人理解）。这时kernel抛出一个错误信息提示用户。
    9.4 发生了硬中断，我们知道硬中断的优先级比
    9.5 h指向下一个软中断向量，既然下一次循环会重新计算h，不明白这里自加有什么意义。
    9.6 便于下一次循环获取下一个软中断bit位

    这里禁用当前CPU的中断是为了下面可能要操作 wakeup_softirqd
    在step 9中，依次遍历并执行了__softirq_pending中所有的软中断，借用网上的一张图，布局如下
    这里写图片描述
    在执行软中断处理函数的过程中，可能会有新的软中断触发，虽然不会打断当前正在执行的软中断处理流程，但是其对应的标志位会被set，比如我们之前分析的__raise_softirq_irqoff—>or_softirq_pending。
    此时kernel会处理这些新来的软中断，如果此时处理软中断的时间没有超时且没有其他进程等待调度且step 6-9没有连续处理10次，那么继续restart重复处理；否则，kernel认为这段时间的软中断触发过于频繁，继续处理会给系统workload带来较大负担，其他进程得不到调度执行，影响了整体的performance，所以将后续到达的软中断延迟执行，即放到下一次ksoftirqd被调度的时候执行。

local_bh_enable

顾名思义，enable bottom half。看了一下，local_bh_enable的调用大都出现在net相关的驱动，由于对net缺乏了解，此处只分析local_bh_enable与softirq的相关流程。

30 static inline void local_bh_enable(void)
31 {
32 __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET);
33 }

143 void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
144 {
145         WARN_ON_ONCE(in_irq() || irqs_disabled());----------------------1
146 #ifdef CONFIG_TRACE_IRQFLAGS
147         local_irq_disable();
148 #endif
149         /*
150          * Are softirqs going to be turned on now:
151          */
152         if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
153                 trace_softirqs_on(ip);
154         /*
155          * Keep preemption disabled until we are done with
156          * softirq processing:
157          */
158         preempt_count_sub(cnt - 1);------------------------------------2
159
160         if (unlikely(!in_interrupt() && local_softirq_pending())) {
161                 /*
162                  * Run softirq if any pending. And do it in its own stack
163                  * as we may be calling this deep in a task call stack already.
164                  */
165                 do_softirq();
166         }
167
168         preempt_count_dec();--------------------------------------------3
169 #ifdef CONFIG_TRACE_IRQFLAGS
170         local_irq_enable();
171 #endif
172         preempt_check_resched();
173 }
174 EXPORT_SYMBOL(__local_bh_enable_ip);

开头先check一下，in_irq其实是判断hard irq的context中，显然如果有硬中断正需要处理，应该硬中断优先。另外判断是否禁止了中断。
如果当前不在某个中断的中断上下文且有软中断pending，那么调用do_softirq。

do_softirq

301 asmlinkage void do_softirq(void)
302 {
303         __u32 pending;
304         unsigned long flags;
305
306         if (in_interrupt())----------------------1
307                 return;
308
309         local_irq_save(flags);-------------------2
310
311         pending = local_softirq_pending();------3
312
313         if (pending)
314                 do_softirq_own_stack();----------4
315
316         local_irq_restore(flags);---------------5
317 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17

    再check一下~
    关闭本地CPU中断
    获得pending的软中断bit位
    调用do_softirq_own_stack
    开本地CPU中断

do_softirq_own_stack

148 void do_softirq_own_stack(void)
149 {
150         struct thread_info *curctx;
151         union irq_ctx *irqctx;
152         u32 *isp;
153
154         curctx = current_thread_info();
155         irqctx = __this_cpu_read(softirq_ctx);
156         irqctx->tinfo.task = curctx->task;
157         irqctx->tinfo.previous_esp = current_stack_pointer;
158
159         /* build the stack frame on the softirq stack */
160         isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
161
162         call_on_stack(__do_softirq, isp);
163 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16

这边其实就是保存当前进程的上下文，切换到softirq的栈，前面已经分析过hardirq和softirq在每个CPU各有自己的栈。

比较奇怪的是，运行run_ksoftirqd的时候没有看到栈的切换，希望能有大神解惑。
Appendix
__softirq_pending

每个cpu定义了一个irq_stat变量

24 DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);

类型为irq_cpustat_t，其中__softirq_pending表示有哪些softirq已经pending等待处理。

7 typedef struct {
8     unsigned int __softirq_pending;
9     unsigned int __nmi_count;   /* arch dependent */
10 #ifdef CONFIG_X86_LOCAL_APIC
11     unsigned int apic_timer_irqs;   /* arch dependent */
12     unsigned int irq_spurious_count;
13     unsigned int icr_read_retry_count;
14 #endif
15 #ifdef CONFIG_HAVE_KVM
16     unsigned int kvm_posted_intr_ipis;
17 #endif
18     unsigned int x86_platform_ipis; /* arch dependent */
19     unsigned int apic_perf_irqs;
20     unsigned int apic_irq_work_irqs;
21 #ifdef CONFIG_SMP
22     unsigned int irq_resched_count;
23     unsigned int irq_call_count;
24     /*
25      * irq_tlb_count is double-counted in irq_call_count, so it must be
26      * subtracted from irq_call_count when displaying irq_call_count
27      */
28     unsigned int irq_tlb_count;
29 #endif
30 #ifdef CONFIG_X86_THERMAL_VECTOR
31     unsigned int irq_thermal_count;
32 #endif
33 #ifdef CONFIG_X86_MCE_THRESHOLD
34     unsigned int irq_threshold_count;
35 #endif
36 } ____cacheline_aligned irq_cpustat_t;
37

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31

in_interrupt

所谓中断上下文，必然是包含了各种中断情况，如HARDIRQ、SOFTIRQ、NMI（x86上才有NMI）。
在x86 kernel 中断分析三——中断处理流程中，我们分析了preempt_count的组成结构。判断当前是否处于中断上下文，就是看preempt_count中对应的bit是否有效。

66 #define in_interrupt() (irq_count())

55 #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
56 | NMI_MASK))

    1
    2
    3
    4

25 EXPORT_PER_CPU_SYMBOL(irq_stat);

424 void __raise_softirq_irqoff(unsigned int nr)
425 {
426 trace_softirq_raise(nr);
427 or_softirq_pending(1UL << nr);
428 }

Abstract

在Kernel 中断分析六——softirq中，分析了软中断的处理流程，那么bottom half还剩下tasklet与workqueue。tasklet是在软中断基础上实现的一种延迟机制，当然同样运行在中断上下文，而workqueue运行在进程上下文，允许睡眠。
Tasklet

kernel中有定义了十种软中断类型，其中HI_SOFTIRQ、TASKLET_SOFTIRQ用于实现tasklet，区别在于HI_SOFTIRQ的优先级较高。
tasklet_struct

kernel中用tasklet_struct来描述一个tasklet，用DECLARE_TASKLET来声明一个tasklet。

443 /* Tasklets --- multithreaded analogue of BHs.
444
445    Main feature differing them of generic softirqs: tasklet-----1
446    is running only on one CPU simultaneously.
447
448    Main feature differing them of BHs: different tasklets-------2
449    may be run simultaneously on different CPUs.
450
451    Properties:
452    * If tasklet_schedule() is called, then tasklet is guaranteed
453      to be executed on some cpu at least once after this.
454    * If the tasklet is already scheduled, but its execution is still not
455      started, it will be executed only once.
456    * If this tasklet is already running on another CPU (or schedule is called
457      from tasklet itself), it is rescheduled for later.
458    * Tasklet is strictly serialized wrt itself, but not
459      wrt another tasklets. If client needs some intertask synchronization,
460      he makes it with spinlocks.
461 */
462
463 struct tasklet_struct
464 {
465     struct tasklet_struct *next;
466     unsigned long state;
467     atomic_t count;
468     void (*func)(unsigned long);
469     unsigned long data;
470 };
471
472 #define DECLARE_TASKLET(name, func, data) \
473 struct tasklet_struct name = { NULL, 0, ATOMIC_INIT(0), func, data }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32

在分析代码之前先说明tasklet的特点，以便带着问题分析。
以上的注释概括了tasklet的核心思想：
1. 与普通软中断的区别在于：相同的tasklet同一时间只能在一个CPU上执行，即相同tasklet不具有并发性，相同的tasklet串行执行。
2. 与其他BHs（其实就是softirq和workqueue）的区别在于：不同的tasklet可以同时运行在不同的CPU上。不同的tasklet并行执行。

总结以上两条，可以发现软中断和tasklet的区别：
softirq：软中断可以并发运行在不同CPU上，即多个CPU可能同时调用同一块软中断处理的代码，所以软中断处理是可重入的，这就要求软中断处理函数中访问数据时做好保护工作。
tasklet：相同的tasklet在不同处理器上串行执行，不同的tasklet可在不同处理器上并行执行。即tasklet不需要考虑重入，比如，对于某个driver的tasklet，任何时刻，只有一个CPU会执行相关的代码片。
tasklet_schedule

509 static inline void tasklet_schedule(struct tasklet_struct *t)
510 {
511 if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) ------1
512 __tasklet_schedule(t);
513 }

    1
    2
    3
    4
    5

446 void __tasklet_schedule(struct tasklet_struct *t)
447 {
448     unsigned long flags;
449
450     local_irq_save(flags);                                    -------2
451     t->next = NULL;                                           -------3
452     *__this_cpu_read(tasklet_vec.tail) = t;                   -------4
453     __this_cpu_write(tasklet_vec.tail, &(t->next));
454     raise_softirq_irqoff(TASKLET_SOFTIRQ);                    -------5
455     local_irq_restore(flags);                                 -------6
456 }
457 EXPORT_SYMBOL(__tasklet_schedule);

443 static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);      -------4
435 /*
436 * Tasklets
437 */
438 struct tasklet_head {
439     struct tasklet_struct *head;
440     struct tasklet_struct **tail;
441 };

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22

    tasklet_schedule表示将要schedule传入的tasklet，如果该tasklet已经被设置了TASKLET_STATE_SCHED，则退出；否则调用__tasklet_schedule。这说明，同一个tasklet，在处于schedule状态下，再尝试进行schedule，实际上只处理一次。
    禁用本地CPU中断
    只schedule一个tasklet，将其next指针设置成NULL
    将该tasklet加入到tasklet_vec链表中尾部。kernel为每个CPU定义了一个tasklet_vec链表，保存需要schedule的tasklet。
    触发TASKLET_SOFTIRQ对应的软中断，前几篇分析的很详细了。
    打开本地中断

tasklet_action

TASKLET_SOFTIRQ软中断触发以后会调用对应的中断处理函数，即tasklet_action

482 static void tasklet_action(struct softirq_action *a)
483 {
484     struct tasklet_struct *list;
485
486     local_irq_disable();                 ------------1
487     list = __this_cpu_read(tasklet_vec.head);    ----2
488     __this_cpu_write(tasklet_vec.head, NULL);   -----3
489     __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head);
490     local_irq_enable();                      --------4
491
492     while (list) {          ---------------------5
493         struct tasklet_struct *t = list;    -----5.1
494
495         list = list->next;                  -----5.2
496
497         if (tasklet_trylock(t)) {           -----5.3
498             if (!atomic_read(&t->count)) { -----5.4
499                 if (!test_and_clear_bit(TASKLET_STATE_SCHED,-----5.5
500                             &t->state))
501                     BUG();
502                 t->func(t->data);           -------5.6
503                 tasklet_unlock(t);          -------5.7
504                 continue;
505             }
506             tasklet_unlock(t);
507         }
508
509         local_irq_disable();                --------6
510         t->next = NULL;
511         *__this_cpu_read(tasklet_vec.tail) = t;
512         __this_cpu_write(tasklet_vec.tail, &(t->next));
513         __raise_softirq_irqoff(TASKLET_SOFTIRQ);
514         local_irq_enable();
515     }
516 }

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35

    关闭本地CPU中断
    获取tasklet_vec链表
    清空tasklet_vec链表
    打开本地CPU中断
    判断list是否为空
    5.1 获取第一个tasklet
    5.2 获取下一个tasklet，方便下一次遍历
    5.3 检测TASKLET_STATE_RUN有没有被设置，如果有，说明该tasklet正在其他CPU上处理。否则设置该状态位为1，表示该tasklet正在本地CPU上被处理。此时其他CPU如果调用tasklet_action，会tasklet_trylock失败，进入step 6。
    5.4 tasklet的count表示该tasklet是否被disable，0 enable，1 disable。如果被disable，那么清除TASKLET_STATE_RUN，进入step6。
    5.5 检测TASKLET_STATE_SCHED标志位是否为0，如果为0，那么此时抛一个bug出来，因为正常情况下，前面的处理流程中设置了TASKLET_STATE_SCHED为1，并且其他CPU是没有机会再设置TASKLET_STATE_SCHED的。如果为1，那么清0，此时TASKLET_STATE_SCHED为0，TASKLET_STATE_RUN为1，tasklet从schedule状态变成了run状态。
    5.6 执行tasklet的处理函数
    5.7 清除该tasklet的TASKLET_STATE_RUN标志位，并且调用continue进入下一次循环。
    走到这一步，有两种可能：
    a. tasklet_trylock失败，该tasklet正在被其他CPU处理
    b. 该tasklet被disable了
    以上两种情况都会将TASKLET_STATE_RUN标志位清0，并将该tasklet添加到tasklet_vec的尾部，重新触发一次TASKLET_SOFTIRQ软中断，等待下一次处理。所以，无论tasklet能否顺利执行，该tasklet始终运行在schedule它的CPU上

可以想象一下，tasklet_vec中有N个tasklet，满足条件的tasklet会调用它的处理函数。如果有tasklet正在其他CPU上运行，那么本CPU上与之相同的tasklet放到tasklet_vec尾部，下一次再处理。如此就实现了相同tasklet串行执行。至于不同的tasklet，在多个CPU上并发执行时没问题的。