sys_nice源码分析

最新推荐文章于 2025-06-10 13:58:28 发布

原创最新推荐文章于 2025-06-10 13:58:28 发布 · 2.1k 阅读

3 ·

CC 4.0 BY-SA版权

glibc+linux源码分析同时被 2 个专栏收录

24 篇文章

订阅专栏

linux逆向编程

21 篇文章

订阅专栏

sys_nice源码分析

sys_nice系统调用用于改变进程的优先级，下面来看。

sys_nice
kernel/sched/core.c

SYSCALL_DEFINE1(nice, int, increment)
{
    long nice, retval;

    increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
    nice = task_nice(current) + increment;

    nice = clamp_val(nice, MIN_NICE, MAX_NICE);
    if (increment < 0 && !can_nice(current, nice))
        return -EPERM;

    set_user_nice(current, nice);
    return 0;
}

clamp宏让increment变量限制在(-NICE_WIDTH, NICE_WIDTH)范围内。NICE_WIDTH的默认值为40。即nice系统调用用户提供的进程优先级只能限制在-40到40的范围内。

#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)

接下来通过task_nice函数获取进程当前优先级对应的nice值，并与increment相加获得新的nice值。
clamp_val宏和clamp类似，将nice值限制在(MIN_NICE, MAX_NICE)范围内。MIN_NICE为-19，MAX_NICE为20。
再往下调用can_nice函数检查新的nice值是否会超过系统的限制值。
最后通过set_user_nice函数将新的nice值设置到task_struct中。

sys_nice->task_nice
include/linux/sched.h

static inline int task_nice(const struct task_struct *p)
{
    return PRIO_TO_NICE((p)->static_prio);
}
#define PRIO_TO_NICE(prio)  ((prio) - DEFAULT_PRIO)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
#define MAX_USER_RT_PRIO    100
#define MAX_RT_PRIO     MAX_USER_RT_PRIO

task_nice首先从进程task_struct结构中获得静态优先级static_prio，然后通过PRIO_TO_NICE宏将其转化成nice值。PRIO_TO_NICE宏其实就是(prio - 120)。相反，NICE_TO_PRIO宏其实就是(nice + 120)。

sys_nice->can_nice
kernel/sched/core.c

int can_nice(const struct task_struct *p, const int nice)
{
    int nice_rlim = nice_to_rlimit(nice);

    return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
}

static inline long nice_to_rlimit(long nice)
{
    return (MAX_NICE - nice + 1);
}

nice_to_rlimit将nice值从-20到19反向对应到2到41，即-20对应41，19对应2。
转化后检查该值是否小于系统的限制值。task_rlimit内部通过系统调用获得该限制值，该限制值在内核启动时就已经确定下来。

sys_nice->set_user_nice
kernel/sched/core.c

void set_user_nice(struct task_struct *p, long nice)
{
    int old_prio, delta, queued;
    unsigned long flags;
    struct rq *rq;

    rq = task_rq_lock(p, &flags);
    if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
        p->static_prio = NICE_TO_PRIO(nice);
        goto out_unlock;
    }
    queued = task_on_rq_queued(p);
    if (queued)
        dequeue_task(rq, p, 0);

    p->static_prio = NICE_TO_PRIO(nice);
    set_load_weight(p);
    old_prio = p->prio;
    p->prio = effective_prio(p);
    delta = p->prio - old_prio;

    if (queued) {
        enqueue_task(rq, p, 0);
        if (delta < 0 || (delta > 0 && task_running(rq, p)))
            resched_curr(rq);
    }
out_unlock:
    task_rq_unlock(rq, p, &flags);
}

task_rq_lock获得进程p所属的运行队列rq。

如果当前进程的调度策略是SCHED_DEADLINE、SCHED_FIFO和SCHED_RR的一种，则直接通过NICE_TO_PRIO宏将nice值转化为优先级值并设置到static_prio中即可。
SCHED_DEADLINE采用了EDF调度算法，主要针对运行时间比较敏感的进程，SCHED_FIFO和SCHED_RR调度策略主要是针对实时进程。

如果是其他的调度策略，最典型的是SCHED_NORMAL，此时首先通过task_on_rq_queued检查当前进程是否在运行队列上，如果是则返回1，否则返回0。如果当前进程在运行队列上，就要通过dequeue_task函数将其移除当前运行队列，等待重新设置进程权重后再放回运行队列。dequeue_task函数和后面的enqueue_task函数在《enqueue_task和dequeue_task源码分析》文章中分析了。

接下来也要设置static_prio。根据前面的分析，该值的范围在101到140之间。

set_load_weight函数根据当前进程的静态优先级设置其对应的调度实体sched_entity的权重，内核在调度进程时，最终会通过计算该权重将进程对应的调度实体插入到一个红黑树中，然后再从该树中找到一个最合适的进程运行。

下面以CFS调度策略为例，effective_prio对于CFS调度的普通进程而言其实就是获得static_prio。再计算delta表示新旧两个static_prio的差。然后如果前面将进程从运行队列中出队，这里就要通过enqueue_task函数将其重新入队。如果delta小于0，则表示进程的优先级提高，如果delta大于0，表示进程的优先级降低，并且此时进程正在运行，这两种情况都要通过resched_curr函数设置运行队列当前正在运行的进程的TIF_NEED_RESCHED标志位，让其重新调度一次。task_running宏检查进程是否在运行中。

static inline int task_running(struct rq *rq, struct task_struct *p)
{
    return p->on_cpu;
}

sys_nice->set_user_nice->task_has_dl_policy
kernel/sched/sched.h

static inline int task_has_dl_policy(struct task_struct *p)
{
    return dl_policy(p->policy);
}
static inline int dl_policy(int policy)
{
    return policy == SCHED_DEADLINE;
}
static inline int task_has_rt_policy(struct task_struct *p)
{
    return rt_policy(p->policy);
}
static inline int rt_policy(int policy)
{
    return policy == SCHED_FIFO || policy == SCHED_RR;
}

task_has_dl_policy函数通过dl_policy函数检查进程的调度策略是否是SCHED_DEADLINE；task_has_rt_policy函数通过rt_policy函数检查进程的调度策略是否是SCHED_FIFO或SCHED_RR。

sys_nice->set_user_nice->set_load_weight
kernel/sched/core.c

static void set_load_weight(struct task_struct *p)
{
    int prio = p->static_prio - MAX_RT_PRIO;
    struct load_weight *load = &p->se.load;

    if (p->policy == SCHED_IDLE) {
        ...
        return;
    }

    load->weight = prio_to_weight[prio];
    load->inv_weight = prio_to_wmult[prio];
}

根据前面的分析可知这里的静态优先级static_prio的范围在101到140之间，减去MAX_RT_PRIO即100后，prio的范围限制在1到40之间。
接下来获得进程对应的调度实体sched_entity的权重load_weight，这里只考虑使用CFS策略调度的普通进程，最后将刚刚计算的prio值作为数组下表，在prio_to_weight和prio_to_wmult数组中查找对应的权重值及它的倒数，并设置到load_weight的weight和inv_weight变量中。

static const int prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*   5 */       335,       272,       215,       172,       137,
 /*  10 */       110,        87,        70,        56,        45,
 /*  15 */        36,        29,        23,        18,        15,
};

static const u32 prio_to_wmult[40] = {
 /* -20 */     48388,     59856,     76040,     92818,    118348,
 /* -15 */    147320,    184698,    229616,    287308,    360437,
 /* -10 */    449829,    563644,    704093,    875809,   1099582,
 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};

设置prio_to_wmult数组是为了提升算法速度，例如在CFS调度策略中，会选择一个进程并计算其运行的虚拟时间，这个计算过程就会使用进程权重的倒数，因此在这里提前计算。

sys_nice->set_user_nice->effective_prio
kernel/sched/core.c

static int effective_prio(struct task_struct *p)
{
    p->normal_prio = normal_prio(p);
    if (!rt_prio(p->prio))
        return p->normal_prio;
    return p->prio;
}
static inline int normal_prio(struct task_struct *p)
{
    int prio;

    if (task_has_dl_policy(p))
        prio = MAX_DL_PRIO-1;
    else if (task_has_rt_policy(p))
        prio = MAX_RT_PRIO-1 - p->rt_priority;
    else
        prio = __normal_prio(p);
    return prio;
}
static inline int __normal_prio(struct task_struct *p)
{
    return p->static_prio;
}

effective_prio对于采用CFS调度策略的普通进程而言，最终返回的就是进程的static_prio。

sys_nice->set_user_nice->resched_curr
kernel/sched/core.c

void resched_curr(struct rq *rq)
{
    struct task_struct *curr = rq->curr;
    int cpu;

    if (test_tsk_need_resched(curr))
        return;

    cpu = cpu_of(rq);

    if (cpu == smp_processor_id()) {
        set_tsk_need_resched(curr);
        set_preempt_need_resched();
        return;
    }

    ...
}

static inline int test_tsk_need_resched(struct task_struct *tsk)
{
    return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}

static inline void set_tsk_need_resched(struct task_struct *tsk)
{
    set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}

static __always_inline void set_preempt_need_resched(void)
{
    raw_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
}

test_tsk_need_resched检查thread_info结构的标志位中是否已经设置了TIF_NEED_RESCHED。
cpu_of宏获得运行队列rq对应的cpu，smp_processor_id宏则获得当前进程对应的cpu的id。下面只考虑两者相等的情况，此时，通过set_tsk_need_resched增加TIF_NEED_RESCHED到进程thread_info结构的标志位，再通过set_preempt_need_resched函数设置per-cpu变量__preempt_count。