본문 바로가기
  • fishing...
  • eating...
MISCELLANEOUSNESS

linux v2.6.11.5 kernel scheduler algorithm.

by 회색뿔 2010. 6. 23.


linux v2.6.11.5 kernel scheduler algorithm.

 /*
  * schedule() is the main scheduler function.
  */
asmlinkage void __sched schedule(void)
{
        long *switch_count;
        task_t *prev, *next;
        runqueue_t *rq;
        prio_array_t *array;
        struct list_head *queue;
        unsigned long long now;
        unsigned long run_time;
        int cpu, idx;

        /*
         * Test if we are atomic.  Since do_exit() needs to call into
         * schedule() atomically, we ignore that path for now.
         * Otherwise, whine if we are scheduling when we should not be.
         */
        if (likely(!current->exit_state)) {
                if (unlikely(in_atomic())) {
                        printk(KERN_ERR "scheduling while atomic: "
                                "%s/0x%08x/%d\n",
                                current->comm, preempt_count(), current->pid);
                        dump_stack();
                }
        }
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));

need_resched:
        preempt_disable();
        prev = current;
        release_kernel_lock(prev);
need_resched_nonpreemptible:
        rq = this_rq();

        /*
         * The idle thread is not allowed to schedule!
         * Remove this check after it has been exercised a bit.
         */
        if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
                printk(KERN_ERR "bad: scheduling from the idle thread!\n");
                dump_stack();
        }

        schedstat_inc(rq, sched_cnt);
        now = sched_clock();
        if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
                run_time = now - prev->timestamp;
        else
                run_time = NS_MAX_SLEEP_AVG;

        /*
         * Tasks charged proportionately less run_time at high sleep_avg to
         * delay them losing their interactive status
         */
        run_time /= (CURRENT_BONUS(prev) ? : 1);
        spin_lock_irq(&rq->lock);

        if (unlikely(prev->flags & PF_DEAD))
                prev->state = EXIT_DEAD;

        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                switch_count = &prev->nvcsw;
                if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
                                unlikely(signal_pending(prev))))
                        prev->state = TASK_RUNNING;
                else {
                        if (prev->state == TASK_UNINTERRUPTIBLE)
                                rq->nr_uninterruptible++;
                        deactivate_task(prev, rq);
                }
        }

        cpu = smp_processor_id();
        if (unlikely(!rq->nr_running)) {
go_idle:
                idle_balance(cpu, rq);
                if (!rq->nr_running) {
                        next = rq->idle;
                        rq->expired_timestamp = 0;
                        wake_sleeping_dependent(cpu, rq);
                        /*
                         * wake_sleeping_dependent() might have released
                         * the runqueue, so break out if we got new
                         * tasks meanwhile:
                         */
                        if (!rq->nr_running)
                                goto switch_tasks;
                }
        } else {
                if (dependent_sleeper(cpu, rq)) {
                        next = rq->idle;
                        goto switch_tasks;
                }
                /*
                 * dependent_sleeper() releases and reacquires the runqueue
                 * lock, hence go into the idle loop if the rq went
                 * empty meanwhile:
                 */
                if (unlikely(!rq->nr_running))
                        goto go_idle;
        }

        array = rq->active;
        if (unlikely(!array->nr_active)) {
                /*
                 * Switch the active and expired arrays.
                 */
                schedstat_inc(rq, sched_switch);
                rq->active = rq->expired;
                rq->expired = array;
                array = rq->active;
                rq->expired_timestamp = 0;
                rq->best_expired_prio = MAX_PRIO;
        } else
                schedstat_inc(rq, sched_noswitch);

        idx = sched_find_first_bit(array->bitmap);
        queue = array->queue + idx;
        next = list_entry(queue->next, task_t, run_list);

        if (!rt_task(next) && next->activated > 0) {
                unsigned long long delta = now - next->timestamp;

                if (next->activated == 1)
                        delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;

                array = next->array;
                dequeue_task(next, array);
                recalc_task_prio(next, next->timestamp + delta);
                enqueue_task(next, array);
        }
        next->activated = 0;
switch_tasks:
        if (next == rq->idle)
                schedstat_inc(rq, sched_goidle);
        prefetch(next);
        clear_tsk_need_resched(prev);
        rcu_qsctr_inc(task_cpu(prev));

        prev->sleep_avg -= run_time;
        if ((long)prev->sleep_avg <= 0)
                prev->sleep_avg = 0;
        prev->timestamp = prev->last_ran = now;

        sched_info_switch(prev, next);
        if (likely(prev != next)) {
                next->timestamp = now;
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;

                prepare_arch_switch(rq, next);
                prev = context_switch(rq, prev, next);
                barrier();

                finish_task_switch(prev);
        } else
                spin_unlock_irq(&rq->lock);

        prev = current;
        if (unlikely(reacquire_kernel_lock(prev) < 0))
                goto need_resched_nonpreemptible;
        preempt_enable_no_resched();
        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                goto need_resched;
}