[LinuxKernel] WALT-Windows Assisted Loading Tracking

Codebase reference: https://android.googlesource.com/kernel/common/+/android-4.9

kernel-4.14 EAS預設是PELT,4.9之前PELT還沒那麼成熟時採用的WALT

EAS關鍵的一環就是整個系統loading tracking了,那WALT是怎麼計算一個CPU的loading呢、或是更基本的一個task(泛指thread/process)的 loading(utilization)?

首先看看task_util是怎麼告訴我們一個task的loading的

static inline unsigned long task_util(struct task_struct *p)
{
#ifdef CONFIG_SCHED_WALT
    if (!walt_disabled && sysctl_sched_use_walt_task_util) {
        unsigned long demand = p->ravg.demand;
        return (demand << SCHED_CAPACITY_SHIFT) / walt_ravg_window;
    }
#endif
    return p->se.avg.util_avg;
}

 

這裡被config包住,在WALT定義中task的loading是要看ravg.demand這個參數

而se.avg.util_avg則是PELT會用的

那接著我們來看看task初始化的時候,ravg.demand會怎樣被定義。

static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
    p->on_rq         = 0;
    p->se.on_rq          = 0;
    p->se.exec_start     = 0;
    p->se.sum_exec_runtime       = 0;
    p->se.prev_sum_exec_runtime  = 0;
    p->se.nr_migrations      = 0;
    p->se.vruntime           = 0;
#ifdef CONFIG_SCHED_WALT
    p->last_sleep_ts     = 0;
#endif
    INIT_LIST_HEAD(&p->se.group_node);
    walt_init_new_task_load(p);
#ifdef CONFIG_FAIR_GROUP_SCHED
    p->se.cfs_rq         = NULL;
#endif
#ifdef CONFIG_SCHEDSTATS
    /* Even if schedstat is disabled, there should not be garbage */
    memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
    RB_CLEAR_NODE(&p->dl.rb_node);
    init_dl_task_timer(&p->dl);
    __dl_clear_params(p);
    INIT_LIST_HEAD(&p->rt.run_list);
    p->rt.timeout        = 0;
    p->rt.time_slice = sched_rr_timeslice;
    p->rt.on_rq      = 0;
    p->rt.on_list        = 0;
#ifdef CONFIG_PREEMPT_NOTIFIERS
    INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
#ifdef CONFIG_NUMA_BALANCING
    if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
        p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
        p->mm->numa_scan_seq = 0;
    }
    if (clone_flags & CLONE_VM)
        p->numa_preferred_nid = current->numa_preferred_nid;
    else
        p->numa_preferred_nid = -1;
    p->node_stamp = 0ULL;
    p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
    p->numa_scan_period = sysctl_numa_balancing_scan_delay;
    p->numa_work.next = &p->numa_work;
    p->numa_faults = NULL;
    p->last_task_numa_placement = 0;
    p->last_sum_exec_runtime = 0;
    p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
}

 

fork的時候會初始化walt估計task的loading

而其實在wake_up_new_task這裡還會再重新做一次,意義不明…

void walt_init_new_task_load(struct task_struct *p)
{
    int i;
    u32 init_load_windows =
            div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
                          (u64)walt_ravg_window, 100);
    u32 init_load_pct = current->init_load_pct;
    p->init_load_pct = 0;
    memset(&p->ravg, 0, sizeof(struct ravg));
    if (init_load_pct) {
        init_load_windows = div64_u64((u64)init_load_pct *
              (u64)walt_ravg_window, 100);
    }
    p->ravg.demand = init_load_windows;
    for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
        p->ravg.sum_history[i] = init_load_windows;
}

 

可以看到這邊p->ravg.demand WALT估計loading會在這裡被assign value為 sysctl_sched_walt_init_task_load_pct  * walt_ravg_window / 100;

預設值可以在 kernel/sched/walt.c找到

unsigned int sysctl_sched_walt_init_task_load_pct = 15;

sysctl_sched_walt_init_task_load_pct 這個參數可以在proc/sys/kernel/sched_walt_init_task_load_pct 做調整

所以以native kernel new task的初始loading: sysctl_sched_walt_init_task_load_pct  * 1024 / 100 = 153


初始化之後我們來看看ravg.demand是如何被更新

/* Reflect task activity on its demand and cpu's busy time statistics */
void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
         int event, u64 wallclock, u64 irqtime)
{
    if (walt_disabled || !rq->window_start)
        return;
    lockdep_assert_held(&rq->lock);
    update_window_start(rq, wallclock);
    if (!p->ravg.mark_start)
        goto done;
    update_task_demand(p, rq, event, wallclock);
    update_cpu_busy_time(p, rq, event, wallclock, irqtime);
done:
    trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
    p->ravg.mark_start = wallclock;
}

 

grep “walt_update_task_ravg” 後可以在kernel/scehed/core.c下 找到幾處有call walt_update_task_ravg

  1. scheduler_tick (週期性update)
  2. __schedule (排程要做context switch前會 update task loading)
  3. try_to_wake_up (喚醒前update)
  4. static void update_task_demand(struct task_struct *p, struct rq *rq,
             int event, u64 wallclock)
    {
        u64 mark_start = p->ravg.mark_start;
        u64 delta, window_start = rq->window_start;
        int new_window, nr_full_windows;
        u32 window_size = walt_ravg_window;
        new_window = mark_start < window_start;
        if (!account_busy_for_task_demand(p, event)) {
            if (new_window)
                /* If the time accounted isn't being accounted as
                 * busy time, and a new window started, only the
                 * previous window need be closed out with the
                 * pre-existing demand. Multiple windows may have
                 * elapsed, but since empty windows are dropped,
                 * it is not necessary to account those. */
                update_history(rq, p, p->ravg.sum, 1, event);
            return;
        }
        if (!new_window) {
            /* The simple case - busy time contained within the existing
             * window. */
            add_to_task_demand(rq, p, wallclock - mark_start);
            return;
        }
        /* Busy time spans at least two windows. Temporarily rewind
         * window_start to first window boundary after mark_start. */
        delta = window_start - mark_start;
        nr_full_windows = div64_u64(delta, window_size);
        window_start -= (u64)nr_full_windows * (u64)window_size;
        /* Process (window_start - mark_start) first */
        add_to_task_demand(rq, p, window_start - mark_start);
        /* Push new sample(s) into task's demand history */
        update_history(rq, p, p->ravg.sum, 1, event);
        if (nr_full_windows)
            update_history(rq, p, scale_exec_time(window_size, rq),
                       nr_full_windows, event);
        /* Roll window_start back to current to process any remainder
         * in current window. */
        window_start += (u64)nr_full_windows * (u64)window_size;
        /* Process (wallclock - window_start) next */
        mark_start = window_start;
        add_to_task_demand(rq, p, wallclock - mark_start);
    }

     

這邊有看到一些windows start、delta的計算,但還是沒有看到ravg.demand如何被更新

/* Push new sample(s) into task's demand history */
update_history(rq, p, p->ravg.sum, 1, event);
if (nr_full_windows)
    update_history(rq, p, scale_exec_time(window_size, rq),
               nr_full_windows, event);

 

註解有講到demand所以繼續往下看

/*
 * Called when new window is starting for a task, to record cpu usage over
 * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
 * when, say, a real-time task runs without preemption for several windows at a
 * stretch.
 */
static void update_history(struct rq *rq, struct task_struct *p,
             u32 runtime, int samples, int event)
{
    u32 *hist = &p->ravg.sum_history[0];
    int ridx, widx;
    u32 max = 0, avg, demand;
    u64 sum = 0;
    /* Ignore windows where task had no activity */
    if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
            goto done;
    /* Push new 'runtime' value onto stack */
    widx = walt_ravg_hist_size - 1;
    ridx = widx - samples;
    for (; ridx >= 0; --widx, --ridx) {
        hist[widx] = hist[ridx];
        sum += hist[widx];
        if (hist[widx] > max)
            max = hist[widx];
    }
    for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
        hist[widx] = runtime;
        sum += hist[widx];
        if (hist[widx] > max)
            max = hist[widx];
    }
    p->ravg.sum = 0;
    if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
        demand = runtime;
    } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
        demand = max;
    } else {
        avg = div64_u64(sum, walt_ravg_hist_size);
        if (walt_window_stats_policy == WINDOW_STATS_AVG)
            demand = avg;
        else
            demand = max(avg, runtime);
    }
    /*
     * A throttled deadline sched class task gets dequeued without
     * changing p->on_rq. Since the dequeue decrements hmp stats
     * avoid decrementing it here again.
     *
     * When window is rolled over, the cumulative window demand
     * is reset to the cumulative runnable average (contribution from
     * the tasks on the runqueue). If the current task is dequeued
     * already, it's demand is not included in the cumulative runnable
     * average. So add the task demand separately to cumulative window
     * demand.
     */
    if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
        if (task_on_rq_queued(p))
            fixup_cumulative_runnable_avg(rq, p, demand);
        else if (rq->curr == p)
            fixup_cum_window_demand(rq, demand);
    }
    p->ravg.demand = demand;
done:
    trace_walt_update_history(rq, p, runtime, samples, event);
    return;
}

 

一路往下可以最終在這裡看到

p->ravg.demand = demand;

從上面往下看依序看到 會蒐集 widx – samples  這區間內的windows history demand

繼續往下看可以看到walt_window_stats_policy 來決定demand是採recent max or average的決策,這也是可以調整的

大概是這樣。

之後會再繼續往下寫一篇DVFS相關的。

更多資料、圖像化可以參考這篇:

https://blog.csdn.net/wukongmingjing/article/details/81633225