Codebase reference: https://android.googlesource.com/kernel/common/+/android-4.9
kernel-4.14 EAS預設是PELT,4.9之前PELT還沒那麼成熟時採用的WALT
EAS關鍵的一環就是整個系統loading tracking了,那WALT是怎麼計算一個CPU的loading呢、或是更基本的一個task(泛指thread/process)的 loading(utilization)?
首先看看task_util是怎麼告訴我們一個task的loading的
static inline unsigned long task_util(struct task_struct *p) { #ifdef CONFIG_SCHED_WALT if (!walt_disabled && sysctl_sched_use_walt_task_util) { unsigned long demand = p->ravg.demand; return (demand << SCHED_CAPACITY_SHIFT) / walt_ravg_window; } #endif return p->se.avg.util_avg; }
這裡被config包住,在WALT定義中task的loading是要看ravg.demand這個參數
而se.avg.util_avg則是PELT會用的
那接著我們來看看task初始化的時候,ravg.demand會怎樣被定義。
static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { p->on_rq = 0; p->se.on_rq = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; #ifdef CONFIG_SCHED_WALT p->last_sleep_ts = 0; #endif INIT_LIST_HEAD(&p->se.group_node); walt_init_new_task_load(p); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; #endif #ifdef CONFIG_SCHEDSTATS /* Even if schedstat is disabled, there should not be garbage */ memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; p->rt.time_slice = sched_rr_timeslice; p->rt.on_rq = 0; p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif #ifdef CONFIG_NUMA_BALANCING if (p->mm && atomic_read(&p->mm->mm_users) == 1) { p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); p->mm->numa_scan_seq = 0; } if (clone_flags & CLONE_VM) p->numa_preferred_nid = current->numa_preferred_nid; else p->numa_preferred_nid = -1; p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; p->numa_work.next = &p->numa_work; p->numa_faults = NULL; p->last_task_numa_placement = 0; p->last_sum_exec_runtime = 0; p->numa_group = NULL; #endif /* CONFIG_NUMA_BALANCING */ }
fork的時候會初始化walt估計task的loading
而其實在wake_up_new_task這裡還會再重新做一次,意義不明…
void walt_init_new_task_load(struct task_struct *p) { int i; u32 init_load_windows = div64_u64((u64)sysctl_sched_walt_init_task_load_pct * (u64)walt_ravg_window, 100); u32 init_load_pct = current->init_load_pct; p->init_load_pct = 0; memset(&p->ravg, 0, sizeof(struct ravg)); if (init_load_pct) { init_load_windows = div64_u64((u64)init_load_pct * (u64)walt_ravg_window, 100); } p->ravg.demand = init_load_windows; for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i) p->ravg.sum_history[i] = init_load_windows; }
可以看到這邊p->ravg.demand WALT估計loading會在這裡被assign value為 sysctl_sched_walt_init_task_load_pct * walt_ravg_window / 100;
預設值可以在 kernel/sched/walt.c找到
unsigned int sysctl_sched_walt_init_task_load_pct = 15;
sysctl_sched_walt_init_task_load_pct 這個參數可以在proc/sys/kernel/sched_walt_init_task_load_pct 做調整
所以以native kernel new task的初始loading: sysctl_sched_walt_init_task_load_pct * 1024 / 100 = 153
初始化之後我們來看看ravg.demand是如何被更新
/* Reflect task activity on its demand and cpu's busy time statistics */ void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, u64 wallclock, u64 irqtime) { if (walt_disabled || !rq->window_start) return; lockdep_assert_held(&rq->lock); update_window_start(rq, wallclock); if (!p->ravg.mark_start) goto done; update_task_demand(p, rq, event, wallclock); update_cpu_busy_time(p, rq, event, wallclock, irqtime); done: trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime); p->ravg.mark_start = wallclock; }
grep “walt_update_task_ravg” 後可以在kernel/scehed/core.c下 找到幾處有call walt_update_task_ravg
- scheduler_tick (週期性update)
- __schedule (排程要做context switch前會 update task loading)
- try_to_wake_up (喚醒前update)
-
static void update_task_demand(struct task_struct *p, struct rq *rq, int event, u64 wallclock) { u64 mark_start = p->ravg.mark_start; u64 delta, window_start = rq->window_start; int new_window, nr_full_windows; u32 window_size = walt_ravg_window; new_window = mark_start < window_start; if (!account_busy_for_task_demand(p, event)) { if (new_window) /* If the time accounted isn't being accounted as * busy time, and a new window started, only the * previous window need be closed out with the * pre-existing demand. Multiple windows may have * elapsed, but since empty windows are dropped, * it is not necessary to account those. */ update_history(rq, p, p->ravg.sum, 1, event); return; } if (!new_window) { /* The simple case - busy time contained within the existing * window. */ add_to_task_demand(rq, p, wallclock - mark_start); return; } /* Busy time spans at least two windows. Temporarily rewind * window_start to first window boundary after mark_start. */ delta = window_start - mark_start; nr_full_windows = div64_u64(delta, window_size); window_start -= (u64)nr_full_windows * (u64)window_size; /* Process (window_start - mark_start) first */ add_to_task_demand(rq, p, window_start - mark_start); /* Push new sample(s) into task's demand history */ update_history(rq, p, p->ravg.sum, 1, event); if (nr_full_windows) update_history(rq, p, scale_exec_time(window_size, rq), nr_full_windows, event); /* Roll window_start back to current to process any remainder * in current window. */ window_start += (u64)nr_full_windows * (u64)window_size; /* Process (wallclock - window_start) next */ mark_start = window_start; add_to_task_demand(rq, p, wallclock - mark_start); }
這邊有看到一些windows start、delta的計算,但還是沒有看到ravg.demand如何被更新
/* Push new sample(s) into task's demand history */ update_history(rq, p, p->ravg.sum, 1, event); if (nr_full_windows) update_history(rq, p, scale_exec_time(window_size, rq), nr_full_windows, event);
註解有講到demand所以繼續往下看
/* * Called when new window is starting for a task, to record cpu usage over * recently concluded window(s). Normally 'samples' should be 1. It can be > 1 * when, say, a real-time task runs without preemption for several windows at a * stretch. */ static void update_history(struct rq *rq, struct task_struct *p, u32 runtime, int samples, int event) { u32 *hist = &p->ravg.sum_history[0]; int ridx, widx; u32 max = 0, avg, demand; u64 sum = 0; /* Ignore windows where task had no activity */ if (!runtime || is_idle_task(p) || exiting_task(p) || !samples) goto done; /* Push new 'runtime' value onto stack */ widx = walt_ravg_hist_size - 1; ridx = widx - samples; for (; ridx >= 0; --widx, --ridx) { hist[widx] = hist[ridx]; sum += hist[widx]; if (hist[widx] > max) max = hist[widx]; } for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) { hist[widx] = runtime; sum += hist[widx]; if (hist[widx] > max) max = hist[widx]; } p->ravg.sum = 0; if (walt_window_stats_policy == WINDOW_STATS_RECENT) { demand = runtime; } else if (walt_window_stats_policy == WINDOW_STATS_MAX) { demand = max; } else { avg = div64_u64(sum, walt_ravg_hist_size); if (walt_window_stats_policy == WINDOW_STATS_AVG) demand = avg; else demand = max(avg, runtime); } /* * A throttled deadline sched class task gets dequeued without * changing p->on_rq. Since the dequeue decrements hmp stats * avoid decrementing it here again. * * When window is rolled over, the cumulative window demand * is reset to the cumulative runnable average (contribution from * the tasks on the runqueue). If the current task is dequeued * already, it's demand is not included in the cumulative runnable * average. So add the task demand separately to cumulative window * demand. */ if (!task_has_dl_policy(p) || !p->dl.dl_throttled) { if (task_on_rq_queued(p)) fixup_cumulative_runnable_avg(rq, p, demand); else if (rq->curr == p) fixup_cum_window_demand(rq, demand); } p->ravg.demand = demand; done: trace_walt_update_history(rq, p, runtime, samples, event); return; }
一路往下可以最終在這裡看到
p->ravg.demand = demand;
從上面往下看依序看到 會蒐集 widx – samples 這區間內的windows history demand
繼續往下看可以看到walt_window_stats_policy 來決定demand是採recent max or average的決策,這也是可以調整的
大概是這樣。
之後會再繼續往下寫一篇DVFS相關的。
更多資料、圖像化可以參考這篇:
https://blog.csdn.net/wukongmingjing/article/details/81633225