1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
|
From 1e9ffadc70a89ede023b79139debe4f6a17c5c7f Mon Sep 17 00:00:00 2001
From: Morten Rasmussen <morten.rasmussen@arm.com>
Date: Thu, 18 Dec 2014 14:47:18 +0000
Subject: [PATCH 53/92] sched: Calculate energy consumption of sched_group
For energy-aware load-balancing decisions it is necessary to know the
energy consumption estimates of groups of cpus. This patch introduces a
basic function, sched_group_energy(), which estimates the energy
consumption of the cpus in the group and any resources shared by the
members of the group.
NOTE: The function has five levels of identation and breaks the 80
character limit. Refactoring is necessary.
cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
(cherry picked from commit 1e09b216da57376d8321459eda52699cef8f7e1a)
Signed-off-by: Gaku Inami <gaku.inami.xw@bp.renesas.com>
---
kernel/sched/core.c | 4 ++
kernel/sched/fair.c | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 1 +
3 files changed, 161 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f2642d4..431c7e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6002,6 +6002,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
DEFINE_PER_CPU(struct sched_domain *, sd_ea);
+DEFINE_PER_CPU(struct sched_domain *, sd_scs);
static void update_top_cache_domain(int cpu)
{
@@ -6036,6 +6037,9 @@ static void update_top_cache_domain(int cpu)
break;
}
rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
+
+ sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
+ rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0085d4f..1b8dca7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5331,6 +5331,162 @@ static inline bool energy_aware(void)
}
/*
+ * cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_CAPACITY_SCALE] which is useful
+ * for energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
+ *
+ * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ * capacity = capacity_orig * curr_freq/max_freq
+ *
+ * norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long cpu_norm_util(int cpu, unsigned long capacity)
+{
+ int util = cpu_util(cpu);
+
+ if (util >= capacity)
+ return SCHED_CAPACITY_SCALE;
+
+ return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static unsigned long group_max_util(struct sched_group *sg)
+{
+ int i;
+ unsigned long max_util = 0;
+
+ for_each_cpu(i, sched_group_cpus(sg))
+ max_util = max(max_util, cpu_util(i));
+
+ return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_CAPACITY_SCALE] for use
+ * in energy calculations. Since task executions may or may not overlap in time
+ * in the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned long group_norm_util(struct sched_group *sg, int cap_idx)
+{
+ int i;
+ unsigned long util_sum = 0;
+ unsigned long capacity = sg->sge->cap_states[cap_idx].cap;
+
+ for_each_cpu(i, sched_group_cpus(sg))
+ util_sum += cpu_norm_util(i, capacity);
+
+ if (util_sum > SCHED_CAPACITY_SCALE)
+ return SCHED_CAPACITY_SCALE;
+ return util_sum;
+}
+
+static int find_new_capacity(struct sched_group *sg,
+ const struct sched_group_energy const *sge)
+{
+ int idx;
+ unsigned long util = group_max_util(sg);
+
+ for (idx = 0; idx < sge->nr_cap_states; idx++) {
+ if (sge->cap_states[idx].cap >= util)
+ return idx;
+ }
+
+ return idx;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct sched_group *sg_top)
+{
+ struct sched_domain *sd;
+ int cpu, total_energy = 0;
+ struct cpumask visit_cpus;
+ struct sched_group *sg;
+
+ WARN_ON(!sg_top->sge);
+
+ cpumask_copy(&visit_cpus, sched_group_cpus(sg_top));
+
+ while (!cpumask_empty(&visit_cpus)) {
+ struct sched_group *sg_shared_cap = NULL;
+
+ cpu = cpumask_first(&visit_cpus);
+
+ /*
+ * Is the group utilization affected by cpus outside this
+ * sched_group?
+ */
+ sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+ if (!sd)
+ /*
+ * We most probably raced with hotplug; returning a
+ * wrong energy estimation is better than entering an
+ * infinite loop.
+ */
+ return -EINVAL;
+
+ if (sd->parent)
+ sg_shared_cap = sd->parent->groups;
+
+ for_each_domain(cpu, sd) {
+ sg = sd->groups;
+
+ /* Has this sched_domain already been visited? */
+ if (sd->child && group_first_cpu(sg) != cpu)
+ break;
+
+ do {
+ struct sched_group *sg_cap_util;
+ unsigned long group_util;
+ int sg_busy_energy, sg_idle_energy, cap_idx;
+
+ if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+ sg_cap_util = sg_shared_cap;
+ else
+ sg_cap_util = sg;
+
+ cap_idx = find_new_capacity(sg_cap_util, sg->sge);
+ group_util = group_norm_util(sg, cap_idx);
+ sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
+ >> SCHED_CAPACITY_SHIFT;
+ sg_idle_energy = ((SCHED_CAPACITY_SCALE-group_util) * sg->sge->idle_states[0].power)
+ >> SCHED_CAPACITY_SHIFT;
+
+ total_energy += sg_busy_energy + sg_idle_energy;
+
+ if (!sd->child)
+ cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+
+ if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top)))
+ goto next_cpu;
+
+ } while (sg = sg->next, sg != sd->groups);
+ }
+next_cpu:
+ continue;
+ }
+
+ return total_energy;
+}
+
+/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
*
* A waker of many should wake a different task than the one last awakened
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 22b7cfd..75dcd85 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -893,6 +893,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
DECLARE_PER_CPU(struct sched_domain *, sd_ea);
+DECLARE_PER_CPU(struct sched_domain *, sd_scs);
struct sched_group_capacity {
atomic_t ref;
--
1.9.1
|