-
Notifications
You must be signed in to change notification settings - Fork 138
layered cpuset support #1747
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
layered cpuset support #1747
Changes from all commits
d0fdfd4
dd5fdeb
149c669
ed5f936
e27b53c
ed4c527
590315d
82e6ad4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
"matches": [ | ||
[ | ||
{ | ||
"CommPrefix": "geekbench" | ||
"CgroupPrefix": "system.slice/docker" | ||
} | ||
] | ||
], | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,9 +34,11 @@ const volatile u64 numa_cpumasks[MAX_NUMA_NODES][MAX_CPUS / 64]; | |
const volatile u32 llc_numa_id_map[MAX_LLCS]; | ||
const volatile u32 cpu_llc_id_map[MAX_CPUS]; | ||
const volatile u32 nr_layers = 1; | ||
const volatile u32 nr_cpusets = 1; | ||
const volatile u32 nr_nodes = 32; /* !0 for veristat, set during init */ | ||
const volatile u32 nr_llcs = 32; /* !0 for veristat, set during init */ | ||
const volatile bool smt_enabled = true; | ||
const volatile bool enable_cpuset = true; | ||
const volatile bool has_little_cores = true; | ||
const volatile bool xnuma_preemption = false; | ||
const volatile s32 __sibling_cpu[MAX_CPUS]; | ||
|
@@ -53,6 +55,7 @@ const volatile u64 lo_fb_wait_ns = 5000000; /* !0 for veristat */ | |
const volatile u32 lo_fb_share_ppk = 128; /* !0 for veristat */ | ||
const volatile bool percpu_kthread_preempt = true; | ||
volatile u64 layer_refresh_seq_avgruntime; | ||
const volatile u64 cpuset_fakemasks[MAX_CPUSETS][MAX_CPUS / 64]; | ||
|
||
/* Flag to enable or disable antistall feature */ | ||
const volatile bool enable_antistall = true; | ||
|
@@ -80,6 +83,20 @@ u32 layered_root_tgid = 0; | |
u32 empty_layer_ids[MAX_LAYERS]; | ||
u32 nr_empty_layer_ids; | ||
|
||
|
||
struct cpumask_wrapper { | ||
struct bpf_cpumask __kptr *mask; | ||
}; | ||
|
||
struct { | ||
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); | ||
__uint(max_entries, MAX_CPUSETS); | ||
__type(key, u32); | ||
__type(value, struct cpumask_wrapper); | ||
} cpuset_cpumask SEC(".maps"); | ||
|
||
|
||
|
||
UEI_DEFINE(uei); | ||
|
||
struct task_hint { | ||
|
@@ -502,6 +519,7 @@ struct task_ctx { | |
struct bpf_cpumask __kptr *layered_unprotected_mask; | ||
bool all_cpus_allowed; | ||
bool cpus_node_aligned; | ||
bool cpus_cpuset_aligned; | ||
u64 runnable_at; | ||
u64 running_at; | ||
u64 runtime_avg; | ||
|
@@ -1381,9 +1399,11 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) | |
* without making the whole scheduler node aware and should only be used | ||
* with open layers on non-saturated machines to avoid possible stalls. | ||
*/ | ||
if ((!taskc->all_cpus_allowed && | ||
!(layer->allow_node_aligned && taskc->cpus_node_aligned)) || | ||
!layer->nr_cpus) { | ||
if ((!taskc->all_cpus_allowed && | ||
!((layer->allow_node_aligned && taskc->cpus_node_aligned) || | ||
(enable_cpuset && taskc->cpus_cpuset_aligned))) | ||
|| !layer->nr_cpus) { | ||
|
||
taskc->dsq_id = task_cpuc->lo_fb_dsq_id; | ||
/* | ||
* Start a new lo fallback queued region if the DSQ is empty. | ||
|
@@ -2641,7 +2661,7 @@ void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight) | |
static void refresh_cpus_flags(struct task_ctx *taskc, | ||
const struct cpumask *cpumask) | ||
{ | ||
u32 node_id; | ||
u32 node_id, cpuset_id; | ||
|
||
if (!all_cpumask) { | ||
scx_bpf_error("NULL all_cpumask"); | ||
|
@@ -2658,7 +2678,7 @@ static void refresh_cpus_flags(struct task_ctx *taskc, | |
|
||
if (!(nodec = lookup_node_ctx(node_id)) || | ||
!(node_cpumask = cast_mask(nodec->cpumask))) | ||
return; | ||
break; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is |
||
|
||
/* not llc aligned if partially overlaps */ | ||
if (bpf_cpumask_intersects(node_cpumask, cpumask) && | ||
|
@@ -2667,6 +2687,21 @@ static void refresh_cpus_flags(struct task_ctx *taskc, | |
break; | ||
} | ||
} | ||
if (enable_cpuset) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe a blank line above? |
||
bpf_for(cpuset_id, 0, nr_cpusets) { | ||
struct cpumask_wrapper* wrapper; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Blank line. |
||
wrapper = bpf_map_lookup_elem(&cpuset_cpumask, &cpuset_id); | ||
if (!wrapper || !wrapper->mask) { | ||
scx_bpf_error("error marking tasks as cpuset aligned"); | ||
return; | ||
} | ||
if (bpf_cpumask_equal(cast_mask(wrapper->mask), cpumask)) { | ||
taskc->cpus_cpuset_aligned = true; | ||
return; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
} | ||
} | ||
taskc->cpus_cpuset_aligned = false; | ||
} | ||
} | ||
|
||
static int init_cached_cpus(struct cached_cpus *ccpus) | ||
|
@@ -3334,8 +3369,10 @@ static s32 init_cpu(s32 cpu, int *nr_online_cpus, | |
|
||
s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) | ||
{ | ||
struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask; | ||
int i, nr_online_cpus, ret; | ||
struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask, | ||
*tmp_cpuset_cpumask, *tmp_swap_dst_cpumask; | ||
int i, j, cpu, nr_online_cpus, ret; | ||
struct cpumask_wrapper* cpumask_wrapper; | ||
|
||
cpumask = bpf_cpumask_create(); | ||
if (!cpumask) | ||
|
@@ -3377,6 +3414,57 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) | |
if (tmp_unprotected_cpumask) | ||
bpf_cpumask_release(tmp_unprotected_cpumask); | ||
|
||
|
||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Two many blank lines. |
||
if (enable_cpuset) { | ||
bpf_for(i, 0, nr_cpusets) { | ||
cpumask = bpf_cpumask_create(); | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's also customary to not have blank line between variable setting and test on it. In scheduler BPF code, we've been doing |
||
if (!cpumask) | ||
return -ENOMEM; | ||
|
||
bpf_for(j, 0, MAX_CPUS/64) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add comments explaining what each block is doing? |
||
bpf_for(cpu, 0, 64) { | ||
if (i < 0 || i >= MAX_CPUSETS) { | ||
bpf_cpumask_release(cpumask); | ||
return -1; | ||
} | ||
if (cpuset_fakemasks[i][j] & (1LLU << cpu)) { | ||
bpf_cpumask_set_cpu((MAX_CPUS/64 - j - 1) * 64 + cpu, cpumask); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So AFAICT the cpumask should fit all node cpumasks? This looks like it works because we clobber vmlinux.h to hold 128 64-bit numbers, which is fine to bypass verifier behavior but I'm not sure is great to depend on. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ideally we'll rip out all unprintable trusted ptr cpumasks w/ printable arena cpumasks eventually I think, maybe. I really wish these were printable lol... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This loop is unnecessarily convoluted. Just iterate |
||
} | ||
|
||
} | ||
} | ||
|
||
// pay init cost once for faster lookups later. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need per-cpu copies? Can't this be a part of |
||
bpf_for(cpu, 0, nr_possible_cpus) { | ||
cpumask_wrapper = bpf_map_lookup_percpu_elem(&cpuset_cpumask, &i, cpu); | ||
tmp_cpuset_cpumask = bpf_cpumask_create(); | ||
|
||
if (!cpumask || !tmp_cpuset_cpumask || !cpumask_wrapper) { | ||
if (cpumask) | ||
bpf_cpumask_release(cpumask); | ||
if (tmp_cpuset_cpumask) | ||
bpf_cpumask_release(tmp_cpuset_cpumask); | ||
scx_bpf_error("cpumask is null"); | ||
return -1; | ||
} | ||
|
||
bpf_cpumask_copy(tmp_cpuset_cpumask, cast_mask(cpumask)); | ||
|
||
tmp_swap_dst_cpumask = bpf_kptr_xchg(&cpumask_wrapper->mask, tmp_cpuset_cpumask); | ||
|
||
if (tmp_swap_dst_cpumask) | ||
bpf_cpumask_release(tmp_swap_dst_cpumask); | ||
|
||
} | ||
|
||
if (cpumask) | ||
bpf_cpumask_release(cpumask); | ||
|
||
} | ||
} | ||
|
||
bpf_for(i, 0, nr_nodes) { | ||
ret = create_node(i); | ||
if (ret) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please update the comment to explain
cpus_cpuset_aligned
.