diff --git a/scheds/rust/scx_layered/examples/cpuset.json b/scheds/rust/scx_layered/examples/cpuset.json index 70f6be305..656187095 100755 --- a/scheds/rust/scx_layered/examples/cpuset.json +++ b/scheds/rust/scx_layered/examples/cpuset.json @@ -5,7 +5,7 @@ "matches": [ [ { - "CommPrefix": "geekbench" + "CgroupPrefix": "system.slice/docker" } ] ], diff --git a/scheds/rust/scx_layered/src/bpf/intf.h b/scheds/rust/scx_layered/src/bpf/intf.h index 0144ddded..4f4c06547 100644 --- a/scheds/rust/scx_layered/src/bpf/intf.h +++ b/scheds/rust/scx_layered/src/bpf/intf.h @@ -30,6 +30,7 @@ enum consts { MAX_TASKS = 131072, MAX_PATH = 4096, MAX_NUMA_NODES = 64, + MAX_CPUSETS = 64, MAX_LLCS = 64, MAX_COMM = 16, MAX_LAYER_MATCH_ORS = 32, diff --git a/scheds/rust/scx_layered/src/bpf/main.bpf.c b/scheds/rust/scx_layered/src/bpf/main.bpf.c index e0f0cf07c..8fb7d3854 100644 --- a/scheds/rust/scx_layered/src/bpf/main.bpf.c +++ b/scheds/rust/scx_layered/src/bpf/main.bpf.c @@ -34,9 +34,11 @@ const volatile u64 numa_cpumasks[MAX_NUMA_NODES][MAX_CPUS / 64]; const volatile u32 llc_numa_id_map[MAX_LLCS]; const volatile u32 cpu_llc_id_map[MAX_CPUS]; const volatile u32 nr_layers = 1; +const volatile u32 nr_cpusets = 1; const volatile u32 nr_nodes = 32; /* !0 for veristat, set during init */ const volatile u32 nr_llcs = 32; /* !0 for veristat, set during init */ const volatile bool smt_enabled = true; +const volatile bool enable_cpuset = true; const volatile bool has_little_cores = true; const volatile bool xnuma_preemption = false; const volatile s32 __sibling_cpu[MAX_CPUS]; @@ -53,6 +55,7 @@ const volatile u64 lo_fb_wait_ns = 5000000; /* !0 for veristat */ const volatile u32 lo_fb_share_ppk = 128; /* !0 for veristat */ const volatile bool percpu_kthread_preempt = true; volatile u64 layer_refresh_seq_avgruntime; +const volatile u64 cpuset_fakemasks[MAX_CPUSETS][MAX_CPUS / 64]; /* Flag to enable or disable antistall feature */ const volatile bool enable_antistall = true; @@ -80,6 +83,20 @@ u32 layered_root_tgid = 0; u32 empty_layer_ids[MAX_LAYERS]; u32 nr_empty_layer_ids; + +struct cpumask_wrapper { + struct bpf_cpumask __kptr *mask; +}; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, MAX_CPUSETS); + __type(key, u32); + __type(value, struct cpumask_wrapper); +} cpuset_cpumask SEC(".maps"); + + + UEI_DEFINE(uei); struct task_hint { @@ -502,6 +519,7 @@ struct task_ctx { struct bpf_cpumask __kptr *layered_unprotected_mask; bool all_cpus_allowed; bool cpus_node_aligned; + bool cpus_cpuset_aligned; u64 runnable_at; u64 running_at; u64 runtime_avg; @@ -1381,9 +1399,11 @@ void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) * without making the whole scheduler node aware and should only be used * with open layers on non-saturated machines to avoid possible stalls. */ - if ((!taskc->all_cpus_allowed && - !(layer->allow_node_aligned && taskc->cpus_node_aligned)) || - !layer->nr_cpus) { + if ((!taskc->all_cpus_allowed && + !((layer->allow_node_aligned && taskc->cpus_node_aligned) || + (enable_cpuset && taskc->cpus_cpuset_aligned))) + || !layer->nr_cpus) { + taskc->dsq_id = task_cpuc->lo_fb_dsq_id; /* * Start a new lo fallback queued region if the DSQ is empty. @@ -2641,7 +2661,7 @@ void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight) static void refresh_cpus_flags(struct task_ctx *taskc, const struct cpumask *cpumask) { - u32 node_id; + u32 node_id, cpuset_id; if (!all_cpumask) { scx_bpf_error("NULL all_cpumask"); @@ -2658,7 +2678,7 @@ static void refresh_cpus_flags(struct task_ctx *taskc, if (!(nodec = lookup_node_ctx(node_id)) || !(node_cpumask = cast_mask(nodec->cpumask))) - return; + break; /* not llc aligned if partially overlaps */ if (bpf_cpumask_intersects(node_cpumask, cpumask) && @@ -2667,6 +2687,21 @@ static void refresh_cpus_flags(struct task_ctx *taskc, break; } } + if (enable_cpuset) { + bpf_for(cpuset_id, 0, nr_cpusets) { + struct cpumask_wrapper* wrapper; + wrapper = bpf_map_lookup_elem(&cpuset_cpumask, &cpuset_id); + if (!wrapper || !wrapper->mask) { + scx_bpf_error("error marking tasks as cpuset aligned"); + return; + } + if (bpf_cpumask_equal(cast_mask(wrapper->mask), cpumask)) { + taskc->cpus_cpuset_aligned = true; + return; + } + } + taskc->cpus_cpuset_aligned = false; + } } static int init_cached_cpus(struct cached_cpus *ccpus) @@ -3334,8 +3369,10 @@ static s32 init_cpu(s32 cpu, int *nr_online_cpus, s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) { - struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask; - int i, nr_online_cpus, ret; + struct bpf_cpumask *cpumask, *tmp_big_cpumask, *tmp_unprotected_cpumask, + *tmp_cpuset_cpumask, *tmp_swap_dst_cpumask; + int i, j, cpu, nr_online_cpus, ret; + struct cpumask_wrapper* cpumask_wrapper; cpumask = bpf_cpumask_create(); if (!cpumask) @@ -3377,6 +3414,57 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) if (tmp_unprotected_cpumask) bpf_cpumask_release(tmp_unprotected_cpumask); + + + if (enable_cpuset) { + bpf_for(i, 0, nr_cpusets) { + cpumask = bpf_cpumask_create(); + + if (!cpumask) + return -ENOMEM; + + bpf_for(j, 0, MAX_CPUS/64) { + bpf_for(cpu, 0, 64) { + if (i < 0 || i >= MAX_CPUSETS) { + bpf_cpumask_release(cpumask); + return -1; + } + if (cpuset_fakemasks[i][j] & (1LLU << cpu)) { + bpf_cpumask_set_cpu((MAX_CPUS/64 - j - 1) * 64 + cpu, cpumask); + } + + } + } + + // pay init cost once for faster lookups later. + bpf_for(cpu, 0, nr_possible_cpus) { + cpumask_wrapper = bpf_map_lookup_percpu_elem(&cpuset_cpumask, &i, cpu); + tmp_cpuset_cpumask = bpf_cpumask_create(); + + if (!cpumask || !tmp_cpuset_cpumask || !cpumask_wrapper) { + if (cpumask) + bpf_cpumask_release(cpumask); + if (tmp_cpuset_cpumask) + bpf_cpumask_release(tmp_cpuset_cpumask); + scx_bpf_error("cpumask is null"); + return -1; + } + + bpf_cpumask_copy(tmp_cpuset_cpumask, cast_mask(cpumask)); + + tmp_swap_dst_cpumask = bpf_kptr_xchg(&cpumask_wrapper->mask, tmp_cpuset_cpumask); + + if (tmp_swap_dst_cpumask) + bpf_cpumask_release(tmp_swap_dst_cpumask); + + } + + if (cpumask) + bpf_cpumask_release(cpumask); + + } + } + bpf_for(i, 0, nr_nodes) { ret = create_node(i); if (ret) diff --git a/scheds/rust/scx_layered/src/layer_core_growth.rs b/scheds/rust/scx_layered/src/layer_core_growth.rs index 1e8ba128a..334ac0a6d 100644 --- a/scheds/rust/scx_layered/src/layer_core_growth.rs +++ b/scheds/rust/scx_layered/src/layer_core_growth.rs @@ -89,8 +89,8 @@ use std::collections::BTreeSet; #[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct CpuSet { - cpus: BTreeSet, - cores: BTreeSet, + pub cpus: BTreeSet, + pub cores: BTreeSet, } fn parse_cpu_ranges(s: &str) -> Result> { @@ -126,7 +126,7 @@ fn collect_cpuset_effective() -> Result>> { } // return cpuset layout. -fn get_cpusets(topo: &Topology) -> Result> { +pub fn get_cpusets(topo: &Topology) -> Result> { let mut cpusets: BTreeSet = BTreeSet::new(); let cpuset_cpus = collect_cpuset_effective()?; for x in cpuset_cpus { diff --git a/scheds/rust/scx_layered/src/lib.rs b/scheds/rust/scx_layered/src/lib.rs index b39555d7b..1e93220d4 100644 --- a/scheds/rust/scx_layered/src/lib.rs +++ b/scheds/rust/scx_layered/src/lib.rs @@ -3,7 +3,7 @@ // This software may be used and distributed according to the terms of the // GNU General Public License version 2. mod config; -mod layer_core_growth; +pub mod layer_core_growth; pub mod bpf_intf; @@ -189,7 +189,7 @@ impl CpuPool { cpus } - fn get_core_topological_id(&self, core: &Core) -> usize { + pub fn get_core_topological_id(&self, core: &Core) -> usize { *self .core_topology_to_id .get(&(core.node_id, core.llc_id, core.id)) diff --git a/scheds/rust/scx_layered/src/main.rs b/scheds/rust/scx_layered/src/main.rs index 551e49c4f..b0ccc0f2f 100644 --- a/scheds/rust/scx_layered/src/main.rs +++ b/scheds/rust/scx_layered/src/main.rs @@ -27,6 +27,7 @@ use anyhow::Result; pub use bpf_skel::*; use clap::Parser; use crossbeam::channel::RecvTimeoutError; +use layer_core_growth::get_cpusets; use lazy_static::lazy_static; use libbpf_rs::MapCore as _; use libbpf_rs::OpenObject; @@ -67,6 +68,7 @@ const MIN_LAYER_WEIGHT: u32 = bpf_intf::consts_MIN_LAYER_WEIGHT; const MAX_LAYER_MATCH_ORS: usize = bpf_intf::consts_MAX_LAYER_MATCH_ORS as usize; const MAX_LAYER_NAME: usize = bpf_intf::consts_MAX_LAYER_NAME as usize; const MAX_LAYERS: usize = bpf_intf::consts_MAX_LAYERS as usize; +const MAX_CPUS: usize = bpf_intf::consts_MAX_CPUS as usize; const DEFAULT_LAYER_WEIGHT: u32 = bpf_intf::consts_DEFAULT_LAYER_WEIGHT; const USAGE_HALF_LIFE: u32 = bpf_intf::consts_USAGE_HALF_LIFE; const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0; @@ -589,6 +591,10 @@ struct Opts { #[clap(long, default_value = "false")] disable_antistall: bool, + /// Enable cpuset support + #[clap(long, default_value = "false")] + enable_cpuset: bool, + /// Maximum task runnable_at delay (in seconds) before antistall turns on #[clap(long, default_value = "3")] antistall_sec: u64, @@ -1424,6 +1430,34 @@ impl<'a> Scheduler<'a> { Ok(()) } + fn init_cpusets(skel: &mut OpenBpfSkel, topo: &Topology) -> Result<()> { + let cpusets = get_cpusets(topo)?; + for (i, cpuset) in cpusets.iter().enumerate() { + debug!("a cpuset is: {:#?}", cpuset.clone()); + let mut cpumask_bitvec: [u64; MAX_CPUS / 64] = [0; MAX_CPUS / 64]; + for chunk_idx in 0..cpumask_bitvec.len() { + let mut cpumask_chunk = 0; + for cpu in 0..64 { + if cpuset.cpus.contains(&(chunk_idx * 64 + cpu)) { + cpumask_chunk |= 1 << cpu; + } + } + cpumask_bitvec[cpumask_bitvec.len() - chunk_idx - 1] = cpumask_chunk; + } + let hex_string = cpumask_bitvec + .clone() + .iter() + .map(|b| format!("{:02x}", b)) + .collect::(); + debug!("a cpuset cpumask is: {}", hex_string); + + let cpuset_cpumask_slice = &mut skel.maps.rodata_data.cpuset_fakemasks[i]; + cpuset_cpumask_slice.copy_from_slice(&cpumask_bitvec); + } + skel.maps.rodata_data.nr_cpusets = cpusets.len() as u32; + Ok(()) + } + fn init_nodes(skel: &mut OpenBpfSkel, _opts: &Opts, topo: &Topology) { skel.maps.rodata_data.nr_nodes = topo.nodes.len() as u32; skel.maps.rodata_data.nr_llcs = 0; @@ -1876,6 +1910,7 @@ impl<'a> Scheduler<'a> { skel.maps.rodata_data.lo_fb_wait_ns = opts.lo_fb_wait_us * 1000; skel.maps.rodata_data.lo_fb_share_ppk = ((opts.lo_fb_share * 1024.0) as u32).clamp(1, 1024); skel.maps.rodata_data.enable_antistall = !opts.disable_antistall; + skel.maps.rodata_data.enable_cpuset = opts.enable_cpuset; skel.maps.rodata_data.enable_gpu_support = opts.enable_gpu_support; for (cpu, sib) in topo.sibling_cpus().iter().enumerate() { @@ -1944,6 +1979,10 @@ impl<'a> Scheduler<'a> { Self::init_layers(&mut skel, &layer_specs, &topo)?; Self::init_nodes(&mut skel, opts, &topo); + if opts.enable_cpuset { + Self::init_cpusets(&mut skel, &topo)?; + } + // We set the pin path before loading the skeleton. This will ensure // libbpf creates and pins the map, or reuses the pinned map fd for us, // so that we can keep reusing the older map already pinned on scheduler