diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e479d7d202b12..ccfe78b6a10a3e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2611,7 +2611,7 @@ static int timerslack_ns_show(struct seq_file *m, void *v) } task_lock(p); - seq_printf(m, "%llu\n", p->timer_slack_ns); + seq_printf(m, "%llu\n", get_task_timer_slack_ns(p)); task_unlock(p); out: diff --git a/fs/select.c b/fs/select.c index 0ee55af1a55c29..0ce8d9c66df9f2 100644 --- a/fs/select.c +++ b/fs/select.c @@ -75,7 +75,7 @@ static long __estimate_accuracy(struct timespec64 *tv) u64 select_estimate_accuracy(struct timespec64 *tv) { - u64 ret; + u64 ret, timer_slack; struct timespec64 now; /* @@ -88,8 +88,9 @@ u64 select_estimate_accuracy(struct timespec64 *tv) ktime_get_ts64(&now); now = timespec64_sub(*tv, now); ret = __estimate_accuracy(&now); - if (ret < current->timer_slack_ns) - return current->timer_slack_ns; + timer_slack = get_task_timer_slack_ns(current); + if (ret < timer_slack) + return timer_slack; return ret; } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 8a0d5466c7be15..9270b4cdaf835d 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -407,6 +407,15 @@ struct cgroup { int nr_dying_descendants; int max_descendants; + /* + * The default process time slacks: + * Setting the timer_slack_ns set's this (and ancestry) cgroups to this + * slack. Defaults to U64_MAX when unset. When timer_slack_ns is unset, + * the parent timer slack from default_timer_slack_ns is used. + */ + u64 timer_slack_ns; + u64 default_timer_slack_ns; + /* * Each non-empty css_set associated with this cgroup contributes * one to nr_populated_csets. The counter is zero iff this cgroup diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 3410aecffdb477..9382aaf0fec88a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -147,6 +147,12 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); +struct cgroup_subsys_state * +css_filter_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root, + bool (*filter)(struct cgroup_subsys_state *pos, void *data), + void *filter_data); + struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); @@ -243,6 +249,11 @@ void css_task_iter_end(struct css_task_iter *it); for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ (pos) = css_next_descendant_pre((pos), (css))) +#define css_filter_for_each_descendant_pre(pos, css, filter, filter_data) \ + for ((pos) = css_filter_next_descendant_pre(NULL, (css), (filter), (filter_data)); \ + (pos); \ + (pos) = css_filter_next_descendant_pre((pos), (css), (filter), (filter_data))) + /** * css_for_each_descendant_post - post-order walk of a css's descendants * @pos: the css * to use as the loop cursor diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 0ee140176f102f..33e9b3f18eafae 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -13,7 +13,7 @@ #define _LINUX_HRTIMER_H #include -#include +#include #include #include #include @@ -97,14 +97,15 @@ enum hrtimer_restart { /** * struct hrtimer - the basic hrtimer structure * @node: timerqueue node, which also manages node.expires, - * the absolute expiry time in the hrtimers internal + * the earliest expiry time in the hrtimers internal * representation. The time is related to the clock on - * which the timer is based. Is setup by adding - * slack to the _softexpires value. For non range timers - * identical to _softexpires. - * @_softexpires: the absolute earliest expiry time of the hrtimer. - * The time which was given as expiry time when the timer - * was armed. + * which the timer is based. + * @_hardexpires: The absolutely last time this timer should expire. + * This is the timer expiry time with the timer slack added to it. + * For non range timers identical to node.expires. + * @_subtree_least_expires: The least hard expiry time among all the nodes in + * the subtree from this node, i.e. when the next timer should + * fire. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) @@ -117,7 +118,8 @@ enum hrtimer_restart { */ struct hrtimer { struct timerqueue_node node; - ktime_t _softexpires; + ktime_t _hardexpires; + ktime_t _subtree_least_expires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; u8 state; @@ -240,66 +242,88 @@ struct hrtimer_cpu_base { static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = time; - timer->_softexpires = time; + timer->_hardexpires = time; } static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta) { - timer->_softexpires = time; - timer->node.expires = ktime_add_safe(time, delta); + timer->node.expires = time; + timer->_hardexpires = ktime_add_safe(time, delta); } static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { - timer->_softexpires = time; - timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); + timer->node.expires = time; + timer->_hardexpires = ktime_add_safe(time, ns_to_ktime(delta)); } + static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64) { timer->node.expires = tv64; - timer->_softexpires = tv64; + timer->_hardexpires = tv64; +} + +static inline void hrtimer_set_subtree_least_expires(struct hrtimer *timer, ktime_t time) +{ + timer->_subtree_least_expires = time; +} + +static inline void hrtimer_set_subtree_least_expires_tv64(struct hrtimer *timer, s64 tv64) +{ + timer->_subtree_least_expires = tv64; } static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = ktime_add_safe(timer->node.expires, time); - timer->_softexpires = ktime_add_safe(timer->_softexpires, time); + timer->_hardexpires = ktime_add_safe(timer->_hardexpires, time); } static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns) { timer->node.expires = ktime_add_ns(timer->node.expires, ns); - timer->_softexpires = ktime_add_ns(timer->_softexpires, ns); + timer->_hardexpires = ktime_add_ns(timer->_hardexpires, ns); } static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer) { - return timer->node.expires; + return timer->_hardexpires; } static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) { - return timer->_softexpires; + return timer->node.expires; } static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer) { - return timer->node.expires; + return timer->_hardexpires; } + +static inline ktime_t hrtimer_get_subtree_least_expires(const struct hrtimer *timer) +{ + return timer->_subtree_least_expires; +} + +static inline s64 hrtimer_get_subtree_least_expires_tv64(const struct hrtimer *timer) +{ + return timer->_subtree_least_expires; +} + static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) { - return timer->_softexpires; + return timer->node.expires; } static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) { - return ktime_to_ns(timer->node.expires); + return ktime_to_ns(timer->_hardexpires); } static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { - return ktime_sub(timer->node.expires, timer->base->get_time()); + return ktime_sub(timer->_hardexpires, timer->base->get_time()); } static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) @@ -329,7 +353,7 @@ extern unsigned int hrtimer_resolution; static inline ktime_t __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) { - ktime_t rem = ktime_sub(timer->node.expires, now); + ktime_t rem = ktime_sub(timer->_hardexpires, now); /* * Adjust relative timers for the extra we added in @@ -523,6 +547,12 @@ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); +#ifdef CONFIG_HIGH_RES_TIMERS +extern void hrtimer_run_softexpired_timers(void); +#else +static inline void hrtimer_run_softexpired_timers(void) {}; +#endif + /* Bootup initialization: */ extern void __init hrtimers_init(void); diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index d1c53e9d8c7532..c2f9253c709300 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -28,6 +28,7 @@ struct rb_augment_callbacks { void (*propagate)(struct rb_node *node, struct rb_node *stop); void (*copy)(struct rb_node *old, struct rb_node *new); void (*rotate)(struct rb_node *old, struct rb_node *new); + void (*insert)(struct rb_node *parent, struct rb_node *node); }; extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, @@ -60,6 +61,56 @@ rb_insert_augmented_cached(struct rb_node *node, rb_insert_augmented(node, &root->rb_root, augment); } +static __always_inline bool +__rb_add_augmented(struct rb_node *node, struct rb_node **link, + bool (*less)(struct rb_node *, const struct rb_node *), + const struct rb_augment_callbacks *augment) +{ + struct rb_node *parent = NULL; + bool leftmost = true; + + while (*link) { + parent = *link; + if (augment) + augment->insert(parent, node); + + if (less(node, parent)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = false; + } + } + rb_link_node(node, parent, link); + + return leftmost; +} + +static __always_inline struct rb_node * +rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, + bool (*less)(struct rb_node *, const struct rb_node *), + const struct rb_augment_callbacks *augment) +{ + struct rb_node **link = &tree->rb_root.rb_node; + bool leftmost; + + leftmost = __rb_add_augmented(node, link, less, augment); + rb_insert_augmented_cached(node, tree, leftmost, augment); + + return leftmost ? node : NULL; +} + +static __always_inline void +rb_add_augmented(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *), + const struct rb_augment_callbacks *augment) +{ + struct rb_node **link = &tree->rb_node; + + __rb_add_augmented(node, link, less, augment); + rb_insert_augmented(node, tree, augment); +} + /* * Template for declaring augmented rbtree callbacks (generic case) * diff --git a/include/linux/sched.h b/include/linux/sched.h index 853d08f7562bda..f6ffb8767fd538 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1346,9 +1346,13 @@ struct task_struct { /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. + * The default timer slack used is 50 usec. + * The effective timer slack should be retrieved with + * get_task_timer_slack_ns(task) */ u64 timer_slack_ns; u64 default_timer_slack_ns; +#define TASK_TIMER_SLACK_NS 50000 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; @@ -2420,4 +2424,20 @@ static inline void sched_core_fork(struct task_struct *p) { } extern void sched_set_stop_task(int cpu, struct task_struct *stop); +#ifdef CONFIG_CGROUPS +extern u64 cgroup_timer_slack_ns(const struct task_struct *task); +#else +static inline u64 cgroup_timer_slack_ns(const struct task_struct *task) +{ + return TASK_TIMER_SLACK_NS; +} +#endif + +static inline u64 get_task_timer_slack_ns(const struct task_struct *task) +{ + if (task->timer_slack_ns == U64_MAX) + return cgroup_timer_slack_ns(task); + return task->timer_slack_ns; +} + #endif diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index adc80e29168ea0..24f51ba75d2c8e 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -2,7 +2,7 @@ #ifndef _LINUX_TIMERQUEUE_H #define _LINUX_TIMERQUEUE_H -#include +#include #include @@ -11,8 +11,15 @@ struct timerqueue_node { ktime_t expires; }; +/** + * stuct timerqueue_head - timerqueue base + * @head: rbtree root + * @augment: If not NULL, contains augmentation callbacks to use when + * modifying timerqueue rbtree. + */ struct timerqueue_head { struct rb_root_cached rb_root; + const struct rb_augment_callbacks *augment; }; @@ -53,8 +60,27 @@ static inline bool timerqueue_node_expires(struct timerqueue_node *node) return node->expires; } -static inline void timerqueue_init_head(struct timerqueue_head *head) +static inline +void timerqueue_init_head_augmented(struct timerqueue_head *head, + const struct rb_augment_callbacks *augment) { head->rb_root = RB_ROOT_CACHED; + head->augment = augment; +} + +static inline void timerqueue_init_head(struct timerqueue_head *head) +{ + timerqueue_init_head_augmented(head, NULL); +} + +static inline +struct timerqueue_node *timerqueue_getroot(const struct timerqueue_head *head) +{ + struct rb_node *rbnode = head->rb_root.rb_root.rb_node; + + if (!rbnode) + return NULL; + + return rb_entry(rbnode, struct timerqueue_node, node); } #endif /* _LINUX_TIMERQUEUE_H */ diff --git a/include/linux/wait.h b/include/linux/wait.h index a0307b516b099e..761264e78d7d1a 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -546,7 +546,7 @@ do { \ HRTIMER_MODE_REL); \ if ((timeout) != KTIME_MAX) { \ hrtimer_set_expires_range_ns(&__t.timer, timeout, \ - current->timer_slack_ns); \ + get_task_timer_slack_ns(current)); \ hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL); \ } \ \ diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bfe6b1c..cd265c26fa2d4c 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -130,7 +130,7 @@ struct task_struct init_task .journal_info = NULL, INIT_CPU_TIMERS(init_task) .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), - .timer_slack_ns = 50000, /* 50 usec default slack */ + .timer_slack_ns = U64_MAX, /* using default slack */ .thread_pid = &init_struct_pid, .thread_group = LIST_HEAD_INIT(init_task.thread_group), .thread_node = LIST_HEAD_INIT(init_signals.thread_head), diff --git a/ipc/sem.c b/ipc/sem.c index 00f88aa01ac5a0..f4a36c7139cb36 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -2166,7 +2166,8 @@ long __do_semtimedop(int semid, struct sembuf *sops, rcu_read_unlock(); timed_out = !schedule_hrtimeout_range(exp, - current->timer_slack_ns, HRTIMER_MODE_ABS); + get_task_timer_slack_ns(current), + HRTIMER_MODE_ABS); /* * fastpath: the semop has completed, either successfully or diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index c099cf3fa02d2d..ee0b6ad799a1b9 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2013,6 +2013,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; + cgrp->timer_slack_ns = U64_MAX; + cgrp->default_timer_slack_ns = TASK_TIMER_SLACK_NS; /* 50 usec default slack */ INIT_LIST_HEAD(&cgrp->rstat_css_list); prev_cputime_init(&cgrp->prev_cputime); @@ -3663,6 +3665,122 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, return nbytes; } +/** + * cgroup_timer_slack - Get the effective timer slack for a task + */ +u64 cgroup_timer_slack_ns(const struct task_struct *task) +{ + struct cgroup *cgrp = task_dfl_cgroup(current); + u64 timer_slack; + + if (!cgrp) + return TASK_TIMER_SLACK_NS; + + timer_slack = READ_ONCE(cgrp->timer_slack_ns); + if (timer_slack < U64_MAX) + return timer_slack; + + timer_slack = READ_ONCE(cgrp->default_timer_slack_ns); + if (timer_slack < U64_MAX) + return timer_slack; + + return TASK_TIMER_SLACK_NS; +} + +static int cgroup_timer_slack_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 timer_slack = READ_ONCE(cgrp->timer_slack_ns); + static char *source[] = { "", "(parent) ", "(default) " }; + char **source_selector = source; + + if (timer_slack == U64_MAX) { + source_selector++; + timer_slack = READ_ONCE(cgrp->default_timer_slack_ns); + } + + if (timer_slack == U64_MAX) { + source_selector++; + timer_slack = TASK_TIMER_SLACK_NS; + } + + seq_printf(seq, "%s%llu\n", *source_selector, timer_slack); + return 0; +} + +static bool __css_filter_match_unmodified(struct cgroup_subsys_state *css, + void *css_origin) +{ + struct cgroup_subsys_state *origin = css_origin; + + /* Don't include children of parents that have their slack set */ + return css->parent->cgroup->timer_slack_ns == U64_MAX + || css->parent == origin; +} + +static ssize_t cgroup_timer_slack_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + struct cgroup_subsys_state *css; + ssize_t ret; + u64 timer_slack; + u64 default_timer_slack; + + buf = strstrip(buf); + if (!strcmp(buf, "default")) { + timer_slack = U64_MAX; + } else { + ret = kstrtoull(buf, 0, &timer_slack); + if (ret) + return ret; + } + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + /* + * When "unsetting" the timer slack, need to propagate the previous + * timer slack to the descendants + */ + if (timer_slack == U64_MAX) + default_timer_slack = cgrp->default_timer_slack_ns; + else + default_timer_slack = timer_slack; + + /* + * Update the default timer slack to all descendants, except subtrees + * that have their own timer slacks set. We do, however, need to update + * the default value even for cgroups that have the timer slack set! + * (see filter function). + */ + spin_lock_irq(&css_set_lock); + css_filter_for_each_descendant_pre(css, + &cgrp->self, + __css_filter_match_unmodified, + &cgrp->self) { + struct cgroup *dcgrp = css->cgroup; + + /* current cgroup keeps the parent default */ + if (dcgrp == cgrp) + continue; + + if (cgroup_is_dead(dcgrp)) + continue; + + + dcgrp->default_timer_slack_ns = default_timer_slack; + } + spin_unlock_irq(&css_set_lock); + + cgrp->timer_slack_ns = timer_slack; + + cgroup_kn_unlock(of->kn); + + return nbytes; +} + static int cgroup_events_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; @@ -4563,9 +4681,31 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, } /** - * css_next_descendant_pre - find the next descendant for pre-order walk + * _css_filter_next_child - Find the next matching child of a given css + * + * Behaves as @css_next_child except that it skips children not matching the + * filter. + */ +static struct cgroup_subsys_state * +_css_filter_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent, + bool (*filter)(struct cgroup_subsys_state *pos, void *data), + void *filter_data) +{ + do { + pos = css_next_child(pos, parent); + if (pos && filter(pos, filter_data)) + return pos; + } while (pos); + + return NULL; +} + +/** + * css_filter_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) * @root: css whose descendants to walk + * @match: Function that descides wether we traverse into the subtree. * * To be used by css_for_each_descendant_pre(). Find the next descendant * to visit for pre-order traversal of @root's descendants. @root is @@ -4584,8 +4724,10 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, * responsibility to synchronize against on/offlining. */ struct cgroup_subsys_state * -css_next_descendant_pre(struct cgroup_subsys_state *pos, - struct cgroup_subsys_state *root) +css_filter_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root, + bool (*filter)(struct cgroup_subsys_state *pos, void *data), + void *filter_data) { struct cgroup_subsys_state *next; @@ -4595,14 +4737,14 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, if (!pos) return root; - /* visit the first child if exists */ - next = css_next_child(NULL, pos); + /* Find a matching child for pos */ + next = _css_filter_next_child(NULL, pos, filter, filter_data); if (next) return next; - /* no child, visit my or the closest ancestor's next sibling */ + /* no matching child, visit my or the closest ancestor's next sibling */ while (pos != root) { - next = css_next_child(pos, pos->parent); + next = _css_filter_next_child(pos, pos->parent, filter, filter_data); if (next) return next; pos = pos->parent; @@ -4610,6 +4752,19 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, return NULL; } +EXPORT_SYMBOL_GPL(css_filter_next_descendant_pre); + +static inline bool __css_match_all(struct cgroup_subsys_state *pos, void *data) +{ + return true; +} + +struct cgroup_subsys_state * +css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) +{ + return css_filter_next_descendant_pre(pos, root, __css_match_all, NULL); +} EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** @@ -5239,6 +5394,11 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_max_depth_show, .write = cgroup_max_depth_write, }, + { + .name = "cgroup.timer_slack_ns", + .seq_show = cgroup_timer_slack_show, + .write = cgroup_timer_slack_write, + }, { .name = "cgroup.stat", .seq_show = cgroup_stat_show, @@ -5605,6 +5765,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->root = root; cgrp->level = level; + /* inherit parent timer slack if set, or else use the subtree default */ + cgrp->default_timer_slack_ns = parent->timer_slack_ns == U64_MAX ? + parent->default_timer_slack_ns : parent->timer_slack_ns; + ret = psi_cgroup_alloc(cgrp); if (ret) goto out_kernfs_remove; diff --git a/kernel/fork.c b/kernel/fork.c index 9f7fe354189785..81bfaeac79753a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2175,7 +2175,7 @@ static __latent_entropy struct task_struct *copy_process( memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif - p->default_timer_slack_ns = current->timer_slack_ns; + p->default_timer_slack_ns = get_task_timer_slack_ns(current); #ifdef CONFIG_PSI p->psi_flags = 0; diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c index cba8b1a6a4cc27..5b4f6920a0a779 100644 --- a/kernel/futex/requeue.c +++ b/kernel/futex/requeue.c @@ -781,7 +781,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, return -EINVAL; to = futex_setup_timer(abs_time, &timeout, flags, - current->timer_slack_ns); + get_task_timer_slack_ns(current)); /* * The waiter is allocated on our stack, manipulated by the requeue diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index ba01b94082033b..53c61c896659ee 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -642,7 +642,7 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time q.bitset = bitset; to = futex_setup_timer(abs_time, &timeout, flags, - current->timer_slack_ns); + get_task_timer_slack_ns(current)); retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f26ab2675f7d74..abc91bc12816d5 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -170,6 +170,9 @@ static void cpuidle_idle_call(void) struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; + /* We can avoid the next wakeup by running timers preemptively */ + hrtimer_run_softexpired_timers(); + /* * Check if the idle task must be rescheduled. If it is the * case, exit the function after re-enabling the local irq. diff --git a/kernel/signal.c b/kernel/signal.c index ae26da61c4d9fa..3f98c677d6839f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3603,7 +3603,8 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, spin_unlock_irq(&tsk->sighand->siglock); __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); - ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns, + ret = schedule_hrtimeout_range(to, + get_task_timer_slack_ns(tsk), HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); diff --git a/kernel/sys.c b/kernel/sys.c index 88b31f096fb2d9..1bbebc38247725 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2447,10 +2447,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: - if (current->timer_slack_ns > ULONG_MAX) + if (get_task_timer_slack_ns(current) > ULONG_MAX) error = ULONG_MAX; else - error = current->timer_slack_ns; + error = get_task_timer_slack_ns(current); break; case PR_SET_TIMERSLACK: if (arg2 <= 0) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3ae661ab62603c..08861733c53c69 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -512,6 +512,12 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, struct hrtimer *timer; next = timerqueue_getnext(&base->active); + if (!next) + continue; + /* Get next absolute timeout */ + timer = container_of(timerqueue_getroot(&base->active), + struct hrtimer, node); + expires = hrtimer_get_subtree_least_expires(timer); timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ @@ -520,8 +526,15 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, continue; timer = container_of(next, struct hrtimer, node); + + /* + * Can't figure out the ideal slack with the excluded + * timer (cheaply). Go with the safest value, which is + * the earliest possible soft timer expiry. + */ + expires = hrtimer_get_softexpires(timer); } - expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + expires = ktime_sub(expires, base->offset); if (expires < expires_next) { expires_next = expires; @@ -1088,6 +1101,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); + hrtimer_set_subtree_least_expires(timer, hrtimer_get_expires(timer)); return timerqueue_add(&base->active, &timer->node); } @@ -1774,6 +1788,11 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) #ifdef CONFIG_HIGH_RES_TIMERS +static inline ktime_t __hrtimer_get_softexpires_safe(struct hrtimer *timer) +{ + return timer ? hrtimer_get_softexpires(timer) : KTIME_MAX; +} + /* * High resolution timer interrupt * Called with interrupts disabled @@ -1802,7 +1821,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) */ cpu_base->expires_next = KTIME_MAX; - if (!ktime_before(now, cpu_base->softirq_expires_next)) { + if (ktime_after(now, __hrtimer_get_softexpires_safe(cpu_base->softirq_next_timer))) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_softirq_irqoff(HRTIMER_SOFTIRQ); @@ -1882,9 +1901,137 @@ static inline void __hrtimer_peek_ahead_timers(void) hrtimer_interrupt(td->evtdev); } +/* Do a cheap test to see if there are soft-expired timers present */ +static inline bool __hrtimer_has_softexpired_timers(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t now; + unsigned long flags; + bool has_expired; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + now = hrtimer_update_base(cpu_base); + has_expired = + ktime_after(now, __hrtimer_get_softexpires_safe(cpu_base->softirq_next_timer)) || + ktime_after(now, __hrtimer_get_softexpires_safe(cpu_base->next_timer)); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + return has_expired; +} + +/** + * hrtimer_run_softexpired_timers - Run timers that have "soft" expired. + * + * Called with interrupts disabled. + **/ +void hrtimer_run_softexpired_timers(void) +{ + if (!hrtimer_hres_active()) + return; + + if (__hrtimer_has_softexpired_timers()) + __hrtimer_peek_ahead_timers(); +} + +#define hrtimer_entry(rb_ptr) \ + container_of(rb_entry(rb_ptr, struct timerqueue_node, node), \ + struct hrtimer, node) + +static s64 get_rb_node_subtree_expires_tv64(const struct rb_node *rb) +{ + return rb + ? hrtimer_get_subtree_least_expires_tv64(hrtimer_entry(rb)) + : KTIME_MAX; +} + +static s64 get_subtree_least_expires_tv64(const struct rb_node *parent) +{ + if (!parent) + return KTIME_MAX; + return parent + ? min(get_rb_node_subtree_expires_tv64(parent->rb_left), + get_rb_node_subtree_expires_tv64(parent->rb_right)) + : KTIME_MAX; +} + +/** + * hrtimer_rb_augment_propagate - Propagate the _least_expires from child node + * @rb: Node to start from + * @stop: Node to stop at. NULL = Propagate to the root. + */ +static +void hrtimer_rb_augment_propagate(struct rb_node *rb, struct rb_node *stop) +{ + while (rb != stop) { + struct hrtimer *timer = hrtimer_entry(rb); + s64 least_expires_tv64 = min(hrtimer_get_expires_tv64(timer), + get_subtree_least_expires_tv64(rb)); + + hrtimer_set_subtree_least_expires_tv64(timer, + least_expires_tv64); + rb = rb_parent(rb); + } +} + +/** + * hrtimer_rb_augment_copy - Copy the least_expires + */ +static +void hrtimer_rb_augment_copy(struct rb_node *rb_from, struct rb_node *rb_to) +{ + struct hrtimer *to_timer = hrtimer_entry(rb_to); + const struct hrtimer *from_timer = hrtimer_entry(rb_from);; + + hrtimer_set_subtree_least_expires_tv64(to_timer, + hrtimer_get_subtree_least_expires_tv64(from_timer)); +} + +/** + * hrtimer_rb_augment_rotate - swap and recalculate augmentation during rbtree rotation + */ +static +void hrtimer_rb_augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) +{ + struct hrtimer *old_timer = hrtimer_entry(rb_old); + s64 least_expires_tv64; + + hrtimer_rb_augment_copy(rb_old, rb_new); + + least_expires_tv64 = min(hrtimer_get_expires_tv64(old_timer), + get_subtree_least_expires_tv64(rb_old)); + hrtimer_set_subtree_least_expires_tv64(old_timer, + least_expires_tv64); +} + +/** + * hrtimer_timerqueue_augment_insert - timerwheel augment callback on insert + */ +static void hrtimer_rb_augment_insert(struct rb_node *rb_parent, + struct rb_node *rb_node) +{ + struct hrtimer *parent_timer = hrtimer_entry(rb_parent); + struct hrtimer *node_timer = hrtimer_entry(rb_node); + ktime_t node_expires = hrtimer_get_expires(node_timer); + + if (ktime_before(node_expires, + hrtimer_get_subtree_least_expires(parent_timer))) + hrtimer_set_subtree_least_expires(parent_timer, node_expires); +} + +static const struct rb_augment_callbacks hrtimer_rb_augment_callbacks_struct = { + .propagate = hrtimer_rb_augment_propagate, + .copy = hrtimer_rb_augment_copy, + .rotate = hrtimer_rb_augment_rotate, + .insert = hrtimer_rb_augment_insert, +}; + +static const struct rb_augment_callbacks *hrtimer_rb_augment_callbacks = + &hrtimer_rb_augment_callbacks_struct; + #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } +static const struct rb_augment_callbacks *hrtimer_rb_augment_callbacks = NULL; #endif /* !CONFIG_HIGH_RES_TIMERS */ @@ -1915,7 +2062,7 @@ void hrtimer_run_queues(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); - if (!ktime_before(now, cpu_base->softirq_expires_next)) { + if (ktime_after(now, __hrtimer_get_softexpires_safe(cpu_base->softirq_next_timer))) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_softirq_irqoff(HRTIMER_SOFTIRQ); @@ -2088,7 +2235,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, int ret = 0; u64 slack; - slack = current->timer_slack_ns; + slack = get_task_timer_slack_ns(current); if (dl_task(current) || rt_task(current)) slack = 0; @@ -2167,7 +2314,8 @@ int hrtimers_prepare_cpu(unsigned int cpu) clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); - timerqueue_init_head(&clock_b->active); + timerqueue_init_head_augmented(&clock_b->active, + hrtimer_rb_augment_callbacks); } cpu_base->cpu = cpu; diff --git a/lib/rbtree.c b/lib/rbtree.c index c4ac5c2421f255..bcbd8440023a2a 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -424,11 +424,13 @@ EXPORT_SYMBOL(__rb_erase_color); static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {} static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {} static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {} +static inline void dummy_insert(struct rb_node *parent, struct rb_node *node) {} static const struct rb_augment_callbacks dummy_callbacks = { .propagate = dummy_propagate, .copy = dummy_copy, - .rotate = dummy_rotate + .rotate = dummy_rotate, + .insert = dummy_insert, }; void rb_insert_color(struct rb_node *node, struct rb_root *root) diff --git a/lib/timerqueue.c b/lib/timerqueue.c index cdb9c7658478f0..e9ba77ccc6ec6c 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -37,6 +37,10 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) /* Make sure we don't add nodes that are already added */ WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node)); + if (head->augment) + return rb_add_augmented_cached(&node->node, &head->rb_root, + __timerqueue_less, + head->augment); return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less); } EXPORT_SYMBOL_GPL(timerqueue_add); @@ -54,7 +58,12 @@ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); - rb_erase_cached(&node->node, &head->rb_root); + if (head->augment) + rb_erase_augmented_cached(&node->node, &head->rb_root, + head->augment); + else + rb_erase_cached(&node->node, &head->rb_root); + RB_CLEAR_NODE(&node->node); return !RB_EMPTY_ROOT(&head->rb_root.rb_root);