diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9e479d7d202b12..ccfe78b6a10a3e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2611,7 +2611,7 @@ static int timerslack_ns_show(struct seq_file *m, void *v)
 	}
 
 	task_lock(p);
-	seq_printf(m, "%llu\n", p->timer_slack_ns);
+	seq_printf(m, "%llu\n", get_task_timer_slack_ns(p));
 	task_unlock(p);
 
 out:
diff --git a/fs/select.c b/fs/select.c
index 0ee55af1a55c29..0ce8d9c66df9f2 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -75,7 +75,7 @@ static long __estimate_accuracy(struct timespec64 *tv)
 
 u64 select_estimate_accuracy(struct timespec64 *tv)
 {
-	u64 ret;
+	u64 ret, timer_slack;
 	struct timespec64 now;
 
 	/*
@@ -88,8 +88,9 @@ u64 select_estimate_accuracy(struct timespec64 *tv)
 	ktime_get_ts64(&now);
 	now = timespec64_sub(*tv, now);
 	ret = __estimate_accuracy(&now);
-	if (ret < current->timer_slack_ns)
-		return current->timer_slack_ns;
+	timer_slack = get_task_timer_slack_ns(current);
+	if (ret < timer_slack)
+		return timer_slack;
 	return ret;
 }
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8a0d5466c7be15..9270b4cdaf835d 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -407,6 +407,15 @@ struct cgroup {
 	int nr_dying_descendants;
 	int max_descendants;
 
+	/*
+	 * The default process time slacks:
+	 * Setting the timer_slack_ns set's this (and ancestry) cgroups to this
+	 * slack. Defaults to U64_MAX when unset. When timer_slack_ns is unset,
+	 * the parent timer slack from default_timer_slack_ns is used.
+	 */
+	u64 timer_slack_ns;
+	u64 default_timer_slack_ns;
+
 	/*
 	 * Each non-empty css_set associated with this cgroup contributes
 	 * one to nr_populated_csets.  The counter is zero iff this cgroup
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3410aecffdb477..9382aaf0fec88a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -147,6 +147,12 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
 					   struct cgroup_subsys_state *parent);
 struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos,
 						    struct cgroup_subsys_state *css);
+struct cgroup_subsys_state *
+css_filter_next_descendant_pre(struct cgroup_subsys_state *pos,
+			       struct cgroup_subsys_state *root,
+			       bool (*filter)(struct cgroup_subsys_state *pos, void *data),
+			       void *filter_data);
+
 struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos);
 struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos,
 						     struct cgroup_subsys_state *css);
@@ -243,6 +249,11 @@ void css_task_iter_end(struct css_task_iter *it);
 	for ((pos) = css_next_descendant_pre(NULL, (css)); (pos);	\
 	     (pos) = css_next_descendant_pre((pos), (css)))
 
+#define css_filter_for_each_descendant_pre(pos, css, filter, filter_data)            \
+  for ((pos) = css_filter_next_descendant_pre(NULL, (css), (filter), (filter_data)); \
+       (pos);									     \
+       (pos) = css_filter_next_descendant_pre((pos), (css), (filter), (filter_data)))
+
 /**
  * css_for_each_descendant_post - post-order walk of a css's descendants
  * @pos: the css * to use as the loop cursor
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 0ee140176f102f..33e9b3f18eafae 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -13,7 +13,7 @@
 #define _LINUX_HRTIMER_H
 
 #include <linux/hrtimer_defs.h>
-#include <linux/rbtree.h>
+#include <linux/rbtree_augmented.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
@@ -97,14 +97,15 @@ enum hrtimer_restart {
 /**
  * struct hrtimer - the basic hrtimer structure
  * @node:	timerqueue node, which also manages node.expires,
- *		the absolute expiry time in the hrtimers internal
+ *		the earliest expiry time in the hrtimers internal
  *		representation. The time is related to the clock on
- *		which the timer is based. Is setup by adding
- *		slack to the _softexpires value. For non range timers
- *		identical to _softexpires.
- * @_softexpires: the absolute earliest expiry time of the hrtimer.
- *		The time which was given as expiry time when the timer
- *		was armed.
+ *		which the timer is based.
+ * @_hardexpires: The absolutely last time this timer should expire.
+ *              This is the timer expiry time with the timer slack added to it.
+ *              For non range timers identical to node.expires.
+ * @_subtree_least_expires: The least hard expiry time among all the nodes in
+ *              the subtree from this node, i.e. when the next timer should
+ *              fire.
  * @function:	timer expiry callback function
  * @base:	pointer to the timer base (per cpu and per clock)
  * @state:	state information (See bit values above)
@@ -117,7 +118,8 @@ enum hrtimer_restart {
  */
 struct hrtimer {
 	struct timerqueue_node		node;
-	ktime_t				_softexpires;
+	ktime_t				_hardexpires;
+	ktime_t				_subtree_least_expires;
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	u8				state;
@@ -240,66 +242,88 @@ struct hrtimer_cpu_base {
 static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->node.expires = time;
-	timer->_softexpires = time;
+	timer->_hardexpires = time;
 }
 
 static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta)
 {
-	timer->_softexpires = time;
-	timer->node.expires = ktime_add_safe(time, delta);
+	timer->node.expires = time;
+	timer->_hardexpires = ktime_add_safe(time, delta);
 }
 
 static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta)
 {
-	timer->_softexpires = time;
-	timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta));
+	timer->node.expires = time;
+	timer->_hardexpires = ktime_add_safe(time, ns_to_ktime(delta));
 }
 
+
 static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64)
 {
 	timer->node.expires = tv64;
-	timer->_softexpires = tv64;
+	timer->_hardexpires = tv64;
+}
+
+static inline void hrtimer_set_subtree_least_expires(struct hrtimer *timer, ktime_t time)
+{
+	timer->_subtree_least_expires = time;
+}
+
+static inline void hrtimer_set_subtree_least_expires_tv64(struct hrtimer *timer, s64 tv64)
+{
+	timer->_subtree_least_expires = tv64;
 }
 
 static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time)
 {
 	timer->node.expires = ktime_add_safe(timer->node.expires, time);
-	timer->_softexpires = ktime_add_safe(timer->_softexpires, time);
+	timer->_hardexpires = ktime_add_safe(timer->_hardexpires, time);
 }
 
 static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns)
 {
 	timer->node.expires = ktime_add_ns(timer->node.expires, ns);
-	timer->_softexpires = ktime_add_ns(timer->_softexpires, ns);
+	timer->_hardexpires = ktime_add_ns(timer->_hardexpires, ns);
 }
 
 static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer)
 {
-	return timer->node.expires;
+	return timer->_hardexpires;
 }
 
 static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer)
 {
-	return timer->_softexpires;
+	return timer->node.expires;
 }
 
 static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer)
 {
-	return timer->node.expires;
+	return timer->_hardexpires;
 }
+
+static inline ktime_t hrtimer_get_subtree_least_expires(const struct hrtimer *timer)
+{
+	return timer->_subtree_least_expires;
+}
+
+static inline s64 hrtimer_get_subtree_least_expires_tv64(const struct hrtimer *timer)
+{
+	return timer->_subtree_least_expires;
+}
+
 static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer)
 {
-	return timer->_softexpires;
+	return timer->node.expires;
 }
 
 static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer)
 {
-	return ktime_to_ns(timer->node.expires);
+	return ktime_to_ns(timer->_hardexpires);
 }
 
 static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
 {
-	return ktime_sub(timer->node.expires, timer->base->get_time());
+	return ktime_sub(timer->_hardexpires, timer->base->get_time());
 }
 
 static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
@@ -329,7 +353,7 @@ extern unsigned int hrtimer_resolution;
 static inline ktime_t
 __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now)
 {
-	ktime_t rem = ktime_sub(timer->node.expires, now);
+	ktime_t rem = ktime_sub(timer->_hardexpires, now);
 
 	/*
 	 * Adjust relative timers for the extra we added in
@@ -523,6 +547,12 @@ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+extern void hrtimer_run_softexpired_timers(void);
+#else
+static inline void hrtimer_run_softexpired_timers(void) {};
+#endif
+
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);
 
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index d1c53e9d8c7532..c2f9253c709300 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -28,6 +28,7 @@ struct rb_augment_callbacks {
 	void (*propagate)(struct rb_node *node, struct rb_node *stop);
 	void (*copy)(struct rb_node *old, struct rb_node *new);
 	void (*rotate)(struct rb_node *old, struct rb_node *new);
+	void (*insert)(struct rb_node *parent, struct rb_node *node);
 };
 
 extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root,
@@ -60,6 +61,56 @@ rb_insert_augmented_cached(struct rb_node *node,
 	rb_insert_augmented(node, &root->rb_root, augment);
 }
 
+static __always_inline bool
+__rb_add_augmented(struct rb_node *node, struct rb_node **link,
+		   bool (*less)(struct rb_node *, const struct rb_node *),
+		   const struct rb_augment_callbacks *augment)
+{
+	struct rb_node *parent = NULL;
+	bool leftmost = true;
+
+	while (*link) {
+		parent = *link;
+		if (augment)
+			augment->insert(parent, node);
+
+		if (less(node, parent)) {
+			link = &parent->rb_left;
+		} else {
+			link = &parent->rb_right;
+			leftmost = false;
+		}
+	}
+	rb_link_node(node, parent, link);
+
+	return leftmost;
+}
+
+static __always_inline struct rb_node *
+rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
+			bool (*less)(struct rb_node *, const struct rb_node *),
+			const struct rb_augment_callbacks *augment)
+{
+	struct rb_node **link = &tree->rb_root.rb_node;
+	bool leftmost;
+
+	leftmost = __rb_add_augmented(node, link, less, augment);
+	rb_insert_augmented_cached(node, tree, leftmost, augment);
+
+	return leftmost ? node : NULL;
+}
+
+static __always_inline void
+rb_add_augmented(struct rb_node *node, struct rb_root *tree,
+		 bool (*less)(struct rb_node *, const struct rb_node *),
+		 const struct rb_augment_callbacks *augment)
+{
+	struct rb_node **link = &tree->rb_node;
+
+	__rb_add_augmented(node, link, less, augment);
+	rb_insert_augmented(node, tree, augment);
+}
+
 /*
  * Template for declaring augmented rbtree callbacks (generic case)
  *
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 853d08f7562bda..f6ffb8767fd538 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1346,9 +1346,13 @@ struct task_struct {
 	/*
 	 * Time slack values; these are used to round up poll() and
 	 * select() etc timeout values. These are in nanoseconds.
+	 * The default timer slack used is 50 usec.
+	 * The effective timer slack should be retrieved with
+	 * get_task_timer_slack_ns(task)
 	 */
 	u64				timer_slack_ns;
 	u64				default_timer_slack_ns;
+#define TASK_TIMER_SLACK_NS          50000
 
 #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
 	unsigned int			kasan_depth;
@@ -2420,4 +2424,20 @@ static inline void sched_core_fork(struct task_struct *p) { }
 
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 
+#ifdef CONFIG_CGROUPS
+extern u64 cgroup_timer_slack_ns(const struct task_struct *task);
+#else
+static inline u64 cgroup_timer_slack_ns(const struct task_struct *task)
+{
+	return TASK_TIMER_SLACK_NS;
+}
+#endif
+
+static inline u64 get_task_timer_slack_ns(const struct task_struct *task)
+{
+	if (task->timer_slack_ns == U64_MAX)
+		return cgroup_timer_slack_ns(task);
+	return task->timer_slack_ns;
+}
+
 #endif
diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h
index adc80e29168ea0..24f51ba75d2c8e 100644
--- a/include/linux/timerqueue.h
+++ b/include/linux/timerqueue.h
@@ -2,7 +2,7 @@
 #ifndef _LINUX_TIMERQUEUE_H
 #define _LINUX_TIMERQUEUE_H
 
-#include <linux/rbtree.h>
+#include <linux/rbtree_augmented.h>
 #include <linux/ktime.h>
 
 
@@ -11,8 +11,15 @@ struct timerqueue_node {
 	ktime_t expires;
 };
 
+/**
+ * stuct timerqueue_head - timerqueue base
+ * @head:    rbtree root
+ * @augment: If not NULL, contains augmentation callbacks to use when
+ *           modifying timerqueue rbtree.
+ */
 struct timerqueue_head {
 	struct rb_root_cached rb_root;
+	const struct rb_augment_callbacks *augment;
 };
 
 
@@ -53,8 +60,27 @@ static inline bool timerqueue_node_expires(struct timerqueue_node *node)
 	return node->expires;
 }
 
-static inline void timerqueue_init_head(struct timerqueue_head *head)
+static inline
+void timerqueue_init_head_augmented(struct timerqueue_head *head,
+				    const struct rb_augment_callbacks *augment)
 {
 	head->rb_root = RB_ROOT_CACHED;
+	head->augment = augment;
+}
+
+static inline void timerqueue_init_head(struct timerqueue_head *head)
+{
+	timerqueue_init_head_augmented(head, NULL);
+}
+
+static inline
+struct timerqueue_node *timerqueue_getroot(const struct timerqueue_head *head)
+{
+	struct rb_node *rbnode = head->rb_root.rb_root.rb_node;
+
+	if (!rbnode)
+		return NULL;
+
+	return rb_entry(rbnode, struct timerqueue_node, node);
 }
 #endif /* _LINUX_TIMERQUEUE_H */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index a0307b516b099e..761264e78d7d1a 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -546,7 +546,7 @@ do {										\
 				      HRTIMER_MODE_REL);			\
 	if ((timeout) != KTIME_MAX) {						\
 		hrtimer_set_expires_range_ns(&__t.timer, timeout,		\
-					current->timer_slack_ns);		\
+					     get_task_timer_slack_ns(current)); \
 		hrtimer_sleeper_start_expires(&__t, HRTIMER_MODE_REL);		\
 	}									\
 										\
diff --git a/init/init_task.c b/init/init_task.c
index ff6c4b9bfe6b1c..cd265c26fa2d4c 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -130,7 +130,7 @@ struct task_struct init_task
 	.journal_info	= NULL,
 	INIT_CPU_TIMERS(init_task)
 	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
-	.timer_slack_ns = 50000, /* 50 usec default slack */
+	.timer_slack_ns = U64_MAX, /* using default slack */
 	.thread_pid	= &init_struct_pid,
 	.thread_group	= LIST_HEAD_INIT(init_task.thread_group),
 	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),
diff --git a/ipc/sem.c b/ipc/sem.c
index 00f88aa01ac5a0..f4a36c7139cb36 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -2166,7 +2166,8 @@ long __do_semtimedop(int semid, struct sembuf *sops,
 		rcu_read_unlock();
 
 		timed_out = !schedule_hrtimeout_range(exp,
-				current->timer_slack_ns, HRTIMER_MODE_ABS);
+						      get_task_timer_slack_ns(current),
+						      HRTIMER_MODE_ABS);
 
 		/*
 		 * fastpath: the semop has completed, either successfully or
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index c099cf3fa02d2d..ee0b6ad799a1b9 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2013,6 +2013,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	cgrp->dom_cgrp = cgrp;
 	cgrp->max_descendants = INT_MAX;
 	cgrp->max_depth = INT_MAX;
+	cgrp->timer_slack_ns = U64_MAX;
+	cgrp->default_timer_slack_ns = TASK_TIMER_SLACK_NS; /* 50 usec default slack */
 	INIT_LIST_HEAD(&cgrp->rstat_css_list);
 	prev_cputime_init(&cgrp->prev_cputime);
 
@@ -3663,6 +3665,122 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of,
 	return nbytes;
 }
 
+/**
+ * cgroup_timer_slack - Get the effective timer slack for a task
+ */
+u64 cgroup_timer_slack_ns(const struct task_struct *task)
+{
+	struct cgroup *cgrp = task_dfl_cgroup(current);
+	u64 timer_slack;
+
+	if (!cgrp)
+		return TASK_TIMER_SLACK_NS;
+
+	timer_slack = READ_ONCE(cgrp->timer_slack_ns);
+	if (timer_slack < U64_MAX)
+		return timer_slack;
+
+	timer_slack = READ_ONCE(cgrp->default_timer_slack_ns);
+	if (timer_slack < U64_MAX)
+		return timer_slack;
+
+	return TASK_TIMER_SLACK_NS;
+}
+
+static int cgroup_timer_slack_show(struct seq_file *seq, void *v)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	u64 timer_slack = READ_ONCE(cgrp->timer_slack_ns);
+	static char *source[] = { "", "(parent) ", "(default) " };
+	char **source_selector = source;
+
+	if (timer_slack == U64_MAX) {
+		source_selector++;
+		timer_slack = READ_ONCE(cgrp->default_timer_slack_ns);
+	}
+
+	if (timer_slack == U64_MAX) {
+		source_selector++;
+		timer_slack = TASK_TIMER_SLACK_NS;
+	}
+
+	seq_printf(seq, "%s%llu\n", *source_selector, timer_slack);
+	return 0;
+}
+
+static bool __css_filter_match_unmodified(struct cgroup_subsys_state *css,
+					  void *css_origin)
+{
+	struct cgroup_subsys_state *origin = css_origin;
+
+	/* Don't include children of parents that have their slack set */
+	return css->parent->cgroup->timer_slack_ns == U64_MAX
+		|| css->parent == origin;
+}
+
+static ssize_t cgroup_timer_slack_write(struct kernfs_open_file *of,
+					char *buf, size_t nbytes, loff_t off)
+{
+	struct cgroup *cgrp;
+	struct cgroup_subsys_state *css;
+	ssize_t ret;
+	u64 timer_slack;
+	u64 default_timer_slack;
+
+	buf = strstrip(buf);
+	if (!strcmp(buf, "default")) {
+		timer_slack = U64_MAX;
+	} else {
+		ret = kstrtoull(buf, 0, &timer_slack);
+		if (ret)
+			return ret;
+	}
+
+	cgrp = cgroup_kn_lock_live(of->kn, false);
+	if (!cgrp)
+		return -ENOENT;
+
+	/*
+	 * When "unsetting" the timer slack, need to propagate the previous
+	 * timer slack to the descendants
+	 */
+	if (timer_slack == U64_MAX)
+		default_timer_slack = cgrp->default_timer_slack_ns;
+	else
+		default_timer_slack = timer_slack;
+
+	/*
+	 * Update the default timer slack to all descendants, except subtrees
+	 * that have their own timer slacks set. We do, however, need to update
+	 * the default value even for cgroups that have the timer slack set!
+	 * (see filter function).
+	 */
+	spin_lock_irq(&css_set_lock);
+	css_filter_for_each_descendant_pre(css,
+					   &cgrp->self,
+					   __css_filter_match_unmodified,
+					   &cgrp->self) {
+		struct cgroup *dcgrp = css->cgroup;
+
+		/* current cgroup keeps the parent default */
+		if (dcgrp == cgrp)
+			continue;
+
+		if (cgroup_is_dead(dcgrp))
+			continue;
+
+
+		dcgrp->default_timer_slack_ns = default_timer_slack;
+	}
+	spin_unlock_irq(&css_set_lock);
+
+	cgrp->timer_slack_ns = timer_slack;
+
+	cgroup_kn_unlock(of->kn);
+
+	return nbytes;
+}
+
 static int cgroup_events_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
@@ -4563,9 +4681,31 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
 }
 
 /**
- * css_next_descendant_pre - find the next descendant for pre-order walk
+ * _css_filter_next_child - Find the next matching child of a given css
+ *
+ * Behaves as @css_next_child except that it skips children not matching the
+ * filter.
+ */
+static struct cgroup_subsys_state *
+_css_filter_next_child(struct cgroup_subsys_state *pos,
+		       struct cgroup_subsys_state *parent,
+		       bool (*filter)(struct cgroup_subsys_state *pos, void *data),
+		       void *filter_data)
+{
+	do {
+		pos = css_next_child(pos, parent);
+		if (pos && filter(pos, filter_data))
+			return pos;
+	} while (pos);
+
+	return NULL;
+}
+
+/**
+ * css_filter_next_descendant_pre - find the next descendant for pre-order walk
  * @pos: the current position (%NULL to initiate traversal)
  * @root: css whose descendants to walk
+ * @match: Function that descides wether we traverse into the subtree.
  *
  * To be used by css_for_each_descendant_pre().  Find the next descendant
  * to visit for pre-order traversal of @root's descendants.  @root is
@@ -4584,8 +4724,10 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
  * responsibility to synchronize against on/offlining.
  */
 struct cgroup_subsys_state *
-css_next_descendant_pre(struct cgroup_subsys_state *pos,
-			struct cgroup_subsys_state *root)
+css_filter_next_descendant_pre(struct cgroup_subsys_state *pos,
+			      struct cgroup_subsys_state *root,
+			      bool (*filter)(struct cgroup_subsys_state *pos, void *data),
+			      void *filter_data)
 {
 	struct cgroup_subsys_state *next;
 
@@ -4595,14 +4737,14 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
 	if (!pos)
 		return root;
 
-	/* visit the first child if exists */
-	next = css_next_child(NULL, pos);
+	/* Find a matching child for pos */
+	next = _css_filter_next_child(NULL, pos, filter, filter_data);
 	if (next)
 		return next;
 
-	/* no child, visit my or the closest ancestor's next sibling */
+	/* no matching child, visit my or the closest ancestor's next sibling */
 	while (pos != root) {
-		next = css_next_child(pos, pos->parent);
+		next = _css_filter_next_child(pos, pos->parent, filter, filter_data);
 		if (next)
 			return next;
 		pos = pos->parent;
@@ -4610,6 +4752,19 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos,
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(css_filter_next_descendant_pre);
+
+static inline bool __css_match_all(struct cgroup_subsys_state *pos, void *data)
+{
+	return true;
+}
+
+struct cgroup_subsys_state *
+css_next_descendant_pre(struct cgroup_subsys_state *pos,
+			struct cgroup_subsys_state *root)
+{
+	return css_filter_next_descendant_pre(pos, root, __css_match_all, NULL);
+}
 EXPORT_SYMBOL_GPL(css_next_descendant_pre);
 
 /**
@@ -5239,6 +5394,11 @@ static struct cftype cgroup_base_files[] = {
 		.seq_show = cgroup_max_depth_show,
 		.write = cgroup_max_depth_write,
 	},
+	{
+		.name = "cgroup.timer_slack_ns",
+		.seq_show = cgroup_timer_slack_show,
+		.write = cgroup_timer_slack_write,
+	},
 	{
 		.name = "cgroup.stat",
 		.seq_show = cgroup_stat_show,
@@ -5605,6 +5765,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
 	cgrp->root = root;
 	cgrp->level = level;
 
+	/* inherit parent timer slack if set, or else use the subtree default  */
+	cgrp->default_timer_slack_ns = parent->timer_slack_ns == U64_MAX ?
+		parent->default_timer_slack_ns : parent->timer_slack_ns;
+
 	ret = psi_cgroup_alloc(cgrp);
 	if (ret)
 		goto out_kernfs_remove;
diff --git a/kernel/fork.c b/kernel/fork.c
index 9f7fe354189785..81bfaeac79753a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2175,7 +2175,7 @@ static __latent_entropy struct task_struct *copy_process(
 	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
 #endif
 
-	p->default_timer_slack_ns = current->timer_slack_ns;
+	p->default_timer_slack_ns = get_task_timer_slack_ns(current);
 
 #ifdef CONFIG_PSI
 	p->psi_flags = 0;
diff --git a/kernel/futex/requeue.c b/kernel/futex/requeue.c
index cba8b1a6a4cc27..5b4f6920a0a779 100644
--- a/kernel/futex/requeue.c
+++ b/kernel/futex/requeue.c
@@ -781,7 +781,7 @@ int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		return -EINVAL;
 
 	to = futex_setup_timer(abs_time, &timeout, flags,
-			       current->timer_slack_ns);
+			       get_task_timer_slack_ns(current));
 
 	/*
 	 * The waiter is allocated on our stack, manipulated by the requeue
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index ba01b94082033b..53c61c896659ee 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -642,7 +642,7 @@ int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time
 	q.bitset = bitset;
 
 	to = futex_setup_timer(abs_time, &timeout, flags,
-			       current->timer_slack_ns);
+			       get_task_timer_slack_ns(current));
 retry:
 	/*
 	 * Prepare to wait on uaddr. On success, it holds hb->lock and q
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d74..abc91bc12816d5 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -170,6 +170,9 @@ static void cpuidle_idle_call(void)
 	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
 	int next_state, entered_state;
 
+	/* We can avoid the next wakeup by running timers preemptively */
+	hrtimer_run_softexpired_timers();
+
 	/*
 	 * Check if the idle task must be rescheduled. If it is the
 	 * case, exit the function after re-enabling the local irq.
diff --git a/kernel/signal.c b/kernel/signal.c
index ae26da61c4d9fa..3f98c677d6839f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3603,7 +3603,8 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info,
 		spin_unlock_irq(&tsk->sighand->siglock);
 
 		__set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
-		ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns,
+		ret = schedule_hrtimeout_range(to,
+					       get_task_timer_slack_ns(tsk),
 					       HRTIMER_MODE_REL);
 		spin_lock_irq(&tsk->sighand->siglock);
 		__set_task_blocked(tsk, &tsk->real_blocked);
diff --git a/kernel/sys.c b/kernel/sys.c
index 88b31f096fb2d9..1bbebc38247725 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2447,10 +2447,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		error = perf_event_task_enable();
 		break;
 	case PR_GET_TIMERSLACK:
-		if (current->timer_slack_ns > ULONG_MAX)
+		if (get_task_timer_slack_ns(current) > ULONG_MAX)
 			error = ULONG_MAX;
 		else
-			error = current->timer_slack_ns;
+			error = get_task_timer_slack_ns(current);
 		break;
 	case PR_SET_TIMERSLACK:
 		if (arg2 <= 0)
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3ae661ab62603c..08861733c53c69 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -512,6 +512,12 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
 		struct hrtimer *timer;
 
 		next = timerqueue_getnext(&base->active);
+		if (!next)
+			continue;
+		/* Get next absolute timeout */
+		timer = container_of(timerqueue_getroot(&base->active),
+				     struct hrtimer, node);
+		expires = hrtimer_get_subtree_least_expires(timer);
 		timer = container_of(next, struct hrtimer, node);
 		if (timer == exclude) {
 			/* Get to the next timer in the queue. */
@@ -520,8 +526,15 @@ static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base,
 				continue;
 
 			timer = container_of(next, struct hrtimer, node);
+
+			/*
+			 * Can't figure out the ideal slack with the excluded
+			 * timer (cheaply). Go with the safest value, which is
+			 * the earliest possible soft timer expiry.
+			 */
+			expires = hrtimer_get_softexpires(timer);
 		}
-		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+		expires = ktime_sub(expires, base->offset);
 		if (expires < expires_next) {
 			expires_next = expires;
 
@@ -1088,6 +1101,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 	/* Pairs with the lockless read in hrtimer_is_queued() */
 	WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED);
 
+	hrtimer_set_subtree_least_expires(timer, hrtimer_get_expires(timer));
 	return timerqueue_add(&base->active, &timer->node);
 }
 
@@ -1774,6 +1788,11 @@ static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h)
 
 #ifdef CONFIG_HIGH_RES_TIMERS
 
+static inline ktime_t __hrtimer_get_softexpires_safe(struct hrtimer *timer)
+{
+	return timer ? hrtimer_get_softexpires(timer) : KTIME_MAX;
+}
+
 /*
  * High resolution timer interrupt
  * Called with interrupts disabled
@@ -1802,7 +1821,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	 */
 	cpu_base->expires_next = KTIME_MAX;
 
-	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
+	if (ktime_after(now, __hrtimer_get_softexpires_safe(cpu_base->softirq_next_timer))) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
 		cpu_base->softirq_activated = 1;
 		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
@@ -1882,9 +1901,137 @@ static inline void __hrtimer_peek_ahead_timers(void)
 		hrtimer_interrupt(td->evtdev);
 }
 
+/* Do a cheap test to see if there are soft-expired timers present */
+static inline bool __hrtimer_has_softexpired_timers(void)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	ktime_t now;
+	unsigned long flags;
+	bool has_expired;
+
+	raw_spin_lock_irqsave(&cpu_base->lock, flags);
+	now = hrtimer_update_base(cpu_base);
+	has_expired =
+		ktime_after(now, __hrtimer_get_softexpires_safe(cpu_base->softirq_next_timer)) ||
+		ktime_after(now, __hrtimer_get_softexpires_safe(cpu_base->next_timer));
+	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
+
+	return has_expired;
+}
+
+/**
+ * hrtimer_run_softexpired_timers - Run timers that have "soft" expired.
+ *
+ * Called with interrupts disabled.
+ **/
+void hrtimer_run_softexpired_timers(void)
+{
+	if (!hrtimer_hres_active())
+		return;
+
+	if (__hrtimer_has_softexpired_timers())
+		__hrtimer_peek_ahead_timers();
+}
+
+#define hrtimer_entry(rb_ptr) \
+	container_of(rb_entry(rb_ptr, struct timerqueue_node, node), \
+		     struct hrtimer, node)
+
+static s64 get_rb_node_subtree_expires_tv64(const struct rb_node *rb)
+{
+	return rb
+		? hrtimer_get_subtree_least_expires_tv64(hrtimer_entry(rb))
+		: KTIME_MAX;
+}
+
+static s64 get_subtree_least_expires_tv64(const struct rb_node *parent)
+{
+	if (!parent)
+		return KTIME_MAX;
+	return parent
+		? min(get_rb_node_subtree_expires_tv64(parent->rb_left),
+		      get_rb_node_subtree_expires_tv64(parent->rb_right))
+		: KTIME_MAX;
+}
+
+/**
+ * hrtimer_rb_augment_propagate - Propagate the _least_expires from child node
+ * @rb:   Node to start from
+ * @stop: Node to stop at. NULL = Propagate to the root.
+ */
+static
+void hrtimer_rb_augment_propagate(struct rb_node *rb, struct rb_node *stop)
+{
+	while (rb != stop) {
+		struct hrtimer *timer  = hrtimer_entry(rb);
+		s64 least_expires_tv64 = min(hrtimer_get_expires_tv64(timer),
+					     get_subtree_least_expires_tv64(rb));
+
+		hrtimer_set_subtree_least_expires_tv64(timer,
+						       least_expires_tv64);
+		rb = rb_parent(rb);
+	}
+}
+
+/**
+ * hrtimer_rb_augment_copy - Copy the least_expires
+ */
+static
+void hrtimer_rb_augment_copy(struct rb_node *rb_from, struct rb_node *rb_to)
+{
+	struct hrtimer *to_timer         = hrtimer_entry(rb_to);
+	const struct hrtimer *from_timer = hrtimer_entry(rb_from);;
+
+	hrtimer_set_subtree_least_expires_tv64(to_timer,
+			   hrtimer_get_subtree_least_expires_tv64(from_timer));
+}
+
+/**
+ * hrtimer_rb_augment_rotate - swap and recalculate augmentation during rbtree rotation
+ */
+static
+void hrtimer_rb_augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+	struct hrtimer *old_timer = hrtimer_entry(rb_old);
+	s64 least_expires_tv64;
+
+	hrtimer_rb_augment_copy(rb_old, rb_new);
+
+	least_expires_tv64 = min(hrtimer_get_expires_tv64(old_timer),
+				 get_subtree_least_expires_tv64(rb_old));
+	hrtimer_set_subtree_least_expires_tv64(old_timer,
+					       least_expires_tv64);
+}
+
+/**
+ * hrtimer_timerqueue_augment_insert - timerwheel augment callback on insert
+ */
+static void hrtimer_rb_augment_insert(struct rb_node *rb_parent,
+				      struct rb_node *rb_node)
+{
+	struct hrtimer *parent_timer = hrtimer_entry(rb_parent);
+	struct hrtimer *node_timer   = hrtimer_entry(rb_node);
+	ktime_t node_expires = hrtimer_get_expires(node_timer);
+
+	if (ktime_before(node_expires,
+			 hrtimer_get_subtree_least_expires(parent_timer)))
+		hrtimer_set_subtree_least_expires(parent_timer, node_expires);
+}
+
+static const struct rb_augment_callbacks hrtimer_rb_augment_callbacks_struct = {
+	.propagate = hrtimer_rb_augment_propagate,
+	.copy      = hrtimer_rb_augment_copy,
+	.rotate    = hrtimer_rb_augment_rotate,
+	.insert    = hrtimer_rb_augment_insert,
+};
+
+static const struct rb_augment_callbacks *hrtimer_rb_augment_callbacks =
+	&hrtimer_rb_augment_callbacks_struct;
+
 #else /* CONFIG_HIGH_RES_TIMERS */
 
 static inline void __hrtimer_peek_ahead_timers(void) { }
+static const struct rb_augment_callbacks *hrtimer_rb_augment_callbacks = NULL;
 
 #endif	/* !CONFIG_HIGH_RES_TIMERS */
 
@@ -1915,7 +2062,7 @@ void hrtimer_run_queues(void)
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 	now = hrtimer_update_base(cpu_base);
 
-	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
+	if (ktime_after(now, __hrtimer_get_softexpires_safe(cpu_base->softirq_next_timer))) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
 		cpu_base->softirq_activated = 1;
 		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
@@ -2088,7 +2235,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode,
 	int ret = 0;
 	u64 slack;
 
-	slack = current->timer_slack_ns;
+	slack = get_task_timer_slack_ns(current);
 	if (dl_task(current) || rt_task(current))
 		slack = 0;
 
@@ -2167,7 +2314,8 @@ int hrtimers_prepare_cpu(unsigned int cpu)
 
 		clock_b->cpu_base = cpu_base;
 		seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock);
-		timerqueue_init_head(&clock_b->active);
+		timerqueue_init_head_augmented(&clock_b->active,
+					       hrtimer_rb_augment_callbacks);
 	}
 
 	cpu_base->cpu = cpu;
diff --git a/lib/rbtree.c b/lib/rbtree.c
index c4ac5c2421f255..bcbd8440023a2a 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -424,11 +424,13 @@ EXPORT_SYMBOL(__rb_erase_color);
 static inline void dummy_propagate(struct rb_node *node, struct rb_node *stop) {}
 static inline void dummy_copy(struct rb_node *old, struct rb_node *new) {}
 static inline void dummy_rotate(struct rb_node *old, struct rb_node *new) {}
+static inline void dummy_insert(struct rb_node *parent, struct rb_node *node) {}
 
 static const struct rb_augment_callbacks dummy_callbacks = {
 	.propagate = dummy_propagate,
 	.copy = dummy_copy,
-	.rotate = dummy_rotate
+	.rotate = dummy_rotate,
+	.insert = dummy_insert,
 };
 
 void rb_insert_color(struct rb_node *node, struct rb_root *root)
diff --git a/lib/timerqueue.c b/lib/timerqueue.c
index cdb9c7658478f0..e9ba77ccc6ec6c 100644
--- a/lib/timerqueue.c
+++ b/lib/timerqueue.c
@@ -37,6 +37,10 @@ bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node)
 	/* Make sure we don't add nodes that are already added */
 	WARN_ON_ONCE(!RB_EMPTY_NODE(&node->node));
 
+	if (head->augment)
+		return rb_add_augmented_cached(&node->node, &head->rb_root,
+					       __timerqueue_less,
+					       head->augment);
 	return rb_add_cached(&node->node, &head->rb_root, __timerqueue_less);
 }
 EXPORT_SYMBOL_GPL(timerqueue_add);
@@ -54,7 +58,12 @@ bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node)
 {
 	WARN_ON_ONCE(RB_EMPTY_NODE(&node->node));
 
-	rb_erase_cached(&node->node, &head->rb_root);
+	if (head->augment)
+		rb_erase_augmented_cached(&node->node, &head->rb_root,
+					  head->augment);
+	else
+		rb_erase_cached(&node->node, &head->rb_root);
+
 	RB_CLEAR_NODE(&node->node);
 
 	return !RB_EMPTY_ROOT(&head->rb_root.rb_root);