From 3baae1286ea7b0494a62ff435c1cdce94b66565d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 May 2024 01:46:26 -1000
Subject: [PATCH 1/4] scx: Always use rq->scx instead of scx_rq

Some functions were using scx_rq local variable to cache rq->scx and
dequeue_task_scx() was taking scx_rq as an argument. "scx_rq->" isn't any
shorter than "rq->scx." and the inconsistency adds to confusion. Let's
always use rq.
---
 kernel/sched/ext.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 77ca3353537f..db9ccb67d437 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -1673,10 +1673,10 @@ static bool task_linked_on_dsq(struct task_struct *p)
 	return !list_empty(&p->scx.dsq_node.list);
 }
 
-static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p)
+static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
 {
 	struct scx_dispatch_q *dsq = p->scx.dsq;
-	bool is_local = dsq == &scx_rq->local_dsq;
+	bool is_local = dsq == &rq->scx.local_dsq;
 
 	if (!dsq) {
 		WARN_ON_ONCE(task_linked_on_dsq(p));
@@ -2011,8 +2011,6 @@ static void ops_dequeue(struct task_struct *p, u64 deq_flags)
 
 static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
 {
-	struct scx_rq *scx_rq = &rq->scx;
-
 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
 		WARN_ON_ONCE(task_runnable(p));
 		return;
@@ -2046,10 +2044,10 @@ static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags
 		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
 
 	p->scx.flags &= ~SCX_TASK_QUEUED;
-	scx_rq->nr_running--;
+	rq->scx.nr_running--;
 	sub_nr_running(rq, 1);
 
-	dispatch_dequeue(scx_rq, p);
+	dispatch_dequeue(rq, p);
 }
 
 static void yield_task_scx(struct rq *rq)
@@ -2203,17 +2201,15 @@ static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
 static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq,
 			       struct task_struct *p)
 {
-	struct scx_rq *scx_rq = &rq->scx;
-
 	lockdep_assert_held(&dsq->lock);	/* released on return */
 
 	/* @dsq is locked and @p is on this rq */
 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
 	task_unlink_from_dsq(p, dsq);
-	list_add_tail(&p->scx.dsq_node.list, &scx_rq->local_dsq.list);
+	list_add_tail(&p->scx.dsq_node.list, &rq->scx.local_dsq.list);
 	dsq_mod_nr(dsq, -1);
-	dsq_mod_nr(&scx_rq->local_dsq, 1);
-	WRITE_ONCE(p->scx.dsq, &scx_rq->local_dsq);
+	dsq_mod_nr(&rq->scx.local_dsq, 1);
+	WRITE_ONCE(p->scx.dsq, &rq->scx.local_dsq);
 	raw_spin_unlock(&dsq->lock);
 }
 
@@ -2531,14 +2527,13 @@ static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
 static int balance_one(struct rq *rq, struct task_struct *prev,
 		       struct rq_flags *rf, bool local)
 {
-	struct scx_rq *scx_rq = &rq->scx;
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
 	int nr_loops = SCX_DSP_MAX_LOOPS;
 	bool has_tasks = false;
 
 	lockdep_assert_rq_held(rq);
-	scx_rq->flags |= SCX_RQ_BALANCING;
+	rq->scx.flags |= SCX_RQ_BALANCING;
 
 	if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
 	    unlikely(rq->scx.cpu_released)) {
@@ -2582,7 +2577,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev,
 	}
 
 	/* if there already are tasks to run, nothing to do */
-	if (scx_rq->local_dsq.nr)
+	if (rq->scx.local_dsq.nr)
 		goto has_tasks;
 
 	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
@@ -2610,7 +2605,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev,
 
 		flush_dispatch_buf(rq, rf);
 
-		if (scx_rq->local_dsq.nr)
+		if (rq->scx.local_dsq.nr)
 			goto has_tasks;
 		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
 			goto has_tasks;
@@ -2635,7 +2630,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev,
 has_tasks:
 	has_tasks = true;
 out:
-	scx_rq->flags &= ~SCX_RQ_BALANCING;
+	rq->scx.flags &= ~SCX_RQ_BALANCING;
 	return has_tasks;
 }
 
@@ -2689,7 +2684,7 @@ static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
 		 * dispatched. Call ops_dequeue() to notify the BPF scheduler.
 		 */
 		ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
-		dispatch_dequeue(&rq->scx, p);
+		dispatch_dequeue(rq, p);
 	}
 
 	p->se.exec_start = rq_clock_task(rq);
@@ -6145,14 +6140,12 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
 {
 	u32 nr_enqueued, i;
 	struct rq *rq;
-	struct scx_rq *scx_rq;
 
 	if (!scx_kf_allowed(SCX_KF_CPU_RELEASE))
 		return 0;
 
 	rq = cpu_rq(smp_processor_id());
 	lockdep_assert_rq_held(rq);
-	scx_rq = &rq->scx;
 
 	/*
 	 * Get the number of tasks on the local DSQ before iterating over it to
@@ -6160,7 +6153,7 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
 	 * the task to stay on the local DSQ, and we want to prevent the BPF
 	 * scheduler from causing us to loop indefinitely.
 	 */
-	nr_enqueued = scx_rq->local_dsq.nr;
+	nr_enqueued = rq->scx.local_dsq.nr;
 	for (i = 0; i < nr_enqueued; i++) {
 		struct task_struct *p;
 
@@ -6169,7 +6162,7 @@ __bpf_kfunc u32 scx_bpf_reenqueue_local(void)
 			     SCX_OPSS_NONE);
 		WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
 		WARN_ON_ONCE(p->scx.holding_cpu != -1);
-		dispatch_dequeue(scx_rq, p);
+		dispatch_dequeue(rq, p);
 		do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1);
 	}
 

From 3a4476944ee208025306fd3dfe7bc5e44596e411 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 May 2024 01:46:27 -1000
Subject: [PATCH 2/4] scx: Replace scx_cpu_online percpu var with SCX_RQ_ONLINE
 flag

No need for this to be a separate variable.

- As this removes symbol name collision, rename test_rq_online() to
  scx_rq_online().

- [un]likely() annotation moved from its users to scx_rq_online().

- On/offline status should agree with ops->cpu_on/offline(). In the existing
  code, the two states could deviate when rq_on/offline_scx() were called
  for sched domain updates. Fix it so that they always agree.
---
 kernel/sched/ext.c   | 33 ++++++++++++---------------------
 kernel/sched/sched.h | 10 ++++++++--
 2 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index db9ccb67d437..d41ff1ebdb22 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -921,16 +921,6 @@ static unsigned long __percpu *scx_kick_cpus_pnt_seqs;
  */
 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
 
-/*
- * Has the current CPU been onlined from the perspective of scx?
- *
- * A hotplugged CPU may begin scheduling tasks before the core scheduler will
- * call into rq_online_scx(). Track whether we've had a chance to invoke
- * ops.cpu_online() so we can skip invoking ops.enqueue() or ops.dispatch() on
- * that CPU until the scheduler knows about the hotplug event.
- */
-static DEFINE_PER_CPU(bool, scx_cpu_online);
-
 /* dispatch queues */
 static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
 
@@ -1804,9 +1794,9 @@ static void direct_dispatch(struct task_struct *p, u64 enq_flags)
 	dispatch_enqueue(dsq, p, enq_flags);
 }
 
-static bool test_rq_online(struct rq *rq)
+static bool scx_rq_online(struct rq *rq)
 {
-	return per_cpu(scx_cpu_online, cpu_of(rq));
+	return likely(rq->scx.flags & SCX_RQ_ONLINE);
 }
 
 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
@@ -1826,7 +1816,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
 	 * offline. We're just trying to on/offline the CPU. Don't bother the
 	 * BPF scheduler.
 	 */
-	if (unlikely(!test_rq_online(rq)))
+	if (!scx_rq_online(rq))
 		goto local;
 
 	if (scx_ops_bypassing()) {
@@ -2228,7 +2218,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq)
 		return false;
 	if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p)))
 		return false;
-	if (unlikely(!test_rq_online(rq)))
+	if (!scx_rq_online(rq))
 		return false;
 	return true;
 }
@@ -2383,7 +2373,7 @@ dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
 		 * instead, which should always be safe. As this is an allowed
 		 * behavior, don't trigger an ops error.
 		 */
-		if (unlikely(!test_rq_online(dst_rq)))
+		if (!scx_rq_online(dst_rq))
 			dst_rq = src_rq;
 
 		if (src_rq == dst_rq) {
@@ -2583,8 +2573,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev,
 	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
 		goto has_tasks;
 
-	if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() ||
-	    unlikely(!test_rq_online(rq)))
+	if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
 		goto out;
 
 	dspc->rq = rq;
@@ -3223,16 +3212,18 @@ static void handle_hotplug(struct rq *rq, bool online)
 
 static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason)
 {
-	if (reason == RQ_ONOFF_HOTPLUG)
+	if (reason == RQ_ONOFF_HOTPLUG) {
 		handle_hotplug(rq, true);
-	per_cpu(scx_cpu_online, cpu_of(rq)) = true;
+		rq->scx.flags |= SCX_RQ_ONLINE;
+	}
 }
 
 static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason)
 {
-	per_cpu(scx_cpu_online, cpu_of(rq)) = false;
-	if (reason == RQ_ONOFF_HOTPLUG)
+	if (reason == RQ_ONOFF_HOTPLUG) {
+		rq->scx.flags &= ~SCX_RQ_ONLINE;
 		handle_hotplug(rq, false);
+	}
 }
 
 #else	/* CONFIG_SMP */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d31db189977a..2bb22f72ba75 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -721,8 +721,14 @@ struct cfs_rq {
 #ifdef CONFIG_SCHED_CLASS_EXT
 /* scx_rq->flags, protected by the rq lock */
 enum scx_rq_flags {
-	SCX_RQ_BALANCING	= 1 << 0,
-	SCX_RQ_CAN_STOP_TICK	= 1 << 1,
+	/*
+	 * A hotplugged CPU starts scheduling before rq_online_scx(). Track
+	 * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called
+	 * only while the BPF scheduler considers the CPU to be online.
+	 */
+	SCX_RQ_ONLINE		= 1 << 0,
+	SCX_RQ_BALANCING	= 1 << 1,
+	SCX_RQ_CAN_STOP_TICK	= 1 << 2,
 };
 
 struct scx_rq {

From 7d196e5e19a8c9840da6135e2570d463cee3b056 Mon Sep 17 00:00:00 2001
From: "From: Tejun Heo" <tj@kernel.org>
Date: Fri, 17 May 2024 01:46:27 -1000
Subject: [PATCH 3/4] scx: Allocate scx_dsp_ctx together with scx_dsp_buf

The two being separate variables only makes things cumbersome.

- Move scx_dsp_buf into scx_dsp_ctx and dynamically allocate the latter.

- Rename scx_dsp_ctx->buf_cursor to ->cursor for brevity.
---
 kernel/sched/ext.c | 51 +++++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d41ff1ebdb22..1160bed2895b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -942,16 +942,16 @@ struct scx_dsp_buf_ent {
 };
 
 static u32 scx_dsp_max_batch;
-static struct scx_dsp_buf_ent __percpu *scx_dsp_buf;
 
 struct scx_dsp_ctx {
 	struct rq		*rq;
 	struct rq_flags		*rf;
-	u32			buf_cursor;
+	u32			cursor;
 	u32			nr_tasks;
+	struct scx_dsp_buf_ent	buf[];
 };
 
-static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx);
+static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
 
 /* string formatting from BPF */
 struct scx_bstr_buf {
@@ -2500,24 +2500,24 @@ static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
 
 static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
 {
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	u32 u;
 
-	for (u = 0; u < dspc->buf_cursor; u++) {
-		struct scx_dsp_buf_ent *ent = &this_cpu_ptr(scx_dsp_buf)[u];
+	for (u = 0; u < dspc->cursor; u++) {
+		struct scx_dsp_buf_ent *ent = &dspc->buf[u];
 
 		finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id,
 				ent->enq_flags);
 	}
 
-	dspc->nr_tasks += dspc->buf_cursor;
-	dspc->buf_cursor = 0;
+	dspc->nr_tasks += dspc->cursor;
+	dspc->cursor = 0;
 }
 
 static int balance_one(struct rq *rq, struct task_struct *prev,
 		       struct rq_flags *rf, bool local)
 {
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
 	int nr_loops = SCX_DSP_MAX_LOOPS;
 	bool has_tasks = false;
@@ -4460,8 +4460,8 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 	} while (dsq == ERR_PTR(-EAGAIN));
 	rhashtable_walk_exit(&rht_iter);
 
-	free_percpu(scx_dsp_buf);
-	scx_dsp_buf = NULL;
+	free_percpu(scx_dsp_ctx);
+	scx_dsp_ctx = NULL;
 	scx_dsp_max_batch = 0;
 
 	free_exit_info(scx_exit_info);
@@ -4945,11 +4945,12 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
 	if (ret)
 		goto err_disable;
 
-	WARN_ON_ONCE(scx_dsp_buf);
+	WARN_ON_ONCE(scx_dsp_ctx);
 	scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
-	scx_dsp_buf = __alloc_percpu(sizeof(scx_dsp_buf[0]) * scx_dsp_max_batch,
-				     __alignof__(scx_dsp_buf[0]));
-	if (!scx_dsp_buf) {
+	scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
+						   scx_dsp_max_batch),
+				     __alignof__(struct scx_dsp_ctx));
+	if (!scx_dsp_ctx) {
 		ret = -ENOMEM;
 		goto err_disable;
 	}
@@ -5829,8 +5830,8 @@ static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
 
 static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
 {
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct task_struct *ddsp_task;
-	int idx;
 
 	ddsp_task = __this_cpu_read(direct_dispatch_task);
 	if (ddsp_task) {
@@ -5838,19 +5839,17 @@ static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags
 		return;
 	}
 
-	idx = __this_cpu_read(scx_dsp_ctx.buf_cursor);
-	if (unlikely(idx >= scx_dsp_max_batch)) {
+	if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
 		scx_ops_error("dispatch buffer overflow");
 		return;
 	}
 
-	this_cpu_ptr(scx_dsp_buf)[idx] = (struct scx_dsp_buf_ent){
+	dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
 		.task = p,
 		.qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
 		.dsq_id = dsq_id,
 		.enq_flags = enq_flags,
 	};
-	__this_cpu_inc(scx_dsp_ctx.buf_cursor);
 }
 
 __bpf_kfunc_start_defs();
@@ -5962,7 +5961,7 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
 	if (!scx_kf_allowed(SCX_KF_DISPATCH))
 		return 0;
 
-	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx.buf_cursor);
+	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
 }
 
 /**
@@ -5973,13 +5972,13 @@ __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
  */
 __bpf_kfunc void scx_bpf_dispatch_cancel(void)
 {
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 
 	if (!scx_kf_allowed(SCX_KF_DISPATCH))
 		return;
 
-	if (dspc->buf_cursor > 0)
-		dspc->buf_cursor--;
+	if (dspc->cursor > 0)
+		dspc->cursor--;
 	else
 		scx_ops_error("dispatch buffer underflow");
 }
@@ -6001,7 +6000,7 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
  */
 __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
 {
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct scx_dispatch_q *dsq;
 
 	if (!scx_kf_allowed(SCX_KF_DISPATCH))
@@ -6047,7 +6046,7 @@ __bpf_kfunc bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p)
 {
 	struct bpf_iter_scx_dsq_kern *kit = (void *)it;
 	struct scx_dispatch_q *dsq, *kit_dsq;
-	struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
+	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct rq *task_rq;
 	u64 kit_dsq_seq;
 

From 6a5cf232bc8a875724668f7ff747aeec09178812 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 17 May 2024 01:59:32 -1000
Subject: [PATCH 4/4] scx/common.bpf.h: Sync from SCX repo

---
 tools/sched_ext/include/scx/common.bpf.h | 34 ++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 4a360d5f551a..b59aa7c57e78 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -243,6 +243,40 @@ BPF_PROG(name, ##args)
 #define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
 #define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
 
+/*
+ * bpf_log2 - Compute the base 2 logarithm of a 32-bit exponential value.
+ * @v: The value for which we're computing the base 2 logarithm.
+ */
+static inline u32 bpf_log2(u32 v)
+{
+        u32 r;
+        u32 shift;
+
+        r = (v > 0xFFFF) << 4; v >>= r;
+        shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+        shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+        shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+        r |= (v >> 1);
+        return r;
+}
+
+/*
+ * bpf_log2l - Compute the base 2 logarithm of a 64-bit exponential value.
+ * @v: The value for which we're computing the base 2 logarithm.
+ */
+static inline u32 bpf_log2l(u64 v)
+{
+        u32 hi = v >> 32;
+        if (hi)
+                return bpf_log2(hi) + 32 + 1;
+        else
+                return bpf_log2(v) + 1;
+}
+
+/* useful compiler attributes */
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
 void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
 void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;