Skip to content

Commit

Permalink
kernel: priority queues: declare as static inlines
Browse files Browse the repository at this point in the history
After the move to C files we got some drop in the performance when
running latency_measure. This patch declares the priority queue
functions as static inlines with minor optimizations.

The result for one metric (on qemu):

3.6 and before the anything was changed:

  Get data from LIFO (w/ ctx switch): 13087 ns

after original change (46484da):

  Get data from LIFO (w/ ctx switch): 13663 ns

with this change:

  Get data from LIFO (w/ ctx switch): 12543 ns

So overall, a net gain of ~ 500ns that can be seen across the board on many
of the metrics.

(cherry picked from commit 4593f0d)

Original-Signed-off-by: Anas Nashif <[email protected]>
GitOrigin-RevId: 4593f0d
Change-Id: I24bcab9462c3fdfd398032be51be0378be059279
Reviewed-on: https://chromium-review.googlesource.com/c/chromiumos/third_party/zephyr/+/5476004
Commit-Queue: Dawid Niedźwiecki <[email protected]>
Tested-by: Dawid Niedźwiecki <[email protected]>
Reviewed-by: Dawid Niedźwiecki <[email protected]>
Tested-by: ChromeOS Prod (Robot) <[email protected]>
  • Loading branch information
nashif authored and Chromeos LUCI committed Apr 23, 2024
1 parent f3f4ef8 commit 819b7ec
Show file tree
Hide file tree
Showing 3 changed files with 141 additions and 148 deletions.
155 changes: 135 additions & 20 deletions kernel/include/priority_q.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@
#ifndef ZEPHYR_KERNEL_INCLUDE_PRIORITY_Q_H_
#define ZEPHYR_KERNEL_INCLUDE_PRIORITY_Q_H_

#include <zephyr/sys/math_extras.h>
#include <zephyr/sys/dlist.h>

/* Dump Scheduling */
extern int32_t z_sched_prio_cmp(struct k_thread *thread_1,
struct k_thread *thread_2);

bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b);

/* Dumb Scheduling */
#if defined(CONFIG_SCHED_DUMB)
#define _priq_run_add z_priq_dumb_add
#define _priq_run_remove z_priq_dumb_remove
# if defined(CONFIG_SCHED_CPU_MASK)
# define _priq_run_best _priq_dumb_mask_best
# define _priq_run_best z_priq_dumb_mask_best
# else
# define _priq_run_best z_priq_dumb_best
# endif /* CONFIG_SCHED_CPU_MASK */
Expand All @@ -25,11 +32,11 @@
/* Multi Queue Scheduling */
#elif defined(CONFIG_SCHED_MULTIQ)

# if defined(CONFIG_64BIT)
# define NBITS 64
# else
# define NBITS 32
# endif
#if defined(CONFIG_64BIT)
#define NBITS 64
#else
#define NBITS 32
#endif /* CONFIG_64BIT */

#define _priq_run_add z_priq_mq_add
#define _priq_run_remove z_priq_mq_remove
Expand All @@ -40,30 +47,99 @@ static ALWAYS_INLINE void z_priq_mq_remove(struct _priq_mq *pq, struct k_thread

/* Scalable Wait Queue */
#if defined(CONFIG_WAITQ_SCALABLE)
#define z_priq_wait_add z_priq_rb_add
#define _priq_wait_add z_priq_rb_add
#define _priq_wait_remove z_priq_rb_remove
#define _priq_wait_best z_priq_rb_best
/* Dump Wait Queue */
/* Dumb Wait Queue */
#elif defined(CONFIG_WAITQ_DUMB)
#define z_priq_wait_add z_priq_dumb_add
#define _priq_wait_add z_priq_dumb_add
#define _priq_wait_remove z_priq_dumb_remove
#define _priq_wait_best z_priq_dumb_best
#endif

/* Dumb Scheduling*/
struct k_thread *z_priq_dumb_best(sys_dlist_t *pq);
void z_priq_dumb_remove(sys_dlist_t *pq, struct k_thread *thread);
static ALWAYS_INLINE void z_priq_dumb_remove(sys_dlist_t *pq, struct k_thread *thread)
{
ARG_UNUSED(pq);

/* Scalable Scheduling */
void z_priq_rb_add(struct _priq_rb *pq, struct k_thread *thread);
void z_priq_rb_remove(struct _priq_rb *pq, struct k_thread *thread);
sys_dlist_remove(&thread->base.qnode_dlist);
}

/* Multi Queue Scheduling */
struct k_thread *z_priq_mq_best(struct _priq_mq *pq);
struct k_thread *z_priq_rb_best(struct _priq_rb *pq);
static ALWAYS_INLINE struct k_thread *z_priq_dumb_best(sys_dlist_t *pq)
{
struct k_thread *thread = NULL;
sys_dnode_t *n = sys_dlist_peek_head(pq);

if (n != NULL) {
thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
}
return thread;
}

bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b);
static ALWAYS_INLINE void z_priq_rb_add(struct _priq_rb *pq, struct k_thread *thread)
{
struct k_thread *t;

thread->base.order_key = pq->next_order_key++;

/* Renumber at wraparound. This is tiny code, and in practice
* will almost never be hit on real systems. BUT on very
* long-running systems where a priq never completely empties
* AND that contains very large numbers of threads, it can be
* a latency glitch to loop over all the threads like this.
*/
if (!pq->next_order_key) {
RB_FOR_EACH_CONTAINER(&pq->tree, t, base.qnode_rb) {
t->base.order_key = pq->next_order_key++;
}
}

rb_insert(&pq->tree, &thread->base.qnode_rb);
}

static ALWAYS_INLINE void z_priq_rb_remove(struct _priq_rb *pq, struct k_thread *thread)
{
rb_remove(&pq->tree, &thread->base.qnode_rb);

if (!pq->tree.root) {
pq->next_order_key = 0;
}
}

static ALWAYS_INLINE struct k_thread *z_priq_rb_best(struct _priq_rb *pq)
{
struct k_thread *thread = NULL;
struct rbnode *n = rb_get_min(&pq->tree);

if (n != NULL) {
thread = CONTAINER_OF(n, struct k_thread, base.qnode_rb);
}
return thread;
}

static ALWAYS_INLINE struct k_thread *z_priq_mq_best(struct _priq_mq *pq)
{
struct k_thread *thread = NULL;

for (int i = 0; i < PRIQ_BITMAP_SIZE; ++i) {
if (!pq->bitmask[i]) {
continue;
}

#ifdef CONFIG_64BIT
sys_dlist_t *l = &pq->queues[i * 64 + u64_count_trailing_zeros(pq->bitmask[i])];
#else
sys_dlist_t *l = &pq->queues[i * 32 + u32_count_trailing_zeros(pq->bitmask[i])];
#endif
sys_dnode_t *n = sys_dlist_peek_head(l);

if (n != NULL) {
thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
break;
}
}

return thread;
}


#ifdef CONFIG_SCHED_MULTIQ
Expand Down Expand Up @@ -105,4 +181,43 @@ static ALWAYS_INLINE void z_priq_mq_remove(struct _priq_mq *pq,
}
}
#endif /* CONFIG_SCHED_MULTIQ */



#ifdef CONFIG_SCHED_CPU_MASK
static ALWAYS_INLINE struct k_thread *z_priq_dumb_mask_best(sys_dlist_t *pq)
{
/* With masks enabled we need to be prepared to walk the list
* looking for one we can run
*/
struct k_thread *thread;

SYS_DLIST_FOR_EACH_CONTAINER(pq, thread, base.qnode_dlist) {
if ((thread->base.cpu_mask & BIT(_current_cpu->id)) != 0) {
return thread;
}
}
return NULL;
}
#endif /* CONFIG_SCHED_CPU_MASK */


#if defined(CONFIG_SCHED_DUMB) || defined(CONFIG_WAITQ_DUMB)
static ALWAYS_INLINE void z_priq_dumb_add(sys_dlist_t *pq,
struct k_thread *thread)
{
struct k_thread *t;

SYS_DLIST_FOR_EACH_CONTAINER(pq, t, base.qnode_dlist) {
if (z_sched_prio_cmp(thread, t) > 0) {
sys_dlist_insert(&t->base.qnode_dlist,
&thread->base.qnode_dlist);
return;
}
}

sys_dlist_append(pq, &thread->base.qnode_dlist);
}
#endif /* CONFIG_SCHED_DUMB || CONFIG_WAITQ_DUMB */

#endif /* ZEPHYR_KERNEL_INCLUDE_PRIORITY_Q_H_ */
91 changes: 1 addition & 90 deletions kernel/priority_queues.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,7 @@
#include <zephyr/kernel.h>
#include <ksched.h>
#include <zephyr/sys/math_extras.h>

void z_priq_dumb_remove(sys_dlist_t *pq, struct k_thread *thread)
{
ARG_UNUSED(pq);

__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));

sys_dlist_remove(&thread->base.qnode_dlist);
}

struct k_thread *z_priq_dumb_best(sys_dlist_t *pq)
{
struct k_thread *thread = NULL;
sys_dnode_t *n = sys_dlist_peek_head(pq);

if (n != NULL) {
thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
}
return thread;
}
#include <zephyr/sys/dlist.h>

bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b)
{
Expand All @@ -47,73 +28,3 @@ bool z_priq_rb_lessthan(struct rbnode *a, struct rbnode *b)
? 1 : 0;
}
}

void z_priq_rb_add(struct _priq_rb *pq, struct k_thread *thread)
{
struct k_thread *t;

__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));

thread->base.order_key = pq->next_order_key++;

/* Renumber at wraparound. This is tiny code, and in practice
* will almost never be hit on real systems. BUT on very
* long-running systems where a priq never completely empties
* AND that contains very large numbers of threads, it can be
* a latency glitch to loop over all the threads like this.
*/
if (!pq->next_order_key) {
RB_FOR_EACH_CONTAINER(&pq->tree, t, base.qnode_rb) {
t->base.order_key = pq->next_order_key++;
}
}

rb_insert(&pq->tree, &thread->base.qnode_rb);
}

void z_priq_rb_remove(struct _priq_rb *pq, struct k_thread *thread)
{
__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));

rb_remove(&pq->tree, &thread->base.qnode_rb);

if (!pq->tree.root) {
pq->next_order_key = 0;
}
}

struct k_thread *z_priq_rb_best(struct _priq_rb *pq)
{
struct k_thread *thread = NULL;
struct rbnode *n = rb_get_min(&pq->tree);

if (n != NULL) {
thread = CONTAINER_OF(n, struct k_thread, base.qnode_rb);
}
return thread;
}

struct k_thread *z_priq_mq_best(struct _priq_mq *pq)
{
struct k_thread *thread = NULL;

for (int i = 0; i < PRIQ_BITMAP_SIZE; ++i) {
if (!pq->bitmask[i]) {
continue;
}

#ifdef CONFIG_64BIT
sys_dlist_t *l = &pq->queues[i * 64 + u64_count_trailing_zeros(pq->bitmask[i])];
#else
sys_dlist_t *l = &pq->queues[i * 32 + u32_count_trailing_zeros(pq->bitmask[i])];
#endif
sys_dnode_t *n = sys_dlist_peek_head(l);

if (n != NULL) {
thread = CONTAINER_OF(n, struct k_thread, base.qnode_dlist);
break;
}
}

return thread;
}
43 changes: 5 additions & 38 deletions kernel/sched.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,43 +82,6 @@ int32_t z_sched_prio_cmp(struct k_thread *thread_1,
return 0;
}

#ifdef CONFIG_SCHED_CPU_MASK
static ALWAYS_INLINE struct k_thread *_priq_dumb_mask_best(sys_dlist_t *pq)
{
/* With masks enabled we need to be prepared to walk the list
* looking for one we can run
*/
struct k_thread *thread;

SYS_DLIST_FOR_EACH_CONTAINER(pq, thread, base.qnode_dlist) {
if ((thread->base.cpu_mask & BIT(_current_cpu->id)) != 0) {
return thread;
}
}
return NULL;
}
#endif /* CONFIG_SCHED_CPU_MASK */

#if defined(CONFIG_SCHED_DUMB) || defined(CONFIG_WAITQ_DUMB)
static ALWAYS_INLINE void z_priq_dumb_add(sys_dlist_t *pq,
struct k_thread *thread)
{
struct k_thread *t;

__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));

SYS_DLIST_FOR_EACH_CONTAINER(pq, t, base.qnode_dlist) {
if (z_sched_prio_cmp(thread, t) > 0) {
sys_dlist_insert(&t->base.qnode_dlist,
&thread->base.qnode_dlist);
return;
}
}

sys_dlist_append(pq, &thread->base.qnode_dlist);
}
#endif /* CONFIG_SCHED_DUMB || CONFIG_WAITQ_DUMB */

static ALWAYS_INLINE void *thread_runq(struct k_thread *thread)
{
#ifdef CONFIG_SCHED_CPU_MASK_PIN_ONLY
Expand Down Expand Up @@ -150,11 +113,15 @@ static ALWAYS_INLINE void *curr_cpu_runq(void)

static ALWAYS_INLINE void runq_add(struct k_thread *thread)
{
__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));

_priq_run_add(thread_runq(thread), thread);
}

static ALWAYS_INLINE void runq_remove(struct k_thread *thread)
{
__ASSERT_NO_MSG(!z_is_idle_thread_object(thread));

_priq_run_remove(thread_runq(thread), thread);
}

Expand Down Expand Up @@ -616,7 +583,7 @@ static void add_to_waitq_locked(struct k_thread *thread, _wait_q_t *wait_q)

if (wait_q != NULL) {
thread->base.pended_on = wait_q;
z_priq_wait_add(&wait_q->waitq, thread);
_priq_wait_add(&wait_q->waitq, thread);
}
}

Expand Down

0 comments on commit 819b7ec

Please sign in to comment.