From 58949b2b566c0b301da1513a1a2970857082f4f4 Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Mon, 24 Feb 2025 12:08:49 -0500
Subject: [PATCH 01/11] i#7157 dyn inject: Improve ord tracking for synthetic
 records

Fixes handling of some cases related to ordinal tracking when there are queued synthetic records.

get_input_record_ordinal, which has existing logic to discount records that are read-ahead by the scheduler, was missing
logic to correctly account for synthetic records in the queue. This caused a higher count to be subtracted from the
record ordinal returned by the input stream, particularly when a new input is switched to (in which case the guarding
condition for the read-ahead adjustment was satisfied).

Added tracking for whether each queued record is real, that is, read from the input reader or synthetic.

Fixes an issue with instrs_pre_read  tracking which should be decremented only when a real record is passed on to the caller.

Adds a test that verifies the bookkeeping for use_input_ordinals case, which was affected by the above issue.

Issue: #7157
---
 .../scheduler/scheduler_dynamic.cpp           |   9 +-
 .../drcachesim/scheduler/scheduler_impl.cpp   | 143 ++++++++++--------
 clients/drcachesim/scheduler/scheduler_impl.h |  20 ++-
 .../drcachesim/scheduler/scheduler_replay.cpp |   3 +-
 .../drcachesim/tests/scheduler_unit_tests.cpp |  35 +++--
 5 files changed, 132 insertions(+), 78 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler_dynamic.cpp b/clients/drcachesim/scheduler/scheduler_dynamic.cpp
index 8f8dad07eb8..979c2c66910 100644
--- a/clients/drcachesim/scheduler/scheduler_dynamic.cpp
+++ b/clients/drcachesim/scheduler/scheduler_dynamic.cpp
@@ -458,7 +458,8 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
         // boundaries so we live with those being before the switch.
         // XXX: Once we insert kernel traces, we may have to try harder
         // to stop before the post-syscall records.
-        if (this->record_type_is_instr_boundary(record, outputs_[output].last_record) &&
+        if (this->record_type_is_instr_boundary(record,
+                                                outputs_[output].last_record.record) &&
             // We want to delay the context switch until after the injected syscall trace.
             !outputs_[output].in_syscall_code) {
             if (input->switch_to_input != sched_type_t::INVALID_INPUT_ORDINAL) {
@@ -507,7 +508,8 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
         this->process_marker(*input, output, marker_type, marker_value);
     }
     if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
-        this->record_type_is_instr_boundary(record, outputs_[output].last_record) &&
+        this->record_type_is_instr_boundary(record,
+                                            outputs_[output].last_record.record) &&
         !outputs_[output].in_context_switch_code) {
         ++input->instrs_in_quantum;
         if (input->instrs_in_quantum > options_.quantum_duration_instrs) {
@@ -546,7 +548,8 @@ scheduler_dynamic_tmpl_t<RecordType, ReaderType>::check_for_input_switch(
             // We only switch on instruction boundaries.  We could possibly switch
             // in between (e.g., scatter/gather long sequence of reads/writes) by
             // setting input->switching_pre_instruction.
-            this->record_type_is_instr_boundary(record, outputs_[output].last_record)) {
+            this->record_type_is_instr_boundary(record,
+                                                outputs_[output].last_record.record)) {
             if (outputs_[output].in_syscall_code) {
                 // XXX: Maybe this should be printed only once per-syscall-instance to
                 // reduce log spam.
diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index 3ff55edf368..bcc3476d401 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -564,8 +564,8 @@ scheduler_impl_tmpl_t<trace_entry_t, record_reader_t>::insert_switch_tid_pid(
     tid.size = 0;
     tid.addr = static_cast<addr_t>(input.tid);
 
-    input.queue.push_front(pid);
-    input.queue.push_front(tid);
+    input.queue.emplace_front(pid, false);
+    input.queue.emplace_front(tid, false);
 }
 
 /***************************************************************************
@@ -850,7 +850,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::init(
                 ? spec_type_t::USE_NOPS
                 // TODO i#5843: Add more flags for other options.
                 : spec_type_t::LAST_FROM_TRACE,
-            static_cast<int>(get_time_micros()), create_invalid_record(), verbosity_);
+            static_cast<int>(get_time_micros()),
+            next_record_t(create_invalid_record(), false), verbosity_);
         if (options_.single_lockstep_output)
             outputs_.back().stream = global_stream_.get();
         if (options_.schedule_record_ostream != nullptr) {
@@ -1523,8 +1524,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
             // Maybe we should disallow it so we don't need checks like this?
             options_.mapping != sched_type_t::MAP_AS_PREVIOUSLY) {
             RecordType record = create_invalid_record();
+            bool real_record = false;
             stream_status_t res =
-                advance_region_of_interest(/*output=*/-1, record, input);
+                advance_region_of_interest(/*output=*/-1, record, input, real_record);
             if (res == sched_type_t::STATUS_SKIPPED) {
                 input.next_timestamp =
                     static_cast<uintptr_t>(input.reader->get_last_timestamp());
@@ -1548,7 +1550,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
             // the non-consuming queue loop vs the consuming and queue-pushback
             // reader loop.
             for (const auto &record : input.queue) {
-                if (!process_next_initial_record(input, record, found_filetype,
+                if (!process_next_initial_record(input, record.record, found_filetype,
                                                  found_timestamp))
                     break;
             }
@@ -1592,7 +1594,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
                 // we skip (see skip_instructions()).  Thus, we abort with an error.
                 if (record_type_is_instr(record))
                     break;
-                input.queue.push_back(record);
+                input.queue.emplace_back(record, true);
+                ++input.real_records_in_queue;
                 ++(*input.reader);
             }
         }
@@ -1632,7 +1635,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::open_reader(
         RecordType record = **reader;
         if (record_type_has_tid(record, tid))
             break;
-        input.queue.push_back(record);
+        input.queue.emplace_back(record, true);
+        ++input.real_records_in_queue;
         ++(*reader);
     }
     if (tid == INVALID_THREAD_ID) {
@@ -1821,10 +1825,14 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_record_ordinal(
         return 0;
     uint64_t ord = inputs_[index].reader->get_record_ordinal();
     if (get_instr_ordinal(inputs_[index]) == 0) {
+        uint64_t adjust =
+            inputs_[index].cur_from_queue && inputs_[index].cur_real_record ? 1 : 0;
+        adjust += inputs_[index].real_records_in_queue;
+        assert(ord >= adjust);
         // Account for get_initial_input_content() readahead for filetype/timestamp.
         // If this gets any more complex, the scheduler stream should track its
         // own counts for every input and just ignore the input stream's tracking.
-        ord -= inputs_[index].queue.size() + (inputs_[index].cur_from_queue ? 1 : 0);
+        ord -= adjust;
     }
     return ord;
 }
@@ -1852,7 +1860,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_first_timestamp(
         return 0;
     uint64_t res = inputs_[index].reader->get_first_timestamp();
     if (get_instr_ordinal(inputs_[index]) == 0 &&
-        (!inputs_[index].queue.empty() || inputs_[index].cur_from_queue)) {
+        (inputs_[index].real_records_in_queue > 0 ||
+         (inputs_[index].cur_from_queue && inputs_[index].cur_real_record))) {
         // Account for get_initial_input_content() readahead for filetype/timestamp.
         res = 0;
     }
@@ -1871,7 +1880,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_last_timestamp(
         return 0;
     uint64_t res = inputs_[index].reader->get_last_timestamp();
     if (get_instr_ordinal(inputs_[index]) == 0 &&
-        (!inputs_[index].queue.empty() || inputs_[index].cur_from_queue)) {
+        (inputs_[index].real_records_in_queue > 0 ||
+         (inputs_[index].cur_from_queue && inputs_[index].cur_real_record))) {
         // Account for get_initial_input_content() readahead for filetype/timestamp.
         res = 0;
     }
@@ -1881,7 +1891,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_last_timestamp(
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_impl_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
-    output_ordinal_t output, RecordType &record, input_info_t &input)
+    output_ordinal_t output, RecordType &record, input_info_t &input, bool &real_record)
 {
     assert(input.lock->owned_by_cur_thread());
     uint64_t cur_instr = get_instr_ordinal(input);
@@ -1913,7 +1923,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
                     if (status != sched_type_t::STATUS_OK)
                         return status;
                 }
-                input.queue.push_back(create_thread_exit(input.tid));
+                input.queue.push_back({ create_thread_exit(input.tid), false });
                 stream_status_t status = mark_input_eof(input);
                 // For early EOF we still need our synthetic exit so do not return it yet.
                 if (status != sched_type_t::STATUS_OK &&
@@ -1932,8 +1942,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
         if (input.cur_region > 0) {
             VPRINT(this, 3, "skip_instructions input=%d: inserting separator marker\n",
                    input.index);
-            input.queue.push_back(record);
+            input.queue.emplace_back(record, real_record);
             record = create_region_separator_marker(input.tid, input.cur_region);
+            real_record = false;
         }
         return sched_type_t::STATUS_OK;
     }
@@ -2011,11 +2022,12 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::clear_input_queue(input_info_t &i
     int i = 0;
     while (!input.queue.empty()) {
         assert(i == 0 ||
-               (!record_type_is_instr(input.queue.front()) &&
-                !record_type_is_encoding(input.queue.front())));
+               (!record_type_is_instr(input.queue.front().record) &&
+                !record_type_is_encoding(input.queue.front().record)));
         ++i;
         input.queue.pop_front();
     }
+    input.real_records_in_queue = 0;
 }
 
 template <typename RecordType, typename ReaderType>
@@ -2032,8 +2044,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::skip_instructions(input_info_t &i
     // For a skip of 0 we still need to clear non-instrs from the queue, but
     // should not have an instr in there.
     assert(skip_amount > 0 || input.queue.empty() ||
-           (!record_type_is_instr(input.queue.front()) &&
-            !record_type_is_encoding(input.queue.front())));
+           (!record_type_is_instr(input.queue.front().record) &&
+            !record_type_is_encoding(input.queue.front().record)));
     clear_input_queue(input);
     input.reader->skip_instructions(skip_amount);
     VPRINT(this, 3, "skip_instructions: input=%d amount=%" PRIu64 "\n", input.index,
@@ -2072,7 +2084,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::skip_instructions(input_info_t &i
         VPRINT(this, 3, "skip_instructions input=%d: inserting separator marker\n",
                input.index);
         input.queue.push_back(
-            create_region_separator_marker(input.tid, input.cur_region));
+            { create_region_separator_marker(input.tid, input.cur_region), false });
     }
     return sched_type_t::STATUS_SKIPPED;
 }
@@ -2288,36 +2300,38 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::set_cur_input(
         stream->page_size_ = inputs_[input].reader->get_page_size();
     }
 
-    if (inputs_[input].pid != INVALID_PID) {
-        insert_switch_tid_pid(inputs_[input]);
-    }
-
-    if (!switch_sequence_.empty() &&
-        outputs_[output].stream->get_instruction_ordinal() > 0) {
-        switch_type_t switch_type = sched_type_t::SWITCH_INVALID;
-        if (prev_workload != inputs_[input].workload)
-            switch_type = sched_type_t::SWITCH_PROCESS;
-        else
-            switch_type = sched_type_t::SWITCH_THREAD;
-        // Inject kernel context switch code.  Since the injected records belong to this
-        // input (the kernel is acting on behalf of this input) we insert them into the
-        // input's queue, but ahead of any prior queued items.  This is why we walk in
-        // reverse, for the push_front calls to the deque.  We update the tid of the
-        // records here to match.  They are considered as is_record_synthetic() and do
-        // not affect input stream ordinals.
-        // XXX: These will appear before the top headers of a new thread which is slightly
-        // odd to have regular records with the new tid before the top headers.
-        if (!switch_sequence_[switch_type].empty()) {
-            for (int i = static_cast<int>(switch_sequence_[switch_type].size()) - 1;
-                 i >= 0; --i) {
-                RecordType record = switch_sequence_[switch_type][i];
-                record_type_set_tid(record, inputs_[input].tid);
-                inputs_[input].queue.push_front(record);
+    if (prev_workload >= 0) {
+        if (inputs_[input].pid != INVALID_PID) {
+            insert_switch_tid_pid(inputs_[input]);
+        }
+        if (!switch_sequence_.empty() &&
+            outputs_[output].stream->get_instruction_ordinal() > 0) {
+            switch_type_t switch_type = sched_type_t::SWITCH_INVALID;
+            if (prev_workload != inputs_[input].workload)
+                switch_type = sched_type_t::SWITCH_PROCESS;
+            else
+                switch_type = sched_type_t::SWITCH_THREAD;
+            // Inject kernel context switch code.  Since the injected records belong to
+            // this input (the kernel is acting on behalf of this input) we insert them
+            // into the input's queue, but ahead of any prior queued items.  This is why
+            // we walk in reverse, for the push_front calls to the deque.  We update the
+            // tid of the records here to match.  They are considered as
+            // is_record_synthetic() and do not affect input stream ordinals.
+            // XXX: These will appear before the top headers of a new thread which is
+            // slightly odd to have regular records with the new tid before the top
+            // headers.
+            if (!switch_sequence_[switch_type].empty()) {
+                for (int i = static_cast<int>(switch_sequence_[switch_type].size()) - 1;
+                     i >= 0; --i) {
+                    RecordType record = switch_sequence_[switch_type][i];
+                    record_type_set_tid(record, inputs_[input].tid);
+                    inputs_[input].queue.emplace_front(record, false);
+                }
+                VPRINT(this, 3,
+                       "Inserted %zu switch records for type %d from %d.%d to %d.%d\n",
+                       switch_sequence_[switch_type].size(), switch_type, prev_workload,
+                       outputs_[output].prev_input, inputs_[input].workload, input);
             }
-            VPRINT(this, 3,
-                   "Inserted %zu switch records for type %d from %d.%d to %d.%d\n",
-                   switch_sequence_[switch_type].size(), switch_type, prev_workload,
-                   outputs_[output].prev_input, inputs_[input].workload, input);
         }
     }
 
@@ -2517,6 +2531,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
     }
     while (true) {
         input->cur_from_queue = false;
+        input->cur_real_record = false;
         if (input->needs_init) {
             // We pay the cost of this conditional to support ipc_reader_t::init() which
             // blocks and must be called right before reading its first record.
@@ -2528,9 +2543,13 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
             input->needs_init = false;
         }
         if (!input->queue.empty()) {
-            record = input->queue.front();
-            input->queue.pop_front();
+            record = input->queue.front().record;
             input->cur_from_queue = true;
+            input->cur_real_record = input->queue.front().is_real;
+            input->queue.pop_front();
+            if (input->cur_real_record) {
+                --input->real_records_in_queue;
+            }
         } else {
             // We again have a flag check because reader_t::init() does an initial ++
             // and so we want to skip that on the first record but perform a ++ prior
@@ -2563,12 +2582,14 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
                 continue;
             } else {
                 record = **input->reader;
+                input->cur_real_record = true;
             }
         }
         VPRINT(this, 5,
                "next_record[%d]: candidate record from %d (@%" PRId64 "): ", output,
                input->index, get_instr_ordinal(*input));
-        if (input->instrs_pre_read > 0 && record_type_is_instr(record))
+        if (input->instrs_pre_read > 0 && input->cur_real_record &&
+            record_type_is_instr(record))
             --input->instrs_pre_read;
         VDO(this, 5, print_record(record););
         bool need_new_input = false;
@@ -2586,7 +2607,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
             // We have to put the candidate record in the queue before we release
             // the lock since another output may grab this input.
             VPRINT(this, 5, "next_record[%d]: queuing candidate record\n", output);
-            input->queue.push_back(record);
+            input->queue.emplace_back(record, input->cur_real_record);
             lock.unlock();
             res = pick_next_input(output, blocked_time);
             if (res != sched_type_t::STATUS_OK && res != sched_type_t::STATUS_WAIT &&
@@ -2610,8 +2631,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
                 // we've already reset to 0.
                 if (!preempt && options_.mapping == sched_type_t::MAP_TO_ANY_OUTPUT) {
                     if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
-                        record_type_is_instr_boundary(record,
-                                                      outputs_[output].last_record)) {
+                        record_type_is_instr_boundary(
+                            record, outputs_[output].last_record.record)) {
                         assert(inputs_[prev_input].instrs_in_quantum > 0);
                         --inputs_[prev_input].instrs_in_quantum;
                     } else if (options_.quantum_unit == sched_type_t::QUANTUM_TIME) {
@@ -2630,7 +2651,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
                 lock.lock();
                 if (res != sched_type_t::STATUS_SKIPPED) {
                     // Get our candidate record back.
-                    record = input->queue.back();
+                    record = input->queue.back().record;
+                    input->cur_real_record = input->queue.back().is_real;
                     input->queue.pop_back();
                 }
             }
@@ -2643,7 +2665,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
         if (input->needs_roi && options_.mapping != sched_type_t::MAP_AS_PREVIOUSLY &&
             !input->regions_of_interest.empty()) {
             input_ordinal_t prev_input = input->index;
-            res = advance_region_of_interest(output, record, *input);
+            res = advance_region_of_interest(output, record, *input,
+                                             input->cur_real_record);
             if (res == sched_type_t::STATUS_SKIPPED) {
                 // We need either the queue or to re-de-ref the reader so we loop,
                 // but we do not want to come back here.
@@ -2670,7 +2693,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
     update_next_record(output, record);
     VDO(this, 4, print_record(record););
 
-    outputs_[output].last_record = record;
+    outputs_[output].last_record = { record, input->cur_real_record };
     record_type_has_tid(record, input->last_record_tid);
     record_type_has_pid(record, input->pid);
     return sched_type_t::STATUS_OK;
@@ -2737,11 +2760,11 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal
                                                                   input_info_t *&input)
 {
     auto &outinfo = outputs_[output];
-    if (record_type_is_invalid(outinfo.last_record))
+    if (record_type_is_invalid(outinfo.last_record.record))
         return sched_type_t::STATUS_INVALID;
     if (!outinfo.speculation_stack.empty())
         return sched_type_t::STATUS_INVALID;
-    record = outinfo.last_record;
+    record = outinfo.last_record.record;
     input = &inputs_[outinfo.cur_input];
     std::lock_guard<mutex_dbg_owned> lock(*input->lock);
     VPRINT(this, 4, "next_record[%d]: unreading last record, from %d\n", output,
@@ -2753,7 +2776,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal
     if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
         record_type_is_instr(record))
         --input->instrs_in_quantum;
-    outinfo.last_record = create_invalid_record();
+    outinfo.last_record = { create_invalid_record(), false };
     return sched_type_t::STATUS_OK;
 }
 
@@ -2765,7 +2788,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::start_speculation(
     auto &outinfo = outputs_[output];
     if (outinfo.speculation_stack.empty()) {
         if (queue_current_record) {
-            if (record_type_is_invalid(outinfo.last_record))
+            if (record_type_is_invalid(outinfo.last_record.record))
                 return sched_type_t::STATUS_INVALID;
             inputs_[outinfo.cur_input].queue.push_back(outinfo.last_record);
         }
diff --git a/clients/drcachesim/scheduler/scheduler_impl.h b/clients/drcachesim/scheduler/scheduler_impl.h
index 29a429be858..cb83d4a0a59 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.h
+++ b/clients/drcachesim/scheduler/scheduler_impl.h
@@ -159,6 +159,15 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
 protected:
     typedef speculator_tmpl_t<RecordType> spec_type_t;
 
+    struct next_record_t {
+        next_record_t(RecordType record, bool is_real)
+            : record(record)
+            , is_real(is_real)
+        {
+        }
+        RecordType record;
+        bool is_real = false;
+    };
     struct input_info_t {
         input_info_t()
             : lock(new mutex_dbg_owned)
@@ -195,8 +204,11 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         // If non-empty these records should be returned before incrementing the reader.
         // This is used for read-ahead and inserting synthetic records.
         // We use a deque so we can iterate over it.
-        std::deque<RecordType> queue;
+
+        std::deque<next_record_t> queue;
+        uint64_t real_records_in_queue = 0;
         bool cur_from_queue;
+        bool cur_real_record;
         std::set<output_ordinal_t> binding;
         int priority = 0;
         std::vector<range_t> regions_of_interest;
@@ -423,7 +435,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         output_info_t(scheduler_impl_tmpl_t<RecordType, ReaderType> *scheduler_impl,
                       output_ordinal_t ordinal,
                       typename spec_type_t::speculator_flags_t speculator_flags,
-                      int rand_seed, RecordType last_record_init, int verbosity = 0)
+                      int rand_seed, next_record_t last_record_init, int verbosity = 0)
             : self_stream(scheduler_impl, ordinal, verbosity)
             , stream(&self_stream)
             , ready_queue(rand_seed)
@@ -467,7 +479,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         // while this field holds the instruction's start PC.  The use case is for
         // queueing a read-ahead instruction record for start_speculation().
         addr_t prev_speculate_pc = 0;
-        RecordType last_record; // Set to TRACE_TYPE_INVALID in constructor.
+        next_record_t last_record; // Set to TRACE_TYPE_INVALID in constructor.
         // A list of schedule segments. During replay, this is read by other threads,
         // but it is only written at init time.
         std::vector<schedule_record_t> record;
@@ -687,7 +699,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
     // If STATUS_SKIPPED or STATUS_STOLE is returned, a new next record needs to be read.
     stream_status_t
     advance_region_of_interest(output_ordinal_t output, RecordType &record,
-                               input_info_t &input);
+                               input_info_t &input, bool &real_record);
 
     // Discards the contents of the input queue.  Meant to be used when skipping
     // input records.
diff --git a/clients/drcachesim/scheduler/scheduler_replay.cpp b/clients/drcachesim/scheduler/scheduler_replay.cpp
index a79f037cfef..a582a838e22 100644
--- a/clients/drcachesim/scheduler/scheduler_replay.cpp
+++ b/clients/drcachesim/scheduler/scheduler_replay.cpp
@@ -425,7 +425,8 @@ scheduler_replay_tmpl_t<RecordType, ReaderType>::pick_next_input_for_mode(
         // a synthetic thread exit record.  We need to first throw out the
         // queued candidate record, if any.
         this->clear_input_queue(inputs_[index]);
-        inputs_[index].queue.push_back(this->create_thread_exit(inputs_[index].tid));
+        inputs_[index].queue.emplace_back(this->create_thread_exit(inputs_[index].tid),
+                                          false);
         VPRINT(this, 2, "early end for input %d\n", index);
         // We're done with this entry but we need the queued record to be read,
         // so we do not move past the entry.
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 0b134488f81..2c9baf75de0 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -5770,7 +5770,7 @@ test_unscheduled()
 }
 
 static void
-test_kernel_switch_sequences()
+test_kernel_switch_sequences(bool use_input_ordinals)
 {
     std::cerr << "\n----------------\nTesting kernel switch sequences\n";
     static constexpr memref_tid_t TID_IN_SWITCHES = 1;
@@ -5838,9 +5838,18 @@ test_kernel_switch_sequences()
         }
         sched_inputs.emplace_back(std::move(readers));
     }
+    dynamorio::drmemtrace::scheduler_tmpl_t<
+        dynamorio::drmemtrace::_memref_t,
+        dynamorio::drmemtrace::reader_t>::scheduler_flags_t flags =
+        scheduler_t::SCHEDULER_DEFAULTS;
+    if (use_input_ordinals) {
+        flags = static_cast<dynamorio::drmemtrace::scheduler_tmpl_t<
+            dynamorio::drmemtrace::_memref_t,
+            dynamorio::drmemtrace::reader_t>::scheduler_flags_t>(
+            flags | scheduler_t::SCHEDULER_USE_INPUT_ORDINALS);
+    }
     scheduler_t::scheduler_options_t sched_ops(scheduler_t::MAP_TO_ANY_OUTPUT,
-                                               scheduler_t::DEPENDENCY_TIMESTAMPS,
-                                               scheduler_t::SCHEDULER_DEFAULTS,
+                                               scheduler_t::DEPENDENCY_TIMESTAMPS, flags,
                                                /*verbosity=*/3);
     sched_ops.quantum_duration_instrs = INSTR_QUANTUM;
     sched_ops.kernel_switch_reader = std::move(switch_reader);
@@ -5888,9 +5897,12 @@ test_kernel_switch_sequences()
                 sched_as_string[i] +=
                     'A' + static_cast<char>(memref.instr.tid - TID_BASE);
             }
+            bool now_switch = false;
             if (memref.marker.type == TRACE_TYPE_MARKER &&
-                memref.marker.marker_type == TRACE_MARKER_TYPE_CONTEXT_SWITCH_START)
+                memref.marker.marker_type == TRACE_MARKER_TYPE_CONTEXT_SWITCH_START) {
+                now_switch = true;
                 in_switch[i] = true;
+            }
             if (in_switch[i]) {
                 // Test that switch code is marked synthetic.
                 assert(outputs[i]->is_record_synthetic());
@@ -5899,10 +5911,13 @@ test_kernel_switch_sequences()
                 assert(outputs[i]->get_input_interface()->get_record_ordinal() ==
                            prev_in_ord[i] ||
                        // Won't match if we just switched inputs.
-                       (memref.marker.type == TRACE_TYPE_MARKER &&
-                        memref.marker.marker_type ==
-                            TRACE_MARKER_TYPE_CONTEXT_SWITCH_START));
-                assert(outputs[i]->get_record_ordinal() > prev_out_ord[i]);
+                       now_switch);
+                if (use_input_ordinals) {
+                    assert(outputs[i]->get_record_ordinal() == prev_out_ord[i] ||
+                           now_switch);
+                } else {
+                    assert(outputs[i]->get_record_ordinal() > prev_out_ord[i]);
+                }
             } else
                 assert(!outputs[i]->is_record_synthetic());
             if (type_is_instr(memref.instr.type))
@@ -6698,7 +6713,6 @@ test_main(int argc, const char *argv[])
     assert(argc == 2);
     // Avoid races with lazy drdecode init (b/279350357).
     dr_standalone_init();
-
     test_serial();
     test_parallel();
     test_param_checks();
@@ -6730,7 +6744,8 @@ test_main(int argc, const char *argv[])
     test_inactive();
     test_direct_switch();
     test_unscheduled();
-    test_kernel_switch_sequences();
+    test_kernel_switch_sequences(/*use_input_ordinals=*/true);
+    test_kernel_switch_sequences(/*use_input_ordinals=*/false);
     test_random_schedule();
     test_record_scheduler();
     test_rebalancing();

From c4f2595f6fe19b5306f991b66ee4b9890e8be468 Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Mon, 24 Feb 2025 13:38:01 -0500
Subject: [PATCH 02/11] Add note about get_instruction_ordinal not being WAI
 for USE_INPUT_ORDINALS

---
 clients/drcachesim/scheduler/scheduler_impl.cpp   | 6 ++++--
 clients/drcachesim/tests/scheduler_unit_tests.cpp | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index bcc3476d401..b02425f58b3 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -2304,8 +2304,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::set_cur_input(
         if (inputs_[input].pid != INVALID_PID) {
             insert_switch_tid_pid(inputs_[input]);
         }
-        if (!switch_sequence_.empty() &&
-            outputs_[output].stream->get_instruction_ordinal() > 0) {
+        if (!switch_sequence_.empty()) {
             switch_type_t switch_type = sched_type_t::SWITCH_INVALID;
             if (prev_workload != inputs_[input].workload)
                 switch_type = sched_type_t::SWITCH_PROCESS;
@@ -2731,6 +2730,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::update_next_record(output_ordinal
             VPRINT(this, 2, "output %d base timestamp = %zu\n", output,
                    outputs_[output].base_timestamp);
         }
+        // FIXME: When USE_INPUT_ORDINALS is enabled, this returns the input-local
+        // instruction ordinal (which not only is not global, but it may also include
+        // the read-ahead instructions).
         uint64_t instr_ord = outputs_[output].stream->get_instruction_ordinal();
         uint64_t idle_count = outputs_[output].idle_count;
         uintptr_t new_time = static_cast<uintptr_t>(
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 2c9baf75de0..a7d5c5ee085 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -5772,7 +5772,9 @@ test_unscheduled()
 static void
 test_kernel_switch_sequences(bool use_input_ordinals)
 {
-    std::cerr << "\n----------------\nTesting kernel switch sequences\n";
+    std::cerr
+        << "\n----------------\nTesting kernel switch sequences for use_input_ordinals: "
+        << use_input_ordinals << "\n";
     static constexpr memref_tid_t TID_IN_SWITCHES = 1;
     static constexpr addr_t PROCESS_SWITCH_PC_START = 0xfeed101;
     static constexpr addr_t THREAD_SWITCH_PC_START = 0xcafe101;

From f53f9b0cea49ea3c51caf102cada76b0cf9a6fd0 Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Mon, 24 Feb 2025 13:58:33 -0500
Subject: [PATCH 03/11] Cleanup and renaming

---
 .../drcachesim/scheduler/scheduler_impl.cpp   | 50 ++++++++++---------
 clients/drcachesim/scheduler/scheduler_impl.h | 14 +++---
 .../drcachesim/tests/scheduler_unit_tests.cpp |  2 +
 3 files changed, 36 insertions(+), 30 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index b02425f58b3..a462567f92e 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -95,6 +95,9 @@ typedef dynamorio::drmemtrace::record_file_reader_t<std::ifstream>
     default_record_file_reader_t;
 #endif
 
+static constexpr bool IS_REAL = true;
+static constexpr bool IS_SYNTHETIC = false;
+
 std::string
 replay_file_checker_t::check(archive_istream_t *infile)
 {
@@ -564,8 +567,8 @@ scheduler_impl_tmpl_t<trace_entry_t, record_reader_t>::insert_switch_tid_pid(
     tid.size = 0;
     tid.addr = static_cast<addr_t>(input.tid);
 
-    input.queue.emplace_front(pid, false);
-    input.queue.emplace_front(tid, false);
+    input.queue.emplace_front(pid, IS_SYNTHETIC);
+    input.queue.emplace_front(tid, IS_SYNTHETIC);
 }
 
 /***************************************************************************
@@ -851,7 +854,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::init(
                 // TODO i#5843: Add more flags for other options.
                 : spec_type_t::LAST_FROM_TRACE,
             static_cast<int>(get_time_micros()),
-            next_record_t(create_invalid_record(), false), verbosity_);
+            cached_record_t(create_invalid_record(), false), verbosity_);
         if (options_.single_lockstep_output)
             outputs_.back().stream = global_stream_.get();
         if (options_.schedule_record_ostream != nullptr) {
@@ -1524,9 +1527,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
             // Maybe we should disallow it so we don't need checks like this?
             options_.mapping != sched_type_t::MAP_AS_PREVIOUSLY) {
             RecordType record = create_invalid_record();
-            bool real_record = false;
+            bool is_record_real = false;
             stream_status_t res =
-                advance_region_of_interest(/*output=*/-1, record, input, real_record);
+                advance_region_of_interest(/*output=*/-1, record, input, is_record_real);
             if (res == sched_type_t::STATUS_SKIPPED) {
                 input.next_timestamp =
                     static_cast<uintptr_t>(input.reader->get_last_timestamp());
@@ -1594,7 +1597,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_initial_input_content(
                 // we skip (see skip_instructions()).  Thus, we abort with an error.
                 if (record_type_is_instr(record))
                     break;
-                input.queue.emplace_back(record, true);
+                input.queue.emplace_back(record, IS_REAL);
                 ++input.real_records_in_queue;
                 ++(*input.reader);
             }
@@ -1635,7 +1638,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::open_reader(
         RecordType record = **reader;
         if (record_type_has_tid(record, tid))
             break;
-        input.queue.emplace_back(record, true);
+        input.queue.emplace_back(record, IS_REAL);
         ++input.real_records_in_queue;
         ++(*reader);
     }
@@ -1826,7 +1829,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_record_ordinal(
     uint64_t ord = inputs_[index].reader->get_record_ordinal();
     if (get_instr_ordinal(inputs_[index]) == 0) {
         uint64_t adjust =
-            inputs_[index].cur_from_queue && inputs_[index].cur_real_record ? 1 : 0;
+            inputs_[index].cur_from_queue && inputs_[index].is_cur_record_real ? 1 : 0;
         adjust += inputs_[index].real_records_in_queue;
         assert(ord >= adjust);
         // Account for get_initial_input_content() readahead for filetype/timestamp.
@@ -1861,7 +1864,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_first_timestamp(
     uint64_t res = inputs_[index].reader->get_first_timestamp();
     if (get_instr_ordinal(inputs_[index]) == 0 &&
         (inputs_[index].real_records_in_queue > 0 ||
-         (inputs_[index].cur_from_queue && inputs_[index].cur_real_record))) {
+         (inputs_[index].cur_from_queue && inputs_[index].is_cur_record_real))) {
         // Account for get_initial_input_content() readahead for filetype/timestamp.
         res = 0;
     }
@@ -1881,7 +1884,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_last_timestamp(
     uint64_t res = inputs_[index].reader->get_last_timestamp();
     if (get_instr_ordinal(inputs_[index]) == 0 &&
         (inputs_[index].real_records_in_queue > 0 ||
-         (inputs_[index].cur_from_queue && inputs_[index].cur_real_record))) {
+         (inputs_[index].cur_from_queue && inputs_[index].is_cur_record_real))) {
         // Account for get_initial_input_content() readahead for filetype/timestamp.
         res = 0;
     }
@@ -1891,7 +1894,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::get_input_last_timestamp(
 template <typename RecordType, typename ReaderType>
 typename scheduler_tmpl_t<RecordType, ReaderType>::stream_status_t
 scheduler_impl_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
-    output_ordinal_t output, RecordType &record, input_info_t &input, bool &real_record)
+    output_ordinal_t output, RecordType &record, input_info_t &input,
+    bool &is_record_real)
 {
     assert(input.lock->owned_by_cur_thread());
     uint64_t cur_instr = get_instr_ordinal(input);
@@ -1942,9 +1946,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
         if (input.cur_region > 0) {
             VPRINT(this, 3, "skip_instructions input=%d: inserting separator marker\n",
                    input.index);
-            input.queue.emplace_back(record, real_record);
+            input.queue.emplace_back(record, is_record_real);
             record = create_region_separator_marker(input.tid, input.cur_region);
-            real_record = false;
+            is_record_real = false;
         }
         return sched_type_t::STATUS_OK;
     }
@@ -2324,7 +2328,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::set_cur_input(
                      i >= 0; --i) {
                     RecordType record = switch_sequence_[switch_type][i];
                     record_type_set_tid(record, inputs_[input].tid);
-                    inputs_[input].queue.emplace_front(record, false);
+                    inputs_[input].queue.emplace_front(record, IS_SYNTHETIC);
                 }
                 VPRINT(this, 3,
                        "Inserted %zu switch records for type %d from %d.%d to %d.%d\n",
@@ -2530,7 +2534,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
     }
     while (true) {
         input->cur_from_queue = false;
-        input->cur_real_record = false;
+        input->is_cur_record_real = false;
         if (input->needs_init) {
             // We pay the cost of this conditional to support ipc_reader_t::init() which
             // blocks and must be called right before reading its first record.
@@ -2544,9 +2548,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
         if (!input->queue.empty()) {
             record = input->queue.front().record;
             input->cur_from_queue = true;
-            input->cur_real_record = input->queue.front().is_real;
+            input->is_cur_record_real = input->queue.front().is_real;
             input->queue.pop_front();
-            if (input->cur_real_record) {
+            if (input->is_cur_record_real) {
                 --input->real_records_in_queue;
             }
         } else {
@@ -2581,13 +2585,13 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
                 continue;
             } else {
                 record = **input->reader;
-                input->cur_real_record = true;
+                input->is_cur_record_real = true;
             }
         }
         VPRINT(this, 5,
                "next_record[%d]: candidate record from %d (@%" PRId64 "): ", output,
                input->index, get_instr_ordinal(*input));
-        if (input->instrs_pre_read > 0 && input->cur_real_record &&
+        if (input->instrs_pre_read > 0 && input->is_cur_record_real &&
             record_type_is_instr(record))
             --input->instrs_pre_read;
         VDO(this, 5, print_record(record););
@@ -2606,7 +2610,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
             // We have to put the candidate record in the queue before we release
             // the lock since another output may grab this input.
             VPRINT(this, 5, "next_record[%d]: queuing candidate record\n", output);
-            input->queue.emplace_back(record, input->cur_real_record);
+            input->queue.emplace_back(record, input->is_cur_record_real);
             lock.unlock();
             res = pick_next_input(output, blocked_time);
             if (res != sched_type_t::STATUS_OK && res != sched_type_t::STATUS_WAIT &&
@@ -2651,7 +2655,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
                 if (res != sched_type_t::STATUS_SKIPPED) {
                     // Get our candidate record back.
                     record = input->queue.back().record;
-                    input->cur_real_record = input->queue.back().is_real;
+                    input->is_cur_record_real = input->queue.back().is_real;
                     input->queue.pop_back();
                 }
             }
@@ -2665,7 +2669,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
             !input->regions_of_interest.empty()) {
             input_ordinal_t prev_input = input->index;
             res = advance_region_of_interest(output, record, *input,
-                                             input->cur_real_record);
+                                             input->is_cur_record_real);
             if (res == sched_type_t::STATUS_SKIPPED) {
                 // We need either the queue or to re-de-ref the reader so we loop,
                 // but we do not want to come back here.
@@ -2692,7 +2696,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
     update_next_record(output, record);
     VDO(this, 4, print_record(record););
 
-    outputs_[output].last_record = { record, input->cur_real_record };
+    outputs_[output].last_record = { record, input->is_cur_record_real };
     record_type_has_tid(record, input->last_record_tid);
     record_type_has_pid(record, input->pid);
     return sched_type_t::STATUS_OK;
diff --git a/clients/drcachesim/scheduler/scheduler_impl.h b/clients/drcachesim/scheduler/scheduler_impl.h
index cb83d4a0a59..4f8bfbe93b2 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.h
+++ b/clients/drcachesim/scheduler/scheduler_impl.h
@@ -159,8 +159,8 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
 protected:
     typedef speculator_tmpl_t<RecordType> spec_type_t;
 
-    struct next_record_t {
-        next_record_t(RecordType record, bool is_real)
+    struct cached_record_t {
+        cached_record_t(RecordType record, bool is_real)
             : record(record)
             , is_real(is_real)
         {
@@ -205,10 +205,10 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         // This is used for read-ahead and inserting synthetic records.
         // We use a deque so we can iterate over it.
 
-        std::deque<next_record_t> queue;
+        std::deque<cached_record_t> queue;
         uint64_t real_records_in_queue = 0;
         bool cur_from_queue;
-        bool cur_real_record;
+        bool is_cur_record_real;
         std::set<output_ordinal_t> binding;
         int priority = 0;
         std::vector<range_t> regions_of_interest;
@@ -435,7 +435,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         output_info_t(scheduler_impl_tmpl_t<RecordType, ReaderType> *scheduler_impl,
                       output_ordinal_t ordinal,
                       typename spec_type_t::speculator_flags_t speculator_flags,
-                      int rand_seed, next_record_t last_record_init, int verbosity = 0)
+                      int rand_seed, cached_record_t last_record_init, int verbosity = 0)
             : self_stream(scheduler_impl, ordinal, verbosity)
             , stream(&self_stream)
             , ready_queue(rand_seed)
@@ -479,7 +479,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         // while this field holds the instruction's start PC.  The use case is for
         // queueing a read-ahead instruction record for start_speculation().
         addr_t prev_speculate_pc = 0;
-        next_record_t last_record; // Set to TRACE_TYPE_INVALID in constructor.
+        cached_record_t last_record; // Set to TRACE_TYPE_INVALID in constructor.
         // A list of schedule segments. During replay, this is read by other threads,
         // but it is only written at init time.
         std::vector<schedule_record_t> record;
@@ -699,7 +699,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
     // If STATUS_SKIPPED or STATUS_STOLE is returned, a new next record needs to be read.
     stream_status_t
     advance_region_of_interest(output_ordinal_t output, RecordType &record,
-                               input_info_t &input, bool &real_record);
+                               input_info_t &input, bool &is_record_real);
 
     // Discards the contents of the input queue.  Meant to be used when skipping
     // input records.
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index a7d5c5ee085..27d807642b9 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -5916,6 +5916,7 @@ test_kernel_switch_sequences(bool use_input_ordinals)
                        now_switch);
                 if (use_input_ordinals) {
                     assert(outputs[i]->get_record_ordinal() == prev_out_ord[i] ||
+                           // Won't match if we just switched inputs.
                            now_switch);
                 } else {
                     assert(outputs[i]->get_record_ordinal() > prev_out_ord[i]);
@@ -6715,6 +6716,7 @@ test_main(int argc, const char *argv[])
     assert(argc == 2);
     // Avoid races with lazy drdecode init (b/279350357).
     dr_standalone_init();
+
     test_serial();
     test_parallel();
     test_param_checks();

From 1fbc2753bd34418a5da3ca7049eb318cc629ca45 Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Mon, 24 Feb 2025 14:14:27 -0500
Subject: [PATCH 04/11] Add FIXME for instrs_pre_read tracking issue

---
 clients/drcachesim/scheduler/scheduler_impl.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index a462567f92e..adbb259f654 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -2591,6 +2591,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
         VPRINT(this, 5,
                "next_record[%d]: candidate record from %d (@%" PRId64 "): ", output,
                input->index, get_instr_ordinal(*input));
+        // FIXME: This is likely too premature; we should either move it to later,
+        // or undo the decrement for cases we don't actually end up returning the
+        // pre-read instruction to the caller.
         if (input->instrs_pre_read > 0 && input->is_cur_record_real &&
             record_type_is_instr(record))
             --input->instrs_pre_read;

From 5f682f06319f0d7867ebe55b77a94d28ac171e31 Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Mon, 24 Feb 2025 14:39:03 -0500
Subject: [PATCH 05/11] Some more cleanup

---
 clients/drcachesim/scheduler/scheduler_impl.cpp   | 9 +++++----
 clients/drcachesim/scheduler/scheduler_replay.cpp | 2 +-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index adbb259f654..805751a6c55 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -1927,7 +1927,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::advance_region_of_interest(
                     if (status != sched_type_t::STATUS_OK)
                         return status;
                 }
-                input.queue.push_back({ create_thread_exit(input.tid), false });
+                input.queue.push_back({ create_thread_exit(input.tid), IS_SYNTHETIC });
                 stream_status_t status = mark_input_eof(input);
                 // For early EOF we still need our synthetic exit so do not return it yet.
                 if (status != sched_type_t::STATUS_OK &&
@@ -2088,7 +2088,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::skip_instructions(input_info_t &i
         VPRINT(this, 3, "skip_instructions input=%d: inserting separator marker\n",
                input.index);
         input.queue.push_back(
-            { create_region_separator_marker(input.tid, input.cur_region), false });
+            { create_region_separator_marker(input.tid, input.cur_region),
+              IS_SYNTHETIC });
     }
     return sched_type_t::STATUS_SKIPPED;
 }
@@ -2738,7 +2739,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::update_next_record(output_ordinal
                    outputs_[output].base_timestamp);
         }
         // FIXME: When USE_INPUT_ORDINALS is enabled, this returns the input-local
-        // instruction ordinal (which not only is not global, but it may also include
+        // instruction ordinal (which not only is not global, but it would also count
         // the read-ahead instructions).
         uint64_t instr_ord = outputs_[output].stream->get_instruction_ordinal();
         uint64_t idle_count = outputs_[output].idle_count;
@@ -2785,7 +2786,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::unread_last_record(output_ordinal
     if (options_.quantum_unit == sched_type_t::QUANTUM_INSTRUCTIONS &&
         record_type_is_instr(record))
         --input->instrs_in_quantum;
-    outinfo.last_record = { create_invalid_record(), false };
+    outinfo.last_record = { create_invalid_record(), IS_SYNTHETIC };
     return sched_type_t::STATUS_OK;
 }
 
diff --git a/clients/drcachesim/scheduler/scheduler_replay.cpp b/clients/drcachesim/scheduler/scheduler_replay.cpp
index a582a838e22..86bcf1994cd 100644
--- a/clients/drcachesim/scheduler/scheduler_replay.cpp
+++ b/clients/drcachesim/scheduler/scheduler_replay.cpp
@@ -426,7 +426,7 @@ scheduler_replay_tmpl_t<RecordType, ReaderType>::pick_next_input_for_mode(
         // queued candidate record, if any.
         this->clear_input_queue(inputs_[index]);
         inputs_[index].queue.emplace_back(this->create_thread_exit(inputs_[index].tid),
-                                          false);
+                                          /*is_real=*/false);
         VPRINT(this, 2, "early end for input %d\n", index);
         // We're done with this entry but we need the queued record to be read,
         // so we do not move past the entry.

From 5efb673cf8d2f71af448bdd6858a65cbf0c8cd71 Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Mon, 24 Feb 2025 15:31:28 -0500
Subject: [PATCH 06/11] Include switch sequence injection count in scheduler
 stats

---
 clients/drcachesim/common/memtrace_stream.h              | 6 +++++-
 clients/drcachesim/scheduler/scheduler_impl.cpp          | 7 ++++++-
 clients/drcachesim/tests/core_serial.templatex           | 1 +
 .../drcachesim/tests/schedule_stats_nopreempt.templatex  | 5 +++++
 clients/drcachesim/tests/scheduler_unit_tests.cpp        | 9 +++++++++
 clients/drcachesim/tests/switch_insertion.templatex      | 2 +-
 clients/drcachesim/tools/schedule_stats.cpp              | 7 ++++++-
 clients/drcachesim/tools/schedule_stats.h                | 4 +++-
 8 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/clients/drcachesim/common/memtrace_stream.h b/clients/drcachesim/common/memtrace_stream.h
index b479ff5785c..44e48891e3a 100644
--- a/clients/drcachesim/common/memtrace_stream.h
+++ b/clients/drcachesim/common/memtrace_stream.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2022-2024 Google, Inc.  All rights reserved.
+ * Copyright (c) 2022-2025 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -112,6 +112,10 @@ class memtrace_stream_t {
          * inputs from being scheduled onto an output.
          */
         SCHED_STAT_HIT_OUTPUT_LIMIT,
+        /**
+         * Counts the instances when the context switch sequence was injected.
+         */
+        SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS,
         /** Count of statistic types. */
         SCHED_STAT_TYPE_COUNT,
     };
diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index 805751a6c55..8d7d605b191 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -659,7 +659,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::~scheduler_impl_tmpl_t()
                outputs_[i].stats[memtrace_stream_t::SCHED_STAT_RUNQUEUE_STEALS]);
         VPRINT(this, 1, "  %-35s: %9" PRId64 "\n", "Runqueue rebalances",
                outputs_[i].stats[memtrace_stream_t::SCHED_STAT_RUNQUEUE_REBALANCES]);
-        VPRINT(this, 1, "  %-35s: %9" PRId64 "\n", "Ouput limits hit",
+        VPRINT(this, 1, "  %-35s: %9" PRId64 "\n", "Output limits hit",
                outputs_[i].stats[memtrace_stream_t::SCHED_STAT_HIT_OUTPUT_LIMIT]);
 #ifndef NDEBUG
         VPRINT(this, 1, "  %-35s: %9" PRId64 "\n", "Runqueue lock acquired",
@@ -667,6 +667,9 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::~scheduler_impl_tmpl_t()
         VPRINT(this, 1, "  %-35s: %9" PRId64 "\n", "Runqueue lock contended",
                outputs_[i].ready_queue.lock->get_count_contended());
 #endif
+        VPRINT(
+            this, 1, "  %-35s: %9" PRId64 "\n", "Switch sequence injections",
+            outputs_[i].stats[memtrace_stream_t::SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS]);
     }
 }
 
@@ -2325,6 +2328,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::set_cur_input(
             // slightly odd to have regular records with the new tid before the top
             // headers.
             if (!switch_sequence_[switch_type].empty()) {
+                ++outputs_[output]
+                      .stats[memtrace_stream_t::SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS];
                 for (int i = static_cast<int>(switch_sequence_[switch_type].size()) - 1;
                      i >= 0; --i) {
                     RecordType record = switch_sequence_[switch_type][i];
diff --git a/clients/drcachesim/tests/core_serial.templatex b/clients/drcachesim/tests/core_serial.templatex
index 1fdc9b85311..4ad9fe5dfe0 100644
--- a/clients/drcachesim/tests/core_serial.templatex
+++ b/clients/drcachesim/tests/core_serial.templatex
@@ -8,6 +8,7 @@ Total counts:
       106490 instructions per context switch
            6 voluntary context switches
            0 direct context switches
+           0 context switch sequence injections
       100.00% voluntary switches
         0.00% direct switches
            4 switches input-to-input
diff --git a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
index b04b4d291de..e511c57a31a 100644
--- a/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
+++ b/clients/drcachesim/tests/schedule_stats_nopreempt.templatex
@@ -8,6 +8,7 @@ Total counts:
       106490 instructions per context switch
            6 voluntary context switches
            0 direct context switches
+           0 context switch sequence injections
       100\.00% voluntary switches
         0\.00% direct switches
            5 switches input-to-input
@@ -45,6 +46,7 @@ Core #0 counts:
       *[0-9]* instructions per context switch
            . voluntary context switches
            0 direct context switches
+           0 context switch sequence injections
       100\.00% voluntary switches
         0\.00% direct switches
 .*
@@ -56,6 +58,7 @@ Core #1 counts:
       *[0-9]* instructions per context switch
            . voluntary context switches
            0 direct context switches
+           0 context switch sequence injections
       100\.00% voluntary switches
         0\.00% direct switches
 .*
@@ -67,6 +70,7 @@ Core #2 counts:
       *[0-9]* instructions per context switch
            . voluntary context switches
            0 direct context switches
+           0 context switch sequence injections
       100\.00% voluntary switches
         0\.00% direct switches
 .*
@@ -78,6 +82,7 @@ Core #3 counts:
       *[0-9]* instructions per context switch
            . voluntary context switches
            0 direct context switches
+           0 context switch sequence injections
       100\.00% voluntary switches
         0\.00% direct switches
 .*
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index 27d807642b9..bbb9b47ca6a 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -5876,6 +5876,7 @@ test_kernel_switch_sequences(bool use_input_ordinals)
     std::vector<bool> in_switch(NUM_OUTPUTS, false);
     std::vector<uint64> prev_in_ord(NUM_OUTPUTS, 0);
     std::vector<uint64> prev_out_ord(NUM_OUTPUTS, 0);
+    std::vector<uint64> switch_seq_count(NUM_OUTPUTS, 0);
     while (num_eof < NUM_OUTPUTS) {
         for (int i = 0; i < NUM_OUTPUTS; i++) {
             if (eof[i])
@@ -5930,7 +5931,9 @@ test_kernel_switch_sequences(bool use_input_ordinals)
                 case TRACE_MARKER_TYPE_VERSION: sched_as_string[i] += 'v'; break;
                 case TRACE_MARKER_TYPE_TIMESTAMP: sched_as_string[i] += '0'; break;
                 case TRACE_MARKER_TYPE_CONTEXT_SWITCH_END:
+                    assert(in_switch[i]);
                     in_switch[i] = false;
+                    ++switch_seq_count[i];
                     ANNOTATE_FALLTHROUGH;
                 case TRACE_MARKER_TYPE_CONTEXT_SWITCH_START:
                     if (memref.marker.marker_value == scheduler_t::SWITCH_PROCESS)
@@ -5948,6 +5951,12 @@ test_kernel_switch_sequences(bool use_input_ordinals)
             prev_out_ord[i] = outputs[i]->get_record_ordinal();
         }
     }
+    for (int i = 0; i < NUM_OUTPUTS; i++) {
+        assert(switch_seq_count[i] > 0);
+        assert(switch_seq_count[i] ==
+               static_cast<uint64>(outputs[i]->get_schedule_statistic(
+                   memtrace_stream_t::SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS)));
+    }
     // Check the high-level strings.
     for (int i = 0; i < NUM_OUTPUTS; i++) {
         std::cerr << "cpu #" << i << " schedule: " << sched_as_string[i] << "\n";
diff --git a/clients/drcachesim/tests/switch_insertion.templatex b/clients/drcachesim/tests/switch_insertion.templatex
index 91f6dc04667..b8668f659dd 100644
--- a/clients/drcachesim/tests/switch_insertion.templatex
+++ b/clients/drcachesim/tests/switch_insertion.templatex
@@ -3,6 +3,6 @@ Total counts:
       [1-9][0-9][0-9][0-9][0-9][0-9] total \(fetched\) instructions
         5971 total unique \(fetched\) instructions
       [1-9][0-9][0-9][0-9][0-9][0-9] total userspace instructions
-         [1-9][0-9][0-9] total kernel instructions
+          [1-9][0-9] total kernel instructions
       [1-9][0-9][0-9][0-9][0-9][0-9] total non-fetched instructions
 .*
diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp
index 5d4b26b6b81..abc1ae6e044 100644
--- a/clients/drcachesim/tools/schedule_stats.cpp
+++ b/clients/drcachesim/tools/schedule_stats.cpp
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2017-2024 Google, Inc.  All rights reserved.
+ * Copyright (c) 2017-2025 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -166,6 +166,9 @@ schedule_stats_t::get_scheduler_stats(memtrace_stream_t *stream, counters_t &cou
         memtrace_stream_t::SCHED_STAT_RUNQUEUE_REBALANCES));
     counters.at_output_limit = static_cast<int64_t>(
         stream->get_schedule_statistic(memtrace_stream_t::SCHED_STAT_HIT_OUTPUT_LIMIT));
+    counters.switch_sequence_injections =
+        static_cast<int64_t>(stream->get_schedule_statistic(
+            memtrace_stream_t::SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS));
 
     // XXX: Currently, schedule_stats is measuring swap-ins to a real input.  If we
     // want to match what "perf" targeting this app would record, which is swap-outs,
@@ -420,6 +423,8 @@ schedule_stats_t::print_counters(const counters_t &counters)
               << counters.voluntary_switches << " voluntary context switches\n";
     std::cerr << std::setw(12) << counters.direct_switches
               << " direct context switches\n";
+    std::cerr << std::setw(12) << counters.switch_sequence_injections
+              << " context switch sequence injections\n";
     print_percentage(static_cast<double>(counters.voluntary_switches),
                      static_cast<double>(counters.total_switches),
                      "% voluntary switches\n");
diff --git a/clients/drcachesim/tools/schedule_stats.h b/clients/drcachesim/tools/schedule_stats.h
index cc082a84232..95133455d4e 100644
--- a/clients/drcachesim/tools/schedule_stats.h
+++ b/clients/drcachesim/tools/schedule_stats.h
@@ -1,5 +1,5 @@
 /* **********************************************************
- * Copyright (c) 2023-2024 Google, Inc.  All rights reserved.
+ * Copyright (c) 2023-2025 Google, Inc.  All rights reserved.
  * **********************************************************/
 
 /*
@@ -216,6 +216,7 @@ class schedule_stats_t : public analysis_tool_t {
             syscalls += rhs.syscalls;
             maybe_blocking_syscalls += rhs.maybe_blocking_syscalls;
             direct_switch_requests += rhs.direct_switch_requests;
+            switch_sequence_injections += rhs.switch_sequence_injections;
             observed_migrations += rhs.observed_migrations;
             waits += rhs.waits;
             idles += rhs.idles;
@@ -252,6 +253,7 @@ class schedule_stats_t : public analysis_tool_t {
         int64_t syscalls = 0;
         int64_t maybe_blocking_syscalls = 0;
         int64_t direct_switch_requests = 0;
+        int64_t switch_sequence_injections = 0;
         // Our observed migrations will be <= the scheduler's reported migrations
         // for a dynamic schedule as we don't know the initial runqueue allocation
         // and so can't see the migration of an input that didn't execute in the

From e790c3da39e6c4925c4023ea45b0439e0849bed0 Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Mon, 24 Feb 2025 17:18:46 -0500
Subject: [PATCH 07/11] Fix injection count

---
 .../drcachesim/scheduler/scheduler_impl.cpp   | 96 +++++++++++--------
 clients/drcachesim/scheduler/scheduler_impl.h |  4 +-
 2 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index 8d7d605b191..9d8c1f0c358 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -2308,42 +2308,6 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::set_cur_input(
         stream->page_size_ = inputs_[input].reader->get_page_size();
     }
 
-    if (prev_workload >= 0) {
-        if (inputs_[input].pid != INVALID_PID) {
-            insert_switch_tid_pid(inputs_[input]);
-        }
-        if (!switch_sequence_.empty()) {
-            switch_type_t switch_type = sched_type_t::SWITCH_INVALID;
-            if (prev_workload != inputs_[input].workload)
-                switch_type = sched_type_t::SWITCH_PROCESS;
-            else
-                switch_type = sched_type_t::SWITCH_THREAD;
-            // Inject kernel context switch code.  Since the injected records belong to
-            // this input (the kernel is acting on behalf of this input) we insert them
-            // into the input's queue, but ahead of any prior queued items.  This is why
-            // we walk in reverse, for the push_front calls to the deque.  We update the
-            // tid of the records here to match.  They are considered as
-            // is_record_synthetic() and do not affect input stream ordinals.
-            // XXX: These will appear before the top headers of a new thread which is
-            // slightly odd to have regular records with the new tid before the top
-            // headers.
-            if (!switch_sequence_[switch_type].empty()) {
-                ++outputs_[output]
-                      .stats[memtrace_stream_t::SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS];
-                for (int i = static_cast<int>(switch_sequence_[switch_type].size()) - 1;
-                     i >= 0; --i) {
-                    RecordType record = switch_sequence_[switch_type][i];
-                    record_type_set_tid(record, inputs_[input].tid);
-                    inputs_[input].queue.emplace_front(record, IS_SYNTHETIC);
-                }
-                VPRINT(this, 3,
-                       "Inserted %zu switch records for type %d from %d.%d to %d.%d\n",
-                       switch_sequence_[switch_type].size(), switch_type, prev_workload,
-                       outputs_[output].prev_input, inputs_[input].workload, input);
-            }
-        }
-    }
-
     inputs_[input].prev_time_in_quantum =
         outputs_[output].cur_time->load(std::memory_order_acquire);
 
@@ -2436,27 +2400,77 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::pick_next_input(output_ordinal_t
     }
     // We can't easily place these stats inside set_cur_input() as we call that to
     // temporarily give up our input.
-    update_switch_stats(output, prev_index, index);
+    on_context_switch(output, prev_index, index);
     set_cur_input(output, index);
     return res;
 }
 
 template <typename RecordType, typename ReaderType>
 void
-scheduler_impl_tmpl_t<RecordType, ReaderType>::update_switch_stats(
+scheduler_impl_tmpl_t<RecordType, ReaderType>::on_context_switch(
     output_ordinal_t output, input_ordinal_t prev_input, input_ordinal_t new_input)
 {
+    bool do_inject_switch_seq = false;
     if (prev_input == new_input)
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_NOP];
     else if (prev_input != sched_type_t::INVALID_INPUT_ORDINAL &&
-             new_input != sched_type_t::INVALID_INPUT_ORDINAL)
+             new_input != sched_type_t::INVALID_INPUT_ORDINAL) {
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_INPUT];
-    else if (new_input == sched_type_t::INVALID_INPUT_ORDINAL)
+        do_inject_switch_seq = true;
+    } else if (new_input == sched_type_t::INVALID_INPUT_ORDINAL)
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_IDLE];
     else {
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_IDLE_TO_INPUT];
         // Reset the flag so we'll try to steal if we go idle again.
         outputs_[output].tried_to_steal_on_idle = false;
+        do_inject_switch_seq = true;
+    }
+
+    // We want to insert the context switch sequence on input-to-input and
+    // idle-to-input cases. This is a better control point to do that than
+    // set_cur_input. Here we get the stolen input events too, and we don't have
+    // to filter out the init-time set_cur_input cases.
+    // RFC: Was there any other benefit to doing this in set_cur_input before?
+    // It is harder to differentiate idle-to-input events from start of output when
+    // USE_INPUT_ORDINALS is set, since get_instruction_ordinal() does not return the
+    // intended global value in that case.
+    if (do_inject_switch_seq) {
+        if (inputs_[new_input].pid != INVALID_PID) {
+            insert_switch_tid_pid(inputs_[new_input]);
+        }
+        if (!switch_sequence_.empty()) {
+            switch_type_t switch_type = sched_type_t::SWITCH_INVALID;
+            if (prev_input == sched_type_t::INVALID_INPUT_ORDINAL ||
+                inputs_[prev_input].workload != inputs_[new_input].workload)
+                switch_type = sched_type_t::SWITCH_PROCESS;
+            else
+                switch_type = sched_type_t::SWITCH_THREAD;
+            // Inject kernel context switch code.  Since the injected records belong to
+            // this input (the kernel is acting on behalf of this input) we insert them
+            // into the input's queue, but ahead of any prior queued items.  This is why
+            // we walk in reverse, for the push_front calls to the deque.  We update the
+            // tid of the records here to match.  They are considered as
+            // is_record_synthetic() and do not affect input stream ordinals.
+            // XXX: These will appear before the top headers of a new thread which is
+            // slightly odd to have regular records with the new tid before the top
+            // headers.
+            if (!switch_sequence_[switch_type].empty()) {
+                ++outputs_[output]
+                      .stats[memtrace_stream_t::SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS];
+                for (int i = static_cast<int>(switch_sequence_[switch_type].size()) - 1;
+                     i >= 0; --i) {
+                    RecordType record = switch_sequence_[switch_type][i];
+                    record_type_set_tid(record, inputs_[new_input].tid);
+                    inputs_[new_input].queue.emplace_front(record, IS_SYNTHETIC);
+                }
+                VPRINT(this, 3, "Inserted %zu switch for type %d from %d.%d to %d.%d\n",
+                       switch_sequence_[switch_type].size(), switch_type,
+                       prev_input != sched_type_t::INVALID_INPUT_ORDINAL
+                           ? inputs_[prev_input].workload
+                           : -1,
+                       prev_input, inputs_[new_input].workload, new_input);
+            }
+        }
     }
 }
 
@@ -2688,7 +2702,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
             } else if (res == sched_type_t::STATUS_STOLE) {
                 // We need to loop to get the new record.
                 input = &inputs_[outputs_[output].cur_input];
-                update_switch_stats(output, prev_input, input->index);
+                on_context_switch(output, prev_input, input->index);
                 lock.unlock();
                 lock = std::unique_lock<mutex_dbg_owned>(*input->lock);
                 lock.lock();
diff --git a/clients/drcachesim/scheduler/scheduler_impl.h b/clients/drcachesim/scheduler/scheduler_impl.h
index 4f8bfbe93b2..3b4c7fe07e2 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.h
+++ b/clients/drcachesim/scheduler/scheduler_impl.h
@@ -418,8 +418,8 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
     scale_blocked_time(uint64_t initial_time) const;
 
     void
-    update_switch_stats(output_ordinal_t output, input_ordinal_t prev_input,
-                        input_ordinal_t new_input);
+    on_context_switch(output_ordinal_t output, input_ordinal_t prev_input,
+                      input_ordinal_t new_input);
 
     ///
     ///////////////////////////////////////////////////////////////////////////

From 3502e28d5cbfc3f8e5e6ea4ba534334c3a7818ed Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Mon, 24 Feb 2025 19:20:02 -0500
Subject: [PATCH 08/11] Change switch_insertion to use schedule_stats to verify
 switch injection count

---
 .../drcachesim/scheduler/scheduler_impl.cpp   | 12 ++++---
 .../tests/switch_insertion.templatex          | 32 +++++++++++++++----
 suite/tests/CMakeLists.txt                    |  2 +-
 3 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index 9d8c1f0c358..e7712d2eec5 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -2431,9 +2431,11 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::on_context_switch(
     // set_cur_input. Here we get the stolen input events too, and we don't have
     // to filter out the init-time set_cur_input cases.
     // RFC: Was there any other benefit to doing this in set_cur_input before?
-    // It is harder to differentiate idle-to-input events from start of output when
-    // USE_INPUT_ORDINALS is set, since get_instruction_ordinal() does not return the
-    // intended global value in that case.
+    // If this logic is kept a part of set_cur_input, when USE_INPUT_ORDINALS is
+    // set, it is harder to differentiate between idle-to-input events and start
+    // of output; this is because get_instruction_ordinal() does not return the
+    // intended global value in that case (it returns the input-local value
+    // which is also not adjusted for the scheduler read-ahead, so is non-zero).
     if (do_inject_switch_seq) {
         if (inputs_[new_input].pid != INVALID_PID) {
             insert_switch_tid_pid(inputs_[new_input]);
@@ -2758,8 +2760,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::update_next_record(output_ordinal
                    outputs_[output].base_timestamp);
         }
         // FIXME: When USE_INPUT_ORDINALS is enabled, this returns the input-local
-        // instruction ordinal (which not only is not global, but it would also count
-        // the read-ahead instructions).
+        // instruction ordinal (which not only is not global, but also counts the
+        // read-ahead instructions).
         uint64_t instr_ord = outputs_[output].stream->get_instruction_ordinal();
         uint64_t idle_count = outputs_[output].idle_count;
         uintptr_t new_time = static_cast<uintptr_t>(
diff --git a/clients/drcachesim/tests/switch_insertion.templatex b/clients/drcachesim/tests/switch_insertion.templatex
index b8668f659dd..096cf5355da 100644
--- a/clients/drcachesim/tests/switch_insertion.templatex
+++ b/clients/drcachesim/tests/switch_insertion.templatex
@@ -1,8 +1,28 @@
-Basic counts tool results:
+Schedule stats tool results:
 Total counts:
-      [1-9][0-9][0-9][0-9][0-9][0-9] total \(fetched\) instructions
-        5971 total unique \(fetched\) instructions
-      [1-9][0-9][0-9][0-9][0-9][0-9] total userspace instructions
-          [1-9][0-9] total kernel instructions
-      [1-9][0-9][0-9][0-9][0-9][0-9] total non-fetched instructions
+           4 cores
+.*
+      638960 instructions
+          11 total context switches
+.*
+           6 voluntary context switches
+           0 direct context switches
+          11 context switch sequence injections
+.*
+           9 switches input-to-input
+           5 switches input-to-idle
+           2 switches idle-to-input
+.*
+Core #0 counts:
+.*
+      117031 instructions
+           8 total context switches
+.*
+           3 voluntary context switches
+           0 direct context switches
+           8 context switch sequence injections
+.*
+           6 switches input-to-input
+           3 switches input-to-idle
+           2 switches idle-to-input
 .*
diff --git a/suite/tests/CMakeLists.txt b/suite/tests/CMakeLists.txt
index aa1c39b6d21..9ec5d3d6f47 100644
--- a/suite/tests/CMakeLists.txt
+++ b/suite/tests/CMakeLists.txt
@@ -4067,7 +4067,7 @@ if (BUILD_CLIENTS)
         set(switch_file
           "${PROJECT_SOURCE_DIR}/clients/drcachesim/tests/mock_switch_sequences.x64.zip")
         torunonly_simtool(switch_insertion ${ci_shared_app}
-          "-indir ${thread_trace_dir} -tool basic_counts -core_sharded -sched_quantum 1000 -sched_switch_file ${switch_file}"
+          "-indir ${thread_trace_dir} -tool schedule_stats -core_sharded -sched_quantum 1000 -sched_switch_file ${switch_file}"
           "")
         set(tool.switch_insertion_rawtemp ON) # no preprocessor
 

From 052145309d5287faf3a4ad8980da3faa1d067f9c Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Tue, 25 Feb 2025 16:18:36 -0500
Subject: [PATCH 09/11] Reviewer suggested edits from diffbase PRs

---
 clients/drcachesim/common/memtrace_stream.h   |  4 +-
 .../drcachesim/scheduler/scheduler_impl.cpp   | 93 ++++++++++---------
 .../drcachesim/tests/scheduler_unit_tests.cpp |  2 +-
 clients/drcachesim/tools/schedule_stats.cpp   |  2 +-
 4 files changed, 51 insertions(+), 50 deletions(-)

diff --git a/clients/drcachesim/common/memtrace_stream.h b/clients/drcachesim/common/memtrace_stream.h
index 44e48891e3a..b02b4dfddf7 100644
--- a/clients/drcachesim/common/memtrace_stream.h
+++ b/clients/drcachesim/common/memtrace_stream.h
@@ -113,9 +113,9 @@ class memtrace_stream_t {
          */
         SCHED_STAT_HIT_OUTPUT_LIMIT,
         /**
-         * Counts the instances when the context switch sequence was injected.
+         * Counts the instances when the kernel context switch sequence was injected.
          */
-        SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS,
+        SCHED_STAT_KERNEL_SWITCH_SEQUENCE_INJECTIONS,
         /** Count of statistic types. */
         SCHED_STAT_TYPE_COUNT,
     };
diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index e7712d2eec5..9febd602219 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -669,7 +669,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::~scheduler_impl_tmpl_t()
 #endif
         VPRINT(
             this, 1, "  %-35s: %9" PRId64 "\n", "Switch sequence injections",
-            outputs_[i].stats[memtrace_stream_t::SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS]);
+            outputs_[i]
+                .stats[memtrace_stream_t::SCHED_STAT_KERNEL_SWITCH_SEQUENCE_INJECTIONS]);
     }
 }
 
@@ -2417,9 +2418,13 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::on_context_switch(
              new_input != sched_type_t::INVALID_INPUT_ORDINAL) {
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_INPUT];
         do_inject_switch_seq = true;
-    } else if (new_input == sched_type_t::INVALID_INPUT_ORDINAL)
+    } else if (new_input == sched_type_t::INVALID_INPUT_ORDINAL) {
+        // XXX: For now, we do not inject a kernel context switch sequence on
+        // input-to-idle transitions (note that we do so on idle-to-input though).
+        // However, we may want to inject some other suitable sequence, but we're not
+        // sure yet.
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_INPUT_TO_IDLE];
-    else {
+    } else {
         ++outputs_[output].stats[memtrace_stream_t::SCHED_STAT_SWITCH_IDLE_TO_INPUT];
         // Reset the flag so we'll try to steal if we go idle again.
         outputs_[output].tried_to_steal_on_idle = false;
@@ -2430,50 +2435,46 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::on_context_switch(
     // idle-to-input cases. This is a better control point to do that than
     // set_cur_input. Here we get the stolen input events too, and we don't have
     // to filter out the init-time set_cur_input cases.
-    // RFC: Was there any other benefit to doing this in set_cur_input before?
-    // If this logic is kept a part of set_cur_input, when USE_INPUT_ORDINALS is
-    // set, it is harder to differentiate between idle-to-input events and start
-    // of output; this is because get_instruction_ordinal() does not return the
-    // intended global value in that case (it returns the input-local value
-    // which is also not adjusted for the scheduler read-ahead, so is non-zero).
-    if (do_inject_switch_seq) {
-        if (inputs_[new_input].pid != INVALID_PID) {
-            insert_switch_tid_pid(inputs_[new_input]);
-        }
-        if (!switch_sequence_.empty()) {
-            switch_type_t switch_type = sched_type_t::SWITCH_INVALID;
-            if (prev_input == sched_type_t::INVALID_INPUT_ORDINAL ||
-                inputs_[prev_input].workload != inputs_[new_input].workload)
-                switch_type = sched_type_t::SWITCH_PROCESS;
-            else
-                switch_type = sched_type_t::SWITCH_THREAD;
-            // Inject kernel context switch code.  Since the injected records belong to
-            // this input (the kernel is acting on behalf of this input) we insert them
-            // into the input's queue, but ahead of any prior queued items.  This is why
-            // we walk in reverse, for the push_front calls to the deque.  We update the
-            // tid of the records here to match.  They are considered as
-            // is_record_synthetic() and do not affect input stream ordinals.
-            // XXX: These will appear before the top headers of a new thread which is
-            // slightly odd to have regular records with the new tid before the top
-            // headers.
-            if (!switch_sequence_[switch_type].empty()) {
-                ++outputs_[output]
-                      .stats[memtrace_stream_t::SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS];
-                for (int i = static_cast<int>(switch_sequence_[switch_type].size()) - 1;
-                     i >= 0; --i) {
-                    RecordType record = switch_sequence_[switch_type][i];
-                    record_type_set_tid(record, inputs_[new_input].tid);
-                    inputs_[new_input].queue.emplace_front(record, IS_SYNTHETIC);
-                }
-                VPRINT(this, 3, "Inserted %zu switch for type %d from %d.%d to %d.%d\n",
-                       switch_sequence_[switch_type].size(), switch_type,
-                       prev_input != sched_type_t::INVALID_INPUT_ORDINAL
-                           ? inputs_[prev_input].workload
-                           : -1,
-                       prev_input, inputs_[new_input].workload, new_input);
-            }
-        }
+    if (!do_inject_switch_seq)
+        return;
+    if (inputs_[new_input].pid != INVALID_PID) {
+        insert_switch_tid_pid(inputs_[new_input]);
     }
+    if (switch_sequence_.empty())
+        return;
+    switch_type_t switch_type = sched_type_t::SWITCH_INVALID;
+    if ( // XXX: idle-to-input transitions are assumed to be process switches
+         // for now. But we may want to improve this heuristic.
+        prev_input == sched_type_t::INVALID_INPUT_ORDINAL ||
+        inputs_[prev_input].workload != inputs_[new_input].workload)
+        switch_type = sched_type_t::SWITCH_PROCESS;
+    else
+        switch_type = sched_type_t::SWITCH_THREAD;
+    if (switch_sequence_[switch_type].empty())
+        return;
+    // Inject kernel context switch code.  Since the injected records belong to
+    // this input (the kernel is acting on behalf of this input) we insert them
+    // into the input's queue, but ahead of any prior queued items.  This is why
+    // we walk in reverse, for the push_front calls to the deque.  We update the
+    // tid of the records here to match.  They are considered as
+    // is_record_synthetic() and do not affect input stream ordinals.
+    // XXX: These will appear before the top headers of a new thread which is
+    // slightly odd to have regular records with the new tid before the top
+    // headers.
+    ++outputs_[output]
+          .stats[memtrace_stream_t::SCHED_STAT_KERNEL_SWITCH_SEQUENCE_INJECTIONS];
+    for (int i = static_cast<int>(switch_sequence_[switch_type].size()) - 1; i >= 0;
+         --i) {
+        RecordType record = switch_sequence_[switch_type][i];
+        record_type_set_tid(record, inputs_[new_input].tid);
+        inputs_[new_input].queue.emplace_front(record, IS_SYNTHETIC);
+    }
+    VPRINT(this, 3, "Inserted %zu switch for type %d from %d.%d to %d.%d\n",
+           switch_sequence_[switch_type].size(), switch_type,
+           prev_input != sched_type_t::INVALID_INPUT_ORDINAL
+               ? inputs_[prev_input].workload
+               : -1,
+           prev_input, inputs_[new_input].workload, new_input);
 }
 
 template <typename RecordType, typename ReaderType>
diff --git a/clients/drcachesim/tests/scheduler_unit_tests.cpp b/clients/drcachesim/tests/scheduler_unit_tests.cpp
index bbb9b47ca6a..ee2721d43ab 100644
--- a/clients/drcachesim/tests/scheduler_unit_tests.cpp
+++ b/clients/drcachesim/tests/scheduler_unit_tests.cpp
@@ -5955,7 +5955,7 @@ test_kernel_switch_sequences(bool use_input_ordinals)
         assert(switch_seq_count[i] > 0);
         assert(switch_seq_count[i] ==
                static_cast<uint64>(outputs[i]->get_schedule_statistic(
-                   memtrace_stream_t::SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS)));
+                   memtrace_stream_t::SCHED_STAT_KERNEL_SWITCH_SEQUENCE_INJECTIONS)));
     }
     // Check the high-level strings.
     for (int i = 0; i < NUM_OUTPUTS; i++) {
diff --git a/clients/drcachesim/tools/schedule_stats.cpp b/clients/drcachesim/tools/schedule_stats.cpp
index abc1ae6e044..fd7b4849bb4 100644
--- a/clients/drcachesim/tools/schedule_stats.cpp
+++ b/clients/drcachesim/tools/schedule_stats.cpp
@@ -168,7 +168,7 @@ schedule_stats_t::get_scheduler_stats(memtrace_stream_t *stream, counters_t &cou
         stream->get_schedule_statistic(memtrace_stream_t::SCHED_STAT_HIT_OUTPUT_LIMIT));
     counters.switch_sequence_injections =
         static_cast<int64_t>(stream->get_schedule_statistic(
-            memtrace_stream_t::SCHED_STAT_SWITCH_SEQUENCE_INJECTIONS));
+            memtrace_stream_t::SCHED_STAT_KERNEL_SWITCH_SEQUENCE_INJECTIONS));
 
     // XXX: Currently, schedule_stats is measuring swap-ins to a real input.  If we
     // want to match what "perf" targeting this app would record, which is swap-outs,

From 6bc5d2ca494d659d746a1393e45797101163fcd0 Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Tue, 25 Feb 2025 16:32:41 -0500
Subject: [PATCH 10/11] Add real_records_in_queue tracking

---
 clients/drcachesim/scheduler/scheduler_impl.cpp | 9 ++++++---
 clients/drcachesim/scheduler/scheduler_impl.h   | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index 1f25bae38a0..0d0ad3d2fcc 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -858,7 +858,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::init(
                 // TODO i#5843: Add more flags for other options.
                 : spec_type_t::LAST_FROM_TRACE,
             static_cast<int>(get_time_micros()),
-            cached_record_t(create_invalid_record(), false), verbosity_);
+            cached_record_t(create_invalid_record(), IS_SYNTHETIC), verbosity_);
         if (options_.single_lockstep_output)
             outputs_.back().stream = global_stream_.get();
         if (options_.schedule_record_ostream != nullptr) {
@@ -2573,9 +2573,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
             input->cur_from_queue = true;
             input->is_cur_record_real = input->queue.front().is_real;
             input->queue.pop_front();
-            if (input->is_cur_record_real) {
+            if (input->is_cur_record_real)
                 --input->real_records_in_queue;
-            }
         } else {
             // We again have a flag check because reader_t::init() does an initial ++
             // and so we want to skip that on the first record but perform a ++ prior
@@ -2637,6 +2636,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
             // the lock since another output may grab this input.
             VPRINT(this, 5, "next_record[%d]: queuing candidate record\n", output);
             input->queue.emplace_back(record, input->is_cur_record_real);
+            if (input->is_cur_record_real)
+                ++input->real_records_in_queue;
             lock.unlock();
             res = pick_next_input(output, blocked_time);
             if (res != sched_type_t::STATUS_OK && res != sched_type_t::STATUS_WAIT &&
@@ -2682,6 +2683,8 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::next_record(output_ordinal_t outp
                     // Get our candidate record back.
                     record = input->queue.back().record;
                     input->is_cur_record_real = input->queue.back().is_real;
+                    if (input->is_cur_record_real)
+                        --input->real_records_in_queue;
                     input->queue.pop_back();
                 }
             }
diff --git a/clients/drcachesim/scheduler/scheduler_impl.h b/clients/drcachesim/scheduler/scheduler_impl.h
index 3b4c7fe07e2..6e886f456fe 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.h
+++ b/clients/drcachesim/scheduler/scheduler_impl.h
@@ -204,7 +204,7 @@ template <typename RecordType, typename ReaderType> class scheduler_impl_tmpl_t
         // If non-empty these records should be returned before incrementing the reader.
         // This is used for read-ahead and inserting synthetic records.
         // We use a deque so we can iterate over it.
-
+        // Remember to adjust real_records_in_queue when adding or removing from this.
         std::deque<cached_record_t> queue;
         uint64_t real_records_in_queue = 0;
         bool cur_from_queue;

From b16b7c5c0fe270db5b0750e9e5374bb1df601c4a Mon Sep 17 00:00:00 2001
From: Abhinav Anil Sharma <sharmaabhinav@google.com>
Date: Tue, 25 Feb 2025 16:37:48 -0500
Subject: [PATCH 11/11] Include 'kernel' in log

---
 clients/drcachesim/scheduler/scheduler_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clients/drcachesim/scheduler/scheduler_impl.cpp b/clients/drcachesim/scheduler/scheduler_impl.cpp
index 0d0ad3d2fcc..a077f2325cb 100644
--- a/clients/drcachesim/scheduler/scheduler_impl.cpp
+++ b/clients/drcachesim/scheduler/scheduler_impl.cpp
@@ -668,7 +668,7 @@ scheduler_impl_tmpl_t<RecordType, ReaderType>::~scheduler_impl_tmpl_t()
                outputs_[i].ready_queue.lock->get_count_contended());
 #endif
         VPRINT(
-            this, 1, "  %-35s: %9" PRId64 "\n", "Switch sequence injections",
+            this, 1, "  %-35s: %9" PRId64 "\n", "Kernel switch sequence injections",
             outputs_[i]
                 .stats[memtrace_stream_t::SCHED_STAT_KERNEL_SWITCH_SEQUENCE_INJECTIONS]);
     }